xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision 44eda4d7)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5033f9833Sek  * Common Development and Distribution License (the "License").
6033f9833Sek  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22033f9833Sek  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
29fa9e4066Sahrens  * DVA-based Adjustable Relpacement Cache
30fa9e4066Sahrens  *
31ea8dc4b6Seschrock  * While much of the theory of operation used here is
32ea8dc4b6Seschrock  * based on the self-tuning, low overhead replacement cache
33fa9e4066Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
34fa9e4066Sahrens  * significant differences:
35fa9e4066Sahrens  *
36fa9e4066Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
37fa9e4066Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
38fa9e4066Sahrens  * the eviction algorithm simple: evict the last page in the list.
39fa9e4066Sahrens  * This also make the performance characteristics easy to reason
40fa9e4066Sahrens  * about.  Our cache is not so simple.  At any given moment, some
41fa9e4066Sahrens  * subset of the blocks in the cache are un-evictable because we
42fa9e4066Sahrens  * have handed out a reference to them.  Blocks are only evictable
43fa9e4066Sahrens  * when there are no external references active.  This makes
44fa9e4066Sahrens  * eviction far more problematic:  we choose to evict the evictable
45fa9e4066Sahrens  * blocks that are the "lowest" in the list.
46fa9e4066Sahrens  *
47fa9e4066Sahrens  * There are times when it is not possible to evict the requested
48fa9e4066Sahrens  * space.  In these circumstances we are unable to adjust the cache
49fa9e4066Sahrens  * size.  To prevent the cache growing unbounded at these times we
50fa9e4066Sahrens  * implement a "cache throttle" that slowes the flow of new data
51fa9e4066Sahrens  * into the cache until we can make space avaiable.
52fa9e4066Sahrens  *
53fa9e4066Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
54fa9e4066Sahrens  * Pages are evicted when the cache is full and there is a cache
55fa9e4066Sahrens  * miss.  Our model has a variable sized cache.  It grows with
56fa9e4066Sahrens  * high use, but also tries to react to memory preasure from the
57fa9e4066Sahrens  * operating system: decreasing its size when system memory is
58fa9e4066Sahrens  * tight.
59fa9e4066Sahrens  *
60fa9e4066Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
61fa9e4066Sahrens  * elements of the cache are therefor exactly the same size.  So
62fa9e4066Sahrens  * when adjusting the cache size following a cache miss, its simply
63fa9e4066Sahrens  * a matter of choosing a single page to evict.  In our model, we
64fa9e4066Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
65fa9e4066Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
66fa9e4066Sahrens  * space for a cache miss that approximates as closely as possible
67fa9e4066Sahrens  * the space used by the new block.
68fa9e4066Sahrens  *
69fa9e4066Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70fa9e4066Sahrens  * by N. Megiddo & D. Modha, FAST 2003
71fa9e4066Sahrens  */
72fa9e4066Sahrens 
73fa9e4066Sahrens /*
74fa9e4066Sahrens  * The locking model:
75fa9e4066Sahrens  *
76fa9e4066Sahrens  * A new reference to a cache buffer can be obtained in two
77fa9e4066Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
78fa9e4066Sahrens  * or 2) via one of the ARC lists.  The arc_read() inerface
79fa9e4066Sahrens  * uses method 1, while the internal arc algorithms for
80fa9e4066Sahrens  * adjusting the cache use method 2.  We therefor provide two
81fa9e4066Sahrens  * types of locks: 1) the hash table lock array, and 2) the
82fa9e4066Sahrens  * arc list locks.
83fa9e4066Sahrens  *
84fa9e4066Sahrens  * Buffers do not have their own mutexs, rather they rely on the
85fa9e4066Sahrens  * hash table mutexs for the bulk of their protection (i.e. most
86fa9e4066Sahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
87fa9e4066Sahrens  *
88fa9e4066Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
89fa9e4066Sahrens  * locates the requested buffer in the hash table.  It returns
90fa9e4066Sahrens  * NULL for the mutex if the buffer was not in the table.
91fa9e4066Sahrens  *
92fa9e4066Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
93fa9e4066Sahrens  * already held before it is invoked.
94fa9e4066Sahrens  *
95fa9e4066Sahrens  * Each arc state also has a mutex which is used to protect the
96fa9e4066Sahrens  * buffer list associated with the state.  When attempting to
97fa9e4066Sahrens  * obtain a hash table lock while holding an arc list lock you
98fa9e4066Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
99*44eda4d7Smaybee  * the active state mutex must be held before the ghost state mutex.
100fa9e4066Sahrens  *
101ea8dc4b6Seschrock  * Arc buffers may have an associated eviction callback function.
102ea8dc4b6Seschrock  * This function will be invoked prior to removing the buffer (e.g.
103ea8dc4b6Seschrock  * in arc_do_user_evicts()).  Note however that the data associated
104ea8dc4b6Seschrock  * with the buffer may be evicted prior to the callback.  The callback
105ea8dc4b6Seschrock  * must be made with *no locks held* (to prevent deadlock).  Additionally,
106ea8dc4b6Seschrock  * the users of callbacks must ensure that their private data is
107ea8dc4b6Seschrock  * protected from simultaneous callbacks from arc_buf_evict()
108ea8dc4b6Seschrock  * and arc_do_user_evicts().
109ea8dc4b6Seschrock  *
110fa9e4066Sahrens  * Note that the majority of the performance stats are manipulated
111fa9e4066Sahrens  * with atomic operations.
112fa9e4066Sahrens  */
113fa9e4066Sahrens 
114fa9e4066Sahrens #include <sys/spa.h>
115fa9e4066Sahrens #include <sys/zio.h>
116fa9e4066Sahrens #include <sys/zfs_context.h>
117fa9e4066Sahrens #include <sys/arc.h>
118fa9e4066Sahrens #include <sys/refcount.h>
119fa9e4066Sahrens #ifdef _KERNEL
120fa9e4066Sahrens #include <sys/vmsystm.h>
121fa9e4066Sahrens #include <vm/anon.h>
122fa9e4066Sahrens #include <sys/fs/swapnode.h>
123033f9833Sek #include <sys/dnlc.h>
124fa9e4066Sahrens #endif
125fa9e4066Sahrens #include <sys/callb.h>
126fa9e4066Sahrens 
127fa9e4066Sahrens static kmutex_t		arc_reclaim_thr_lock;
128fa9e4066Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
129fa9e4066Sahrens static uint8_t		arc_thread_exit;
130fa9e4066Sahrens 
131033f9833Sek #define	ARC_REDUCE_DNLC_PERCENT	3
132033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
133033f9833Sek 
134fa9e4066Sahrens typedef enum arc_reclaim_strategy {
135fa9e4066Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
136fa9e4066Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
137fa9e4066Sahrens } arc_reclaim_strategy_t;
138fa9e4066Sahrens 
139fa9e4066Sahrens /* number of seconds before growing cache again */
140fa9e4066Sahrens static int		arc_grow_retry = 60;
141fa9e4066Sahrens 
14213506d1eSmaybee /*
143b19a79ecSperrin  * minimum lifespan of a prefetch block in clock ticks
144b19a79ecSperrin  * (initialized in arc_init())
14513506d1eSmaybee  */
146b19a79ecSperrin static int		arc_min_prefetch_lifespan;
14713506d1eSmaybee 
148fa9e4066Sahrens static kmutex_t arc_reclaim_lock;
149fa9e4066Sahrens static int arc_dead;
150fa9e4066Sahrens 
151fa9e4066Sahrens /*
152fa9e4066Sahrens  * Note that buffers can be on one of 5 states:
153fa9e4066Sahrens  *	ARC_anon	- anonymous (discussed below)
154ea8dc4b6Seschrock  *	ARC_mru		- recently used, currently cached
155ea8dc4b6Seschrock  *	ARC_mru_ghost	- recentely used, no longer in cache
156ea8dc4b6Seschrock  *	ARC_mfu		- frequently used, currently cached
157ea8dc4b6Seschrock  *	ARC_mfu_ghost	- frequently used, no longer in cache
158fa9e4066Sahrens  * When there are no active references to the buffer, they
159fa9e4066Sahrens  * are linked onto one of the lists in arc.  These are the
160fa9e4066Sahrens  * only buffers that can be evicted or deleted.
161fa9e4066Sahrens  *
162fa9e4066Sahrens  * Anonymous buffers are buffers that are not associated with
163fa9e4066Sahrens  * a DVA.  These are buffers that hold dirty block copies
164fa9e4066Sahrens  * before they are written to stable storage.  By definition,
165ea8dc4b6Seschrock  * they are "ref'd" and are considered part of arc_mru
166fa9e4066Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
167ea8dc4b6Seschrock  * as they are written and migrate onto the arc_mru list.
168fa9e4066Sahrens  */
169fa9e4066Sahrens 
170fa9e4066Sahrens typedef struct arc_state {
171fa9e4066Sahrens 	list_t	list;	/* linked list of evictable buffer in state */
172fa9e4066Sahrens 	uint64_t lsize;	/* total size of buffers in the linked list */
173fa9e4066Sahrens 	uint64_t size;	/* total size of all buffers in this state */
174fa9e4066Sahrens 	uint64_t hits;
175fa9e4066Sahrens 	kmutex_t mtx;
176fa9e4066Sahrens } arc_state_t;
177fa9e4066Sahrens 
178fa9e4066Sahrens /* The 5 states: */
179fa9e4066Sahrens static arc_state_t ARC_anon;
180ea8dc4b6Seschrock static arc_state_t ARC_mru;
181ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost;
182ea8dc4b6Seschrock static arc_state_t ARC_mfu;
183ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost;
184fa9e4066Sahrens 
185fa9e4066Sahrens static struct arc {
186fa9e4066Sahrens 	arc_state_t 	*anon;
187ea8dc4b6Seschrock 	arc_state_t	*mru;
188ea8dc4b6Seschrock 	arc_state_t	*mru_ghost;
189ea8dc4b6Seschrock 	arc_state_t	*mfu;
190ea8dc4b6Seschrock 	arc_state_t	*mfu_ghost;
191fa9e4066Sahrens 	uint64_t	size;		/* Actual total arc size */
192ea8dc4b6Seschrock 	uint64_t	p;		/* Target size (in bytes) of mru */
193fa9e4066Sahrens 	uint64_t	c;		/* Target size of cache (in bytes) */
194fa9e4066Sahrens 	uint64_t	c_min;		/* Minimum target cache size */
195fa9e4066Sahrens 	uint64_t	c_max;		/* Maximum target cache size */
196fa9e4066Sahrens 
197fa9e4066Sahrens 	/* performance stats */
198fa9e4066Sahrens 	uint64_t	hits;
199fa9e4066Sahrens 	uint64_t	misses;
200fa9e4066Sahrens 	uint64_t	deleted;
201*44eda4d7Smaybee 	uint64_t	recycle_miss;
202*44eda4d7Smaybee 	uint64_t	mutex_miss;
203*44eda4d7Smaybee 	uint64_t	evict_skip;
204fa9e4066Sahrens 	uint64_t	hash_elements;
205fa9e4066Sahrens 	uint64_t	hash_elements_max;
206fa9e4066Sahrens 	uint64_t	hash_collisions;
207fa9e4066Sahrens 	uint64_t	hash_chains;
208fa9e4066Sahrens 	uint32_t	hash_chain_max;
209fa9e4066Sahrens 
210fa9e4066Sahrens 	int		no_grow;	/* Don't try to grow cache size */
211fa9e4066Sahrens } arc;
212fa9e4066Sahrens 
213fa9e4066Sahrens static uint64_t arc_tempreserve;
214fa9e4066Sahrens 
215fa9e4066Sahrens typedef struct arc_callback arc_callback_t;
216fa9e4066Sahrens 
217fa9e4066Sahrens struct arc_callback {
218fa9e4066Sahrens 	arc_done_func_t		*acb_done;
219fa9e4066Sahrens 	void			*acb_private;
220fa9e4066Sahrens 	arc_byteswap_func_t	*acb_byteswap;
221fa9e4066Sahrens 	arc_buf_t		*acb_buf;
222fa9e4066Sahrens 	zio_t			*acb_zio_dummy;
223fa9e4066Sahrens 	arc_callback_t		*acb_next;
224fa9e4066Sahrens };
225fa9e4066Sahrens 
226fa9e4066Sahrens struct arc_buf_hdr {
227fa9e4066Sahrens 	/* immutable */
228fa9e4066Sahrens 	uint64_t		b_size;
229fa9e4066Sahrens 	spa_t			*b_spa;
230fa9e4066Sahrens 
231fa9e4066Sahrens 	/* protected by hash lock */
232fa9e4066Sahrens 	dva_t			b_dva;
233fa9e4066Sahrens 	uint64_t		b_birth;
234fa9e4066Sahrens 	uint64_t		b_cksum0;
235fa9e4066Sahrens 
236fa9e4066Sahrens 	arc_buf_hdr_t		*b_hash_next;
237fa9e4066Sahrens 	arc_buf_t		*b_buf;
238fa9e4066Sahrens 	uint32_t		b_flags;
239ea8dc4b6Seschrock 	uint32_t		b_datacnt;
240fa9e4066Sahrens 
241fa9e4066Sahrens 	kcondvar_t		b_cv;
242fa9e4066Sahrens 	arc_callback_t		*b_acb;
243fa9e4066Sahrens 
244fa9e4066Sahrens 	/* protected by arc state mutex */
245fa9e4066Sahrens 	arc_state_t		*b_state;
246fa9e4066Sahrens 	list_node_t		b_arc_node;
247fa9e4066Sahrens 
248fa9e4066Sahrens 	/* updated atomically */
249fa9e4066Sahrens 	clock_t			b_arc_access;
250fa9e4066Sahrens 
251fa9e4066Sahrens 	/* self protecting */
252fa9e4066Sahrens 	refcount_t		b_refcnt;
253fa9e4066Sahrens };
254fa9e4066Sahrens 
255ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list;
256ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx;
257*44eda4d7Smaybee static void arc_get_data_buf(arc_buf_t *buf);
258*44eda4d7Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
259ea8dc4b6Seschrock 
260ea8dc4b6Seschrock #define	GHOST_STATE(state)	\
261ea8dc4b6Seschrock 	((state) == arc.mru_ghost || (state) == arc.mfu_ghost)
262ea8dc4b6Seschrock 
263fa9e4066Sahrens /*
264fa9e4066Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
265fa9e4066Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
266fa9e4066Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
267fa9e4066Sahrens  * should never be passed and should only be set by ARC code.  When adding new
268fa9e4066Sahrens  * public flags, make sure not to smash the private ones.
269fa9e4066Sahrens  */
270fa9e4066Sahrens 
271ea8dc4b6Seschrock #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
272fa9e4066Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
273fa9e4066Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
274fa9e4066Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
275ea8dc4b6Seschrock #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
27613506d1eSmaybee #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
277fa9e4066Sahrens 
278ea8dc4b6Seschrock #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
279fa9e4066Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
280fa9e4066Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
281fa9e4066Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
282ea8dc4b6Seschrock #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
283fa9e4066Sahrens 
284fa9e4066Sahrens /*
285fa9e4066Sahrens  * Hash table routines
286fa9e4066Sahrens  */
287fa9e4066Sahrens 
288fa9e4066Sahrens #define	HT_LOCK_PAD	64
289fa9e4066Sahrens 
290fa9e4066Sahrens struct ht_lock {
291fa9e4066Sahrens 	kmutex_t	ht_lock;
292fa9e4066Sahrens #ifdef _KERNEL
293fa9e4066Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
294fa9e4066Sahrens #endif
295fa9e4066Sahrens };
296fa9e4066Sahrens 
297fa9e4066Sahrens #define	BUF_LOCKS 256
298fa9e4066Sahrens typedef struct buf_hash_table {
299fa9e4066Sahrens 	uint64_t ht_mask;
300fa9e4066Sahrens 	arc_buf_hdr_t **ht_table;
301fa9e4066Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
302fa9e4066Sahrens } buf_hash_table_t;
303fa9e4066Sahrens 
304fa9e4066Sahrens static buf_hash_table_t buf_hash_table;
305fa9e4066Sahrens 
306fa9e4066Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
307fa9e4066Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
308fa9e4066Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
309fa9e4066Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
310fa9e4066Sahrens #define	HDR_LOCK(buf) \
311fa9e4066Sahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
312fa9e4066Sahrens 
313fa9e4066Sahrens uint64_t zfs_crc64_table[256];
314fa9e4066Sahrens 
315fa9e4066Sahrens static uint64_t
316fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
317fa9e4066Sahrens {
318fa9e4066Sahrens 	uintptr_t spav = (uintptr_t)spa;
319fa9e4066Sahrens 	uint8_t *vdva = (uint8_t *)dva;
320fa9e4066Sahrens 	uint64_t crc = -1ULL;
321fa9e4066Sahrens 	int i;
322fa9e4066Sahrens 
323fa9e4066Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
324fa9e4066Sahrens 
325fa9e4066Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
326fa9e4066Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
327fa9e4066Sahrens 
328fa9e4066Sahrens 	crc ^= (spav>>8) ^ birth;
329fa9e4066Sahrens 
330fa9e4066Sahrens 	return (crc);
331fa9e4066Sahrens }
332fa9e4066Sahrens 
333fa9e4066Sahrens #define	BUF_EMPTY(buf)						\
334fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
335fa9e4066Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
336fa9e4066Sahrens 	(buf)->b_birth == 0)
337fa9e4066Sahrens 
338fa9e4066Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
339fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
340fa9e4066Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
341fa9e4066Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
342fa9e4066Sahrens 
343fa9e4066Sahrens static arc_buf_hdr_t *
344fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
345fa9e4066Sahrens {
346fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
347fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
348fa9e4066Sahrens 	arc_buf_hdr_t *buf;
349fa9e4066Sahrens 
350fa9e4066Sahrens 	mutex_enter(hash_lock);
351fa9e4066Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
352fa9e4066Sahrens 	    buf = buf->b_hash_next) {
353fa9e4066Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
354fa9e4066Sahrens 			*lockp = hash_lock;
355fa9e4066Sahrens 			return (buf);
356fa9e4066Sahrens 		}
357fa9e4066Sahrens 	}
358fa9e4066Sahrens 	mutex_exit(hash_lock);
359fa9e4066Sahrens 	*lockp = NULL;
360fa9e4066Sahrens 	return (NULL);
361fa9e4066Sahrens }
362fa9e4066Sahrens 
363fa9e4066Sahrens /*
364fa9e4066Sahrens  * Insert an entry into the hash table.  If there is already an element
365fa9e4066Sahrens  * equal to elem in the hash table, then the already existing element
366fa9e4066Sahrens  * will be returned and the new element will not be inserted.
367fa9e4066Sahrens  * Otherwise returns NULL.
368fa9e4066Sahrens  */
369fa9e4066Sahrens static arc_buf_hdr_t *
370fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
371fa9e4066Sahrens {
372fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
373fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
374fa9e4066Sahrens 	arc_buf_hdr_t *fbuf;
375fa9e4066Sahrens 	uint32_t max, i;
376fa9e4066Sahrens 
377ea8dc4b6Seschrock 	ASSERT(!HDR_IN_HASH_TABLE(buf));
378fa9e4066Sahrens 	*lockp = hash_lock;
379fa9e4066Sahrens 	mutex_enter(hash_lock);
380fa9e4066Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
381fa9e4066Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
382fa9e4066Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
383fa9e4066Sahrens 			return (fbuf);
384fa9e4066Sahrens 	}
385fa9e4066Sahrens 
386fa9e4066Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
387fa9e4066Sahrens 	buf_hash_table.ht_table[idx] = buf;
388ea8dc4b6Seschrock 	buf->b_flags |= ARC_IN_HASH_TABLE;
389fa9e4066Sahrens 
390fa9e4066Sahrens 	/* collect some hash table performance data */
391fa9e4066Sahrens 	if (i > 0) {
392fa9e4066Sahrens 		atomic_add_64(&arc.hash_collisions, 1);
393fa9e4066Sahrens 		if (i == 1)
394fa9e4066Sahrens 			atomic_add_64(&arc.hash_chains, 1);
395fa9e4066Sahrens 	}
396fa9e4066Sahrens 	while (i > (max = arc.hash_chain_max) &&
397fa9e4066Sahrens 	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
398fa9e4066Sahrens 		continue;
399fa9e4066Sahrens 	}
400fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, 1);
401fa9e4066Sahrens 	if (arc.hash_elements > arc.hash_elements_max)
402fa9e4066Sahrens 		atomic_add_64(&arc.hash_elements_max, 1);
403fa9e4066Sahrens 
404fa9e4066Sahrens 	return (NULL);
405fa9e4066Sahrens }
406fa9e4066Sahrens 
407fa9e4066Sahrens static void
408fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
409fa9e4066Sahrens {
410fa9e4066Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
411fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
412fa9e4066Sahrens 
413fa9e4066Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
414ea8dc4b6Seschrock 	ASSERT(HDR_IN_HASH_TABLE(buf));
415fa9e4066Sahrens 
416fa9e4066Sahrens 	bufp = &buf_hash_table.ht_table[idx];
417fa9e4066Sahrens 	while ((fbuf = *bufp) != buf) {
418fa9e4066Sahrens 		ASSERT(fbuf != NULL);
419fa9e4066Sahrens 		bufp = &fbuf->b_hash_next;
420fa9e4066Sahrens 	}
421fa9e4066Sahrens 	*bufp = buf->b_hash_next;
422fa9e4066Sahrens 	buf->b_hash_next = NULL;
423ea8dc4b6Seschrock 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
424fa9e4066Sahrens 
425fa9e4066Sahrens 	/* collect some hash table performance data */
426fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, -1);
427fa9e4066Sahrens 	if (buf_hash_table.ht_table[idx] &&
428fa9e4066Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
429fa9e4066Sahrens 		atomic_add_64(&arc.hash_chains, -1);
430fa9e4066Sahrens }
431fa9e4066Sahrens 
432fa9e4066Sahrens /*
433fa9e4066Sahrens  * Global data structures and functions for the buf kmem cache.
434fa9e4066Sahrens  */
435fa9e4066Sahrens static kmem_cache_t *hdr_cache;
436fa9e4066Sahrens static kmem_cache_t *buf_cache;
437fa9e4066Sahrens 
438fa9e4066Sahrens static void
439fa9e4066Sahrens buf_fini(void)
440fa9e4066Sahrens {
441fa9e4066Sahrens 	int i;
442fa9e4066Sahrens 
443fa9e4066Sahrens 	kmem_free(buf_hash_table.ht_table,
444fa9e4066Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
445fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
446fa9e4066Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
447fa9e4066Sahrens 	kmem_cache_destroy(hdr_cache);
448fa9e4066Sahrens 	kmem_cache_destroy(buf_cache);
449fa9e4066Sahrens }
450fa9e4066Sahrens 
451fa9e4066Sahrens /*
452fa9e4066Sahrens  * Constructor callback - called when the cache is empty
453fa9e4066Sahrens  * and a new buf is requested.
454fa9e4066Sahrens  */
455fa9e4066Sahrens /* ARGSUSED */
456fa9e4066Sahrens static int
457fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
458fa9e4066Sahrens {
459fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
460fa9e4066Sahrens 
461fa9e4066Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
462fa9e4066Sahrens 	refcount_create(&buf->b_refcnt);
463fa9e4066Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
464fa9e4066Sahrens 	return (0);
465fa9e4066Sahrens }
466fa9e4066Sahrens 
467fa9e4066Sahrens /*
468fa9e4066Sahrens  * Destructor callback - called when a cached buf is
469fa9e4066Sahrens  * no longer required.
470fa9e4066Sahrens  */
471fa9e4066Sahrens /* ARGSUSED */
472fa9e4066Sahrens static void
473fa9e4066Sahrens hdr_dest(void *vbuf, void *unused)
474fa9e4066Sahrens {
475fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
476fa9e4066Sahrens 
477fa9e4066Sahrens 	refcount_destroy(&buf->b_refcnt);
478fa9e4066Sahrens 	cv_destroy(&buf->b_cv);
479fa9e4066Sahrens }
480fa9e4066Sahrens 
481ea8dc4b6Seschrock static int arc_reclaim_needed(void);
482fa9e4066Sahrens void arc_kmem_reclaim(void);
483fa9e4066Sahrens 
484fa9e4066Sahrens /*
485fa9e4066Sahrens  * Reclaim callback -- invoked when memory is low.
486fa9e4066Sahrens  */
487fa9e4066Sahrens /* ARGSUSED */
488fa9e4066Sahrens static void
489fa9e4066Sahrens hdr_recl(void *unused)
490fa9e4066Sahrens {
491fa9e4066Sahrens 	dprintf("hdr_recl called\n");
492ea8dc4b6Seschrock 	if (arc_reclaim_needed())
493ea8dc4b6Seschrock 		arc_kmem_reclaim();
494fa9e4066Sahrens }
495fa9e4066Sahrens 
496fa9e4066Sahrens static void
497fa9e4066Sahrens buf_init(void)
498fa9e4066Sahrens {
499fa9e4066Sahrens 	uint64_t *ct;
500ea8dc4b6Seschrock 	uint64_t hsize = 1ULL << 12;
501fa9e4066Sahrens 	int i, j;
502fa9e4066Sahrens 
503fa9e4066Sahrens 	/*
504fa9e4066Sahrens 	 * The hash table is big enough to fill all of physical memory
505ea8dc4b6Seschrock 	 * with an average 64K block size.  The table will take up
506ea8dc4b6Seschrock 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
507fa9e4066Sahrens 	 */
508ea8dc4b6Seschrock 	while (hsize * 65536 < physmem * PAGESIZE)
509fa9e4066Sahrens 		hsize <<= 1;
510ea8dc4b6Seschrock retry:
511fa9e4066Sahrens 	buf_hash_table.ht_mask = hsize - 1;
512ea8dc4b6Seschrock 	buf_hash_table.ht_table =
513ea8dc4b6Seschrock 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
514ea8dc4b6Seschrock 	if (buf_hash_table.ht_table == NULL) {
515ea8dc4b6Seschrock 		ASSERT(hsize > (1ULL << 8));
516ea8dc4b6Seschrock 		hsize >>= 1;
517ea8dc4b6Seschrock 		goto retry;
518ea8dc4b6Seschrock 	}
519fa9e4066Sahrens 
520fa9e4066Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
521fa9e4066Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
522fa9e4066Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
523fa9e4066Sahrens 	    0, NULL, NULL, NULL, NULL, NULL, 0);
524fa9e4066Sahrens 
525fa9e4066Sahrens 	for (i = 0; i < 256; i++)
526fa9e4066Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
527fa9e4066Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
528fa9e4066Sahrens 
529fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
530fa9e4066Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
531fa9e4066Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
532fa9e4066Sahrens 	}
533fa9e4066Sahrens }
534fa9e4066Sahrens 
535fa9e4066Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
536fa9e4066Sahrens 
537fa9e4066Sahrens static void
538fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
539fa9e4066Sahrens {
540fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
541fa9e4066Sahrens 
542fa9e4066Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
543fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
544ea8dc4b6Seschrock 		int delta = ab->b_size * ab->b_datacnt;
545fa9e4066Sahrens 
546fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
547fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
548fa9e4066Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
549fa9e4066Sahrens 		list_remove(&ab->b_state->list, ab);
550ea8dc4b6Seschrock 		if (GHOST_STATE(ab->b_state)) {
551ea8dc4b6Seschrock 			ASSERT3U(ab->b_datacnt, ==, 0);
552ea8dc4b6Seschrock 			ASSERT3P(ab->b_buf, ==, NULL);
553ea8dc4b6Seschrock 			delta = ab->b_size;
554ea8dc4b6Seschrock 		}
555ea8dc4b6Seschrock 		ASSERT(delta > 0);
556ea8dc4b6Seschrock 		ASSERT3U(ab->b_state->lsize, >=, delta);
557ea8dc4b6Seschrock 		atomic_add_64(&ab->b_state->lsize, -delta);
558fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
55913506d1eSmaybee 		/* remove the prefetch flag is we get a reference */
56013506d1eSmaybee 		if (ab->b_flags & ARC_PREFETCH)
56113506d1eSmaybee 			ab->b_flags &= ~ARC_PREFETCH;
562fa9e4066Sahrens 	}
563fa9e4066Sahrens }
564fa9e4066Sahrens 
565fa9e4066Sahrens static int
566fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
567fa9e4066Sahrens {
568fa9e4066Sahrens 	int cnt;
569fa9e4066Sahrens 
570ea8dc4b6Seschrock 	ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock));
571ea8dc4b6Seschrock 	ASSERT(!GHOST_STATE(ab->b_state));
572fa9e4066Sahrens 
573fa9e4066Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
574fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
575fa9e4066Sahrens 
576fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
577fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
578fa9e4066Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
579fa9e4066Sahrens 		list_insert_head(&ab->b_state->list, ab);
580ea8dc4b6Seschrock 		ASSERT(ab->b_datacnt > 0);
581ea8dc4b6Seschrock 		atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt);
582ea8dc4b6Seschrock 		ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize);
583fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
584fa9e4066Sahrens 	}
585fa9e4066Sahrens 	return (cnt);
586fa9e4066Sahrens }
587fa9e4066Sahrens 
588fa9e4066Sahrens /*
589fa9e4066Sahrens  * Move the supplied buffer to the indicated state.  The mutex
590fa9e4066Sahrens  * for the buffer must be held by the caller.
591fa9e4066Sahrens  */
592fa9e4066Sahrens static void
593ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
594fa9e4066Sahrens {
595ea8dc4b6Seschrock 	arc_state_t *old_state = ab->b_state;
596ea8dc4b6Seschrock 	int refcnt = refcount_count(&ab->b_refcnt);
597ea8dc4b6Seschrock 	int from_delta, to_delta;
598fa9e4066Sahrens 
599fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
600ea8dc4b6Seschrock 	ASSERT(new_state != old_state);
601ea8dc4b6Seschrock 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
602ea8dc4b6Seschrock 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
603ea8dc4b6Seschrock 
604ea8dc4b6Seschrock 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
605fa9e4066Sahrens 
606fa9e4066Sahrens 	/*
607fa9e4066Sahrens 	 * If this buffer is evictable, transfer it from the
608fa9e4066Sahrens 	 * old state list to the new state list.
609fa9e4066Sahrens 	 */
610ea8dc4b6Seschrock 	if (refcnt == 0) {
611ea8dc4b6Seschrock 		if (old_state != arc.anon) {
612ea8dc4b6Seschrock 			int use_mutex = !MUTEX_HELD(&old_state->mtx);
613ea8dc4b6Seschrock 
614ea8dc4b6Seschrock 			if (use_mutex)
615ea8dc4b6Seschrock 				mutex_enter(&old_state->mtx);
616fa9e4066Sahrens 
617fa9e4066Sahrens 			ASSERT(list_link_active(&ab->b_arc_node));
618ea8dc4b6Seschrock 			list_remove(&old_state->list, ab);
619ea8dc4b6Seschrock 
62013506d1eSmaybee 			/*
62113506d1eSmaybee 			 * If prefetching out of the ghost cache,
62213506d1eSmaybee 			 * we will have a non-null datacnt.
62313506d1eSmaybee 			 */
62413506d1eSmaybee 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
62513506d1eSmaybee 				/* ghost elements have a ghost size */
626ea8dc4b6Seschrock 				ASSERT(ab->b_buf == NULL);
627ea8dc4b6Seschrock 				from_delta = ab->b_size;
628ea8dc4b6Seschrock 			}
629ea8dc4b6Seschrock 			ASSERT3U(old_state->lsize, >=, from_delta);
630ea8dc4b6Seschrock 			atomic_add_64(&old_state->lsize, -from_delta);
631ea8dc4b6Seschrock 
632ea8dc4b6Seschrock 			if (use_mutex)
633ea8dc4b6Seschrock 				mutex_exit(&old_state->mtx);
634fa9e4066Sahrens 		}
635fa9e4066Sahrens 		if (new_state != arc.anon) {
636ea8dc4b6Seschrock 			int use_mutex = !MUTEX_HELD(&new_state->mtx);
637fa9e4066Sahrens 
638ea8dc4b6Seschrock 			if (use_mutex)
639fa9e4066Sahrens 				mutex_enter(&new_state->mtx);
640ea8dc4b6Seschrock 
641fa9e4066Sahrens 			list_insert_head(&new_state->list, ab);
642ea8dc4b6Seschrock 
643ea8dc4b6Seschrock 			/* ghost elements have a ghost size */
644ea8dc4b6Seschrock 			if (GHOST_STATE(new_state)) {
645ea8dc4b6Seschrock 				ASSERT(ab->b_datacnt == 0);
646ea8dc4b6Seschrock 				ASSERT(ab->b_buf == NULL);
647ea8dc4b6Seschrock 				to_delta = ab->b_size;
648ea8dc4b6Seschrock 			}
649ea8dc4b6Seschrock 			atomic_add_64(&new_state->lsize, to_delta);
650ea8dc4b6Seschrock 			ASSERT3U(new_state->size + to_delta, >=,
651ea8dc4b6Seschrock 			    new_state->lsize);
652ea8dc4b6Seschrock 
653ea8dc4b6Seschrock 			if (use_mutex)
654fa9e4066Sahrens 				mutex_exit(&new_state->mtx);
655fa9e4066Sahrens 		}
656fa9e4066Sahrens 	}
657fa9e4066Sahrens 
658fa9e4066Sahrens 	ASSERT(!BUF_EMPTY(ab));
659ea8dc4b6Seschrock 	if (new_state == arc.anon && old_state != arc.anon) {
660fa9e4066Sahrens 		buf_hash_remove(ab);
661fa9e4066Sahrens 	}
662fa9e4066Sahrens 
663ea8dc4b6Seschrock 	/* adjust state sizes */
664ea8dc4b6Seschrock 	if (to_delta)
665ea8dc4b6Seschrock 		atomic_add_64(&new_state->size, to_delta);
666ea8dc4b6Seschrock 	if (from_delta) {
667ea8dc4b6Seschrock 		ASSERT3U(old_state->size, >=, from_delta);
668ea8dc4b6Seschrock 		atomic_add_64(&old_state->size, -from_delta);
669fa9e4066Sahrens 	}
670fa9e4066Sahrens 	ab->b_state = new_state;
671fa9e4066Sahrens }
672fa9e4066Sahrens 
673fa9e4066Sahrens arc_buf_t *
674fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag)
675fa9e4066Sahrens {
676fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
677fa9e4066Sahrens 	arc_buf_t *buf;
678fa9e4066Sahrens 
679fa9e4066Sahrens 	ASSERT3U(size, >, 0);
680fa9e4066Sahrens 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
681fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
682fa9e4066Sahrens 	hdr->b_size = size;
683fa9e4066Sahrens 	hdr->b_spa = spa;
684fa9e4066Sahrens 	hdr->b_state = arc.anon;
685fa9e4066Sahrens 	hdr->b_arc_access = 0;
686fa9e4066Sahrens 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
687fa9e4066Sahrens 	buf->b_hdr = hdr;
688*44eda4d7Smaybee 	buf->b_data = NULL;
689ea8dc4b6Seschrock 	buf->b_efunc = NULL;
690ea8dc4b6Seschrock 	buf->b_private = NULL;
691fa9e4066Sahrens 	buf->b_next = NULL;
692fa9e4066Sahrens 	hdr->b_buf = buf;
693*44eda4d7Smaybee 	arc_get_data_buf(buf);
694ea8dc4b6Seschrock 	hdr->b_datacnt = 1;
695fa9e4066Sahrens 	hdr->b_flags = 0;
696fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
697fa9e4066Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
698fa9e4066Sahrens 
699fa9e4066Sahrens 	return (buf);
700fa9e4066Sahrens }
701fa9e4066Sahrens 
702*44eda4d7Smaybee static arc_buf_t *
703*44eda4d7Smaybee arc_buf_clone(arc_buf_t *from)
704ea8dc4b6Seschrock {
705*44eda4d7Smaybee 	arc_buf_t *buf;
706*44eda4d7Smaybee 	arc_buf_hdr_t *hdr = from->b_hdr;
707*44eda4d7Smaybee 	uint64_t size = hdr->b_size;
708ea8dc4b6Seschrock 
709*44eda4d7Smaybee 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
710*44eda4d7Smaybee 	buf->b_hdr = hdr;
711*44eda4d7Smaybee 	buf->b_data = NULL;
712*44eda4d7Smaybee 	buf->b_efunc = NULL;
713*44eda4d7Smaybee 	buf->b_private = NULL;
714*44eda4d7Smaybee 	buf->b_next = hdr->b_buf;
715*44eda4d7Smaybee 	hdr->b_buf = buf;
716*44eda4d7Smaybee 	arc_get_data_buf(buf);
717*44eda4d7Smaybee 	bcopy(from->b_data, buf->b_data, size);
718*44eda4d7Smaybee 	hdr->b_datacnt += 1;
719*44eda4d7Smaybee 	return (buf);
720ea8dc4b6Seschrock }
721ea8dc4b6Seschrock 
722ea8dc4b6Seschrock void
723ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag)
724ea8dc4b6Seschrock {
725ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr;
726ea8dc4b6Seschrock 	kmutex_t *hash_lock;
727ea8dc4b6Seschrock 
728ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
729ea8dc4b6Seschrock 	hdr = buf->b_hdr;
730ea8dc4b6Seschrock 	if (buf->b_data == NULL) {
731ea8dc4b6Seschrock 		/*
732ea8dc4b6Seschrock 		 * This buffer is evicted.
733ea8dc4b6Seschrock 		 */
734ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
735ea8dc4b6Seschrock 		return;
736ea8dc4b6Seschrock 	} else {
737ea8dc4b6Seschrock 		/*
738ea8dc4b6Seschrock 		 * Prevent this buffer from being evicted
739ea8dc4b6Seschrock 		 * while we add a reference.
740ea8dc4b6Seschrock 		 */
741ea8dc4b6Seschrock 		buf->b_hdr = NULL;
742ea8dc4b6Seschrock 	}
743ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
744ea8dc4b6Seschrock 
745ea8dc4b6Seschrock 	ASSERT(hdr->b_state != arc.anon);
746ea8dc4b6Seschrock 	hash_lock = HDR_LOCK(hdr);
747ea8dc4b6Seschrock 	mutex_enter(hash_lock);
748ea8dc4b6Seschrock 	ASSERT(!GHOST_STATE(hdr->b_state));
749ea8dc4b6Seschrock 	buf->b_hdr = hdr;
750ea8dc4b6Seschrock 	add_reference(hdr, hash_lock, tag);
751*44eda4d7Smaybee 	arc_access(hdr, hash_lock);
752*44eda4d7Smaybee 	mutex_exit(hash_lock);
753ea8dc4b6Seschrock 	atomic_add_64(&arc.hits, 1);
754ea8dc4b6Seschrock }
755ea8dc4b6Seschrock 
756ea8dc4b6Seschrock static void
757*44eda4d7Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
758ea8dc4b6Seschrock {
759ea8dc4b6Seschrock 	arc_buf_t **bufp;
760ea8dc4b6Seschrock 
761ea8dc4b6Seschrock 	/* free up data associated with the buf */
762ea8dc4b6Seschrock 	if (buf->b_data) {
763ea8dc4b6Seschrock 		arc_state_t *state = buf->b_hdr->b_state;
764ea8dc4b6Seschrock 		uint64_t size = buf->b_hdr->b_size;
765ea8dc4b6Seschrock 
766*44eda4d7Smaybee 		if (!recycle) {
767*44eda4d7Smaybee 			zio_buf_free(buf->b_data, size);
768*44eda4d7Smaybee 			atomic_add_64(&arc.size, -size);
769*44eda4d7Smaybee 		}
770ea8dc4b6Seschrock 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
771ea8dc4b6Seschrock 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
772ea8dc4b6Seschrock 			ASSERT(state != arc.anon);
773ea8dc4b6Seschrock 			ASSERT3U(state->lsize, >=, size);
774ea8dc4b6Seschrock 			atomic_add_64(&state->lsize, -size);
775ea8dc4b6Seschrock 		}
776ea8dc4b6Seschrock 		ASSERT3U(state->size, >=, size);
777ea8dc4b6Seschrock 		atomic_add_64(&state->size, -size);
778ea8dc4b6Seschrock 		buf->b_data = NULL;
779ea8dc4b6Seschrock 		ASSERT(buf->b_hdr->b_datacnt > 0);
780ea8dc4b6Seschrock 		buf->b_hdr->b_datacnt -= 1;
781ea8dc4b6Seschrock 	}
782ea8dc4b6Seschrock 
783ea8dc4b6Seschrock 	/* only remove the buf if requested */
784ea8dc4b6Seschrock 	if (!all)
785ea8dc4b6Seschrock 		return;
786ea8dc4b6Seschrock 
787ea8dc4b6Seschrock 	/* remove the buf from the hdr list */
788ea8dc4b6Seschrock 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
789ea8dc4b6Seschrock 		continue;
790ea8dc4b6Seschrock 	*bufp = buf->b_next;
791ea8dc4b6Seschrock 
792ea8dc4b6Seschrock 	ASSERT(buf->b_efunc == NULL);
793ea8dc4b6Seschrock 
794ea8dc4b6Seschrock 	/* clean up the buf */
795ea8dc4b6Seschrock 	buf->b_hdr = NULL;
796ea8dc4b6Seschrock 	kmem_cache_free(buf_cache, buf);
797ea8dc4b6Seschrock }
798ea8dc4b6Seschrock 
799fa9e4066Sahrens static void
800ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr)
801fa9e4066Sahrens {
802fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
803fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
804ea8dc4b6Seschrock 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
805fa9e4066Sahrens 
806fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
807ea8dc4b6Seschrock 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
808fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
809fa9e4066Sahrens 		hdr->b_birth = 0;
810fa9e4066Sahrens 		hdr->b_cksum0 = 0;
811fa9e4066Sahrens 	}
812ea8dc4b6Seschrock 	while (hdr->b_buf) {
813fa9e4066Sahrens 		arc_buf_t *buf = hdr->b_buf;
814fa9e4066Sahrens 
815ea8dc4b6Seschrock 		if (buf->b_efunc) {
816ea8dc4b6Seschrock 			mutex_enter(&arc_eviction_mtx);
817ea8dc4b6Seschrock 			ASSERT(buf->b_hdr != NULL);
818*44eda4d7Smaybee 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
819ea8dc4b6Seschrock 			hdr->b_buf = buf->b_next;
820ea8dc4b6Seschrock 			buf->b_next = arc_eviction_list;
821ea8dc4b6Seschrock 			arc_eviction_list = buf;
822ea8dc4b6Seschrock 			mutex_exit(&arc_eviction_mtx);
823ea8dc4b6Seschrock 		} else {
824*44eda4d7Smaybee 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
825ea8dc4b6Seschrock 		}
826fa9e4066Sahrens 	}
827ea8dc4b6Seschrock 
828fa9e4066Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
829fa9e4066Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
830fa9e4066Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
831fa9e4066Sahrens 	kmem_cache_free(hdr_cache, hdr);
832fa9e4066Sahrens }
833fa9e4066Sahrens 
834fa9e4066Sahrens void
835fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
836fa9e4066Sahrens {
837fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
838ea8dc4b6Seschrock 	int hashed = hdr->b_state != arc.anon;
839fa9e4066Sahrens 
840ea8dc4b6Seschrock 	ASSERT(buf->b_efunc == NULL);
841ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
842ea8dc4b6Seschrock 
843ea8dc4b6Seschrock 	if (hashed) {
844ea8dc4b6Seschrock 		kmutex_t *hash_lock = HDR_LOCK(hdr);
845ea8dc4b6Seschrock 
846ea8dc4b6Seschrock 		mutex_enter(hash_lock);
847ea8dc4b6Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
848ea8dc4b6Seschrock 		if (hdr->b_datacnt > 1)
849*44eda4d7Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
850ea8dc4b6Seschrock 		else
851ea8dc4b6Seschrock 			hdr->b_flags |= ARC_BUF_AVAILABLE;
852fa9e4066Sahrens 		mutex_exit(hash_lock);
853ea8dc4b6Seschrock 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
854ea8dc4b6Seschrock 		int destroy_hdr;
855ea8dc4b6Seschrock 		/*
856ea8dc4b6Seschrock 		 * We are in the middle of an async write.  Don't destroy
857ea8dc4b6Seschrock 		 * this buffer unless the write completes before we finish
858ea8dc4b6Seschrock 		 * decrementing the reference count.
859ea8dc4b6Seschrock 		 */
860ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
861ea8dc4b6Seschrock 		(void) remove_reference(hdr, NULL, tag);
862ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
863ea8dc4b6Seschrock 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
864ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
865ea8dc4b6Seschrock 		if (destroy_hdr)
866ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
867ea8dc4b6Seschrock 	} else {
868ea8dc4b6Seschrock 		if (remove_reference(hdr, NULL, tag) > 0) {
869ea8dc4b6Seschrock 			ASSERT(HDR_IO_ERROR(hdr));
870*44eda4d7Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
871ea8dc4b6Seschrock 		} else {
872ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
873ea8dc4b6Seschrock 		}
874fa9e4066Sahrens 	}
875ea8dc4b6Seschrock }
876fa9e4066Sahrens 
877ea8dc4b6Seschrock int
878ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag)
879ea8dc4b6Seschrock {
880ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr = buf->b_hdr;
881ea8dc4b6Seschrock 	kmutex_t *hash_lock = HDR_LOCK(hdr);
882ea8dc4b6Seschrock 	int no_callback = (buf->b_efunc == NULL);
883fa9e4066Sahrens 
884ea8dc4b6Seschrock 	if (hdr->b_state == arc.anon) {
885ea8dc4b6Seschrock 		arc_buf_free(buf, tag);
886ea8dc4b6Seschrock 		return (no_callback);
887ea8dc4b6Seschrock 	}
888ea8dc4b6Seschrock 
889ea8dc4b6Seschrock 	mutex_enter(hash_lock);
890ea8dc4b6Seschrock 	ASSERT(hdr->b_state != arc.anon);
891ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
892ea8dc4b6Seschrock 
893ea8dc4b6Seschrock 	(void) remove_reference(hdr, hash_lock, tag);
894ea8dc4b6Seschrock 	if (hdr->b_datacnt > 1) {
895ea8dc4b6Seschrock 		if (no_callback)
896*44eda4d7Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
897ea8dc4b6Seschrock 	} else if (no_callback) {
898ea8dc4b6Seschrock 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
899ea8dc4b6Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
900ea8dc4b6Seschrock 	}
901ea8dc4b6Seschrock 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
902ea8dc4b6Seschrock 	    refcount_is_zero(&hdr->b_refcnt));
903ea8dc4b6Seschrock 	mutex_exit(hash_lock);
904ea8dc4b6Seschrock 	return (no_callback);
905fa9e4066Sahrens }
906fa9e4066Sahrens 
907fa9e4066Sahrens int
908fa9e4066Sahrens arc_buf_size(arc_buf_t *buf)
909fa9e4066Sahrens {
910fa9e4066Sahrens 	return (buf->b_hdr->b_size);
911fa9e4066Sahrens }
912fa9e4066Sahrens 
913fa9e4066Sahrens /*
914fa9e4066Sahrens  * Evict buffers from list until we've removed the specified number of
915fa9e4066Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
916*44eda4d7Smaybee  * If the recycle flag is set, then attempt to "recycle" a buffer:
917*44eda4d7Smaybee  * - look for a buffer to evict that is `bytes' long.
918*44eda4d7Smaybee  * - return the data block from this buffer rather than freeing it.
919*44eda4d7Smaybee  * This flag is used by callers that are trying to make space for a
920*44eda4d7Smaybee  * new buffer in a full arc cache.
921fa9e4066Sahrens  */
922*44eda4d7Smaybee static void *
923*44eda4d7Smaybee arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle)
924fa9e4066Sahrens {
925fa9e4066Sahrens 	arc_state_t *evicted_state;
926*44eda4d7Smaybee 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
927fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
928fa9e4066Sahrens 	kmutex_t *hash_lock;
929*44eda4d7Smaybee 	boolean_t have_lock;
930*44eda4d7Smaybee 	void *steal = NULL;
931fa9e4066Sahrens 
932ea8dc4b6Seschrock 	ASSERT(state == arc.mru || state == arc.mfu);
933fa9e4066Sahrens 
934ea8dc4b6Seschrock 	evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
935fa9e4066Sahrens 
936fa9e4066Sahrens 	mutex_enter(&state->mtx);
937fa9e4066Sahrens 	mutex_enter(&evicted_state->mtx);
938fa9e4066Sahrens 
939fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
940fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
94113506d1eSmaybee 		/* prefetch buffers have a minimum lifespan */
942*44eda4d7Smaybee 		if (HDR_IO_IN_PROGRESS(ab) ||
943*44eda4d7Smaybee 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
944*44eda4d7Smaybee 		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
94513506d1eSmaybee 			skipped++;
94613506d1eSmaybee 			continue;
94713506d1eSmaybee 		}
948*44eda4d7Smaybee 		if (recycle && (ab->b_size != bytes || ab->b_datacnt > 1))
949*44eda4d7Smaybee 			continue;
950fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
951*44eda4d7Smaybee 		have_lock = MUTEX_HELD(hash_lock);
952*44eda4d7Smaybee 		if (have_lock || mutex_tryenter(hash_lock)) {
953fa9e4066Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
954ea8dc4b6Seschrock 			ASSERT(ab->b_datacnt > 0);
955ea8dc4b6Seschrock 			while (ab->b_buf) {
956ea8dc4b6Seschrock 				arc_buf_t *buf = ab->b_buf;
957*44eda4d7Smaybee 				if (buf->b_data) {
958ea8dc4b6Seschrock 					bytes_evicted += ab->b_size;
959*44eda4d7Smaybee 					if (recycle)
960*44eda4d7Smaybee 						steal = buf->b_data;
961*44eda4d7Smaybee 				}
962ea8dc4b6Seschrock 				if (buf->b_efunc) {
963ea8dc4b6Seschrock 					mutex_enter(&arc_eviction_mtx);
964ea8dc4b6Seschrock 					/*
965ea8dc4b6Seschrock 					 * arc_buf_add_ref() could derail
966ea8dc4b6Seschrock 					 * this eviction.
967ea8dc4b6Seschrock 					 */
968ea8dc4b6Seschrock 					if (buf->b_hdr == NULL) {
969ea8dc4b6Seschrock 						mutex_exit(&arc_eviction_mtx);
970*44eda4d7Smaybee 						bytes_evicted -= ab->b_size;
971*44eda4d7Smaybee 						if (recycle)
972*44eda4d7Smaybee 							steal = NULL;
973*44eda4d7Smaybee 						if (!have_lock)
974*44eda4d7Smaybee 							mutex_exit(hash_lock);
975*44eda4d7Smaybee 						goto derailed;
976ea8dc4b6Seschrock 					}
977*44eda4d7Smaybee 					arc_buf_destroy(buf, recycle, FALSE);
978ea8dc4b6Seschrock 					ab->b_buf = buf->b_next;
979ea8dc4b6Seschrock 					buf->b_next = arc_eviction_list;
980ea8dc4b6Seschrock 					arc_eviction_list = buf;
981ea8dc4b6Seschrock 					mutex_exit(&arc_eviction_mtx);
982ea8dc4b6Seschrock 				} else {
983*44eda4d7Smaybee 					arc_buf_destroy(buf, recycle, TRUE);
984ea8dc4b6Seschrock 				}
985ea8dc4b6Seschrock 			}
986ea8dc4b6Seschrock 			ASSERT(ab->b_datacnt == 0);
987fa9e4066Sahrens 			arc_change_state(evicted_state, ab, hash_lock);
988ea8dc4b6Seschrock 			ASSERT(HDR_IN_HASH_TABLE(ab));
989ea8dc4b6Seschrock 			ab->b_flags = ARC_IN_HASH_TABLE;
990fa9e4066Sahrens 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
991*44eda4d7Smaybee 			if (!have_lock)
992*44eda4d7Smaybee 				mutex_exit(hash_lock);
993ea8dc4b6Seschrock 			if (bytes >= 0 && bytes_evicted >= bytes)
994fa9e4066Sahrens 				break;
995fa9e4066Sahrens 		} else {
996*44eda4d7Smaybee 			missed += 1;
997fa9e4066Sahrens 		}
998*44eda4d7Smaybee derailed:
999*44eda4d7Smaybee 		/* null statement */;
1000fa9e4066Sahrens 	}
1001fa9e4066Sahrens 	mutex_exit(&evicted_state->mtx);
1002fa9e4066Sahrens 	mutex_exit(&state->mtx);
1003fa9e4066Sahrens 
1004fa9e4066Sahrens 	if (bytes_evicted < bytes)
1005fa9e4066Sahrens 		dprintf("only evicted %lld bytes from %x",
1006fa9e4066Sahrens 		    (longlong_t)bytes_evicted, state);
1007fa9e4066Sahrens 
1008*44eda4d7Smaybee 	if (skipped)
1009*44eda4d7Smaybee 		atomic_add_64(&arc.evict_skip, skipped);
1010*44eda4d7Smaybee 	if (missed)
1011*44eda4d7Smaybee 		atomic_add_64(&arc.mutex_miss, missed);
1012*44eda4d7Smaybee 	return (steal);
1013fa9e4066Sahrens }
1014fa9e4066Sahrens 
1015fa9e4066Sahrens /*
1016fa9e4066Sahrens  * Remove buffers from list until we've removed the specified number of
1017fa9e4066Sahrens  * bytes.  Destroy the buffers that are removed.
1018fa9e4066Sahrens  */
1019fa9e4066Sahrens static void
1020ea8dc4b6Seschrock arc_evict_ghost(arc_state_t *state, int64_t bytes)
1021fa9e4066Sahrens {
1022fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
1023fa9e4066Sahrens 	kmutex_t *hash_lock;
1024ea8dc4b6Seschrock 	uint64_t bytes_deleted = 0;
1025ea8dc4b6Seschrock 	uint_t bufs_skipped = 0;
1026fa9e4066Sahrens 
1027ea8dc4b6Seschrock 	ASSERT(GHOST_STATE(state));
1028fa9e4066Sahrens top:
1029fa9e4066Sahrens 	mutex_enter(&state->mtx);
1030fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
1031fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
1032fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
1033fa9e4066Sahrens 		if (mutex_tryenter(hash_lock)) {
103413506d1eSmaybee 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1035ea8dc4b6Seschrock 			ASSERT(ab->b_buf == NULL);
1036fa9e4066Sahrens 			arc_change_state(arc.anon, ab, hash_lock);
1037fa9e4066Sahrens 			mutex_exit(hash_lock);
1038fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
1039fa9e4066Sahrens 			bytes_deleted += ab->b_size;
1040ea8dc4b6Seschrock 			arc_hdr_destroy(ab);
1041ea8dc4b6Seschrock 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1042fa9e4066Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
1043fa9e4066Sahrens 				break;
1044fa9e4066Sahrens 		} else {
1045fa9e4066Sahrens 			if (bytes < 0) {
1046fa9e4066Sahrens 				mutex_exit(&state->mtx);
1047fa9e4066Sahrens 				mutex_enter(hash_lock);
1048fa9e4066Sahrens 				mutex_exit(hash_lock);
1049fa9e4066Sahrens 				goto top;
1050fa9e4066Sahrens 			}
1051fa9e4066Sahrens 			bufs_skipped += 1;
1052fa9e4066Sahrens 		}
1053fa9e4066Sahrens 	}
1054fa9e4066Sahrens 	mutex_exit(&state->mtx);
1055fa9e4066Sahrens 
1056fa9e4066Sahrens 	if (bufs_skipped) {
1057*44eda4d7Smaybee 		atomic_add_64(&arc.mutex_miss, bufs_skipped);
1058fa9e4066Sahrens 		ASSERT(bytes >= 0);
1059fa9e4066Sahrens 	}
1060fa9e4066Sahrens 
1061fa9e4066Sahrens 	if (bytes_deleted < bytes)
1062fa9e4066Sahrens 		dprintf("only deleted %lld bytes from %p",
1063fa9e4066Sahrens 		    (longlong_t)bytes_deleted, state);
1064fa9e4066Sahrens }
1065fa9e4066Sahrens 
1066fa9e4066Sahrens static void
1067fa9e4066Sahrens arc_adjust(void)
1068fa9e4066Sahrens {
1069fa9e4066Sahrens 	int64_t top_sz, mru_over, arc_over;
1070fa9e4066Sahrens 
1071ea8dc4b6Seschrock 	top_sz = arc.anon->size + arc.mru->size;
1072fa9e4066Sahrens 
1073ea8dc4b6Seschrock 	if (top_sz > arc.p && arc.mru->lsize > 0) {
1074ea8dc4b6Seschrock 		int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
1075*44eda4d7Smaybee 		(void) arc_evict(arc.mru, toevict, FALSE);
1076ea8dc4b6Seschrock 		top_sz = arc.anon->size + arc.mru->size;
1077fa9e4066Sahrens 	}
1078fa9e4066Sahrens 
1079ea8dc4b6Seschrock 	mru_over = top_sz + arc.mru_ghost->size - arc.c;
1080fa9e4066Sahrens 
1081fa9e4066Sahrens 	if (mru_over > 0) {
1082ea8dc4b6Seschrock 		if (arc.mru_ghost->lsize > 0) {
1083ea8dc4b6Seschrock 			int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over);
1084ea8dc4b6Seschrock 			arc_evict_ghost(arc.mru_ghost, todelete);
1085fa9e4066Sahrens 		}
1086fa9e4066Sahrens 	}
1087fa9e4066Sahrens 
1088fa9e4066Sahrens 	if ((arc_over = arc.size - arc.c) > 0) {
1089ea8dc4b6Seschrock 		int64_t tbl_over;
1090fa9e4066Sahrens 
1091ea8dc4b6Seschrock 		if (arc.mfu->lsize > 0) {
1092ea8dc4b6Seschrock 			int64_t toevict = MIN(arc.mfu->lsize, arc_over);
1093*44eda4d7Smaybee 			(void) arc_evict(arc.mfu, toevict, FALSE);
1094fa9e4066Sahrens 		}
1095fa9e4066Sahrens 
1096ea8dc4b6Seschrock 		tbl_over = arc.size + arc.mru_ghost->lsize +
1097ea8dc4b6Seschrock 		    arc.mfu_ghost->lsize - arc.c*2;
1098fa9e4066Sahrens 
1099ea8dc4b6Seschrock 		if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) {
1100ea8dc4b6Seschrock 			int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over);
1101ea8dc4b6Seschrock 			arc_evict_ghost(arc.mfu_ghost, todelete);
1102fa9e4066Sahrens 		}
1103fa9e4066Sahrens 	}
1104fa9e4066Sahrens }
1105fa9e4066Sahrens 
1106ea8dc4b6Seschrock static void
1107ea8dc4b6Seschrock arc_do_user_evicts(void)
1108ea8dc4b6Seschrock {
1109ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
1110ea8dc4b6Seschrock 	while (arc_eviction_list != NULL) {
1111ea8dc4b6Seschrock 		arc_buf_t *buf = arc_eviction_list;
1112ea8dc4b6Seschrock 		arc_eviction_list = buf->b_next;
1113ea8dc4b6Seschrock 		buf->b_hdr = NULL;
1114ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
1115ea8dc4b6Seschrock 
1116dd6ef538Smaybee 		if (buf->b_efunc != NULL)
1117dd6ef538Smaybee 			VERIFY(buf->b_efunc(buf) == 0);
1118ea8dc4b6Seschrock 
1119ea8dc4b6Seschrock 		buf->b_efunc = NULL;
1120ea8dc4b6Seschrock 		buf->b_private = NULL;
1121ea8dc4b6Seschrock 		kmem_cache_free(buf_cache, buf);
1122ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
1123ea8dc4b6Seschrock 	}
1124ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
1125ea8dc4b6Seschrock }
1126ea8dc4b6Seschrock 
1127fa9e4066Sahrens /*
1128fa9e4066Sahrens  * Flush all *evictable* data from the cache.
1129fa9e4066Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
1130fa9e4066Sahrens  */
1131fa9e4066Sahrens void
1132fa9e4066Sahrens arc_flush(void)
1133fa9e4066Sahrens {
1134*44eda4d7Smaybee 	while (list_head(&arc.mru->list))
1135*44eda4d7Smaybee 		(void) arc_evict(arc.mru, -1, FALSE);
1136*44eda4d7Smaybee 	while (list_head(&arc.mfu->list))
1137*44eda4d7Smaybee 		(void) arc_evict(arc.mfu, -1, FALSE);
1138fa9e4066Sahrens 
1139ea8dc4b6Seschrock 	arc_evict_ghost(arc.mru_ghost, -1);
1140ea8dc4b6Seschrock 	arc_evict_ghost(arc.mfu_ghost, -1);
1141ea8dc4b6Seschrock 
1142ea8dc4b6Seschrock 	mutex_enter(&arc_reclaim_thr_lock);
1143ea8dc4b6Seschrock 	arc_do_user_evicts();
1144ea8dc4b6Seschrock 	mutex_exit(&arc_reclaim_thr_lock);
1145ea8dc4b6Seschrock 	ASSERT(arc_eviction_list == NULL);
1146fa9e4066Sahrens }
1147fa9e4066Sahrens 
114813506d1eSmaybee int arc_kmem_reclaim_shift = 5;		/* log2(fraction of arc to reclaim) */
114913506d1eSmaybee 
1150fa9e4066Sahrens void
1151fa9e4066Sahrens arc_kmem_reclaim(void)
1152fa9e4066Sahrens {
11533cff2f43Sstans 	uint64_t to_free;
11543cff2f43Sstans 
1155fa9e4066Sahrens 	/*
1156fa9e4066Sahrens 	 * We need arc_reclaim_lock because we don't want multiple
1157fa9e4066Sahrens 	 * threads trying to reclaim concurrently.
1158fa9e4066Sahrens 	 */
1159fa9e4066Sahrens 
1160fa9e4066Sahrens 	/*
1161fa9e4066Sahrens 	 * umem calls the reclaim func when we destroy the buf cache,
1162fa9e4066Sahrens 	 * which is after we do arc_fini().  So we set a flag to prevent
1163fa9e4066Sahrens 	 * accessing the destroyed mutexes and lists.
1164fa9e4066Sahrens 	 */
1165fa9e4066Sahrens 	if (arc_dead)
1166fa9e4066Sahrens 		return;
1167fa9e4066Sahrens 
1168ea8dc4b6Seschrock 	if (arc.c <= arc.c_min)
1169ea8dc4b6Seschrock 		return;
1170ea8dc4b6Seschrock 
1171fa9e4066Sahrens 	mutex_enter(&arc_reclaim_lock);
1172fa9e4066Sahrens 
11733cff2f43Sstans #ifdef _KERNEL
117413506d1eSmaybee 	to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree));
11753cff2f43Sstans #else
117613506d1eSmaybee 	to_free = arc.c >> arc_kmem_reclaim_shift;
11773cff2f43Sstans #endif
11783cff2f43Sstans 	if (arc.c > to_free)
11793cff2f43Sstans 		atomic_add_64(&arc.c, -to_free);
11803cff2f43Sstans 	else
11813cff2f43Sstans 		arc.c = arc.c_min;
11823cff2f43Sstans 
118313506d1eSmaybee 	atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift));
1184ea8dc4b6Seschrock 	if (arc.c > arc.size)
1185ea8dc4b6Seschrock 		arc.c = arc.size;
1186fa9e4066Sahrens 	if (arc.c < arc.c_min)
1187fa9e4066Sahrens 		arc.c = arc.c_min;
1188ea8dc4b6Seschrock 	if (arc.p > arc.c)
1189ea8dc4b6Seschrock 		arc.p = (arc.c >> 1);
1190ea8dc4b6Seschrock 	ASSERT((int64_t)arc.p >= 0);
1191fa9e4066Sahrens 
1192fa9e4066Sahrens 	arc_adjust();
1193fa9e4066Sahrens 
1194fa9e4066Sahrens 	mutex_exit(&arc_reclaim_lock);
1195fa9e4066Sahrens }
1196fa9e4066Sahrens 
1197fa9e4066Sahrens static int
1198fa9e4066Sahrens arc_reclaim_needed(void)
1199fa9e4066Sahrens {
1200fa9e4066Sahrens 	uint64_t extra;
1201fa9e4066Sahrens 
1202fa9e4066Sahrens #ifdef _KERNEL
12033cff2f43Sstans 
12043cff2f43Sstans 	if (needfree)
12053cff2f43Sstans 		return (1);
12063cff2f43Sstans 
1207fa9e4066Sahrens 	/*
1208fa9e4066Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1209fa9e4066Sahrens 	 */
1210fa9e4066Sahrens 	extra = desfree;
1211fa9e4066Sahrens 
1212fa9e4066Sahrens 	/*
1213fa9e4066Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
1214fa9e4066Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
1215fa9e4066Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
1216fa9e4066Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
1217fa9e4066Sahrens 	 * the scanner doesn't start up while we're freeing memory.
1218fa9e4066Sahrens 	 */
1219fa9e4066Sahrens 	if (freemem < lotsfree + needfree + extra)
1220fa9e4066Sahrens 		return (1);
1221fa9e4066Sahrens 
1222fa9e4066Sahrens 	/*
1223fa9e4066Sahrens 	 * check to make sure that swapfs has enough space so that anon
1224fa9e4066Sahrens 	 * reservations can still succeeed. anon_resvmem() checks that the
1225fa9e4066Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
1226fa9e4066Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
1227fa9e4066Sahrens 	 * circumstances from getting really dire.
1228fa9e4066Sahrens 	 */
1229fa9e4066Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1230fa9e4066Sahrens 		return (1);
1231fa9e4066Sahrens 
12325dc8af33Smaybee #if defined(__i386)
1233fa9e4066Sahrens 	/*
1234fa9e4066Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
1235fa9e4066Sahrens 	 * kernel heap space before we ever run out of available physical
1236fa9e4066Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
1237fa9e4066Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
1238fa9e4066Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
1239fa9e4066Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
1240fa9e4066Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1241fa9e4066Sahrens 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
1242fa9e4066Sahrens 	 * free)
1243fa9e4066Sahrens 	 */
1244fa9e4066Sahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1245fa9e4066Sahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1246fa9e4066Sahrens 		return (1);
1247fa9e4066Sahrens #endif
1248fa9e4066Sahrens 
1249fa9e4066Sahrens #else
1250fa9e4066Sahrens 	if (spa_get_random(100) == 0)
1251fa9e4066Sahrens 		return (1);
1252fa9e4066Sahrens #endif
1253fa9e4066Sahrens 	return (0);
1254fa9e4066Sahrens }
1255fa9e4066Sahrens 
1256fa9e4066Sahrens static void
1257fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1258fa9e4066Sahrens {
1259fa9e4066Sahrens 	size_t			i;
1260fa9e4066Sahrens 	kmem_cache_t		*prev_cache = NULL;
1261fa9e4066Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
1262fa9e4066Sahrens 
1263033f9833Sek #ifdef _KERNEL
1264033f9833Sek 	/*
1265033f9833Sek 	 * First purge some DNLC entries, in case the DNLC is using
1266033f9833Sek 	 * up too much memory.
1267033f9833Sek 	 */
1268cee972f8Sek 	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
12695dc8af33Smaybee 
12705dc8af33Smaybee #if defined(__i386)
12715dc8af33Smaybee 	/*
12725dc8af33Smaybee 	 * Reclaim unused memory from all kmem caches.
12735dc8af33Smaybee 	 */
12745dc8af33Smaybee 	kmem_reap();
12755dc8af33Smaybee #endif
1276033f9833Sek #endif
1277033f9833Sek 
1278fa9e4066Sahrens 	/*
1279ea8dc4b6Seschrock 	 * An agressive reclamation will shrink the cache size as well as
1280ea8dc4b6Seschrock 	 * reap free buffers from the arc kmem caches.
1281fa9e4066Sahrens 	 */
1282fa9e4066Sahrens 	if (strat == ARC_RECLAIM_AGGR)
1283ea8dc4b6Seschrock 		arc_kmem_reclaim();
1284fa9e4066Sahrens 
1285fa9e4066Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1286fa9e4066Sahrens 		if (zio_buf_cache[i] != prev_cache) {
1287fa9e4066Sahrens 			prev_cache = zio_buf_cache[i];
1288fa9e4066Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
1289fa9e4066Sahrens 		}
1290fa9e4066Sahrens 	}
1291ea8dc4b6Seschrock 	kmem_cache_reap_now(buf_cache);
1292ea8dc4b6Seschrock 	kmem_cache_reap_now(hdr_cache);
1293fa9e4066Sahrens }
1294fa9e4066Sahrens 
1295fa9e4066Sahrens static void
1296fa9e4066Sahrens arc_reclaim_thread(void)
1297fa9e4066Sahrens {
1298fa9e4066Sahrens 	clock_t			growtime = 0;
1299fa9e4066Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1300fa9e4066Sahrens 	callb_cpr_t		cpr;
1301fa9e4066Sahrens 
1302fa9e4066Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1303fa9e4066Sahrens 
1304fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1305fa9e4066Sahrens 	while (arc_thread_exit == 0) {
1306fa9e4066Sahrens 		if (arc_reclaim_needed()) {
1307fa9e4066Sahrens 
1308fa9e4066Sahrens 			if (arc.no_grow) {
1309fa9e4066Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
1310fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
1311fa9e4066Sahrens 				} else {
1312fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
1313fa9e4066Sahrens 				}
1314fa9e4066Sahrens 			} else {
1315fa9e4066Sahrens 				arc.no_grow = TRUE;
1316fa9e4066Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
1317fa9e4066Sahrens 				membar_producer();
1318fa9e4066Sahrens 			}
1319fa9e4066Sahrens 
1320fa9e4066Sahrens 			/* reset the growth delay for every reclaim */
1321fa9e4066Sahrens 			growtime = lbolt + (arc_grow_retry * hz);
1322fa9e4066Sahrens 
1323fa9e4066Sahrens 			arc_kmem_reap_now(last_reclaim);
1324fa9e4066Sahrens 
1325fa9e4066Sahrens 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1326fa9e4066Sahrens 			arc.no_grow = FALSE;
1327fa9e4066Sahrens 		}
1328fa9e4066Sahrens 
1329ea8dc4b6Seschrock 		if (arc_eviction_list != NULL)
1330ea8dc4b6Seschrock 			arc_do_user_evicts();
1331ea8dc4b6Seschrock 
1332fa9e4066Sahrens 		/* block until needed, or one second, whichever is shorter */
1333fa9e4066Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
1334fa9e4066Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1335fa9e4066Sahrens 		    &arc_reclaim_thr_lock, (lbolt + hz));
1336fa9e4066Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1337fa9e4066Sahrens 	}
1338fa9e4066Sahrens 
1339fa9e4066Sahrens 	arc_thread_exit = 0;
1340fa9e4066Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
1341fa9e4066Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1342fa9e4066Sahrens 	thread_exit();
1343fa9e4066Sahrens }
1344fa9e4066Sahrens 
1345ea8dc4b6Seschrock /*
1346ea8dc4b6Seschrock  * Adapt arc info given the number of bytes we are trying to add and
1347ea8dc4b6Seschrock  * the state that we are comming from.  This function is only called
1348ea8dc4b6Seschrock  * when we are adding new content to the cache.
1349ea8dc4b6Seschrock  */
1350fa9e4066Sahrens static void
1351ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state)
1352fa9e4066Sahrens {
1353ea8dc4b6Seschrock 	int mult;
1354ea8dc4b6Seschrock 
1355ea8dc4b6Seschrock 	ASSERT(bytes > 0);
1356fa9e4066Sahrens 	/*
1357ea8dc4b6Seschrock 	 * Adapt the target size of the MRU list:
1358ea8dc4b6Seschrock 	 *	- if we just hit in the MRU ghost list, then increase
1359ea8dc4b6Seschrock 	 *	  the target size of the MRU list.
1360ea8dc4b6Seschrock 	 *	- if we just hit in the MFU ghost list, then increase
1361ea8dc4b6Seschrock 	 *	  the target size of the MFU list by decreasing the
1362ea8dc4b6Seschrock 	 *	  target size of the MRU list.
1363fa9e4066Sahrens 	 */
1364ea8dc4b6Seschrock 	if (state == arc.mru_ghost) {
1365ea8dc4b6Seschrock 		mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ?
1366ea8dc4b6Seschrock 		    1 : (arc.mfu_ghost->size/arc.mru_ghost->size));
1367ea8dc4b6Seschrock 
1368ea8dc4b6Seschrock 		arc.p = MIN(arc.c, arc.p + bytes * mult);
1369ea8dc4b6Seschrock 	} else if (state == arc.mfu_ghost) {
1370ea8dc4b6Seschrock 		mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ?
1371ea8dc4b6Seschrock 		    1 : (arc.mru_ghost->size/arc.mfu_ghost->size));
1372ea8dc4b6Seschrock 
1373ea8dc4b6Seschrock 		arc.p = MAX(0, (int64_t)arc.p - bytes * mult);
1374ea8dc4b6Seschrock 	}
1375ea8dc4b6Seschrock 	ASSERT((int64_t)arc.p >= 0);
1376fa9e4066Sahrens 
1377fa9e4066Sahrens 	if (arc_reclaim_needed()) {
1378fa9e4066Sahrens 		cv_signal(&arc_reclaim_thr_cv);
1379fa9e4066Sahrens 		return;
1380fa9e4066Sahrens 	}
1381fa9e4066Sahrens 
1382fa9e4066Sahrens 	if (arc.no_grow)
1383fa9e4066Sahrens 		return;
1384fa9e4066Sahrens 
1385ea8dc4b6Seschrock 	if (arc.c >= arc.c_max)
1386ea8dc4b6Seschrock 		return;
1387ea8dc4b6Seschrock 
1388fa9e4066Sahrens 	/*
1389ea8dc4b6Seschrock 	 * If we're within (2 * maxblocksize) bytes of the target
1390ea8dc4b6Seschrock 	 * cache size, increment the target cache size
1391fa9e4066Sahrens 	 */
1392ea8dc4b6Seschrock 	if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) {
1393ea8dc4b6Seschrock 		atomic_add_64(&arc.c, (int64_t)bytes);
1394fa9e4066Sahrens 		if (arc.c > arc.c_max)
1395fa9e4066Sahrens 			arc.c = arc.c_max;
1396ea8dc4b6Seschrock 		else if (state == arc.anon)
1397ea8dc4b6Seschrock 			atomic_add_64(&arc.p, (int64_t)bytes);
1398ea8dc4b6Seschrock 		if (arc.p > arc.c)
1399ea8dc4b6Seschrock 			arc.p = arc.c;
1400fa9e4066Sahrens 	}
1401ea8dc4b6Seschrock 	ASSERT((int64_t)arc.p >= 0);
1402fa9e4066Sahrens }
1403fa9e4066Sahrens 
1404fa9e4066Sahrens /*
1405ea8dc4b6Seschrock  * Check if the cache has reached its limits and eviction is required
1406ea8dc4b6Seschrock  * prior to insert.
1407fa9e4066Sahrens  */
1408fa9e4066Sahrens static int
1409fa9e4066Sahrens arc_evict_needed()
1410fa9e4066Sahrens {
1411fa9e4066Sahrens 	if (arc_reclaim_needed())
1412fa9e4066Sahrens 		return (1);
1413fa9e4066Sahrens 
1414ea8dc4b6Seschrock 	return (arc.size > arc.c);
1415fa9e4066Sahrens }
1416fa9e4066Sahrens 
1417fa9e4066Sahrens /*
1418*44eda4d7Smaybee  * The buffer, supplied as the first argument, needs a data block.
1419*44eda4d7Smaybee  * So, if we are at cache max, determine which cache should be victimized.
1420*44eda4d7Smaybee  * We have the following cases:
1421fa9e4066Sahrens  *
1422ea8dc4b6Seschrock  * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) ->
1423fa9e4066Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
1424fa9e4066Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
1425fa9e4066Sahrens  *
1426ea8dc4b6Seschrock  * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) ->
1427fa9e4066Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
1428fa9e4066Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
1429fa9e4066Sahrens  * entries.
1430fa9e4066Sahrens  *
1431ea8dc4b6Seschrock  * 3. Insert for MFU (c - p) > sizeof(arc.mfu) ->
1432fa9e4066Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
1433fa9e4066Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
1434fa9e4066Sahrens  * the MFU side, so the MRU side needs to be victimized.
1435fa9e4066Sahrens  *
1436ea8dc4b6Seschrock  * 4. Insert for MFU (c - p) < sizeof(arc.mfu) ->
1437fa9e4066Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
1438fa9e4066Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
1439fa9e4066Sahrens  */
1440fa9e4066Sahrens static void
1441*44eda4d7Smaybee arc_get_data_buf(arc_buf_t *buf)
1442fa9e4066Sahrens {
1443*44eda4d7Smaybee 	arc_state_t	*state = buf->b_hdr->b_state;
1444*44eda4d7Smaybee 	uint64_t	size = buf->b_hdr->b_size;
1445fa9e4066Sahrens 
1446*44eda4d7Smaybee 	arc_adapt(size, state);
1447fa9e4066Sahrens 
1448*44eda4d7Smaybee 	/*
1449*44eda4d7Smaybee 	 * We have not yet reached cache maximum size,
1450*44eda4d7Smaybee 	 * just allocate a new buffer.
1451*44eda4d7Smaybee 	 */
1452*44eda4d7Smaybee 	if (!arc_evict_needed()) {
1453*44eda4d7Smaybee 		buf->b_data = zio_buf_alloc(size);
1454*44eda4d7Smaybee 		atomic_add_64(&arc.size, size);
1455*44eda4d7Smaybee 		goto out;
1456*44eda4d7Smaybee 	}
1457*44eda4d7Smaybee 
1458*44eda4d7Smaybee 	/*
1459*44eda4d7Smaybee 	 * If we are prefetching from the mfu ghost list, this buffer
1460*44eda4d7Smaybee 	 * will end up on the mru list; so steal space from there.
1461*44eda4d7Smaybee 	 */
1462*44eda4d7Smaybee 	if (state == arc.mfu_ghost)
1463*44eda4d7Smaybee 		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc.mru : arc.mfu;
1464*44eda4d7Smaybee 	else if (state == arc.mru_ghost)
1465*44eda4d7Smaybee 		state = arc.mru;
1466*44eda4d7Smaybee 
1467*44eda4d7Smaybee 	if (state == arc.mru || state == arc.anon) {
1468*44eda4d7Smaybee 		uint64_t mru_used = arc.anon->size + arc.mru->size;
1469*44eda4d7Smaybee 		state = (arc.p > mru_used) ? arc.mfu : arc.mru;
1470fa9e4066Sahrens 	} else {
1471*44eda4d7Smaybee 		/* MFU cases */
1472*44eda4d7Smaybee 		uint64_t mfu_space = arc.c - arc.p;
1473*44eda4d7Smaybee 		state =  (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu;
1474*44eda4d7Smaybee 	}
1475*44eda4d7Smaybee 	if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) {
1476*44eda4d7Smaybee 		(void) arc_evict(state, size, FALSE);
1477*44eda4d7Smaybee 		buf->b_data = zio_buf_alloc(size);
1478*44eda4d7Smaybee 		atomic_add_64(&arc.size, size);
1479*44eda4d7Smaybee 		atomic_add_64(&arc.recycle_miss, 1);
1480*44eda4d7Smaybee 		if (arc.size > arc.c)
1481*44eda4d7Smaybee 			arc_adjust();
1482*44eda4d7Smaybee 	}
1483*44eda4d7Smaybee 	ASSERT(buf->b_data != NULL);
1484*44eda4d7Smaybee out:
1485*44eda4d7Smaybee 	/*
1486*44eda4d7Smaybee 	 * Update the state size.  Note that ghost states have a
1487*44eda4d7Smaybee 	 * "ghost size" and so don't need to be updated.
1488*44eda4d7Smaybee 	 */
1489*44eda4d7Smaybee 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
1490*44eda4d7Smaybee 		arc_buf_hdr_t *hdr = buf->b_hdr;
1491*44eda4d7Smaybee 
1492*44eda4d7Smaybee 		atomic_add_64(&hdr->b_state->size, size);
1493*44eda4d7Smaybee 		if (list_link_active(&hdr->b_arc_node)) {
1494*44eda4d7Smaybee 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
1495*44eda4d7Smaybee 			atomic_add_64(&hdr->b_state->lsize, size);
1496fa9e4066Sahrens 		}
1497fa9e4066Sahrens 	}
1498fa9e4066Sahrens }
1499fa9e4066Sahrens 
1500fa9e4066Sahrens /*
1501fa9e4066Sahrens  * This routine is called whenever a buffer is accessed.
1502ea8dc4b6Seschrock  * NOTE: the hash lock is dropped in this function.
1503fa9e4066Sahrens  */
1504fa9e4066Sahrens static void
1505*44eda4d7Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1506fa9e4066Sahrens {
1507fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1508fa9e4066Sahrens 
1509fa9e4066Sahrens 	if (buf->b_state == arc.anon) {
1510fa9e4066Sahrens 		/*
1511fa9e4066Sahrens 		 * This buffer is not in the cache, and does not
1512fa9e4066Sahrens 		 * appear in our "ghost" list.  Add the new buffer
1513fa9e4066Sahrens 		 * to the MRU state.
1514fa9e4066Sahrens 		 */
1515fa9e4066Sahrens 
1516fa9e4066Sahrens 		ASSERT(buf->b_arc_access == 0);
1517fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1518ea8dc4b6Seschrock 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1519ea8dc4b6Seschrock 		arc_change_state(arc.mru, buf, hash_lock);
1520fa9e4066Sahrens 
1521ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mru) {
1522fa9e4066Sahrens 		/*
152313506d1eSmaybee 		 * If this buffer is here because of a prefetch, then either:
152413506d1eSmaybee 		 * - clear the flag if this is a "referencing" read
152513506d1eSmaybee 		 *   (any subsequent access will bump this into the MFU state).
152613506d1eSmaybee 		 * or
152713506d1eSmaybee 		 * - move the buffer to the head of the list if this is
152813506d1eSmaybee 		 *   another prefetch (to make it less likely to be evicted).
1529fa9e4066Sahrens 		 */
1530fa9e4066Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
153113506d1eSmaybee 			if (refcount_count(&buf->b_refcnt) == 0) {
153213506d1eSmaybee 				ASSERT(list_link_active(&buf->b_arc_node));
153313506d1eSmaybee 				mutex_enter(&arc.mru->mtx);
153413506d1eSmaybee 				list_remove(&arc.mru->list, buf);
153513506d1eSmaybee 				list_insert_head(&arc.mru->list, buf);
153613506d1eSmaybee 				mutex_exit(&arc.mru->mtx);
153713506d1eSmaybee 			} else {
153813506d1eSmaybee 				buf->b_flags &= ~ARC_PREFETCH;
153913506d1eSmaybee 				atomic_add_64(&arc.mru->hits, 1);
154013506d1eSmaybee 			}
154113506d1eSmaybee 			buf->b_arc_access = lbolt;
1542fa9e4066Sahrens 			return;
1543fa9e4066Sahrens 		}
1544fa9e4066Sahrens 
1545fa9e4066Sahrens 		/*
1546fa9e4066Sahrens 		 * This buffer has been "accessed" only once so far,
1547fa9e4066Sahrens 		 * but it is still in the cache. Move it to the MFU
1548fa9e4066Sahrens 		 * state.
1549fa9e4066Sahrens 		 */
1550fa9e4066Sahrens 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1551fa9e4066Sahrens 			/*
1552fa9e4066Sahrens 			 * More than 125ms have passed since we
1553fa9e4066Sahrens 			 * instantiated this buffer.  Move it to the
1554fa9e4066Sahrens 			 * most frequently used state.
1555fa9e4066Sahrens 			 */
1556fa9e4066Sahrens 			buf->b_arc_access = lbolt;
1557ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1558ea8dc4b6Seschrock 			arc_change_state(arc.mfu, buf, hash_lock);
1559fa9e4066Sahrens 		}
1560ea8dc4b6Seschrock 		atomic_add_64(&arc.mru->hits, 1);
1561ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mru_ghost) {
1562fa9e4066Sahrens 		arc_state_t	*new_state;
1563fa9e4066Sahrens 		/*
1564fa9e4066Sahrens 		 * This buffer has been "accessed" recently, but
1565fa9e4066Sahrens 		 * was evicted from the cache.  Move it to the
1566fa9e4066Sahrens 		 * MFU state.
1567fa9e4066Sahrens 		 */
1568fa9e4066Sahrens 
1569fa9e4066Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
1570ea8dc4b6Seschrock 			new_state = arc.mru;
157113506d1eSmaybee 			if (refcount_count(&buf->b_refcnt) > 0)
157213506d1eSmaybee 				buf->b_flags &= ~ARC_PREFETCH;
1573ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1574fa9e4066Sahrens 		} else {
1575ea8dc4b6Seschrock 			new_state = arc.mfu;
1576ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1577fa9e4066Sahrens 		}
1578fa9e4066Sahrens 
1579fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1580fa9e4066Sahrens 		arc_change_state(new_state, buf, hash_lock);
1581fa9e4066Sahrens 
1582ea8dc4b6Seschrock 		atomic_add_64(&arc.mru_ghost->hits, 1);
1583ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mfu) {
1584fa9e4066Sahrens 		/*
1585fa9e4066Sahrens 		 * This buffer has been accessed more than once and is
1586fa9e4066Sahrens 		 * still in the cache.  Keep it in the MFU state.
1587fa9e4066Sahrens 		 *
158813506d1eSmaybee 		 * NOTE: an add_reference() that occurred when we did
158913506d1eSmaybee 		 * the arc_read() will have kicked this off the list.
159013506d1eSmaybee 		 * If it was a prefetch, we will explicitly move it to
159113506d1eSmaybee 		 * the head of the list now.
1592fa9e4066Sahrens 		 */
159313506d1eSmaybee 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
159413506d1eSmaybee 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
159513506d1eSmaybee 			ASSERT(list_link_active(&buf->b_arc_node));
159613506d1eSmaybee 			mutex_enter(&arc.mfu->mtx);
159713506d1eSmaybee 			list_remove(&arc.mfu->list, buf);
159813506d1eSmaybee 			list_insert_head(&arc.mfu->list, buf);
159913506d1eSmaybee 			mutex_exit(&arc.mfu->mtx);
160013506d1eSmaybee 		}
1601ea8dc4b6Seschrock 		atomic_add_64(&arc.mfu->hits, 1);
160213506d1eSmaybee 		buf->b_arc_access = lbolt;
1603ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mfu_ghost) {
160413506d1eSmaybee 		arc_state_t	*new_state = arc.mfu;
1605fa9e4066Sahrens 		/*
1606fa9e4066Sahrens 		 * This buffer has been accessed more than once but has
1607fa9e4066Sahrens 		 * been evicted from the cache.  Move it back to the
1608fa9e4066Sahrens 		 * MFU state.
1609fa9e4066Sahrens 		 */
1610fa9e4066Sahrens 
161113506d1eSmaybee 		if (buf->b_flags & ARC_PREFETCH) {
161213506d1eSmaybee 			/*
161313506d1eSmaybee 			 * This is a prefetch access...
161413506d1eSmaybee 			 * move this block back to the MRU state.
161513506d1eSmaybee 			 */
161613506d1eSmaybee 			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
161713506d1eSmaybee 			new_state = arc.mru;
161813506d1eSmaybee 		}
161913506d1eSmaybee 
1620fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1621ea8dc4b6Seschrock 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
162213506d1eSmaybee 		arc_change_state(new_state, buf, hash_lock);
1623fa9e4066Sahrens 
1624ea8dc4b6Seschrock 		atomic_add_64(&arc.mfu_ghost->hits, 1);
1625fa9e4066Sahrens 	} else {
1626fa9e4066Sahrens 		ASSERT(!"invalid arc state");
1627fa9e4066Sahrens 	}
1628fa9e4066Sahrens }
1629fa9e4066Sahrens 
1630fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1631fa9e4066Sahrens /* ARGSUSED */
1632fa9e4066Sahrens void
1633fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1634fa9e4066Sahrens {
1635fa9e4066Sahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1636ea8dc4b6Seschrock 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1637fa9e4066Sahrens }
1638fa9e4066Sahrens 
1639fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1640fa9e4066Sahrens void
1641fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1642fa9e4066Sahrens {
1643fa9e4066Sahrens 	arc_buf_t **bufp = arg;
1644fa9e4066Sahrens 	if (zio && zio->io_error) {
1645ea8dc4b6Seschrock 		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1646fa9e4066Sahrens 		*bufp = NULL;
1647fa9e4066Sahrens 	} else {
1648fa9e4066Sahrens 		*bufp = buf;
1649fa9e4066Sahrens 	}
1650fa9e4066Sahrens }
1651fa9e4066Sahrens 
1652fa9e4066Sahrens static void
1653fa9e4066Sahrens arc_read_done(zio_t *zio)
1654fa9e4066Sahrens {
1655bbf4a8dfSmaybee 	arc_buf_hdr_t	*hdr, *found;
1656fa9e4066Sahrens 	arc_buf_t	*buf;
1657fa9e4066Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1658fa9e4066Sahrens 	kmutex_t	*hash_lock;
1659fa9e4066Sahrens 	arc_callback_t	*callback_list, *acb;
1660fa9e4066Sahrens 	int		freeable = FALSE;
1661fa9e4066Sahrens 
1662fa9e4066Sahrens 	buf = zio->io_private;
1663fa9e4066Sahrens 	hdr = buf->b_hdr;
1664fa9e4066Sahrens 
1665bbf4a8dfSmaybee 	/*
1666bbf4a8dfSmaybee 	 * The hdr was inserted into hash-table and removed from lists
1667bbf4a8dfSmaybee 	 * prior to starting I/O.  We should find this header, since
1668bbf4a8dfSmaybee 	 * it's in the hash table, and it should be legit since it's
1669bbf4a8dfSmaybee 	 * not possible to evict it during the I/O.  The only possible
1670bbf4a8dfSmaybee 	 * reason for it not to be found is if we were freed during the
1671bbf4a8dfSmaybee 	 * read.
1672bbf4a8dfSmaybee 	 */
1673bbf4a8dfSmaybee 	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1674fa9e4066Sahrens 		    &hash_lock);
1675fa9e4066Sahrens 
1676bbf4a8dfSmaybee 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
1677bbf4a8dfSmaybee 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
1678fa9e4066Sahrens 
1679fa9e4066Sahrens 	/* byteswap if necessary */
1680fa9e4066Sahrens 	callback_list = hdr->b_acb;
1681fa9e4066Sahrens 	ASSERT(callback_list != NULL);
1682fa9e4066Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1683fa9e4066Sahrens 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1684fa9e4066Sahrens 
1685fa9e4066Sahrens 	/* create copies of the data buffer for the callers */
1686fa9e4066Sahrens 	abuf = buf;
1687fa9e4066Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
1688fa9e4066Sahrens 		if (acb->acb_done) {
1689*44eda4d7Smaybee 			if (abuf == NULL)
1690*44eda4d7Smaybee 				abuf = arc_buf_clone(buf);
1691fa9e4066Sahrens 			acb->acb_buf = abuf;
1692fa9e4066Sahrens 			abuf = NULL;
1693fa9e4066Sahrens 		}
1694fa9e4066Sahrens 	}
1695fa9e4066Sahrens 	hdr->b_acb = NULL;
1696fa9e4066Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1697ea8dc4b6Seschrock 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
1698ea8dc4b6Seschrock 	if (abuf == buf)
1699ea8dc4b6Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
1700fa9e4066Sahrens 
1701fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1702fa9e4066Sahrens 
1703fa9e4066Sahrens 	if (zio->io_error != 0) {
1704fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
1705fa9e4066Sahrens 		if (hdr->b_state != arc.anon)
1706fa9e4066Sahrens 			arc_change_state(arc.anon, hdr, hash_lock);
1707ea8dc4b6Seschrock 		if (HDR_IN_HASH_TABLE(hdr))
1708ea8dc4b6Seschrock 			buf_hash_remove(hdr);
1709fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
171013506d1eSmaybee 		/* convert checksum errors into IO errors */
1711ea8dc4b6Seschrock 		if (zio->io_error == ECKSUM)
1712ea8dc4b6Seschrock 			zio->io_error = EIO;
1713fa9e4066Sahrens 	}
1714fa9e4066Sahrens 
1715ea8dc4b6Seschrock 	/*
171613506d1eSmaybee 	 * Broadcast before we drop the hash_lock to avoid the possibility
171713506d1eSmaybee 	 * that the hdr (and hence the cv) might be freed before we get to
171813506d1eSmaybee 	 * the cv_broadcast().
1719ea8dc4b6Seschrock 	 */
1720ea8dc4b6Seschrock 	cv_broadcast(&hdr->b_cv);
1721ea8dc4b6Seschrock 
1722bbf4a8dfSmaybee 	if (hash_lock) {
1723fa9e4066Sahrens 		/*
1724fa9e4066Sahrens 		 * Only call arc_access on anonymous buffers.  This is because
1725fa9e4066Sahrens 		 * if we've issued an I/O for an evicted buffer, we've already
1726fa9e4066Sahrens 		 * called arc_access (to prevent any simultaneous readers from
1727fa9e4066Sahrens 		 * getting confused).
1728fa9e4066Sahrens 		 */
1729fa9e4066Sahrens 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
1730*44eda4d7Smaybee 			arc_access(hdr, hash_lock);
1731*44eda4d7Smaybee 		mutex_exit(hash_lock);
1732fa9e4066Sahrens 	} else {
1733fa9e4066Sahrens 		/*
1734fa9e4066Sahrens 		 * This block was freed while we waited for the read to
1735fa9e4066Sahrens 		 * complete.  It has been removed from the hash table and
1736fa9e4066Sahrens 		 * moved to the anonymous state (so that it won't show up
1737fa9e4066Sahrens 		 * in the cache).
1738fa9e4066Sahrens 		 */
1739fa9e4066Sahrens 		ASSERT3P(hdr->b_state, ==, arc.anon);
1740fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1741fa9e4066Sahrens 	}
1742fa9e4066Sahrens 
1743fa9e4066Sahrens 	/* execute each callback and free its structure */
1744fa9e4066Sahrens 	while ((acb = callback_list) != NULL) {
1745fa9e4066Sahrens 		if (acb->acb_done)
1746fa9e4066Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1747fa9e4066Sahrens 
1748fa9e4066Sahrens 		if (acb->acb_zio_dummy != NULL) {
1749fa9e4066Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
1750fa9e4066Sahrens 			zio_nowait(acb->acb_zio_dummy);
1751fa9e4066Sahrens 		}
1752fa9e4066Sahrens 
1753fa9e4066Sahrens 		callback_list = acb->acb_next;
1754fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1755fa9e4066Sahrens 	}
1756fa9e4066Sahrens 
1757fa9e4066Sahrens 	if (freeable)
1758ea8dc4b6Seschrock 		arc_hdr_destroy(hdr);
1759fa9e4066Sahrens }
1760fa9e4066Sahrens 
1761fa9e4066Sahrens /*
1762fa9e4066Sahrens  * "Read" the block block at the specified DVA (in bp) via the
1763fa9e4066Sahrens  * cache.  If the block is found in the cache, invoke the provided
1764fa9e4066Sahrens  * callback immediately and return.  Note that the `zio' parameter
1765fa9e4066Sahrens  * in the callback will be NULL in this case, since no IO was
1766fa9e4066Sahrens  * required.  If the block is not in the cache pass the read request
1767fa9e4066Sahrens  * on to the spa with a substitute callback function, so that the
1768fa9e4066Sahrens  * requested block will be added to the cache.
1769fa9e4066Sahrens  *
1770fa9e4066Sahrens  * If a read request arrives for a block that has a read in-progress,
1771fa9e4066Sahrens  * either wait for the in-progress read to complete (and return the
1772fa9e4066Sahrens  * results); or, if this is a read with a "done" func, add a record
1773fa9e4066Sahrens  * to the read to invoke the "done" func when the read completes,
1774fa9e4066Sahrens  * and return; or just return.
1775fa9e4066Sahrens  *
1776fa9e4066Sahrens  * arc_read_done() will invoke all the requested "done" functions
1777fa9e4066Sahrens  * for readers of this block.
1778fa9e4066Sahrens  */
1779fa9e4066Sahrens int
1780fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
1781fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
178213506d1eSmaybee     uint32_t *arc_flags, zbookmark_t *zb)
1783fa9e4066Sahrens {
1784fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1785fa9e4066Sahrens 	arc_buf_t *buf;
1786fa9e4066Sahrens 	kmutex_t *hash_lock;
1787fa9e4066Sahrens 	zio_t	*rzio;
1788fa9e4066Sahrens 
1789fa9e4066Sahrens top:
1790fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1791ea8dc4b6Seschrock 	if (hdr && hdr->b_datacnt > 0) {
1792fa9e4066Sahrens 
179313506d1eSmaybee 		*arc_flags |= ARC_CACHED;
179413506d1eSmaybee 
1795fa9e4066Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
179613506d1eSmaybee 
179713506d1eSmaybee 			if (*arc_flags & ARC_WAIT) {
179813506d1eSmaybee 				cv_wait(&hdr->b_cv, hash_lock);
179913506d1eSmaybee 				mutex_exit(hash_lock);
180013506d1eSmaybee 				goto top;
180113506d1eSmaybee 			}
180213506d1eSmaybee 			ASSERT(*arc_flags & ARC_NOWAIT);
180313506d1eSmaybee 
180413506d1eSmaybee 			if (done) {
1805fa9e4066Sahrens 				arc_callback_t	*acb = NULL;
1806fa9e4066Sahrens 
1807fa9e4066Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
1808fa9e4066Sahrens 				    KM_SLEEP);
1809fa9e4066Sahrens 				acb->acb_done = done;
1810fa9e4066Sahrens 				acb->acb_private = private;
1811fa9e4066Sahrens 				acb->acb_byteswap = swap;
1812fa9e4066Sahrens 				if (pio != NULL)
1813fa9e4066Sahrens 					acb->acb_zio_dummy = zio_null(pio,
1814fa9e4066Sahrens 					    spa, NULL, NULL, flags);
1815fa9e4066Sahrens 
1816fa9e4066Sahrens 				ASSERT(acb->acb_done != NULL);
1817fa9e4066Sahrens 				acb->acb_next = hdr->b_acb;
1818fa9e4066Sahrens 				hdr->b_acb = acb;
1819fa9e4066Sahrens 				add_reference(hdr, hash_lock, private);
1820fa9e4066Sahrens 				mutex_exit(hash_lock);
1821fa9e4066Sahrens 				return (0);
1822fa9e4066Sahrens 			}
1823fa9e4066Sahrens 			mutex_exit(hash_lock);
1824fa9e4066Sahrens 			return (0);
1825fa9e4066Sahrens 		}
1826fa9e4066Sahrens 
1827ea8dc4b6Seschrock 		ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
1828fa9e4066Sahrens 
1829ea8dc4b6Seschrock 		if (done) {
1830*44eda4d7Smaybee 			add_reference(hdr, hash_lock, private);
1831ea8dc4b6Seschrock 			/*
1832ea8dc4b6Seschrock 			 * If this block is already in use, create a new
1833ea8dc4b6Seschrock 			 * copy of the data so that we will be guaranteed
1834ea8dc4b6Seschrock 			 * that arc_release() will always succeed.
1835ea8dc4b6Seschrock 			 */
1836fa9e4066Sahrens 			buf = hdr->b_buf;
1837ea8dc4b6Seschrock 			ASSERT(buf);
1838ea8dc4b6Seschrock 			ASSERT(buf->b_data);
1839*44eda4d7Smaybee 			if (HDR_BUF_AVAILABLE(hdr)) {
1840ea8dc4b6Seschrock 				ASSERT(buf->b_efunc == NULL);
1841ea8dc4b6Seschrock 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
1842*44eda4d7Smaybee 			} else {
1843*44eda4d7Smaybee 				buf = arc_buf_clone(buf);
1844ea8dc4b6Seschrock 			}
184513506d1eSmaybee 		} else if (*arc_flags & ARC_PREFETCH &&
184613506d1eSmaybee 		    refcount_count(&hdr->b_refcnt) == 0) {
184713506d1eSmaybee 			hdr->b_flags |= ARC_PREFETCH;
1848fa9e4066Sahrens 		}
1849fa9e4066Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1850*44eda4d7Smaybee 		arc_access(hdr, hash_lock);
1851*44eda4d7Smaybee 		mutex_exit(hash_lock);
1852fa9e4066Sahrens 		atomic_add_64(&arc.hits, 1);
1853fa9e4066Sahrens 		if (done)
1854fa9e4066Sahrens 			done(NULL, buf, private);
1855fa9e4066Sahrens 	} else {
1856fa9e4066Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
1857fa9e4066Sahrens 		arc_callback_t	*acb;
1858fa9e4066Sahrens 
1859fa9e4066Sahrens 		if (hdr == NULL) {
1860fa9e4066Sahrens 			/* this block is not in the cache */
1861fa9e4066Sahrens 			arc_buf_hdr_t	*exists;
1862fa9e4066Sahrens 
1863fa9e4066Sahrens 			buf = arc_buf_alloc(spa, size, private);
1864fa9e4066Sahrens 			hdr = buf->b_hdr;
1865fa9e4066Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
1866fa9e4066Sahrens 			hdr->b_birth = bp->blk_birth;
1867fa9e4066Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
1868fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1869fa9e4066Sahrens 			if (exists) {
1870fa9e4066Sahrens 				/* somebody beat us to the hash insert */
1871fa9e4066Sahrens 				mutex_exit(hash_lock);
1872fa9e4066Sahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
1873fa9e4066Sahrens 				hdr->b_birth = 0;
1874fa9e4066Sahrens 				hdr->b_cksum0 = 0;
1875ea8dc4b6Seschrock 				(void) arc_buf_remove_ref(buf, private);
1876fa9e4066Sahrens 				goto top; /* restart the IO request */
1877fa9e4066Sahrens 			}
187813506d1eSmaybee 			/* if this is a prefetch, we don't have a reference */
187913506d1eSmaybee 			if (*arc_flags & ARC_PREFETCH) {
188013506d1eSmaybee 				(void) remove_reference(hdr, hash_lock,
188113506d1eSmaybee 				    private);
188213506d1eSmaybee 				hdr->b_flags |= ARC_PREFETCH;
188313506d1eSmaybee 			}
188413506d1eSmaybee 			if (BP_GET_LEVEL(bp) > 0)
188513506d1eSmaybee 				hdr->b_flags |= ARC_INDIRECT;
1886fa9e4066Sahrens 		} else {
1887fa9e4066Sahrens 			/* this block is in the ghost cache */
1888ea8dc4b6Seschrock 			ASSERT(GHOST_STATE(hdr->b_state));
1889ea8dc4b6Seschrock 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
189013506d1eSmaybee 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
1891ea8dc4b6Seschrock 			ASSERT(hdr->b_buf == NULL);
189213506d1eSmaybee 
189313506d1eSmaybee 			/* if this is a prefetch, we don't have a reference */
189413506d1eSmaybee 			if (*arc_flags & ARC_PREFETCH)
189513506d1eSmaybee 				hdr->b_flags |= ARC_PREFETCH;
189613506d1eSmaybee 			else
189713506d1eSmaybee 				add_reference(hdr, hash_lock, private);
1898fa9e4066Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1899fa9e4066Sahrens 			buf->b_hdr = hdr;
1900*44eda4d7Smaybee 			buf->b_data = NULL;
1901ea8dc4b6Seschrock 			buf->b_efunc = NULL;
1902ea8dc4b6Seschrock 			buf->b_private = NULL;
1903fa9e4066Sahrens 			buf->b_next = NULL;
1904fa9e4066Sahrens 			hdr->b_buf = buf;
1905*44eda4d7Smaybee 			arc_get_data_buf(buf);
1906ea8dc4b6Seschrock 			ASSERT(hdr->b_datacnt == 0);
1907ea8dc4b6Seschrock 			hdr->b_datacnt = 1;
190813506d1eSmaybee 
1909fa9e4066Sahrens 		}
1910fa9e4066Sahrens 
1911fa9e4066Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1912fa9e4066Sahrens 		acb->acb_done = done;
1913fa9e4066Sahrens 		acb->acb_private = private;
1914fa9e4066Sahrens 		acb->acb_byteswap = swap;
1915fa9e4066Sahrens 
1916fa9e4066Sahrens 		ASSERT(hdr->b_acb == NULL);
1917fa9e4066Sahrens 		hdr->b_acb = acb;
1918fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
1919fa9e4066Sahrens 
1920fa9e4066Sahrens 		/*
1921fa9e4066Sahrens 		 * If the buffer has been evicted, migrate it to a present state
1922fa9e4066Sahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
1923fa9e4066Sahrens 		 * the header will be marked as I/O in progress and have an
1924fa9e4066Sahrens 		 * attached buffer.  At this point, anybody who finds this
1925fa9e4066Sahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
1926fa9e4066Sahrens 		 */
1927fa9e4066Sahrens 
1928ea8dc4b6Seschrock 		if (GHOST_STATE(hdr->b_state))
1929*44eda4d7Smaybee 			arc_access(hdr, hash_lock);
1930*44eda4d7Smaybee 		mutex_exit(hash_lock);
1931fa9e4066Sahrens 
1932fa9e4066Sahrens 		ASSERT3U(hdr->b_size, ==, size);
1933c543ec06Sahrens 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
1934c543ec06Sahrens 		    zbookmark_t *, zb);
1935fa9e4066Sahrens 		atomic_add_64(&arc.misses, 1);
1936ea8dc4b6Seschrock 
1937fa9e4066Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
1938ea8dc4b6Seschrock 		    arc_read_done, buf, priority, flags, zb);
1939fa9e4066Sahrens 
194013506d1eSmaybee 		if (*arc_flags & ARC_WAIT)
1941fa9e4066Sahrens 			return (zio_wait(rzio));
1942fa9e4066Sahrens 
194313506d1eSmaybee 		ASSERT(*arc_flags & ARC_NOWAIT);
1944fa9e4066Sahrens 		zio_nowait(rzio);
1945fa9e4066Sahrens 	}
1946fa9e4066Sahrens 	return (0);
1947fa9e4066Sahrens }
1948fa9e4066Sahrens 
1949fa9e4066Sahrens /*
1950fa9e4066Sahrens  * arc_read() variant to support pool traversal.  If the block is already
1951fa9e4066Sahrens  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
1952fa9e4066Sahrens  * The idea is that we don't want pool traversal filling up memory, but
1953fa9e4066Sahrens  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
1954fa9e4066Sahrens  */
1955fa9e4066Sahrens int
1956fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
1957fa9e4066Sahrens {
1958fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1959fa9e4066Sahrens 	kmutex_t *hash_mtx;
1960fa9e4066Sahrens 	int rc = 0;
1961fa9e4066Sahrens 
1962fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
1963fa9e4066Sahrens 
1964ea8dc4b6Seschrock 	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
1965ea8dc4b6Seschrock 		arc_buf_t *buf = hdr->b_buf;
1966ea8dc4b6Seschrock 
1967ea8dc4b6Seschrock 		ASSERT(buf);
1968ea8dc4b6Seschrock 		while (buf->b_data == NULL) {
1969ea8dc4b6Seschrock 			buf = buf->b_next;
1970ea8dc4b6Seschrock 			ASSERT(buf);
1971ea8dc4b6Seschrock 		}
1972ea8dc4b6Seschrock 		bcopy(buf->b_data, data, hdr->b_size);
1973ea8dc4b6Seschrock 	} else {
1974fa9e4066Sahrens 		rc = ENOENT;
1975ea8dc4b6Seschrock 	}
1976fa9e4066Sahrens 
1977fa9e4066Sahrens 	if (hash_mtx)
1978fa9e4066Sahrens 		mutex_exit(hash_mtx);
1979fa9e4066Sahrens 
1980fa9e4066Sahrens 	return (rc);
1981fa9e4066Sahrens }
1982fa9e4066Sahrens 
1983ea8dc4b6Seschrock void
1984ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
1985ea8dc4b6Seschrock {
1986ea8dc4b6Seschrock 	ASSERT(buf->b_hdr != NULL);
1987ea8dc4b6Seschrock 	ASSERT(buf->b_hdr->b_state != arc.anon);
1988ea8dc4b6Seschrock 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
1989ea8dc4b6Seschrock 	buf->b_efunc = func;
1990ea8dc4b6Seschrock 	buf->b_private = private;
1991ea8dc4b6Seschrock }
1992ea8dc4b6Seschrock 
1993ea8dc4b6Seschrock /*
1994ea8dc4b6Seschrock  * This is used by the DMU to let the ARC know that a buffer is
1995ea8dc4b6Seschrock  * being evicted, so the ARC should clean up.  If this arc buf
1996ea8dc4b6Seschrock  * is not yet in the evicted state, it will be put there.
1997ea8dc4b6Seschrock  */
1998ea8dc4b6Seschrock int
1999ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf)
2000ea8dc4b6Seschrock {
2001ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr;
2002ea8dc4b6Seschrock 	kmutex_t *hash_lock;
2003ea8dc4b6Seschrock 	arc_buf_t **bufp;
2004ea8dc4b6Seschrock 
2005ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
2006ea8dc4b6Seschrock 	hdr = buf->b_hdr;
2007ea8dc4b6Seschrock 	if (hdr == NULL) {
2008ea8dc4b6Seschrock 		/*
2009ea8dc4b6Seschrock 		 * We are in arc_do_user_evicts().
2010ea8dc4b6Seschrock 		 * NOTE: We can't be in arc_buf_add_ref() because
2011ea8dc4b6Seschrock 		 * that would violate the interface rules.
2012ea8dc4b6Seschrock 		 */
2013ea8dc4b6Seschrock 		ASSERT(buf->b_data == NULL);
2014ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2015ea8dc4b6Seschrock 		return (0);
2016ea8dc4b6Seschrock 	} else if (buf->b_data == NULL) {
2017dd6ef538Smaybee 		arc_buf_t copy = *buf; /* structure assignment */
2018ea8dc4b6Seschrock 		/*
2019dd6ef538Smaybee 		 * We are on the eviction list.  Process this buffer
2020dd6ef538Smaybee 		 * now but let arc_do_user_evicts() do the reaping.
2021ea8dc4b6Seschrock 		 */
2022dd6ef538Smaybee 		buf->b_efunc = NULL;
2023dd6ef538Smaybee 		buf->b_hdr = NULL;
2024ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2025dd6ef538Smaybee 		VERIFY(copy.b_efunc(&copy) == 0);
2026dd6ef538Smaybee 		return (1);
2027ea8dc4b6Seschrock 	} else {
2028ea8dc4b6Seschrock 		/*
2029ea8dc4b6Seschrock 		 * Prevent a race with arc_evict()
2030ea8dc4b6Seschrock 		 */
2031ea8dc4b6Seschrock 		ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2032ea8dc4b6Seschrock 		buf->b_hdr = NULL;
2033ea8dc4b6Seschrock 	}
2034ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
2035ea8dc4b6Seschrock 
2036ea8dc4b6Seschrock 	hash_lock = HDR_LOCK(hdr);
2037ea8dc4b6Seschrock 	mutex_enter(hash_lock);
2038ea8dc4b6Seschrock 
2039ea8dc4b6Seschrock 	ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
2040ea8dc4b6Seschrock 
2041ea8dc4b6Seschrock 	/*
2042ea8dc4b6Seschrock 	 * Pull this buffer off of the hdr
2043ea8dc4b6Seschrock 	 */
2044ea8dc4b6Seschrock 	bufp = &hdr->b_buf;
2045ea8dc4b6Seschrock 	while (*bufp != buf)
2046ea8dc4b6Seschrock 		bufp = &(*bufp)->b_next;
2047ea8dc4b6Seschrock 	*bufp = buf->b_next;
2048ea8dc4b6Seschrock 
2049ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
2050ea8dc4b6Seschrock 	buf->b_hdr = hdr;
2051*44eda4d7Smaybee 	arc_buf_destroy(buf, FALSE, FALSE);
2052ea8dc4b6Seschrock 
2053ea8dc4b6Seschrock 	if (hdr->b_datacnt == 0) {
2054ea8dc4b6Seschrock 		arc_state_t *old_state = hdr->b_state;
2055ea8dc4b6Seschrock 		arc_state_t *evicted_state;
2056ea8dc4b6Seschrock 
2057ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
2058ea8dc4b6Seschrock 
2059ea8dc4b6Seschrock 		evicted_state =
2060ea8dc4b6Seschrock 		    (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
2061ea8dc4b6Seschrock 
2062ea8dc4b6Seschrock 		mutex_enter(&old_state->mtx);
2063ea8dc4b6Seschrock 		mutex_enter(&evicted_state->mtx);
2064ea8dc4b6Seschrock 
2065ea8dc4b6Seschrock 		arc_change_state(evicted_state, hdr, hash_lock);
2066ea8dc4b6Seschrock 		ASSERT(HDR_IN_HASH_TABLE(hdr));
2067ea8dc4b6Seschrock 		hdr->b_flags = ARC_IN_HASH_TABLE;
2068ea8dc4b6Seschrock 
2069ea8dc4b6Seschrock 		mutex_exit(&evicted_state->mtx);
2070ea8dc4b6Seschrock 		mutex_exit(&old_state->mtx);
2071ea8dc4b6Seschrock 	}
2072ea8dc4b6Seschrock 	mutex_exit(hash_lock);
2073dd6ef538Smaybee 
2074ea8dc4b6Seschrock 	VERIFY(buf->b_efunc(buf) == 0);
2075ea8dc4b6Seschrock 	buf->b_efunc = NULL;
2076ea8dc4b6Seschrock 	buf->b_private = NULL;
2077ea8dc4b6Seschrock 	buf->b_hdr = NULL;
2078ea8dc4b6Seschrock 	kmem_cache_free(buf_cache, buf);
2079ea8dc4b6Seschrock 	return (1);
2080ea8dc4b6Seschrock }
2081ea8dc4b6Seschrock 
2082fa9e4066Sahrens /*
2083fa9e4066Sahrens  * Release this buffer from the cache.  This must be done
2084fa9e4066Sahrens  * after a read and prior to modifying the buffer contents.
2085fa9e4066Sahrens  * If the buffer has more than one reference, we must make
2086fa9e4066Sahrens  * make a new hdr for the buffer.
2087fa9e4066Sahrens  */
2088fa9e4066Sahrens void
2089fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag)
2090fa9e4066Sahrens {
2091fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
2092fa9e4066Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
2093fa9e4066Sahrens 
2094fa9e4066Sahrens 	/* this buffer is not on any list */
2095fa9e4066Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2096fa9e4066Sahrens 
2097fa9e4066Sahrens 	if (hdr->b_state == arc.anon) {
2098fa9e4066Sahrens 		/* this buffer is already released */
2099fa9e4066Sahrens 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2100fa9e4066Sahrens 		ASSERT(BUF_EMPTY(hdr));
2101ea8dc4b6Seschrock 		ASSERT(buf->b_efunc == NULL);
2102fa9e4066Sahrens 		return;
2103fa9e4066Sahrens 	}
2104fa9e4066Sahrens 
2105fa9e4066Sahrens 	mutex_enter(hash_lock);
2106fa9e4066Sahrens 
2107ea8dc4b6Seschrock 	/*
2108ea8dc4b6Seschrock 	 * Do we have more than one buf?
2109ea8dc4b6Seschrock 	 */
2110ea8dc4b6Seschrock 	if (hdr->b_buf != buf || buf->b_next != NULL) {
2111fa9e4066Sahrens 		arc_buf_hdr_t *nhdr;
2112fa9e4066Sahrens 		arc_buf_t **bufp;
2113fa9e4066Sahrens 		uint64_t blksz = hdr->b_size;
2114fa9e4066Sahrens 		spa_t *spa = hdr->b_spa;
2115fa9e4066Sahrens 
2116ea8dc4b6Seschrock 		ASSERT(hdr->b_datacnt > 1);
2117fa9e4066Sahrens 		/*
2118fa9e4066Sahrens 		 * Pull the data off of this buf and attach it to
2119fa9e4066Sahrens 		 * a new anonymous buf.
2120fa9e4066Sahrens 		 */
2121ea8dc4b6Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
2122fa9e4066Sahrens 		bufp = &hdr->b_buf;
2123ea8dc4b6Seschrock 		while (*bufp != buf)
2124fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
2125fa9e4066Sahrens 		*bufp = (*bufp)->b_next;
2126ea8dc4b6Seschrock 
2127fa9e4066Sahrens 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
2128fa9e4066Sahrens 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
2129ea8dc4b6Seschrock 		if (refcount_is_zero(&hdr->b_refcnt)) {
2130ea8dc4b6Seschrock 			ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size);
2131ea8dc4b6Seschrock 			atomic_add_64(&hdr->b_state->lsize, -hdr->b_size);
2132ea8dc4b6Seschrock 		}
2133ea8dc4b6Seschrock 		hdr->b_datacnt -= 1;
2134ea8dc4b6Seschrock 
2135fa9e4066Sahrens 		mutex_exit(hash_lock);
2136fa9e4066Sahrens 
2137fa9e4066Sahrens 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
2138fa9e4066Sahrens 		nhdr->b_size = blksz;
2139fa9e4066Sahrens 		nhdr->b_spa = spa;
2140fa9e4066Sahrens 		nhdr->b_buf = buf;
2141fa9e4066Sahrens 		nhdr->b_state = arc.anon;
2142fa9e4066Sahrens 		nhdr->b_arc_access = 0;
2143fa9e4066Sahrens 		nhdr->b_flags = 0;
2144ea8dc4b6Seschrock 		nhdr->b_datacnt = 1;
2145fa9e4066Sahrens 		buf->b_hdr = nhdr;
2146fa9e4066Sahrens 		buf->b_next = NULL;
2147fa9e4066Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
2148fa9e4066Sahrens 		atomic_add_64(&arc.anon->size, blksz);
2149fa9e4066Sahrens 
2150fa9e4066Sahrens 		hdr = nhdr;
2151fa9e4066Sahrens 	} else {
2152ea8dc4b6Seschrock 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2153fa9e4066Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
2154fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2155fa9e4066Sahrens 		arc_change_state(arc.anon, hdr, hash_lock);
2156fa9e4066Sahrens 		hdr->b_arc_access = 0;
2157fa9e4066Sahrens 		mutex_exit(hash_lock);
2158fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
2159fa9e4066Sahrens 		hdr->b_birth = 0;
2160fa9e4066Sahrens 		hdr->b_cksum0 = 0;
2161fa9e4066Sahrens 	}
2162ea8dc4b6Seschrock 	buf->b_efunc = NULL;
2163ea8dc4b6Seschrock 	buf->b_private = NULL;
2164fa9e4066Sahrens }
2165fa9e4066Sahrens 
2166fa9e4066Sahrens int
2167fa9e4066Sahrens arc_released(arc_buf_t *buf)
2168fa9e4066Sahrens {
2169ea8dc4b6Seschrock 	return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon);
2170ea8dc4b6Seschrock }
2171ea8dc4b6Seschrock 
2172ea8dc4b6Seschrock int
2173ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf)
2174ea8dc4b6Seschrock {
2175ea8dc4b6Seschrock 	return (buf->b_efunc != NULL);
2176fa9e4066Sahrens }
2177fa9e4066Sahrens 
2178ea8dc4b6Seschrock #ifdef ZFS_DEBUG
2179ea8dc4b6Seschrock int
2180ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf)
2181ea8dc4b6Seschrock {
2182ea8dc4b6Seschrock 	return (refcount_count(&buf->b_hdr->b_refcnt));
2183ea8dc4b6Seschrock }
2184ea8dc4b6Seschrock #endif
2185ea8dc4b6Seschrock 
2186fa9e4066Sahrens static void
2187fa9e4066Sahrens arc_write_done(zio_t *zio)
2188fa9e4066Sahrens {
2189fa9e4066Sahrens 	arc_buf_t *buf;
2190fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
2191fa9e4066Sahrens 	arc_callback_t *acb;
2192fa9e4066Sahrens 
2193fa9e4066Sahrens 	buf = zio->io_private;
2194fa9e4066Sahrens 	hdr = buf->b_hdr;
2195fa9e4066Sahrens 	acb = hdr->b_acb;
2196fa9e4066Sahrens 	hdr->b_acb = NULL;
2197ea8dc4b6Seschrock 	ASSERT(acb != NULL);
2198fa9e4066Sahrens 
2199fa9e4066Sahrens 	/* this buffer is on no lists and is not in the hash table */
2200fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
2201fa9e4066Sahrens 
2202fa9e4066Sahrens 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
2203fa9e4066Sahrens 	hdr->b_birth = zio->io_bp->blk_birth;
2204fa9e4066Sahrens 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
2205ea8dc4b6Seschrock 	/*
2206ea8dc4b6Seschrock 	 * If the block to be written was all-zero, we may have
2207ea8dc4b6Seschrock 	 * compressed it away.  In this case no write was performed
2208ea8dc4b6Seschrock 	 * so there will be no dva/birth-date/checksum.  The buffer
2209ea8dc4b6Seschrock 	 * must therefor remain anonymous (and uncached).
2210ea8dc4b6Seschrock 	 */
2211fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
2212fa9e4066Sahrens 		arc_buf_hdr_t *exists;
2213fa9e4066Sahrens 		kmutex_t *hash_lock;
2214fa9e4066Sahrens 
2215fa9e4066Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
2216fa9e4066Sahrens 		if (exists) {
2217fa9e4066Sahrens 			/*
2218fa9e4066Sahrens 			 * This can only happen if we overwrite for
2219fa9e4066Sahrens 			 * sync-to-convergence, because we remove
2220fa9e4066Sahrens 			 * buffers from the hash table when we arc_free().
2221fa9e4066Sahrens 			 */
2222fa9e4066Sahrens 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
2223fa9e4066Sahrens 			    BP_IDENTITY(zio->io_bp)));
2224fa9e4066Sahrens 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
2225fa9e4066Sahrens 			    zio->io_bp->blk_birth);
2226fa9e4066Sahrens 
2227fa9e4066Sahrens 			ASSERT(refcount_is_zero(&exists->b_refcnt));
2228fa9e4066Sahrens 			arc_change_state(arc.anon, exists, hash_lock);
2229fa9e4066Sahrens 			mutex_exit(hash_lock);
2230ea8dc4b6Seschrock 			arc_hdr_destroy(exists);
2231fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
2232fa9e4066Sahrens 			ASSERT3P(exists, ==, NULL);
2233fa9e4066Sahrens 		}
2234ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2235*44eda4d7Smaybee 		arc_access(hdr, hash_lock);
2236*44eda4d7Smaybee 		mutex_exit(hash_lock);
2237ea8dc4b6Seschrock 	} else if (acb->acb_done == NULL) {
2238ea8dc4b6Seschrock 		int destroy_hdr;
2239ea8dc4b6Seschrock 		/*
2240ea8dc4b6Seschrock 		 * This is an anonymous buffer with no user callback,
2241ea8dc4b6Seschrock 		 * destroy it if there are no active references.
2242ea8dc4b6Seschrock 		 */
2243ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
2244ea8dc4b6Seschrock 		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
2245ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2246ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2247ea8dc4b6Seschrock 		if (destroy_hdr)
2248ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
2249ea8dc4b6Seschrock 	} else {
2250ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2251fa9e4066Sahrens 	}
2252ea8dc4b6Seschrock 
2253ea8dc4b6Seschrock 	if (acb->acb_done) {
2254fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
2255fa9e4066Sahrens 		acb->acb_done(zio, buf, acb->acb_private);
2256fa9e4066Sahrens 	}
2257fa9e4066Sahrens 
2258ea8dc4b6Seschrock 	kmem_free(acb, sizeof (arc_callback_t));
2259fa9e4066Sahrens }
2260fa9e4066Sahrens 
2261fa9e4066Sahrens int
226244cd46caSbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
2263fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
2264fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
2265ea8dc4b6Seschrock     uint32_t arc_flags, zbookmark_t *zb)
2266fa9e4066Sahrens {
2267fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
2268fa9e4066Sahrens 	arc_callback_t	*acb;
2269fa9e4066Sahrens 	zio_t	*rzio;
2270fa9e4066Sahrens 
2271fa9e4066Sahrens 	/* this is a private buffer - no locking required */
2272fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
2273fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
2274fa9e4066Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
2275c5c6ffa0Smaybee 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
2276c5c6ffa0Smaybee 	ASSERT(hdr->b_acb == 0);
2277fa9e4066Sahrens 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2278fa9e4066Sahrens 	acb->acb_done = done;
2279fa9e4066Sahrens 	acb->acb_private = private;
2280fa9e4066Sahrens 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
2281fa9e4066Sahrens 	hdr->b_acb = acb;
2282ea8dc4b6Seschrock 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
228344cd46caSbillm 	rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
2284ea8dc4b6Seschrock 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
2285fa9e4066Sahrens 
2286fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
2287fa9e4066Sahrens 		return (zio_wait(rzio));
2288fa9e4066Sahrens 
2289fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
2290fa9e4066Sahrens 	zio_nowait(rzio);
2291fa9e4066Sahrens 
2292fa9e4066Sahrens 	return (0);
2293fa9e4066Sahrens }
2294fa9e4066Sahrens 
2295fa9e4066Sahrens int
2296fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
2297fa9e4066Sahrens     zio_done_func_t *done, void *private, uint32_t arc_flags)
2298fa9e4066Sahrens {
2299fa9e4066Sahrens 	arc_buf_hdr_t *ab;
2300fa9e4066Sahrens 	kmutex_t *hash_lock;
2301fa9e4066Sahrens 	zio_t	*zio;
2302fa9e4066Sahrens 
2303fa9e4066Sahrens 	/*
2304fa9e4066Sahrens 	 * If this buffer is in the cache, release it, so it
2305fa9e4066Sahrens 	 * can be re-used.
2306fa9e4066Sahrens 	 */
2307fa9e4066Sahrens 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2308fa9e4066Sahrens 	if (ab != NULL) {
2309fa9e4066Sahrens 		/*
2310fa9e4066Sahrens 		 * The checksum of blocks to free is not always
2311fa9e4066Sahrens 		 * preserved (eg. on the deadlist).  However, if it is
2312fa9e4066Sahrens 		 * nonzero, it should match what we have in the cache.
2313fa9e4066Sahrens 		 */
2314fa9e4066Sahrens 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
2315fa9e4066Sahrens 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
231677ed8509Smaybee 		if (ab->b_state != arc.anon)
231777ed8509Smaybee 			arc_change_state(arc.anon, ab, hash_lock);
231813506d1eSmaybee 		if (HDR_IO_IN_PROGRESS(ab)) {
231913506d1eSmaybee 			/*
232013506d1eSmaybee 			 * This should only happen when we prefetch.
232113506d1eSmaybee 			 */
232213506d1eSmaybee 			ASSERT(ab->b_flags & ARC_PREFETCH);
232313506d1eSmaybee 			ASSERT3U(ab->b_datacnt, ==, 1);
232413506d1eSmaybee 			ab->b_flags |= ARC_FREED_IN_READ;
232513506d1eSmaybee 			if (HDR_IN_HASH_TABLE(ab))
232613506d1eSmaybee 				buf_hash_remove(ab);
232713506d1eSmaybee 			ab->b_arc_access = 0;
232813506d1eSmaybee 			bzero(&ab->b_dva, sizeof (dva_t));
232913506d1eSmaybee 			ab->b_birth = 0;
233013506d1eSmaybee 			ab->b_cksum0 = 0;
233113506d1eSmaybee 			ab->b_buf->b_efunc = NULL;
233213506d1eSmaybee 			ab->b_buf->b_private = NULL;
233313506d1eSmaybee 			mutex_exit(hash_lock);
233413506d1eSmaybee 		} else if (refcount_is_zero(&ab->b_refcnt)) {
2335fa9e4066Sahrens 			mutex_exit(hash_lock);
2336ea8dc4b6Seschrock 			arc_hdr_destroy(ab);
2337fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
2338fa9e4066Sahrens 		} else {
2339bbf4a8dfSmaybee 			/*
234013506d1eSmaybee 			 * We still have an active reference on this
234113506d1eSmaybee 			 * buffer.  This can happen, e.g., from
234213506d1eSmaybee 			 * dbuf_unoverride().
2343bbf4a8dfSmaybee 			 */
234413506d1eSmaybee 			ASSERT(!HDR_IN_HASH_TABLE(ab));
2345fa9e4066Sahrens 			ab->b_arc_access = 0;
2346fa9e4066Sahrens 			bzero(&ab->b_dva, sizeof (dva_t));
2347fa9e4066Sahrens 			ab->b_birth = 0;
2348fa9e4066Sahrens 			ab->b_cksum0 = 0;
2349ea8dc4b6Seschrock 			ab->b_buf->b_efunc = NULL;
2350ea8dc4b6Seschrock 			ab->b_buf->b_private = NULL;
2351fa9e4066Sahrens 			mutex_exit(hash_lock);
2352fa9e4066Sahrens 		}
2353fa9e4066Sahrens 	}
2354fa9e4066Sahrens 
2355fa9e4066Sahrens 	zio = zio_free(pio, spa, txg, bp, done, private);
2356fa9e4066Sahrens 
2357fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
2358fa9e4066Sahrens 		return (zio_wait(zio));
2359fa9e4066Sahrens 
2360fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
2361fa9e4066Sahrens 	zio_nowait(zio);
2362fa9e4066Sahrens 
2363fa9e4066Sahrens 	return (0);
2364fa9e4066Sahrens }
2365fa9e4066Sahrens 
2366fa9e4066Sahrens void
2367fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve)
2368fa9e4066Sahrens {
2369fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, -tempreserve);
2370fa9e4066Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
2371fa9e4066Sahrens }
2372fa9e4066Sahrens 
2373fa9e4066Sahrens int
2374fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve)
2375fa9e4066Sahrens {
2376fa9e4066Sahrens #ifdef ZFS_DEBUG
2377fa9e4066Sahrens 	/*
2378fa9e4066Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
2379fa9e4066Sahrens 	 */
2380fa9e4066Sahrens 	if (spa_get_random(10000) == 0) {
2381fa9e4066Sahrens 		dprintf("forcing random failure\n");
2382fa9e4066Sahrens 		return (ERESTART);
2383fa9e4066Sahrens 	}
2384fa9e4066Sahrens #endif
2385112fe045Smaybee 	if (tempreserve > arc.c/4 && !arc.no_grow)
2386112fe045Smaybee 		arc.c = MIN(arc.c_max, tempreserve * 4);
2387112fe045Smaybee 	if (tempreserve > arc.c)
2388112fe045Smaybee 		return (ENOMEM);
2389112fe045Smaybee 
2390fa9e4066Sahrens 	/*
2391112fe045Smaybee 	 * Throttle writes when the amount of dirty data in the cache
2392112fe045Smaybee 	 * gets too large.  We try to keep the cache less than half full
2393112fe045Smaybee 	 * of dirty blocks so that our sync times don't grow too large.
2394112fe045Smaybee 	 * Note: if two requests come in concurrently, we might let them
2395112fe045Smaybee 	 * both succeed, when one of them should fail.  Not a huge deal.
2396112fe045Smaybee 	 *
2397112fe045Smaybee 	 * XXX The limit should be adjusted dynamically to keep the time
2398112fe045Smaybee 	 * to sync a dataset fixed (around 1-5 seconds?).
2399fa9e4066Sahrens 	 */
2400fa9e4066Sahrens 
2401112fe045Smaybee 	if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 &&
2402112fe045Smaybee 	    arc_tempreserve + arc.anon->size > arc.c / 4) {
2403fa9e4066Sahrens 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
2404fa9e4066Sahrens 		    "tempreserve=%lluK arc.c=%lluK\n",
2405fa9e4066Sahrens 		    arc_tempreserve>>10, arc.anon->lsize>>10,
2406fa9e4066Sahrens 		    tempreserve>>10, arc.c>>10);
2407fa9e4066Sahrens 		return (ERESTART);
2408fa9e4066Sahrens 	}
2409fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, tempreserve);
2410fa9e4066Sahrens 	return (0);
2411fa9e4066Sahrens }
2412fa9e4066Sahrens 
2413fa9e4066Sahrens void
2414fa9e4066Sahrens arc_init(void)
2415fa9e4066Sahrens {
2416fa9e4066Sahrens 	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
2417fa9e4066Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
2418fa9e4066Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
2419fa9e4066Sahrens 
242013506d1eSmaybee 	/* Convert seconds to clock ticks */
2421b19a79ecSperrin 	arc_min_prefetch_lifespan = 1 * hz;
242213506d1eSmaybee 
2423fa9e4066Sahrens 	/* Start out with 1/8 of all memory */
2424fa9e4066Sahrens 	arc.c = physmem * PAGESIZE / 8;
2425fa9e4066Sahrens 
2426fa9e4066Sahrens #ifdef _KERNEL
2427fa9e4066Sahrens 	/*
2428fa9e4066Sahrens 	 * On architectures where the physical memory can be larger
2429fa9e4066Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
2430fa9e4066Sahrens 	 * need to limit the cache to 1/8 of VM size.
2431fa9e4066Sahrens 	 */
2432fa9e4066Sahrens 	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
2433fa9e4066Sahrens #endif
2434fa9e4066Sahrens 
2435112fe045Smaybee 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
2436fa9e4066Sahrens 	arc.c_min = MAX(arc.c / 4, 64<<20);
2437112fe045Smaybee 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
2438fa9e4066Sahrens 	if (arc.c * 8 >= 1<<30)
2439fa9e4066Sahrens 		arc.c_max = (arc.c * 8) - (1<<30);
2440fa9e4066Sahrens 	else
2441fa9e4066Sahrens 		arc.c_max = arc.c_min;
2442fa9e4066Sahrens 	arc.c_max = MAX(arc.c * 6, arc.c_max);
2443fa9e4066Sahrens 	arc.c = arc.c_max;
2444fa9e4066Sahrens 	arc.p = (arc.c >> 1);
2445fa9e4066Sahrens 
2446fa9e4066Sahrens 	/* if kmem_flags are set, lets try to use less memory */
2447fa9e4066Sahrens 	if (kmem_debugging())
2448fa9e4066Sahrens 		arc.c = arc.c / 2;
2449fa9e4066Sahrens 	if (arc.c < arc.c_min)
2450fa9e4066Sahrens 		arc.c = arc.c_min;
2451fa9e4066Sahrens 
2452fa9e4066Sahrens 	arc.anon = &ARC_anon;
2453ea8dc4b6Seschrock 	arc.mru = &ARC_mru;
2454ea8dc4b6Seschrock 	arc.mru_ghost = &ARC_mru_ghost;
2455ea8dc4b6Seschrock 	arc.mfu = &ARC_mfu;
2456ea8dc4b6Seschrock 	arc.mfu_ghost = &ARC_mfu_ghost;
2457ea8dc4b6Seschrock 	arc.size = 0;
2458fa9e4066Sahrens 
2459*44eda4d7Smaybee 	arc.hits = 0;
2460*44eda4d7Smaybee 	arc.recycle_miss = 0;
2461*44eda4d7Smaybee 	arc.evict_skip = 0;
2462*44eda4d7Smaybee 	arc.mutex_miss = 0;
2463*44eda4d7Smaybee 
2464ea8dc4b6Seschrock 	list_create(&arc.mru->list, sizeof (arc_buf_hdr_t),
2465fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2466ea8dc4b6Seschrock 	list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t),
2467fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2468ea8dc4b6Seschrock 	list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t),
2469fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2470ea8dc4b6Seschrock 	list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t),
2471fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2472fa9e4066Sahrens 
2473fa9e4066Sahrens 	buf_init();
2474fa9e4066Sahrens 
2475fa9e4066Sahrens 	arc_thread_exit = 0;
2476ea8dc4b6Seschrock 	arc_eviction_list = NULL;
2477ea8dc4b6Seschrock 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
2478fa9e4066Sahrens 
2479fa9e4066Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
2480fa9e4066Sahrens 	    TS_RUN, minclsyspri);
2481fa9e4066Sahrens }
2482fa9e4066Sahrens 
2483fa9e4066Sahrens void
2484fa9e4066Sahrens arc_fini(void)
2485fa9e4066Sahrens {
2486fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
2487fa9e4066Sahrens 	arc_thread_exit = 1;
2488fa9e4066Sahrens 	while (arc_thread_exit != 0)
2489fa9e4066Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
2490fa9e4066Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
2491fa9e4066Sahrens 
2492fa9e4066Sahrens 	arc_flush();
2493fa9e4066Sahrens 
2494fa9e4066Sahrens 	arc_dead = TRUE;
2495fa9e4066Sahrens 
2496ea8dc4b6Seschrock 	mutex_destroy(&arc_eviction_mtx);
2497fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_lock);
2498fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
2499fa9e4066Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
2500fa9e4066Sahrens 
2501ea8dc4b6Seschrock 	list_destroy(&arc.mru->list);
2502ea8dc4b6Seschrock 	list_destroy(&arc.mru_ghost->list);
2503ea8dc4b6Seschrock 	list_destroy(&arc.mfu->list);
2504ea8dc4b6Seschrock 	list_destroy(&arc.mfu_ghost->list);
2505fa9e4066Sahrens 
2506fa9e4066Sahrens 	buf_fini();
2507fa9e4066Sahrens }
2508