xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision 13506d1eefbbc37e2f12a0528831d9f6d4c361d7)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5033f9833Sek  * Common Development and Distribution License (the "License").
6033f9833Sek  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22033f9833Sek  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
29fa9e4066Sahrens  * DVA-based Adjustable Relpacement Cache
30fa9e4066Sahrens  *
31ea8dc4b6Seschrock  * While much of the theory of operation used here is
32ea8dc4b6Seschrock  * based on the self-tuning, low overhead replacement cache
33fa9e4066Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
34fa9e4066Sahrens  * significant differences:
35fa9e4066Sahrens  *
36fa9e4066Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
37fa9e4066Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
38fa9e4066Sahrens  * the eviction algorithm simple: evict the last page in the list.
39fa9e4066Sahrens  * This also make the performance characteristics easy to reason
40fa9e4066Sahrens  * about.  Our cache is not so simple.  At any given moment, some
41fa9e4066Sahrens  * subset of the blocks in the cache are un-evictable because we
42fa9e4066Sahrens  * have handed out a reference to them.  Blocks are only evictable
43fa9e4066Sahrens  * when there are no external references active.  This makes
44fa9e4066Sahrens  * eviction far more problematic:  we choose to evict the evictable
45fa9e4066Sahrens  * blocks that are the "lowest" in the list.
46fa9e4066Sahrens  *
47fa9e4066Sahrens  * There are times when it is not possible to evict the requested
48fa9e4066Sahrens  * space.  In these circumstances we are unable to adjust the cache
49fa9e4066Sahrens  * size.  To prevent the cache growing unbounded at these times we
50fa9e4066Sahrens  * implement a "cache throttle" that slowes the flow of new data
51fa9e4066Sahrens  * into the cache until we can make space avaiable.
52fa9e4066Sahrens  *
53fa9e4066Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
54fa9e4066Sahrens  * Pages are evicted when the cache is full and there is a cache
55fa9e4066Sahrens  * miss.  Our model has a variable sized cache.  It grows with
56fa9e4066Sahrens  * high use, but also tries to react to memory preasure from the
57fa9e4066Sahrens  * operating system: decreasing its size when system memory is
58fa9e4066Sahrens  * tight.
59fa9e4066Sahrens  *
60fa9e4066Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
61fa9e4066Sahrens  * elements of the cache are therefor exactly the same size.  So
62fa9e4066Sahrens  * when adjusting the cache size following a cache miss, its simply
63fa9e4066Sahrens  * a matter of choosing a single page to evict.  In our model, we
64fa9e4066Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
65fa9e4066Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
66fa9e4066Sahrens  * space for a cache miss that approximates as closely as possible
67fa9e4066Sahrens  * the space used by the new block.
68fa9e4066Sahrens  *
69fa9e4066Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70fa9e4066Sahrens  * by N. Megiddo & D. Modha, FAST 2003
71fa9e4066Sahrens  */
72fa9e4066Sahrens 
73fa9e4066Sahrens /*
74fa9e4066Sahrens  * The locking model:
75fa9e4066Sahrens  *
76fa9e4066Sahrens  * A new reference to a cache buffer can be obtained in two
77fa9e4066Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
78fa9e4066Sahrens  * or 2) via one of the ARC lists.  The arc_read() inerface
79fa9e4066Sahrens  * uses method 1, while the internal arc algorithms for
80fa9e4066Sahrens  * adjusting the cache use method 2.  We therefor provide two
81fa9e4066Sahrens  * types of locks: 1) the hash table lock array, and 2) the
82fa9e4066Sahrens  * arc list locks.
83fa9e4066Sahrens  *
84fa9e4066Sahrens  * Buffers do not have their own mutexs, rather they rely on the
85fa9e4066Sahrens  * hash table mutexs for the bulk of their protection (i.e. most
86fa9e4066Sahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
87fa9e4066Sahrens  *
88fa9e4066Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
89fa9e4066Sahrens  * locates the requested buffer in the hash table.  It returns
90fa9e4066Sahrens  * NULL for the mutex if the buffer was not in the table.
91fa9e4066Sahrens  *
92fa9e4066Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
93fa9e4066Sahrens  * already held before it is invoked.
94fa9e4066Sahrens  *
95fa9e4066Sahrens  * Each arc state also has a mutex which is used to protect the
96fa9e4066Sahrens  * buffer list associated with the state.  When attempting to
97fa9e4066Sahrens  * obtain a hash table lock while holding an arc list lock you
98fa9e4066Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
99fa9e4066Sahrens  * the "top" state mutex must be held before the "bot" state mutex.
100fa9e4066Sahrens  *
101ea8dc4b6Seschrock  * Arc buffers may have an associated eviction callback function.
102ea8dc4b6Seschrock  * This function will be invoked prior to removing the buffer (e.g.
103ea8dc4b6Seschrock  * in arc_do_user_evicts()).  Note however that the data associated
104ea8dc4b6Seschrock  * with the buffer may be evicted prior to the callback.  The callback
105ea8dc4b6Seschrock  * must be made with *no locks held* (to prevent deadlock).  Additionally,
106ea8dc4b6Seschrock  * the users of callbacks must ensure that their private data is
107ea8dc4b6Seschrock  * protected from simultaneous callbacks from arc_buf_evict()
108ea8dc4b6Seschrock  * and arc_do_user_evicts().
109ea8dc4b6Seschrock  *
110fa9e4066Sahrens  * Note that the majority of the performance stats are manipulated
111fa9e4066Sahrens  * with atomic operations.
112fa9e4066Sahrens  */
113fa9e4066Sahrens 
114fa9e4066Sahrens #include <sys/spa.h>
115fa9e4066Sahrens #include <sys/zio.h>
116fa9e4066Sahrens #include <sys/zfs_context.h>
117fa9e4066Sahrens #include <sys/arc.h>
118fa9e4066Sahrens #include <sys/refcount.h>
119fa9e4066Sahrens #ifdef _KERNEL
120fa9e4066Sahrens #include <sys/vmsystm.h>
121fa9e4066Sahrens #include <vm/anon.h>
122fa9e4066Sahrens #include <sys/fs/swapnode.h>
123033f9833Sek #include <sys/dnlc.h>
124fa9e4066Sahrens #endif
125fa9e4066Sahrens #include <sys/callb.h>
126fa9e4066Sahrens 
127fa9e4066Sahrens static kmutex_t		arc_reclaim_thr_lock;
128fa9e4066Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
129fa9e4066Sahrens static uint8_t		arc_thread_exit;
130fa9e4066Sahrens 
131033f9833Sek #define	ARC_REDUCE_DNLC_PERCENT	3
132033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
133033f9833Sek 
134fa9e4066Sahrens typedef enum arc_reclaim_strategy {
135fa9e4066Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
136fa9e4066Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
137fa9e4066Sahrens } arc_reclaim_strategy_t;
138fa9e4066Sahrens 
139fa9e4066Sahrens /* number of seconds before growing cache again */
140fa9e4066Sahrens static int		arc_grow_retry = 60;
141fa9e4066Sahrens 
142*13506d1eSmaybee /*
143*13506d1eSmaybee  * minimum lifespan of a prefetched block in seconds
144*13506d1eSmaybee  * (this is converted to ticks during the arc initialization)
145*13506d1eSmaybee  */
146*13506d1eSmaybee static int		arc_min_prefetch_lifespan = 1;
147*13506d1eSmaybee 
148fa9e4066Sahrens static kmutex_t arc_reclaim_lock;
149fa9e4066Sahrens static int arc_dead;
150fa9e4066Sahrens 
151fa9e4066Sahrens /*
152fa9e4066Sahrens  * Note that buffers can be on one of 5 states:
153fa9e4066Sahrens  *	ARC_anon	- anonymous (discussed below)
154ea8dc4b6Seschrock  *	ARC_mru		- recently used, currently cached
155ea8dc4b6Seschrock  *	ARC_mru_ghost	- recentely used, no longer in cache
156ea8dc4b6Seschrock  *	ARC_mfu		- frequently used, currently cached
157ea8dc4b6Seschrock  *	ARC_mfu_ghost	- frequently used, no longer in cache
158fa9e4066Sahrens  * When there are no active references to the buffer, they
159fa9e4066Sahrens  * are linked onto one of the lists in arc.  These are the
160fa9e4066Sahrens  * only buffers that can be evicted or deleted.
161fa9e4066Sahrens  *
162fa9e4066Sahrens  * Anonymous buffers are buffers that are not associated with
163fa9e4066Sahrens  * a DVA.  These are buffers that hold dirty block copies
164fa9e4066Sahrens  * before they are written to stable storage.  By definition,
165ea8dc4b6Seschrock  * they are "ref'd" and are considered part of arc_mru
166fa9e4066Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
167ea8dc4b6Seschrock  * as they are written and migrate onto the arc_mru list.
168fa9e4066Sahrens  */
169fa9e4066Sahrens 
170fa9e4066Sahrens typedef struct arc_state {
171fa9e4066Sahrens 	list_t	list;	/* linked list of evictable buffer in state */
172fa9e4066Sahrens 	uint64_t lsize;	/* total size of buffers in the linked list */
173fa9e4066Sahrens 	uint64_t size;	/* total size of all buffers in this state */
174fa9e4066Sahrens 	uint64_t hits;
175fa9e4066Sahrens 	kmutex_t mtx;
176fa9e4066Sahrens } arc_state_t;
177fa9e4066Sahrens 
178fa9e4066Sahrens /* The 5 states: */
179fa9e4066Sahrens static arc_state_t ARC_anon;
180ea8dc4b6Seschrock static arc_state_t ARC_mru;
181ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost;
182ea8dc4b6Seschrock static arc_state_t ARC_mfu;
183ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost;
184fa9e4066Sahrens 
185fa9e4066Sahrens static struct arc {
186fa9e4066Sahrens 	arc_state_t 	*anon;
187ea8dc4b6Seschrock 	arc_state_t	*mru;
188ea8dc4b6Seschrock 	arc_state_t	*mru_ghost;
189ea8dc4b6Seschrock 	arc_state_t	*mfu;
190ea8dc4b6Seschrock 	arc_state_t	*mfu_ghost;
191fa9e4066Sahrens 	uint64_t	size;		/* Actual total arc size */
192ea8dc4b6Seschrock 	uint64_t	p;		/* Target size (in bytes) of mru */
193fa9e4066Sahrens 	uint64_t	c;		/* Target size of cache (in bytes) */
194fa9e4066Sahrens 	uint64_t	c_min;		/* Minimum target cache size */
195fa9e4066Sahrens 	uint64_t	c_max;		/* Maximum target cache size */
196fa9e4066Sahrens 
197fa9e4066Sahrens 	/* performance stats */
198fa9e4066Sahrens 	uint64_t	hits;
199fa9e4066Sahrens 	uint64_t	misses;
200fa9e4066Sahrens 	uint64_t	deleted;
201fa9e4066Sahrens 	uint64_t	skipped;
202fa9e4066Sahrens 	uint64_t	hash_elements;
203fa9e4066Sahrens 	uint64_t	hash_elements_max;
204fa9e4066Sahrens 	uint64_t	hash_collisions;
205fa9e4066Sahrens 	uint64_t	hash_chains;
206fa9e4066Sahrens 	uint32_t	hash_chain_max;
207fa9e4066Sahrens 
208fa9e4066Sahrens 	int		no_grow;	/* Don't try to grow cache size */
209fa9e4066Sahrens } arc;
210fa9e4066Sahrens 
211fa9e4066Sahrens static uint64_t arc_tempreserve;
212fa9e4066Sahrens 
213fa9e4066Sahrens typedef struct arc_callback arc_callback_t;
214fa9e4066Sahrens 
215fa9e4066Sahrens struct arc_callback {
216fa9e4066Sahrens 	arc_done_func_t		*acb_done;
217fa9e4066Sahrens 	void			*acb_private;
218fa9e4066Sahrens 	arc_byteswap_func_t	*acb_byteswap;
219fa9e4066Sahrens 	arc_buf_t		*acb_buf;
220fa9e4066Sahrens 	zio_t			*acb_zio_dummy;
221fa9e4066Sahrens 	arc_callback_t		*acb_next;
222fa9e4066Sahrens };
223fa9e4066Sahrens 
224fa9e4066Sahrens struct arc_buf_hdr {
225fa9e4066Sahrens 	/* immutable */
226fa9e4066Sahrens 	uint64_t		b_size;
227fa9e4066Sahrens 	spa_t			*b_spa;
228fa9e4066Sahrens 
229fa9e4066Sahrens 	/* protected by hash lock */
230fa9e4066Sahrens 	dva_t			b_dva;
231fa9e4066Sahrens 	uint64_t		b_birth;
232fa9e4066Sahrens 	uint64_t		b_cksum0;
233fa9e4066Sahrens 
234fa9e4066Sahrens 	arc_buf_hdr_t		*b_hash_next;
235fa9e4066Sahrens 	arc_buf_t		*b_buf;
236fa9e4066Sahrens 	uint32_t		b_flags;
237ea8dc4b6Seschrock 	uint32_t		b_datacnt;
238fa9e4066Sahrens 
239fa9e4066Sahrens 	kcondvar_t		b_cv;
240fa9e4066Sahrens 	arc_callback_t		*b_acb;
241fa9e4066Sahrens 
242fa9e4066Sahrens 	/* protected by arc state mutex */
243fa9e4066Sahrens 	arc_state_t		*b_state;
244fa9e4066Sahrens 	list_node_t		b_arc_node;
245fa9e4066Sahrens 
246fa9e4066Sahrens 	/* updated atomically */
247fa9e4066Sahrens 	clock_t			b_arc_access;
248fa9e4066Sahrens 
249fa9e4066Sahrens 	/* self protecting */
250fa9e4066Sahrens 	refcount_t		b_refcnt;
251fa9e4066Sahrens };
252fa9e4066Sahrens 
253ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list;
254ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx;
255ea8dc4b6Seschrock static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
256ea8dc4b6Seschrock 
257ea8dc4b6Seschrock #define	GHOST_STATE(state)	\
258ea8dc4b6Seschrock 	((state) == arc.mru_ghost || (state) == arc.mfu_ghost)
259ea8dc4b6Seschrock 
260fa9e4066Sahrens /*
261fa9e4066Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
262fa9e4066Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
263fa9e4066Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
264fa9e4066Sahrens  * should never be passed and should only be set by ARC code.  When adding new
265fa9e4066Sahrens  * public flags, make sure not to smash the private ones.
266fa9e4066Sahrens  */
267fa9e4066Sahrens 
268ea8dc4b6Seschrock #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
269fa9e4066Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
270fa9e4066Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
271fa9e4066Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
272ea8dc4b6Seschrock #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
273*13506d1eSmaybee #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
274fa9e4066Sahrens 
275ea8dc4b6Seschrock #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
276fa9e4066Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
277fa9e4066Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
278fa9e4066Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
279ea8dc4b6Seschrock #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
280fa9e4066Sahrens 
281fa9e4066Sahrens /*
282fa9e4066Sahrens  * Hash table routines
283fa9e4066Sahrens  */
284fa9e4066Sahrens 
285fa9e4066Sahrens #define	HT_LOCK_PAD	64
286fa9e4066Sahrens 
287fa9e4066Sahrens struct ht_lock {
288fa9e4066Sahrens 	kmutex_t	ht_lock;
289fa9e4066Sahrens #ifdef _KERNEL
290fa9e4066Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
291fa9e4066Sahrens #endif
292fa9e4066Sahrens };
293fa9e4066Sahrens 
294fa9e4066Sahrens #define	BUF_LOCKS 256
295fa9e4066Sahrens typedef struct buf_hash_table {
296fa9e4066Sahrens 	uint64_t ht_mask;
297fa9e4066Sahrens 	arc_buf_hdr_t **ht_table;
298fa9e4066Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
299fa9e4066Sahrens } buf_hash_table_t;
300fa9e4066Sahrens 
301fa9e4066Sahrens static buf_hash_table_t buf_hash_table;
302fa9e4066Sahrens 
303fa9e4066Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
304fa9e4066Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
305fa9e4066Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
306fa9e4066Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
307fa9e4066Sahrens #define	HDR_LOCK(buf) \
308fa9e4066Sahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
309fa9e4066Sahrens 
310fa9e4066Sahrens uint64_t zfs_crc64_table[256];
311fa9e4066Sahrens 
312fa9e4066Sahrens static uint64_t
313fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
314fa9e4066Sahrens {
315fa9e4066Sahrens 	uintptr_t spav = (uintptr_t)spa;
316fa9e4066Sahrens 	uint8_t *vdva = (uint8_t *)dva;
317fa9e4066Sahrens 	uint64_t crc = -1ULL;
318fa9e4066Sahrens 	int i;
319fa9e4066Sahrens 
320fa9e4066Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
321fa9e4066Sahrens 
322fa9e4066Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
323fa9e4066Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
324fa9e4066Sahrens 
325fa9e4066Sahrens 	crc ^= (spav>>8) ^ birth;
326fa9e4066Sahrens 
327fa9e4066Sahrens 	return (crc);
328fa9e4066Sahrens }
329fa9e4066Sahrens 
330fa9e4066Sahrens #define	BUF_EMPTY(buf)						\
331fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
332fa9e4066Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
333fa9e4066Sahrens 	(buf)->b_birth == 0)
334fa9e4066Sahrens 
335fa9e4066Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
336fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
337fa9e4066Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
338fa9e4066Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
339fa9e4066Sahrens 
340fa9e4066Sahrens static arc_buf_hdr_t *
341fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
342fa9e4066Sahrens {
343fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
344fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
345fa9e4066Sahrens 	arc_buf_hdr_t *buf;
346fa9e4066Sahrens 
347fa9e4066Sahrens 	mutex_enter(hash_lock);
348fa9e4066Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
349fa9e4066Sahrens 	    buf = buf->b_hash_next) {
350fa9e4066Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
351fa9e4066Sahrens 			*lockp = hash_lock;
352fa9e4066Sahrens 			return (buf);
353fa9e4066Sahrens 		}
354fa9e4066Sahrens 	}
355fa9e4066Sahrens 	mutex_exit(hash_lock);
356fa9e4066Sahrens 	*lockp = NULL;
357fa9e4066Sahrens 	return (NULL);
358fa9e4066Sahrens }
359fa9e4066Sahrens 
360fa9e4066Sahrens /*
361fa9e4066Sahrens  * Insert an entry into the hash table.  If there is already an element
362fa9e4066Sahrens  * equal to elem in the hash table, then the already existing element
363fa9e4066Sahrens  * will be returned and the new element will not be inserted.
364fa9e4066Sahrens  * Otherwise returns NULL.
365fa9e4066Sahrens  */
366fa9e4066Sahrens static arc_buf_hdr_t *
367fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
368fa9e4066Sahrens {
369fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
370fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
371fa9e4066Sahrens 	arc_buf_hdr_t *fbuf;
372fa9e4066Sahrens 	uint32_t max, i;
373fa9e4066Sahrens 
374ea8dc4b6Seschrock 	ASSERT(!HDR_IN_HASH_TABLE(buf));
375fa9e4066Sahrens 	*lockp = hash_lock;
376fa9e4066Sahrens 	mutex_enter(hash_lock);
377fa9e4066Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
378fa9e4066Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
379fa9e4066Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
380fa9e4066Sahrens 			return (fbuf);
381fa9e4066Sahrens 	}
382fa9e4066Sahrens 
383fa9e4066Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
384fa9e4066Sahrens 	buf_hash_table.ht_table[idx] = buf;
385ea8dc4b6Seschrock 	buf->b_flags |= ARC_IN_HASH_TABLE;
386fa9e4066Sahrens 
387fa9e4066Sahrens 	/* collect some hash table performance data */
388fa9e4066Sahrens 	if (i > 0) {
389fa9e4066Sahrens 		atomic_add_64(&arc.hash_collisions, 1);
390fa9e4066Sahrens 		if (i == 1)
391fa9e4066Sahrens 			atomic_add_64(&arc.hash_chains, 1);
392fa9e4066Sahrens 	}
393fa9e4066Sahrens 	while (i > (max = arc.hash_chain_max) &&
394fa9e4066Sahrens 	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
395fa9e4066Sahrens 		continue;
396fa9e4066Sahrens 	}
397fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, 1);
398fa9e4066Sahrens 	if (arc.hash_elements > arc.hash_elements_max)
399fa9e4066Sahrens 		atomic_add_64(&arc.hash_elements_max, 1);
400fa9e4066Sahrens 
401fa9e4066Sahrens 	return (NULL);
402fa9e4066Sahrens }
403fa9e4066Sahrens 
404fa9e4066Sahrens static void
405fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
406fa9e4066Sahrens {
407fa9e4066Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
408fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
409fa9e4066Sahrens 
410fa9e4066Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
411ea8dc4b6Seschrock 	ASSERT(HDR_IN_HASH_TABLE(buf));
412fa9e4066Sahrens 
413fa9e4066Sahrens 	bufp = &buf_hash_table.ht_table[idx];
414fa9e4066Sahrens 	while ((fbuf = *bufp) != buf) {
415fa9e4066Sahrens 		ASSERT(fbuf != NULL);
416fa9e4066Sahrens 		bufp = &fbuf->b_hash_next;
417fa9e4066Sahrens 	}
418fa9e4066Sahrens 	*bufp = buf->b_hash_next;
419fa9e4066Sahrens 	buf->b_hash_next = NULL;
420ea8dc4b6Seschrock 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
421fa9e4066Sahrens 
422fa9e4066Sahrens 	/* collect some hash table performance data */
423fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, -1);
424fa9e4066Sahrens 	if (buf_hash_table.ht_table[idx] &&
425fa9e4066Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
426fa9e4066Sahrens 		atomic_add_64(&arc.hash_chains, -1);
427fa9e4066Sahrens }
428fa9e4066Sahrens 
429fa9e4066Sahrens /*
430fa9e4066Sahrens  * Global data structures and functions for the buf kmem cache.
431fa9e4066Sahrens  */
432fa9e4066Sahrens static kmem_cache_t *hdr_cache;
433fa9e4066Sahrens static kmem_cache_t *buf_cache;
434fa9e4066Sahrens 
435fa9e4066Sahrens static void
436fa9e4066Sahrens buf_fini(void)
437fa9e4066Sahrens {
438fa9e4066Sahrens 	int i;
439fa9e4066Sahrens 
440fa9e4066Sahrens 	kmem_free(buf_hash_table.ht_table,
441fa9e4066Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
442fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
443fa9e4066Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
444fa9e4066Sahrens 	kmem_cache_destroy(hdr_cache);
445fa9e4066Sahrens 	kmem_cache_destroy(buf_cache);
446fa9e4066Sahrens }
447fa9e4066Sahrens 
448fa9e4066Sahrens /*
449fa9e4066Sahrens  * Constructor callback - called when the cache is empty
450fa9e4066Sahrens  * and a new buf is requested.
451fa9e4066Sahrens  */
452fa9e4066Sahrens /* ARGSUSED */
453fa9e4066Sahrens static int
454fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
455fa9e4066Sahrens {
456fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
457fa9e4066Sahrens 
458fa9e4066Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
459fa9e4066Sahrens 	refcount_create(&buf->b_refcnt);
460fa9e4066Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
461fa9e4066Sahrens 	return (0);
462fa9e4066Sahrens }
463fa9e4066Sahrens 
464fa9e4066Sahrens /*
465fa9e4066Sahrens  * Destructor callback - called when a cached buf is
466fa9e4066Sahrens  * no longer required.
467fa9e4066Sahrens  */
468fa9e4066Sahrens /* ARGSUSED */
469fa9e4066Sahrens static void
470fa9e4066Sahrens hdr_dest(void *vbuf, void *unused)
471fa9e4066Sahrens {
472fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
473fa9e4066Sahrens 
474fa9e4066Sahrens 	refcount_destroy(&buf->b_refcnt);
475fa9e4066Sahrens 	cv_destroy(&buf->b_cv);
476fa9e4066Sahrens }
477fa9e4066Sahrens 
478ea8dc4b6Seschrock static int arc_reclaim_needed(void);
479fa9e4066Sahrens void arc_kmem_reclaim(void);
480fa9e4066Sahrens 
481fa9e4066Sahrens /*
482fa9e4066Sahrens  * Reclaim callback -- invoked when memory is low.
483fa9e4066Sahrens  */
484fa9e4066Sahrens /* ARGSUSED */
485fa9e4066Sahrens static void
486fa9e4066Sahrens hdr_recl(void *unused)
487fa9e4066Sahrens {
488fa9e4066Sahrens 	dprintf("hdr_recl called\n");
489ea8dc4b6Seschrock 	if (arc_reclaim_needed())
490ea8dc4b6Seschrock 		arc_kmem_reclaim();
491fa9e4066Sahrens }
492fa9e4066Sahrens 
493fa9e4066Sahrens static void
494fa9e4066Sahrens buf_init(void)
495fa9e4066Sahrens {
496fa9e4066Sahrens 	uint64_t *ct;
497ea8dc4b6Seschrock 	uint64_t hsize = 1ULL << 12;
498fa9e4066Sahrens 	int i, j;
499fa9e4066Sahrens 
500fa9e4066Sahrens 	/*
501fa9e4066Sahrens 	 * The hash table is big enough to fill all of physical memory
502ea8dc4b6Seschrock 	 * with an average 64K block size.  The table will take up
503ea8dc4b6Seschrock 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
504fa9e4066Sahrens 	 */
505ea8dc4b6Seschrock 	while (hsize * 65536 < physmem * PAGESIZE)
506fa9e4066Sahrens 		hsize <<= 1;
507ea8dc4b6Seschrock retry:
508fa9e4066Sahrens 	buf_hash_table.ht_mask = hsize - 1;
509ea8dc4b6Seschrock 	buf_hash_table.ht_table =
510ea8dc4b6Seschrock 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
511ea8dc4b6Seschrock 	if (buf_hash_table.ht_table == NULL) {
512ea8dc4b6Seschrock 		ASSERT(hsize > (1ULL << 8));
513ea8dc4b6Seschrock 		hsize >>= 1;
514ea8dc4b6Seschrock 		goto retry;
515ea8dc4b6Seschrock 	}
516fa9e4066Sahrens 
517fa9e4066Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
518fa9e4066Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
519fa9e4066Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
520fa9e4066Sahrens 	    0, NULL, NULL, NULL, NULL, NULL, 0);
521fa9e4066Sahrens 
522fa9e4066Sahrens 	for (i = 0; i < 256; i++)
523fa9e4066Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
524fa9e4066Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
525fa9e4066Sahrens 
526fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
527fa9e4066Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
528fa9e4066Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
529fa9e4066Sahrens 	}
530fa9e4066Sahrens }
531fa9e4066Sahrens 
532fa9e4066Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
533fa9e4066Sahrens 
534fa9e4066Sahrens static void
535fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
536fa9e4066Sahrens {
537fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
538fa9e4066Sahrens 
539fa9e4066Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
540fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
541ea8dc4b6Seschrock 		int delta = ab->b_size * ab->b_datacnt;
542fa9e4066Sahrens 
543fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
544fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
545fa9e4066Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
546fa9e4066Sahrens 		list_remove(&ab->b_state->list, ab);
547ea8dc4b6Seschrock 		if (GHOST_STATE(ab->b_state)) {
548ea8dc4b6Seschrock 			ASSERT3U(ab->b_datacnt, ==, 0);
549ea8dc4b6Seschrock 			ASSERT3P(ab->b_buf, ==, NULL);
550ea8dc4b6Seschrock 			delta = ab->b_size;
551ea8dc4b6Seschrock 		}
552ea8dc4b6Seschrock 		ASSERT(delta > 0);
553ea8dc4b6Seschrock 		ASSERT3U(ab->b_state->lsize, >=, delta);
554ea8dc4b6Seschrock 		atomic_add_64(&ab->b_state->lsize, -delta);
555fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
556*13506d1eSmaybee 		/* remove the prefetch flag is we get a reference */
557*13506d1eSmaybee 		if (ab->b_flags & ARC_PREFETCH)
558*13506d1eSmaybee 			ab->b_flags &= ~ARC_PREFETCH;
559fa9e4066Sahrens 	}
560fa9e4066Sahrens }
561fa9e4066Sahrens 
562fa9e4066Sahrens static int
563fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
564fa9e4066Sahrens {
565fa9e4066Sahrens 	int cnt;
566fa9e4066Sahrens 
567ea8dc4b6Seschrock 	ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock));
568ea8dc4b6Seschrock 	ASSERT(!GHOST_STATE(ab->b_state));
569fa9e4066Sahrens 
570fa9e4066Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
571fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
572fa9e4066Sahrens 
573fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
574fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
575fa9e4066Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
576fa9e4066Sahrens 		list_insert_head(&ab->b_state->list, ab);
577ea8dc4b6Seschrock 		ASSERT(ab->b_datacnt > 0);
578ea8dc4b6Seschrock 		atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt);
579ea8dc4b6Seschrock 		ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize);
580fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
581fa9e4066Sahrens 	}
582fa9e4066Sahrens 	return (cnt);
583fa9e4066Sahrens }
584fa9e4066Sahrens 
585fa9e4066Sahrens /*
586fa9e4066Sahrens  * Move the supplied buffer to the indicated state.  The mutex
587fa9e4066Sahrens  * for the buffer must be held by the caller.
588fa9e4066Sahrens  */
589fa9e4066Sahrens static void
590ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
591fa9e4066Sahrens {
592ea8dc4b6Seschrock 	arc_state_t *old_state = ab->b_state;
593ea8dc4b6Seschrock 	int refcnt = refcount_count(&ab->b_refcnt);
594ea8dc4b6Seschrock 	int from_delta, to_delta;
595fa9e4066Sahrens 
596fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
597ea8dc4b6Seschrock 	ASSERT(new_state != old_state);
598ea8dc4b6Seschrock 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
599ea8dc4b6Seschrock 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
600ea8dc4b6Seschrock 
601ea8dc4b6Seschrock 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
602fa9e4066Sahrens 
603fa9e4066Sahrens 	/*
604fa9e4066Sahrens 	 * If this buffer is evictable, transfer it from the
605fa9e4066Sahrens 	 * old state list to the new state list.
606fa9e4066Sahrens 	 */
607ea8dc4b6Seschrock 	if (refcnt == 0) {
608ea8dc4b6Seschrock 		if (old_state != arc.anon) {
609ea8dc4b6Seschrock 			int use_mutex = !MUTEX_HELD(&old_state->mtx);
610ea8dc4b6Seschrock 
611ea8dc4b6Seschrock 			if (use_mutex)
612ea8dc4b6Seschrock 				mutex_enter(&old_state->mtx);
613fa9e4066Sahrens 
614fa9e4066Sahrens 			ASSERT(list_link_active(&ab->b_arc_node));
615ea8dc4b6Seschrock 			list_remove(&old_state->list, ab);
616ea8dc4b6Seschrock 
617*13506d1eSmaybee 			/*
618*13506d1eSmaybee 			 * If prefetching out of the ghost cache,
619*13506d1eSmaybee 			 * we will have a non-null datacnt.
620*13506d1eSmaybee 			 */
621*13506d1eSmaybee 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
622*13506d1eSmaybee 				/* ghost elements have a ghost size */
623ea8dc4b6Seschrock 				ASSERT(ab->b_buf == NULL);
624ea8dc4b6Seschrock 				from_delta = ab->b_size;
625ea8dc4b6Seschrock 			}
626ea8dc4b6Seschrock 			ASSERT3U(old_state->lsize, >=, from_delta);
627ea8dc4b6Seschrock 			atomic_add_64(&old_state->lsize, -from_delta);
628ea8dc4b6Seschrock 
629ea8dc4b6Seschrock 			if (use_mutex)
630ea8dc4b6Seschrock 				mutex_exit(&old_state->mtx);
631fa9e4066Sahrens 		}
632fa9e4066Sahrens 		if (new_state != arc.anon) {
633ea8dc4b6Seschrock 			int use_mutex = !MUTEX_HELD(&new_state->mtx);
634fa9e4066Sahrens 
635ea8dc4b6Seschrock 			if (use_mutex)
636fa9e4066Sahrens 				mutex_enter(&new_state->mtx);
637ea8dc4b6Seschrock 
638fa9e4066Sahrens 			list_insert_head(&new_state->list, ab);
639ea8dc4b6Seschrock 
640ea8dc4b6Seschrock 			/* ghost elements have a ghost size */
641ea8dc4b6Seschrock 			if (GHOST_STATE(new_state)) {
642ea8dc4b6Seschrock 				ASSERT(ab->b_datacnt == 0);
643ea8dc4b6Seschrock 				ASSERT(ab->b_buf == NULL);
644ea8dc4b6Seschrock 				to_delta = ab->b_size;
645ea8dc4b6Seschrock 			}
646ea8dc4b6Seschrock 			atomic_add_64(&new_state->lsize, to_delta);
647ea8dc4b6Seschrock 			ASSERT3U(new_state->size + to_delta, >=,
648ea8dc4b6Seschrock 			    new_state->lsize);
649ea8dc4b6Seschrock 
650ea8dc4b6Seschrock 			if (use_mutex)
651fa9e4066Sahrens 				mutex_exit(&new_state->mtx);
652fa9e4066Sahrens 		}
653fa9e4066Sahrens 	}
654fa9e4066Sahrens 
655fa9e4066Sahrens 	ASSERT(!BUF_EMPTY(ab));
656ea8dc4b6Seschrock 	if (new_state == arc.anon && old_state != arc.anon) {
657fa9e4066Sahrens 		buf_hash_remove(ab);
658fa9e4066Sahrens 	}
659fa9e4066Sahrens 
660ea8dc4b6Seschrock 	/* adjust state sizes */
661ea8dc4b6Seschrock 	if (to_delta)
662ea8dc4b6Seschrock 		atomic_add_64(&new_state->size, to_delta);
663ea8dc4b6Seschrock 	if (from_delta) {
664ea8dc4b6Seschrock 		ASSERT3U(old_state->size, >=, from_delta);
665ea8dc4b6Seschrock 		atomic_add_64(&old_state->size, -from_delta);
666fa9e4066Sahrens 	}
667fa9e4066Sahrens 	ab->b_state = new_state;
668fa9e4066Sahrens }
669fa9e4066Sahrens 
670fa9e4066Sahrens arc_buf_t *
671fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag)
672fa9e4066Sahrens {
673fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
674fa9e4066Sahrens 	arc_buf_t *buf;
675fa9e4066Sahrens 
676fa9e4066Sahrens 	ASSERT3U(size, >, 0);
677fa9e4066Sahrens 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
678fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
679fa9e4066Sahrens 	hdr->b_size = size;
680fa9e4066Sahrens 	hdr->b_spa = spa;
681fa9e4066Sahrens 	hdr->b_state = arc.anon;
682fa9e4066Sahrens 	hdr->b_arc_access = 0;
683fa9e4066Sahrens 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
684fa9e4066Sahrens 	buf->b_hdr = hdr;
685ea8dc4b6Seschrock 	buf->b_efunc = NULL;
686ea8dc4b6Seschrock 	buf->b_private = NULL;
687fa9e4066Sahrens 	buf->b_next = NULL;
688fa9e4066Sahrens 	buf->b_data = zio_buf_alloc(size);
689fa9e4066Sahrens 	hdr->b_buf = buf;
690ea8dc4b6Seschrock 	hdr->b_datacnt = 1;
691fa9e4066Sahrens 	hdr->b_flags = 0;
692fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
693fa9e4066Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
694fa9e4066Sahrens 
695fa9e4066Sahrens 	atomic_add_64(&arc.size, size);
696fa9e4066Sahrens 	atomic_add_64(&arc.anon->size, size);
697fa9e4066Sahrens 
698fa9e4066Sahrens 	return (buf);
699fa9e4066Sahrens }
700fa9e4066Sahrens 
701ea8dc4b6Seschrock static void *
702ea8dc4b6Seschrock arc_data_copy(arc_buf_hdr_t *hdr, void *old_data)
703ea8dc4b6Seschrock {
704ea8dc4b6Seschrock 	void *new_data = zio_buf_alloc(hdr->b_size);
705ea8dc4b6Seschrock 
706ea8dc4b6Seschrock 	atomic_add_64(&arc.size, hdr->b_size);
707ea8dc4b6Seschrock 	bcopy(old_data, new_data, hdr->b_size);
708ea8dc4b6Seschrock 	atomic_add_64(&hdr->b_state->size, hdr->b_size);
709ea8dc4b6Seschrock 	if (list_link_active(&hdr->b_arc_node)) {
710ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
711ea8dc4b6Seschrock 		atomic_add_64(&hdr->b_state->lsize, hdr->b_size);
712ea8dc4b6Seschrock 	}
713ea8dc4b6Seschrock 	return (new_data);
714ea8dc4b6Seschrock }
715ea8dc4b6Seschrock 
716ea8dc4b6Seschrock void
717ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag)
718ea8dc4b6Seschrock {
719ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr;
720ea8dc4b6Seschrock 	kmutex_t *hash_lock;
721ea8dc4b6Seschrock 
722ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
723ea8dc4b6Seschrock 	hdr = buf->b_hdr;
724ea8dc4b6Seschrock 	if (buf->b_data == NULL) {
725ea8dc4b6Seschrock 		/*
726ea8dc4b6Seschrock 		 * This buffer is evicted.
727ea8dc4b6Seschrock 		 */
728ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
729ea8dc4b6Seschrock 		return;
730ea8dc4b6Seschrock 	} else {
731ea8dc4b6Seschrock 		/*
732ea8dc4b6Seschrock 		 * Prevent this buffer from being evicted
733ea8dc4b6Seschrock 		 * while we add a reference.
734ea8dc4b6Seschrock 		 */
735ea8dc4b6Seschrock 		buf->b_hdr = NULL;
736ea8dc4b6Seschrock 	}
737ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
738ea8dc4b6Seschrock 
739ea8dc4b6Seschrock 	ASSERT(hdr->b_state != arc.anon);
740ea8dc4b6Seschrock 	hash_lock = HDR_LOCK(hdr);
741ea8dc4b6Seschrock 	mutex_enter(hash_lock);
742ea8dc4b6Seschrock 	ASSERT(!GHOST_STATE(hdr->b_state));
743ea8dc4b6Seschrock 	buf->b_hdr = hdr;
744ea8dc4b6Seschrock 	add_reference(hdr, hash_lock, tag);
745ea8dc4b6Seschrock 	arc_access_and_exit(hdr, hash_lock);
746ea8dc4b6Seschrock 	atomic_add_64(&arc.hits, 1);
747ea8dc4b6Seschrock }
748ea8dc4b6Seschrock 
749ea8dc4b6Seschrock static void
750ea8dc4b6Seschrock arc_buf_destroy(arc_buf_t *buf, boolean_t all)
751ea8dc4b6Seschrock {
752ea8dc4b6Seschrock 	arc_buf_t **bufp;
753ea8dc4b6Seschrock 
754ea8dc4b6Seschrock 	/* free up data associated with the buf */
755ea8dc4b6Seschrock 	if (buf->b_data) {
756ea8dc4b6Seschrock 		arc_state_t *state = buf->b_hdr->b_state;
757ea8dc4b6Seschrock 		uint64_t size = buf->b_hdr->b_size;
758ea8dc4b6Seschrock 
759ea8dc4b6Seschrock 		zio_buf_free(buf->b_data, size);
760ea8dc4b6Seschrock 		atomic_add_64(&arc.size, -size);
761ea8dc4b6Seschrock 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
762ea8dc4b6Seschrock 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
763ea8dc4b6Seschrock 			ASSERT(state != arc.anon);
764ea8dc4b6Seschrock 			ASSERT3U(state->lsize, >=, size);
765ea8dc4b6Seschrock 			atomic_add_64(&state->lsize, -size);
766ea8dc4b6Seschrock 		}
767ea8dc4b6Seschrock 		ASSERT3U(state->size, >=, size);
768ea8dc4b6Seschrock 		atomic_add_64(&state->size, -size);
769ea8dc4b6Seschrock 		buf->b_data = NULL;
770ea8dc4b6Seschrock 		ASSERT(buf->b_hdr->b_datacnt > 0);
771ea8dc4b6Seschrock 		buf->b_hdr->b_datacnt -= 1;
772ea8dc4b6Seschrock 	}
773ea8dc4b6Seschrock 
774ea8dc4b6Seschrock 	/* only remove the buf if requested */
775ea8dc4b6Seschrock 	if (!all)
776ea8dc4b6Seschrock 		return;
777ea8dc4b6Seschrock 
778ea8dc4b6Seschrock 	/* remove the buf from the hdr list */
779ea8dc4b6Seschrock 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
780ea8dc4b6Seschrock 		continue;
781ea8dc4b6Seschrock 	*bufp = buf->b_next;
782ea8dc4b6Seschrock 
783ea8dc4b6Seschrock 	ASSERT(buf->b_efunc == NULL);
784ea8dc4b6Seschrock 
785ea8dc4b6Seschrock 	/* clean up the buf */
786ea8dc4b6Seschrock 	buf->b_hdr = NULL;
787ea8dc4b6Seschrock 	kmem_cache_free(buf_cache, buf);
788ea8dc4b6Seschrock }
789ea8dc4b6Seschrock 
790fa9e4066Sahrens static void
791ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr)
792fa9e4066Sahrens {
793fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
794fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
795ea8dc4b6Seschrock 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
796fa9e4066Sahrens 
797fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
798ea8dc4b6Seschrock 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
799fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
800fa9e4066Sahrens 		hdr->b_birth = 0;
801fa9e4066Sahrens 		hdr->b_cksum0 = 0;
802fa9e4066Sahrens 	}
803ea8dc4b6Seschrock 	while (hdr->b_buf) {
804fa9e4066Sahrens 		arc_buf_t *buf = hdr->b_buf;
805fa9e4066Sahrens 
806ea8dc4b6Seschrock 		if (buf->b_efunc) {
807ea8dc4b6Seschrock 			mutex_enter(&arc_eviction_mtx);
808ea8dc4b6Seschrock 			ASSERT(buf->b_hdr != NULL);
809ea8dc4b6Seschrock 			arc_buf_destroy(hdr->b_buf, FALSE);
810ea8dc4b6Seschrock 			hdr->b_buf = buf->b_next;
811ea8dc4b6Seschrock 			buf->b_next = arc_eviction_list;
812ea8dc4b6Seschrock 			arc_eviction_list = buf;
813ea8dc4b6Seschrock 			mutex_exit(&arc_eviction_mtx);
814ea8dc4b6Seschrock 		} else {
815ea8dc4b6Seschrock 			arc_buf_destroy(hdr->b_buf, TRUE);
816ea8dc4b6Seschrock 		}
817fa9e4066Sahrens 	}
818ea8dc4b6Seschrock 
819fa9e4066Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
820fa9e4066Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
821fa9e4066Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
822fa9e4066Sahrens 	kmem_cache_free(hdr_cache, hdr);
823fa9e4066Sahrens }
824fa9e4066Sahrens 
825fa9e4066Sahrens void
826fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
827fa9e4066Sahrens {
828fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
829ea8dc4b6Seschrock 	int hashed = hdr->b_state != arc.anon;
830fa9e4066Sahrens 
831ea8dc4b6Seschrock 	ASSERT(buf->b_efunc == NULL);
832ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
833ea8dc4b6Seschrock 
834ea8dc4b6Seschrock 	if (hashed) {
835ea8dc4b6Seschrock 		kmutex_t *hash_lock = HDR_LOCK(hdr);
836ea8dc4b6Seschrock 
837ea8dc4b6Seschrock 		mutex_enter(hash_lock);
838ea8dc4b6Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
839ea8dc4b6Seschrock 		if (hdr->b_datacnt > 1)
840ea8dc4b6Seschrock 			arc_buf_destroy(buf, TRUE);
841ea8dc4b6Seschrock 		else
842ea8dc4b6Seschrock 			hdr->b_flags |= ARC_BUF_AVAILABLE;
843fa9e4066Sahrens 		mutex_exit(hash_lock);
844ea8dc4b6Seschrock 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
845ea8dc4b6Seschrock 		int destroy_hdr;
846ea8dc4b6Seschrock 		/*
847ea8dc4b6Seschrock 		 * We are in the middle of an async write.  Don't destroy
848ea8dc4b6Seschrock 		 * this buffer unless the write completes before we finish
849ea8dc4b6Seschrock 		 * decrementing the reference count.
850ea8dc4b6Seschrock 		 */
851ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
852ea8dc4b6Seschrock 		(void) remove_reference(hdr, NULL, tag);
853ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
854ea8dc4b6Seschrock 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
855ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
856ea8dc4b6Seschrock 		if (destroy_hdr)
857ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
858ea8dc4b6Seschrock 	} else {
859ea8dc4b6Seschrock 		if (remove_reference(hdr, NULL, tag) > 0) {
860ea8dc4b6Seschrock 			ASSERT(HDR_IO_ERROR(hdr));
861ea8dc4b6Seschrock 			arc_buf_destroy(buf, TRUE);
862ea8dc4b6Seschrock 		} else {
863ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
864ea8dc4b6Seschrock 		}
865fa9e4066Sahrens 	}
866ea8dc4b6Seschrock }
867fa9e4066Sahrens 
868ea8dc4b6Seschrock int
869ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag)
870ea8dc4b6Seschrock {
871ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr = buf->b_hdr;
872ea8dc4b6Seschrock 	kmutex_t *hash_lock = HDR_LOCK(hdr);
873ea8dc4b6Seschrock 	int no_callback = (buf->b_efunc == NULL);
874fa9e4066Sahrens 
875ea8dc4b6Seschrock 	if (hdr->b_state == arc.anon) {
876ea8dc4b6Seschrock 		arc_buf_free(buf, tag);
877ea8dc4b6Seschrock 		return (no_callback);
878ea8dc4b6Seschrock 	}
879ea8dc4b6Seschrock 
880ea8dc4b6Seschrock 	mutex_enter(hash_lock);
881ea8dc4b6Seschrock 	ASSERT(hdr->b_state != arc.anon);
882ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
883ea8dc4b6Seschrock 
884ea8dc4b6Seschrock 	(void) remove_reference(hdr, hash_lock, tag);
885ea8dc4b6Seschrock 	if (hdr->b_datacnt > 1) {
886ea8dc4b6Seschrock 		if (no_callback)
887ea8dc4b6Seschrock 			arc_buf_destroy(buf, TRUE);
888ea8dc4b6Seschrock 	} else if (no_callback) {
889ea8dc4b6Seschrock 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
890ea8dc4b6Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
891ea8dc4b6Seschrock 	}
892ea8dc4b6Seschrock 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
893ea8dc4b6Seschrock 	    refcount_is_zero(&hdr->b_refcnt));
894ea8dc4b6Seschrock 	mutex_exit(hash_lock);
895ea8dc4b6Seschrock 	return (no_callback);
896fa9e4066Sahrens }
897fa9e4066Sahrens 
898fa9e4066Sahrens int
899fa9e4066Sahrens arc_buf_size(arc_buf_t *buf)
900fa9e4066Sahrens {
901fa9e4066Sahrens 	return (buf->b_hdr->b_size);
902fa9e4066Sahrens }
903fa9e4066Sahrens 
904fa9e4066Sahrens /*
905fa9e4066Sahrens  * Evict buffers from list until we've removed the specified number of
906fa9e4066Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
907fa9e4066Sahrens  */
908fa9e4066Sahrens static uint64_t
909ea8dc4b6Seschrock arc_evict(arc_state_t *state, int64_t bytes)
910fa9e4066Sahrens {
911fa9e4066Sahrens 	arc_state_t *evicted_state;
912ea8dc4b6Seschrock 	uint64_t bytes_evicted = 0, skipped = 0;
913fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
914fa9e4066Sahrens 	kmutex_t *hash_lock;
915fa9e4066Sahrens 
916ea8dc4b6Seschrock 	ASSERT(state == arc.mru || state == arc.mfu);
917fa9e4066Sahrens 
918ea8dc4b6Seschrock 	evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
919fa9e4066Sahrens 
920fa9e4066Sahrens 	mutex_enter(&state->mtx);
921fa9e4066Sahrens 	mutex_enter(&evicted_state->mtx);
922fa9e4066Sahrens 
923fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
924fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
925*13506d1eSmaybee 		/* prefetch buffers have a minimum lifespan */
926*13506d1eSmaybee 		if (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
927*13506d1eSmaybee 		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan) {
928*13506d1eSmaybee 			skipped++;
929*13506d1eSmaybee 			continue;
930*13506d1eSmaybee 		}
931fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
932*13506d1eSmaybee 		if (!HDR_IO_IN_PROGRESS(ab) && mutex_tryenter(hash_lock)) {
933fa9e4066Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
934ea8dc4b6Seschrock 			ASSERT(ab->b_datacnt > 0);
935ea8dc4b6Seschrock 			while (ab->b_buf) {
936ea8dc4b6Seschrock 				arc_buf_t *buf = ab->b_buf;
937ea8dc4b6Seschrock 				if (buf->b_data)
938ea8dc4b6Seschrock 					bytes_evicted += ab->b_size;
939ea8dc4b6Seschrock 				if (buf->b_efunc) {
940ea8dc4b6Seschrock 					mutex_enter(&arc_eviction_mtx);
941ea8dc4b6Seschrock 					/*
942ea8dc4b6Seschrock 					 * arc_buf_add_ref() could derail
943ea8dc4b6Seschrock 					 * this eviction.
944ea8dc4b6Seschrock 					 */
945ea8dc4b6Seschrock 					if (buf->b_hdr == NULL) {
946ea8dc4b6Seschrock 						mutex_exit(&arc_eviction_mtx);
947ea8dc4b6Seschrock 						mutex_exit(hash_lock);
948ea8dc4b6Seschrock 						goto skip;
949ea8dc4b6Seschrock 					}
950ea8dc4b6Seschrock 					arc_buf_destroy(buf, FALSE);
951ea8dc4b6Seschrock 					ab->b_buf = buf->b_next;
952ea8dc4b6Seschrock 					buf->b_next = arc_eviction_list;
953ea8dc4b6Seschrock 					arc_eviction_list = buf;
954ea8dc4b6Seschrock 					mutex_exit(&arc_eviction_mtx);
955ea8dc4b6Seschrock 				} else {
956ea8dc4b6Seschrock 					arc_buf_destroy(buf, TRUE);
957ea8dc4b6Seschrock 				}
958ea8dc4b6Seschrock 			}
959ea8dc4b6Seschrock 			ASSERT(ab->b_datacnt == 0);
960fa9e4066Sahrens 			arc_change_state(evicted_state, ab, hash_lock);
961ea8dc4b6Seschrock 			ASSERT(HDR_IN_HASH_TABLE(ab));
962ea8dc4b6Seschrock 			ab->b_flags = ARC_IN_HASH_TABLE;
963fa9e4066Sahrens 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
964fa9e4066Sahrens 			mutex_exit(hash_lock);
965ea8dc4b6Seschrock 			if (bytes >= 0 && bytes_evicted >= bytes)
966fa9e4066Sahrens 				break;
967fa9e4066Sahrens 		} else {
968ea8dc4b6Seschrock skip:
969ea8dc4b6Seschrock 			skipped += 1;
970fa9e4066Sahrens 		}
971fa9e4066Sahrens 	}
972fa9e4066Sahrens 	mutex_exit(&evicted_state->mtx);
973fa9e4066Sahrens 	mutex_exit(&state->mtx);
974fa9e4066Sahrens 
975fa9e4066Sahrens 	if (bytes_evicted < bytes)
976fa9e4066Sahrens 		dprintf("only evicted %lld bytes from %x",
977fa9e4066Sahrens 		    (longlong_t)bytes_evicted, state);
978fa9e4066Sahrens 
979ea8dc4b6Seschrock 	atomic_add_64(&arc.skipped, skipped);
980ea8dc4b6Seschrock 	if (bytes < 0)
981ea8dc4b6Seschrock 		return (skipped);
982fa9e4066Sahrens 	return (bytes_evicted);
983fa9e4066Sahrens }
984fa9e4066Sahrens 
985fa9e4066Sahrens /*
986fa9e4066Sahrens  * Remove buffers from list until we've removed the specified number of
987fa9e4066Sahrens  * bytes.  Destroy the buffers that are removed.
988fa9e4066Sahrens  */
989fa9e4066Sahrens static void
990ea8dc4b6Seschrock arc_evict_ghost(arc_state_t *state, int64_t bytes)
991fa9e4066Sahrens {
992fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
993fa9e4066Sahrens 	kmutex_t *hash_lock;
994ea8dc4b6Seschrock 	uint64_t bytes_deleted = 0;
995ea8dc4b6Seschrock 	uint_t bufs_skipped = 0;
996fa9e4066Sahrens 
997ea8dc4b6Seschrock 	ASSERT(GHOST_STATE(state));
998fa9e4066Sahrens top:
999fa9e4066Sahrens 	mutex_enter(&state->mtx);
1000fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
1001fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
1002fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
1003fa9e4066Sahrens 		if (mutex_tryenter(hash_lock)) {
1004*13506d1eSmaybee 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1005ea8dc4b6Seschrock 			ASSERT(ab->b_buf == NULL);
1006fa9e4066Sahrens 			arc_change_state(arc.anon, ab, hash_lock);
1007fa9e4066Sahrens 			mutex_exit(hash_lock);
1008fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
1009fa9e4066Sahrens 			bytes_deleted += ab->b_size;
1010ea8dc4b6Seschrock 			arc_hdr_destroy(ab);
1011ea8dc4b6Seschrock 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1012fa9e4066Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
1013fa9e4066Sahrens 				break;
1014fa9e4066Sahrens 		} else {
1015fa9e4066Sahrens 			if (bytes < 0) {
1016fa9e4066Sahrens 				mutex_exit(&state->mtx);
1017fa9e4066Sahrens 				mutex_enter(hash_lock);
1018fa9e4066Sahrens 				mutex_exit(hash_lock);
1019fa9e4066Sahrens 				goto top;
1020fa9e4066Sahrens 			}
1021fa9e4066Sahrens 			bufs_skipped += 1;
1022fa9e4066Sahrens 		}
1023fa9e4066Sahrens 	}
1024fa9e4066Sahrens 	mutex_exit(&state->mtx);
1025fa9e4066Sahrens 
1026fa9e4066Sahrens 	if (bufs_skipped) {
1027fa9e4066Sahrens 		atomic_add_64(&arc.skipped, bufs_skipped);
1028fa9e4066Sahrens 		ASSERT(bytes >= 0);
1029fa9e4066Sahrens 	}
1030fa9e4066Sahrens 
1031fa9e4066Sahrens 	if (bytes_deleted < bytes)
1032fa9e4066Sahrens 		dprintf("only deleted %lld bytes from %p",
1033fa9e4066Sahrens 		    (longlong_t)bytes_deleted, state);
1034fa9e4066Sahrens }
1035fa9e4066Sahrens 
1036fa9e4066Sahrens static void
1037fa9e4066Sahrens arc_adjust(void)
1038fa9e4066Sahrens {
1039fa9e4066Sahrens 	int64_t top_sz, mru_over, arc_over;
1040fa9e4066Sahrens 
1041ea8dc4b6Seschrock 	top_sz = arc.anon->size + arc.mru->size;
1042fa9e4066Sahrens 
1043ea8dc4b6Seschrock 	if (top_sz > arc.p && arc.mru->lsize > 0) {
1044ea8dc4b6Seschrock 		int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
1045ea8dc4b6Seschrock 		(void) arc_evict(arc.mru, toevict);
1046ea8dc4b6Seschrock 		top_sz = arc.anon->size + arc.mru->size;
1047fa9e4066Sahrens 	}
1048fa9e4066Sahrens 
1049ea8dc4b6Seschrock 	mru_over = top_sz + arc.mru_ghost->size - arc.c;
1050fa9e4066Sahrens 
1051fa9e4066Sahrens 	if (mru_over > 0) {
1052ea8dc4b6Seschrock 		if (arc.mru_ghost->lsize > 0) {
1053ea8dc4b6Seschrock 			int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over);
1054ea8dc4b6Seschrock 			arc_evict_ghost(arc.mru_ghost, todelete);
1055fa9e4066Sahrens 		}
1056fa9e4066Sahrens 	}
1057fa9e4066Sahrens 
1058fa9e4066Sahrens 	if ((arc_over = arc.size - arc.c) > 0) {
1059ea8dc4b6Seschrock 		int64_t tbl_over;
1060fa9e4066Sahrens 
1061ea8dc4b6Seschrock 		if (arc.mfu->lsize > 0) {
1062ea8dc4b6Seschrock 			int64_t toevict = MIN(arc.mfu->lsize, arc_over);
1063ea8dc4b6Seschrock 			(void) arc_evict(arc.mfu, toevict);
1064fa9e4066Sahrens 		}
1065fa9e4066Sahrens 
1066ea8dc4b6Seschrock 		tbl_over = arc.size + arc.mru_ghost->lsize +
1067ea8dc4b6Seschrock 		    arc.mfu_ghost->lsize - arc.c*2;
1068fa9e4066Sahrens 
1069ea8dc4b6Seschrock 		if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) {
1070ea8dc4b6Seschrock 			int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over);
1071ea8dc4b6Seschrock 			arc_evict_ghost(arc.mfu_ghost, todelete);
1072fa9e4066Sahrens 		}
1073fa9e4066Sahrens 	}
1074fa9e4066Sahrens }
1075fa9e4066Sahrens 
1076ea8dc4b6Seschrock static void
1077ea8dc4b6Seschrock arc_do_user_evicts(void)
1078ea8dc4b6Seschrock {
1079ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
1080ea8dc4b6Seschrock 	while (arc_eviction_list != NULL) {
1081ea8dc4b6Seschrock 		arc_buf_t *buf = arc_eviction_list;
1082ea8dc4b6Seschrock 		arc_eviction_list = buf->b_next;
1083ea8dc4b6Seschrock 		buf->b_hdr = NULL;
1084ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
1085ea8dc4b6Seschrock 
1086dd6ef538Smaybee 		if (buf->b_efunc != NULL)
1087dd6ef538Smaybee 			VERIFY(buf->b_efunc(buf) == 0);
1088ea8dc4b6Seschrock 
1089ea8dc4b6Seschrock 		buf->b_efunc = NULL;
1090ea8dc4b6Seschrock 		buf->b_private = NULL;
1091ea8dc4b6Seschrock 		kmem_cache_free(buf_cache, buf);
1092ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
1093ea8dc4b6Seschrock 	}
1094ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
1095ea8dc4b6Seschrock }
1096ea8dc4b6Seschrock 
1097fa9e4066Sahrens /*
1098fa9e4066Sahrens  * Flush all *evictable* data from the cache.
1099fa9e4066Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
1100fa9e4066Sahrens  */
1101fa9e4066Sahrens void
1102fa9e4066Sahrens arc_flush(void)
1103fa9e4066Sahrens {
1104ea8dc4b6Seschrock 	while (arc_evict(arc.mru, -1));
1105ea8dc4b6Seschrock 	while (arc_evict(arc.mfu, -1));
1106fa9e4066Sahrens 
1107ea8dc4b6Seschrock 	arc_evict_ghost(arc.mru_ghost, -1);
1108ea8dc4b6Seschrock 	arc_evict_ghost(arc.mfu_ghost, -1);
1109ea8dc4b6Seschrock 
1110ea8dc4b6Seschrock 	mutex_enter(&arc_reclaim_thr_lock);
1111ea8dc4b6Seschrock 	arc_do_user_evicts();
1112ea8dc4b6Seschrock 	mutex_exit(&arc_reclaim_thr_lock);
1113ea8dc4b6Seschrock 	ASSERT(arc_eviction_list == NULL);
1114fa9e4066Sahrens }
1115fa9e4066Sahrens 
1116*13506d1eSmaybee int arc_kmem_reclaim_shift = 5;		/* log2(fraction of arc to reclaim) */
1117*13506d1eSmaybee 
1118fa9e4066Sahrens void
1119fa9e4066Sahrens arc_kmem_reclaim(void)
1120fa9e4066Sahrens {
11213cff2f43Sstans 	uint64_t to_free;
11223cff2f43Sstans 
1123fa9e4066Sahrens 	/*
1124fa9e4066Sahrens 	 * We need arc_reclaim_lock because we don't want multiple
1125fa9e4066Sahrens 	 * threads trying to reclaim concurrently.
1126fa9e4066Sahrens 	 */
1127fa9e4066Sahrens 
1128fa9e4066Sahrens 	/*
1129fa9e4066Sahrens 	 * umem calls the reclaim func when we destroy the buf cache,
1130fa9e4066Sahrens 	 * which is after we do arc_fini().  So we set a flag to prevent
1131fa9e4066Sahrens 	 * accessing the destroyed mutexes and lists.
1132fa9e4066Sahrens 	 */
1133fa9e4066Sahrens 	if (arc_dead)
1134fa9e4066Sahrens 		return;
1135fa9e4066Sahrens 
1136ea8dc4b6Seschrock 	if (arc.c <= arc.c_min)
1137ea8dc4b6Seschrock 		return;
1138ea8dc4b6Seschrock 
1139fa9e4066Sahrens 	mutex_enter(&arc_reclaim_lock);
1140fa9e4066Sahrens 
11413cff2f43Sstans #ifdef _KERNEL
1142*13506d1eSmaybee 	to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree));
11433cff2f43Sstans #else
1144*13506d1eSmaybee 	to_free = arc.c >> arc_kmem_reclaim_shift;
11453cff2f43Sstans #endif
11463cff2f43Sstans 	if (arc.c > to_free)
11473cff2f43Sstans 		atomic_add_64(&arc.c, -to_free);
11483cff2f43Sstans 	else
11493cff2f43Sstans 		arc.c = arc.c_min;
11503cff2f43Sstans 
1151*13506d1eSmaybee 	atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift));
1152ea8dc4b6Seschrock 	if (arc.c > arc.size)
1153ea8dc4b6Seschrock 		arc.c = arc.size;
1154fa9e4066Sahrens 	if (arc.c < arc.c_min)
1155fa9e4066Sahrens 		arc.c = arc.c_min;
1156ea8dc4b6Seschrock 	if (arc.p > arc.c)
1157ea8dc4b6Seschrock 		arc.p = (arc.c >> 1);
1158ea8dc4b6Seschrock 	ASSERT((int64_t)arc.p >= 0);
1159fa9e4066Sahrens 
1160fa9e4066Sahrens 	arc_adjust();
1161fa9e4066Sahrens 
1162fa9e4066Sahrens 	mutex_exit(&arc_reclaim_lock);
1163fa9e4066Sahrens }
1164fa9e4066Sahrens 
1165fa9e4066Sahrens static int
1166fa9e4066Sahrens arc_reclaim_needed(void)
1167fa9e4066Sahrens {
1168fa9e4066Sahrens 	uint64_t extra;
1169fa9e4066Sahrens 
1170fa9e4066Sahrens #ifdef _KERNEL
11713cff2f43Sstans 
11723cff2f43Sstans 	if (needfree)
11733cff2f43Sstans 		return (1);
11743cff2f43Sstans 
1175fa9e4066Sahrens 	/*
1176fa9e4066Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1177fa9e4066Sahrens 	 */
1178fa9e4066Sahrens 	extra = desfree;
1179fa9e4066Sahrens 
1180fa9e4066Sahrens 	/*
1181fa9e4066Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
1182fa9e4066Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
1183fa9e4066Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
1184fa9e4066Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
1185fa9e4066Sahrens 	 * the scanner doesn't start up while we're freeing memory.
1186fa9e4066Sahrens 	 */
1187fa9e4066Sahrens 	if (freemem < lotsfree + needfree + extra)
1188fa9e4066Sahrens 		return (1);
1189fa9e4066Sahrens 
1190fa9e4066Sahrens 	/*
1191fa9e4066Sahrens 	 * check to make sure that swapfs has enough space so that anon
1192fa9e4066Sahrens 	 * reservations can still succeeed. anon_resvmem() checks that the
1193fa9e4066Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
1194fa9e4066Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
1195fa9e4066Sahrens 	 * circumstances from getting really dire.
1196fa9e4066Sahrens 	 */
1197fa9e4066Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1198fa9e4066Sahrens 		return (1);
1199fa9e4066Sahrens 
12005dc8af33Smaybee #if defined(__i386)
1201fa9e4066Sahrens 	/*
1202fa9e4066Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
1203fa9e4066Sahrens 	 * kernel heap space before we ever run out of available physical
1204fa9e4066Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
1205fa9e4066Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
1206fa9e4066Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
1207fa9e4066Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
1208fa9e4066Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1209fa9e4066Sahrens 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
1210fa9e4066Sahrens 	 * free)
1211fa9e4066Sahrens 	 */
1212fa9e4066Sahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1213fa9e4066Sahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1214fa9e4066Sahrens 		return (1);
1215fa9e4066Sahrens #endif
1216fa9e4066Sahrens 
1217fa9e4066Sahrens #else
1218fa9e4066Sahrens 	if (spa_get_random(100) == 0)
1219fa9e4066Sahrens 		return (1);
1220fa9e4066Sahrens #endif
1221fa9e4066Sahrens 	return (0);
1222fa9e4066Sahrens }
1223fa9e4066Sahrens 
1224fa9e4066Sahrens static void
1225fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1226fa9e4066Sahrens {
1227fa9e4066Sahrens 	size_t			i;
1228fa9e4066Sahrens 	kmem_cache_t		*prev_cache = NULL;
1229fa9e4066Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
1230fa9e4066Sahrens 
1231033f9833Sek #ifdef _KERNEL
1232033f9833Sek 	/*
1233033f9833Sek 	 * First purge some DNLC entries, in case the DNLC is using
1234033f9833Sek 	 * up too much memory.
1235033f9833Sek 	 */
1236cee972f8Sek 	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
12375dc8af33Smaybee 
12385dc8af33Smaybee #if defined(__i386)
12395dc8af33Smaybee 	/*
12405dc8af33Smaybee 	 * Reclaim unused memory from all kmem caches.
12415dc8af33Smaybee 	 */
12425dc8af33Smaybee 	kmem_reap();
12435dc8af33Smaybee #endif
1244033f9833Sek #endif
1245033f9833Sek 
1246fa9e4066Sahrens 	/*
1247ea8dc4b6Seschrock 	 * An agressive reclamation will shrink the cache size as well as
1248ea8dc4b6Seschrock 	 * reap free buffers from the arc kmem caches.
1249fa9e4066Sahrens 	 */
1250fa9e4066Sahrens 	if (strat == ARC_RECLAIM_AGGR)
1251ea8dc4b6Seschrock 		arc_kmem_reclaim();
1252fa9e4066Sahrens 
1253fa9e4066Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1254fa9e4066Sahrens 		if (zio_buf_cache[i] != prev_cache) {
1255fa9e4066Sahrens 			prev_cache = zio_buf_cache[i];
1256fa9e4066Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
1257fa9e4066Sahrens 		}
1258fa9e4066Sahrens 	}
1259ea8dc4b6Seschrock 	kmem_cache_reap_now(buf_cache);
1260ea8dc4b6Seschrock 	kmem_cache_reap_now(hdr_cache);
1261fa9e4066Sahrens }
1262fa9e4066Sahrens 
1263fa9e4066Sahrens static void
1264fa9e4066Sahrens arc_reclaim_thread(void)
1265fa9e4066Sahrens {
1266fa9e4066Sahrens 	clock_t			growtime = 0;
1267fa9e4066Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1268fa9e4066Sahrens 	callb_cpr_t		cpr;
1269fa9e4066Sahrens 
1270fa9e4066Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1271fa9e4066Sahrens 
1272fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1273fa9e4066Sahrens 	while (arc_thread_exit == 0) {
1274fa9e4066Sahrens 		if (arc_reclaim_needed()) {
1275fa9e4066Sahrens 
1276fa9e4066Sahrens 			if (arc.no_grow) {
1277fa9e4066Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
1278fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
1279fa9e4066Sahrens 				} else {
1280fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
1281fa9e4066Sahrens 				}
1282fa9e4066Sahrens 			} else {
1283fa9e4066Sahrens 				arc.no_grow = TRUE;
1284fa9e4066Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
1285fa9e4066Sahrens 				membar_producer();
1286fa9e4066Sahrens 			}
1287fa9e4066Sahrens 
1288fa9e4066Sahrens 			/* reset the growth delay for every reclaim */
1289fa9e4066Sahrens 			growtime = lbolt + (arc_grow_retry * hz);
1290fa9e4066Sahrens 
1291fa9e4066Sahrens 			arc_kmem_reap_now(last_reclaim);
1292fa9e4066Sahrens 
1293fa9e4066Sahrens 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1294fa9e4066Sahrens 			arc.no_grow = FALSE;
1295fa9e4066Sahrens 		}
1296fa9e4066Sahrens 
1297ea8dc4b6Seschrock 		if (arc_eviction_list != NULL)
1298ea8dc4b6Seschrock 			arc_do_user_evicts();
1299ea8dc4b6Seschrock 
1300fa9e4066Sahrens 		/* block until needed, or one second, whichever is shorter */
1301fa9e4066Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
1302fa9e4066Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1303fa9e4066Sahrens 		    &arc_reclaim_thr_lock, (lbolt + hz));
1304fa9e4066Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1305fa9e4066Sahrens 	}
1306fa9e4066Sahrens 
1307fa9e4066Sahrens 	arc_thread_exit = 0;
1308fa9e4066Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
1309fa9e4066Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1310fa9e4066Sahrens 	thread_exit();
1311fa9e4066Sahrens }
1312fa9e4066Sahrens 
1313ea8dc4b6Seschrock /*
1314ea8dc4b6Seschrock  * Adapt arc info given the number of bytes we are trying to add and
1315ea8dc4b6Seschrock  * the state that we are comming from.  This function is only called
1316ea8dc4b6Seschrock  * when we are adding new content to the cache.
1317ea8dc4b6Seschrock  */
1318fa9e4066Sahrens static void
1319ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state)
1320fa9e4066Sahrens {
1321ea8dc4b6Seschrock 	int mult;
1322ea8dc4b6Seschrock 
1323ea8dc4b6Seschrock 	ASSERT(bytes > 0);
1324fa9e4066Sahrens 	/*
1325ea8dc4b6Seschrock 	 * Adapt the target size of the MRU list:
1326ea8dc4b6Seschrock 	 *	- if we just hit in the MRU ghost list, then increase
1327ea8dc4b6Seschrock 	 *	  the target size of the MRU list.
1328ea8dc4b6Seschrock 	 *	- if we just hit in the MFU ghost list, then increase
1329ea8dc4b6Seschrock 	 *	  the target size of the MFU list by decreasing the
1330ea8dc4b6Seschrock 	 *	  target size of the MRU list.
1331fa9e4066Sahrens 	 */
1332ea8dc4b6Seschrock 	if (state == arc.mru_ghost) {
1333ea8dc4b6Seschrock 		mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ?
1334ea8dc4b6Seschrock 		    1 : (arc.mfu_ghost->size/arc.mru_ghost->size));
1335ea8dc4b6Seschrock 
1336ea8dc4b6Seschrock 		arc.p = MIN(arc.c, arc.p + bytes * mult);
1337ea8dc4b6Seschrock 	} else if (state == arc.mfu_ghost) {
1338ea8dc4b6Seschrock 		mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ?
1339ea8dc4b6Seschrock 		    1 : (arc.mru_ghost->size/arc.mfu_ghost->size));
1340ea8dc4b6Seschrock 
1341ea8dc4b6Seschrock 		arc.p = MAX(0, (int64_t)arc.p - bytes * mult);
1342ea8dc4b6Seschrock 	}
1343ea8dc4b6Seschrock 	ASSERT((int64_t)arc.p >= 0);
1344fa9e4066Sahrens 
1345fa9e4066Sahrens 	if (arc_reclaim_needed()) {
1346fa9e4066Sahrens 		cv_signal(&arc_reclaim_thr_cv);
1347fa9e4066Sahrens 		return;
1348fa9e4066Sahrens 	}
1349fa9e4066Sahrens 
1350fa9e4066Sahrens 	if (arc.no_grow)
1351fa9e4066Sahrens 		return;
1352fa9e4066Sahrens 
1353ea8dc4b6Seschrock 	if (arc.c >= arc.c_max)
1354ea8dc4b6Seschrock 		return;
1355ea8dc4b6Seschrock 
1356fa9e4066Sahrens 	/*
1357ea8dc4b6Seschrock 	 * If we're within (2 * maxblocksize) bytes of the target
1358ea8dc4b6Seschrock 	 * cache size, increment the target cache size
1359fa9e4066Sahrens 	 */
1360ea8dc4b6Seschrock 	if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) {
1361ea8dc4b6Seschrock 		atomic_add_64(&arc.c, (int64_t)bytes);
1362fa9e4066Sahrens 		if (arc.c > arc.c_max)
1363fa9e4066Sahrens 			arc.c = arc.c_max;
1364ea8dc4b6Seschrock 		else if (state == arc.anon)
1365ea8dc4b6Seschrock 			atomic_add_64(&arc.p, (int64_t)bytes);
1366ea8dc4b6Seschrock 		if (arc.p > arc.c)
1367ea8dc4b6Seschrock 			arc.p = arc.c;
1368fa9e4066Sahrens 	}
1369ea8dc4b6Seschrock 	ASSERT((int64_t)arc.p >= 0);
1370fa9e4066Sahrens }
1371fa9e4066Sahrens 
1372fa9e4066Sahrens /*
1373ea8dc4b6Seschrock  * Check if the cache has reached its limits and eviction is required
1374ea8dc4b6Seschrock  * prior to insert.
1375fa9e4066Sahrens  */
1376fa9e4066Sahrens static int
1377fa9e4066Sahrens arc_evict_needed()
1378fa9e4066Sahrens {
1379fa9e4066Sahrens 	if (arc_reclaim_needed())
1380fa9e4066Sahrens 		return (1);
1381fa9e4066Sahrens 
1382ea8dc4b6Seschrock 	return (arc.size > arc.c);
1383fa9e4066Sahrens }
1384fa9e4066Sahrens 
1385fa9e4066Sahrens /*
1386fa9e4066Sahrens  * The state, supplied as the first argument, is going to have something
1387fa9e4066Sahrens  * inserted on its behalf. So, determine which cache must be victimized to
1388fa9e4066Sahrens  * satisfy an insertion for this state.  We have the following cases:
1389fa9e4066Sahrens  *
1390ea8dc4b6Seschrock  * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) ->
1391fa9e4066Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
1392fa9e4066Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
1393fa9e4066Sahrens  *
1394ea8dc4b6Seschrock  * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) ->
1395fa9e4066Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
1396fa9e4066Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
1397fa9e4066Sahrens  * entries.
1398fa9e4066Sahrens  *
1399ea8dc4b6Seschrock  * 3. Insert for MFU (c - p) > sizeof(arc.mfu) ->
1400fa9e4066Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
1401fa9e4066Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
1402fa9e4066Sahrens  * the MFU side, so the MRU side needs to be victimized.
1403fa9e4066Sahrens  *
1404ea8dc4b6Seschrock  * 4. Insert for MFU (c - p) < sizeof(arc.mfu) ->
1405fa9e4066Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
1406fa9e4066Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
1407fa9e4066Sahrens  */
1408fa9e4066Sahrens static void
1409fa9e4066Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes)
1410fa9e4066Sahrens {
1411fa9e4066Sahrens 	uint64_t	mru_used;
1412fa9e4066Sahrens 	uint64_t	mfu_space;
1413fa9e4066Sahrens 	uint64_t	evicted;
1414fa9e4066Sahrens 
1415ea8dc4b6Seschrock 	ASSERT(state == arc.mru || state == arc.mfu);
1416fa9e4066Sahrens 
1417ea8dc4b6Seschrock 	if (state == arc.mru) {
1418ea8dc4b6Seschrock 		mru_used = arc.anon->size + arc.mru->size;
1419fa9e4066Sahrens 		if (arc.p > mru_used) {
1420fa9e4066Sahrens 			/* case 1 */
1421ea8dc4b6Seschrock 			evicted = arc_evict(arc.mfu, bytes);
1422fa9e4066Sahrens 			if (evicted < bytes) {
1423fa9e4066Sahrens 				arc_adjust();
1424fa9e4066Sahrens 			}
1425fa9e4066Sahrens 		} else {
1426fa9e4066Sahrens 			/* case 2 */
1427ea8dc4b6Seschrock 			evicted = arc_evict(arc.mru, bytes);
1428fa9e4066Sahrens 			if (evicted < bytes) {
1429fa9e4066Sahrens 				arc_adjust();
1430fa9e4066Sahrens 			}
1431fa9e4066Sahrens 		}
1432fa9e4066Sahrens 	} else {
1433ea8dc4b6Seschrock 		/* MFU case */
1434fa9e4066Sahrens 		mfu_space = arc.c - arc.p;
1435ea8dc4b6Seschrock 		if (mfu_space > arc.mfu->size) {
1436fa9e4066Sahrens 			/* case 3 */
1437ea8dc4b6Seschrock 			evicted = arc_evict(arc.mru, bytes);
1438fa9e4066Sahrens 			if (evicted < bytes) {
1439fa9e4066Sahrens 				arc_adjust();
1440fa9e4066Sahrens 			}
1441fa9e4066Sahrens 		} else {
1442fa9e4066Sahrens 			/* case 4 */
1443ea8dc4b6Seschrock 			evicted = arc_evict(arc.mfu, bytes);
1444fa9e4066Sahrens 			if (evicted < bytes) {
1445fa9e4066Sahrens 				arc_adjust();
1446fa9e4066Sahrens 			}
1447fa9e4066Sahrens 		}
1448fa9e4066Sahrens 	}
1449fa9e4066Sahrens }
1450fa9e4066Sahrens 
1451fa9e4066Sahrens /*
1452fa9e4066Sahrens  * This routine is called whenever a buffer is accessed.
1453ea8dc4b6Seschrock  * NOTE: the hash lock is dropped in this function.
1454fa9e4066Sahrens  */
1455fa9e4066Sahrens static void
1456ea8dc4b6Seschrock arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1457fa9e4066Sahrens {
1458ea8dc4b6Seschrock 	arc_state_t	*evict_state = NULL;
1459ea8dc4b6Seschrock 	int		blksz;
1460fa9e4066Sahrens 
1461fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1462fa9e4066Sahrens 
1463fa9e4066Sahrens 	blksz = buf->b_size;
1464fa9e4066Sahrens 
1465fa9e4066Sahrens 	if (buf->b_state == arc.anon) {
1466fa9e4066Sahrens 		/*
1467fa9e4066Sahrens 		 * This buffer is not in the cache, and does not
1468fa9e4066Sahrens 		 * appear in our "ghost" list.  Add the new buffer
1469fa9e4066Sahrens 		 * to the MRU state.
1470fa9e4066Sahrens 		 */
1471fa9e4066Sahrens 
1472ea8dc4b6Seschrock 		arc_adapt(blksz, arc.anon);
1473ea8dc4b6Seschrock 		if (arc_evict_needed())
1474ea8dc4b6Seschrock 			evict_state = arc.mru;
1475fa9e4066Sahrens 
1476fa9e4066Sahrens 		ASSERT(buf->b_arc_access == 0);
1477fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1478ea8dc4b6Seschrock 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1479ea8dc4b6Seschrock 		arc_change_state(arc.mru, buf, hash_lock);
1480fa9e4066Sahrens 
1481ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mru) {
1482fa9e4066Sahrens 		/*
1483*13506d1eSmaybee 		 * If this buffer is here because of a prefetch, then either:
1484*13506d1eSmaybee 		 * - clear the flag if this is a "referencing" read
1485*13506d1eSmaybee 		 *   (any subsequent access will bump this into the MFU state).
1486*13506d1eSmaybee 		 * or
1487*13506d1eSmaybee 		 * - move the buffer to the head of the list if this is
1488*13506d1eSmaybee 		 *   another prefetch (to make it less likely to be evicted).
1489fa9e4066Sahrens 		 */
1490fa9e4066Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1491*13506d1eSmaybee 			if (refcount_count(&buf->b_refcnt) == 0) {
1492*13506d1eSmaybee 				ASSERT(list_link_active(&buf->b_arc_node));
1493*13506d1eSmaybee 				mutex_enter(&arc.mru->mtx);
1494*13506d1eSmaybee 				list_remove(&arc.mru->list, buf);
1495*13506d1eSmaybee 				list_insert_head(&arc.mru->list, buf);
1496*13506d1eSmaybee 				mutex_exit(&arc.mru->mtx);
1497*13506d1eSmaybee 			} else {
1498*13506d1eSmaybee 				buf->b_flags &= ~ARC_PREFETCH;
1499*13506d1eSmaybee 				atomic_add_64(&arc.mru->hits, 1);
1500*13506d1eSmaybee 			}
1501*13506d1eSmaybee 			buf->b_arc_access = lbolt;
1502ea8dc4b6Seschrock 			mutex_exit(hash_lock);
1503fa9e4066Sahrens 			return;
1504fa9e4066Sahrens 		}
1505fa9e4066Sahrens 
1506fa9e4066Sahrens 		/*
1507fa9e4066Sahrens 		 * This buffer has been "accessed" only once so far,
1508fa9e4066Sahrens 		 * but it is still in the cache. Move it to the MFU
1509fa9e4066Sahrens 		 * state.
1510fa9e4066Sahrens 		 */
1511fa9e4066Sahrens 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1512fa9e4066Sahrens 			/*
1513fa9e4066Sahrens 			 * More than 125ms have passed since we
1514fa9e4066Sahrens 			 * instantiated this buffer.  Move it to the
1515fa9e4066Sahrens 			 * most frequently used state.
1516fa9e4066Sahrens 			 */
1517fa9e4066Sahrens 			buf->b_arc_access = lbolt;
1518ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1519ea8dc4b6Seschrock 			arc_change_state(arc.mfu, buf, hash_lock);
1520fa9e4066Sahrens 		}
1521ea8dc4b6Seschrock 		atomic_add_64(&arc.mru->hits, 1);
1522ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mru_ghost) {
1523fa9e4066Sahrens 		arc_state_t	*new_state;
1524fa9e4066Sahrens 		/*
1525fa9e4066Sahrens 		 * This buffer has been "accessed" recently, but
1526fa9e4066Sahrens 		 * was evicted from the cache.  Move it to the
1527fa9e4066Sahrens 		 * MFU state.
1528fa9e4066Sahrens 		 */
1529fa9e4066Sahrens 
1530fa9e4066Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
1531ea8dc4b6Seschrock 			new_state = arc.mru;
1532*13506d1eSmaybee 			if (refcount_count(&buf->b_refcnt) > 0)
1533*13506d1eSmaybee 				buf->b_flags &= ~ARC_PREFETCH;
1534ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1535fa9e4066Sahrens 		} else {
1536ea8dc4b6Seschrock 			new_state = arc.mfu;
1537ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1538fa9e4066Sahrens 		}
1539fa9e4066Sahrens 
1540ea8dc4b6Seschrock 		arc_adapt(blksz, arc.mru_ghost);
1541ea8dc4b6Seschrock 		if (arc_evict_needed())
1542ea8dc4b6Seschrock 			evict_state = new_state;
1543fa9e4066Sahrens 
1544fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1545fa9e4066Sahrens 		arc_change_state(new_state, buf, hash_lock);
1546fa9e4066Sahrens 
1547ea8dc4b6Seschrock 		atomic_add_64(&arc.mru_ghost->hits, 1);
1548ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mfu) {
1549fa9e4066Sahrens 		/*
1550fa9e4066Sahrens 		 * This buffer has been accessed more than once and is
1551fa9e4066Sahrens 		 * still in the cache.  Keep it in the MFU state.
1552fa9e4066Sahrens 		 *
1553*13506d1eSmaybee 		 * NOTE: an add_reference() that occurred when we did
1554*13506d1eSmaybee 		 * the arc_read() will have kicked this off the list.
1555*13506d1eSmaybee 		 * If it was a prefetch, we will explicitly move it to
1556*13506d1eSmaybee 		 * the head of the list now.
1557fa9e4066Sahrens 		 */
1558*13506d1eSmaybee 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1559*13506d1eSmaybee 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
1560*13506d1eSmaybee 			ASSERT(list_link_active(&buf->b_arc_node));
1561*13506d1eSmaybee 			mutex_enter(&arc.mfu->mtx);
1562*13506d1eSmaybee 			list_remove(&arc.mfu->list, buf);
1563*13506d1eSmaybee 			list_insert_head(&arc.mfu->list, buf);
1564*13506d1eSmaybee 			mutex_exit(&arc.mfu->mtx);
1565*13506d1eSmaybee 		}
1566ea8dc4b6Seschrock 		atomic_add_64(&arc.mfu->hits, 1);
1567*13506d1eSmaybee 		buf->b_arc_access = lbolt;
1568ea8dc4b6Seschrock 	} else if (buf->b_state == arc.mfu_ghost) {
1569*13506d1eSmaybee 		arc_state_t	*new_state = arc.mfu;
1570fa9e4066Sahrens 		/*
1571fa9e4066Sahrens 		 * This buffer has been accessed more than once but has
1572fa9e4066Sahrens 		 * been evicted from the cache.  Move it back to the
1573fa9e4066Sahrens 		 * MFU state.
1574fa9e4066Sahrens 		 */
1575fa9e4066Sahrens 
1576*13506d1eSmaybee 		if (buf->b_flags & ARC_PREFETCH) {
1577*13506d1eSmaybee 			/*
1578*13506d1eSmaybee 			 * This is a prefetch access...
1579*13506d1eSmaybee 			 * move this block back to the MRU state.
1580*13506d1eSmaybee 			 */
1581*13506d1eSmaybee 			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
1582*13506d1eSmaybee 			new_state = arc.mru;
1583*13506d1eSmaybee 		}
1584*13506d1eSmaybee 
1585ea8dc4b6Seschrock 		arc_adapt(blksz, arc.mfu_ghost);
1586ea8dc4b6Seschrock 		if (arc_evict_needed())
1587*13506d1eSmaybee 			evict_state = new_state;
1588fa9e4066Sahrens 
1589fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1590ea8dc4b6Seschrock 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1591*13506d1eSmaybee 		arc_change_state(new_state, buf, hash_lock);
1592fa9e4066Sahrens 
1593ea8dc4b6Seschrock 		atomic_add_64(&arc.mfu_ghost->hits, 1);
1594fa9e4066Sahrens 	} else {
1595fa9e4066Sahrens 		ASSERT(!"invalid arc state");
1596fa9e4066Sahrens 	}
1597fa9e4066Sahrens 
1598ea8dc4b6Seschrock 	mutex_exit(hash_lock);
1599ea8dc4b6Seschrock 	if (evict_state)
1600ea8dc4b6Seschrock 		arc_evict_for_state(evict_state, blksz);
1601fa9e4066Sahrens }
1602fa9e4066Sahrens 
1603fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1604fa9e4066Sahrens /* ARGSUSED */
1605fa9e4066Sahrens void
1606fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1607fa9e4066Sahrens {
1608fa9e4066Sahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1609ea8dc4b6Seschrock 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1610fa9e4066Sahrens }
1611fa9e4066Sahrens 
1612fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1613fa9e4066Sahrens void
1614fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1615fa9e4066Sahrens {
1616fa9e4066Sahrens 	arc_buf_t **bufp = arg;
1617fa9e4066Sahrens 	if (zio && zio->io_error) {
1618ea8dc4b6Seschrock 		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1619fa9e4066Sahrens 		*bufp = NULL;
1620fa9e4066Sahrens 	} else {
1621fa9e4066Sahrens 		*bufp = buf;
1622fa9e4066Sahrens 	}
1623fa9e4066Sahrens }
1624fa9e4066Sahrens 
1625fa9e4066Sahrens static void
1626fa9e4066Sahrens arc_read_done(zio_t *zio)
1627fa9e4066Sahrens {
1628bbf4a8dfSmaybee 	arc_buf_hdr_t	*hdr, *found;
1629fa9e4066Sahrens 	arc_buf_t	*buf;
1630fa9e4066Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1631fa9e4066Sahrens 	kmutex_t	*hash_lock;
1632fa9e4066Sahrens 	arc_callback_t	*callback_list, *acb;
1633fa9e4066Sahrens 	int		freeable = FALSE;
1634fa9e4066Sahrens 
1635fa9e4066Sahrens 	buf = zio->io_private;
1636fa9e4066Sahrens 	hdr = buf->b_hdr;
1637fa9e4066Sahrens 
1638bbf4a8dfSmaybee 	/*
1639bbf4a8dfSmaybee 	 * The hdr was inserted into hash-table and removed from lists
1640bbf4a8dfSmaybee 	 * prior to starting I/O.  We should find this header, since
1641bbf4a8dfSmaybee 	 * it's in the hash table, and it should be legit since it's
1642bbf4a8dfSmaybee 	 * not possible to evict it during the I/O.  The only possible
1643bbf4a8dfSmaybee 	 * reason for it not to be found is if we were freed during the
1644bbf4a8dfSmaybee 	 * read.
1645bbf4a8dfSmaybee 	 */
1646bbf4a8dfSmaybee 	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1647fa9e4066Sahrens 		    &hash_lock);
1648fa9e4066Sahrens 
1649bbf4a8dfSmaybee 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
1650bbf4a8dfSmaybee 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
1651fa9e4066Sahrens 
1652fa9e4066Sahrens 	/* byteswap if necessary */
1653fa9e4066Sahrens 	callback_list = hdr->b_acb;
1654fa9e4066Sahrens 	ASSERT(callback_list != NULL);
1655fa9e4066Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1656fa9e4066Sahrens 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1657fa9e4066Sahrens 
1658fa9e4066Sahrens 	/* create copies of the data buffer for the callers */
1659fa9e4066Sahrens 	abuf = buf;
1660fa9e4066Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
1661fa9e4066Sahrens 		if (acb->acb_done) {
1662fa9e4066Sahrens 			if (abuf == NULL) {
1663fa9e4066Sahrens 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1664ea8dc4b6Seschrock 				abuf->b_data = arc_data_copy(hdr, buf->b_data);
1665fa9e4066Sahrens 				abuf->b_hdr = hdr;
1666ea8dc4b6Seschrock 				abuf->b_efunc = NULL;
1667ea8dc4b6Seschrock 				abuf->b_private = NULL;
1668fa9e4066Sahrens 				abuf->b_next = hdr->b_buf;
1669fa9e4066Sahrens 				hdr->b_buf = abuf;
1670ea8dc4b6Seschrock 				hdr->b_datacnt += 1;
1671fa9e4066Sahrens 			}
1672fa9e4066Sahrens 			acb->acb_buf = abuf;
1673fa9e4066Sahrens 			abuf = NULL;
1674fa9e4066Sahrens 		}
1675fa9e4066Sahrens 	}
1676fa9e4066Sahrens 	hdr->b_acb = NULL;
1677fa9e4066Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1678ea8dc4b6Seschrock 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
1679ea8dc4b6Seschrock 	if (abuf == buf)
1680ea8dc4b6Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
1681fa9e4066Sahrens 
1682fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1683fa9e4066Sahrens 
1684fa9e4066Sahrens 	if (zio->io_error != 0) {
1685fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
1686fa9e4066Sahrens 		if (hdr->b_state != arc.anon)
1687fa9e4066Sahrens 			arc_change_state(arc.anon, hdr, hash_lock);
1688ea8dc4b6Seschrock 		if (HDR_IN_HASH_TABLE(hdr))
1689ea8dc4b6Seschrock 			buf_hash_remove(hdr);
1690fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1691*13506d1eSmaybee 		/* convert checksum errors into IO errors */
1692ea8dc4b6Seschrock 		if (zio->io_error == ECKSUM)
1693ea8dc4b6Seschrock 			zio->io_error = EIO;
1694fa9e4066Sahrens 	}
1695fa9e4066Sahrens 
1696ea8dc4b6Seschrock 	/*
1697*13506d1eSmaybee 	 * Broadcast before we drop the hash_lock to avoid the possibility
1698*13506d1eSmaybee 	 * that the hdr (and hence the cv) might be freed before we get to
1699*13506d1eSmaybee 	 * the cv_broadcast().
1700ea8dc4b6Seschrock 	 */
1701ea8dc4b6Seschrock 	cv_broadcast(&hdr->b_cv);
1702ea8dc4b6Seschrock 
1703bbf4a8dfSmaybee 	if (hash_lock) {
1704fa9e4066Sahrens 		/*
1705fa9e4066Sahrens 		 * Only call arc_access on anonymous buffers.  This is because
1706fa9e4066Sahrens 		 * if we've issued an I/O for an evicted buffer, we've already
1707fa9e4066Sahrens 		 * called arc_access (to prevent any simultaneous readers from
1708fa9e4066Sahrens 		 * getting confused).
1709fa9e4066Sahrens 		 */
1710fa9e4066Sahrens 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
1711ea8dc4b6Seschrock 			arc_access_and_exit(hdr, hash_lock);
1712ea8dc4b6Seschrock 		else
1713ea8dc4b6Seschrock 			mutex_exit(hash_lock);
1714fa9e4066Sahrens 	} else {
1715fa9e4066Sahrens 		/*
1716fa9e4066Sahrens 		 * This block was freed while we waited for the read to
1717fa9e4066Sahrens 		 * complete.  It has been removed from the hash table and
1718fa9e4066Sahrens 		 * moved to the anonymous state (so that it won't show up
1719fa9e4066Sahrens 		 * in the cache).
1720fa9e4066Sahrens 		 */
1721fa9e4066Sahrens 		ASSERT3P(hdr->b_state, ==, arc.anon);
1722fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1723fa9e4066Sahrens 	}
1724fa9e4066Sahrens 
1725fa9e4066Sahrens 	/* execute each callback and free its structure */
1726fa9e4066Sahrens 	while ((acb = callback_list) != NULL) {
1727fa9e4066Sahrens 		if (acb->acb_done)
1728fa9e4066Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1729fa9e4066Sahrens 
1730fa9e4066Sahrens 		if (acb->acb_zio_dummy != NULL) {
1731fa9e4066Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
1732fa9e4066Sahrens 			zio_nowait(acb->acb_zio_dummy);
1733fa9e4066Sahrens 		}
1734fa9e4066Sahrens 
1735fa9e4066Sahrens 		callback_list = acb->acb_next;
1736fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1737fa9e4066Sahrens 	}
1738fa9e4066Sahrens 
1739fa9e4066Sahrens 	if (freeable)
1740ea8dc4b6Seschrock 		arc_hdr_destroy(hdr);
1741fa9e4066Sahrens }
1742fa9e4066Sahrens 
1743fa9e4066Sahrens /*
1744fa9e4066Sahrens  * "Read" the block block at the specified DVA (in bp) via the
1745fa9e4066Sahrens  * cache.  If the block is found in the cache, invoke the provided
1746fa9e4066Sahrens  * callback immediately and return.  Note that the `zio' parameter
1747fa9e4066Sahrens  * in the callback will be NULL in this case, since no IO was
1748fa9e4066Sahrens  * required.  If the block is not in the cache pass the read request
1749fa9e4066Sahrens  * on to the spa with a substitute callback function, so that the
1750fa9e4066Sahrens  * requested block will be added to the cache.
1751fa9e4066Sahrens  *
1752fa9e4066Sahrens  * If a read request arrives for a block that has a read in-progress,
1753fa9e4066Sahrens  * either wait for the in-progress read to complete (and return the
1754fa9e4066Sahrens  * results); or, if this is a read with a "done" func, add a record
1755fa9e4066Sahrens  * to the read to invoke the "done" func when the read completes,
1756fa9e4066Sahrens  * and return; or just return.
1757fa9e4066Sahrens  *
1758fa9e4066Sahrens  * arc_read_done() will invoke all the requested "done" functions
1759fa9e4066Sahrens  * for readers of this block.
1760fa9e4066Sahrens  */
1761fa9e4066Sahrens int
1762fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
1763fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1764*13506d1eSmaybee     uint32_t *arc_flags, zbookmark_t *zb)
1765fa9e4066Sahrens {
1766fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1767fa9e4066Sahrens 	arc_buf_t *buf;
1768fa9e4066Sahrens 	kmutex_t *hash_lock;
1769fa9e4066Sahrens 	zio_t	*rzio;
1770fa9e4066Sahrens 
1771fa9e4066Sahrens top:
1772fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1773ea8dc4b6Seschrock 	if (hdr && hdr->b_datacnt > 0) {
1774fa9e4066Sahrens 
1775*13506d1eSmaybee 		*arc_flags |= ARC_CACHED;
1776*13506d1eSmaybee 
1777fa9e4066Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
1778*13506d1eSmaybee 
1779*13506d1eSmaybee 			if (*arc_flags & ARC_WAIT) {
1780*13506d1eSmaybee 				cv_wait(&hdr->b_cv, hash_lock);
1781*13506d1eSmaybee 				mutex_exit(hash_lock);
1782*13506d1eSmaybee 				goto top;
1783*13506d1eSmaybee 			}
1784*13506d1eSmaybee 			ASSERT(*arc_flags & ARC_NOWAIT);
1785*13506d1eSmaybee 
1786*13506d1eSmaybee 			if (done) {
1787fa9e4066Sahrens 				arc_callback_t	*acb = NULL;
1788fa9e4066Sahrens 
1789fa9e4066Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
1790fa9e4066Sahrens 				    KM_SLEEP);
1791fa9e4066Sahrens 				acb->acb_done = done;
1792fa9e4066Sahrens 				acb->acb_private = private;
1793fa9e4066Sahrens 				acb->acb_byteswap = swap;
1794fa9e4066Sahrens 				if (pio != NULL)
1795fa9e4066Sahrens 					acb->acb_zio_dummy = zio_null(pio,
1796fa9e4066Sahrens 					    spa, NULL, NULL, flags);
1797fa9e4066Sahrens 
1798fa9e4066Sahrens 				ASSERT(acb->acb_done != NULL);
1799fa9e4066Sahrens 				acb->acb_next = hdr->b_acb;
1800fa9e4066Sahrens 				hdr->b_acb = acb;
1801fa9e4066Sahrens 				add_reference(hdr, hash_lock, private);
1802fa9e4066Sahrens 				mutex_exit(hash_lock);
1803fa9e4066Sahrens 				return (0);
1804fa9e4066Sahrens 			}
1805fa9e4066Sahrens 			mutex_exit(hash_lock);
1806fa9e4066Sahrens 			return (0);
1807fa9e4066Sahrens 		}
1808fa9e4066Sahrens 
1809ea8dc4b6Seschrock 		ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
1810fa9e4066Sahrens 
1811ea8dc4b6Seschrock 		if (done) {
1812ea8dc4b6Seschrock 			/*
1813ea8dc4b6Seschrock 			 * If this block is already in use, create a new
1814ea8dc4b6Seschrock 			 * copy of the data so that we will be guaranteed
1815ea8dc4b6Seschrock 			 * that arc_release() will always succeed.
1816ea8dc4b6Seschrock 			 */
1817fa9e4066Sahrens 			buf = hdr->b_buf;
1818ea8dc4b6Seschrock 			ASSERT(buf);
1819ea8dc4b6Seschrock 			ASSERT(buf->b_data);
1820ea8dc4b6Seschrock 			if (!HDR_BUF_AVAILABLE(hdr)) {
1821ea8dc4b6Seschrock 				void *data = arc_data_copy(hdr, buf->b_data);
1822ea8dc4b6Seschrock 				buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1823ea8dc4b6Seschrock 				buf->b_hdr = hdr;
1824ea8dc4b6Seschrock 				buf->b_data = data;
1825ea8dc4b6Seschrock 				buf->b_efunc = NULL;
1826ea8dc4b6Seschrock 				buf->b_private = NULL;
1827ea8dc4b6Seschrock 				buf->b_next = hdr->b_buf;
1828ea8dc4b6Seschrock 				hdr->b_buf = buf;
1829ea8dc4b6Seschrock 				hdr->b_datacnt += 1;
1830ea8dc4b6Seschrock 			} else {
1831ea8dc4b6Seschrock 				ASSERT(buf->b_efunc == NULL);
1832ea8dc4b6Seschrock 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
1833ea8dc4b6Seschrock 			}
1834ea8dc4b6Seschrock 			add_reference(hdr, hash_lock, private);
1835*13506d1eSmaybee 		} else if (*arc_flags & ARC_PREFETCH &&
1836*13506d1eSmaybee 		    refcount_count(&hdr->b_refcnt) == 0) {
1837*13506d1eSmaybee 			hdr->b_flags |= ARC_PREFETCH;
1838fa9e4066Sahrens 		}
1839fa9e4066Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1840ea8dc4b6Seschrock 		arc_access_and_exit(hdr, hash_lock);
1841fa9e4066Sahrens 		atomic_add_64(&arc.hits, 1);
1842fa9e4066Sahrens 		if (done)
1843fa9e4066Sahrens 			done(NULL, buf, private);
1844fa9e4066Sahrens 	} else {
1845fa9e4066Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
1846fa9e4066Sahrens 		arc_callback_t	*acb;
1847fa9e4066Sahrens 
1848fa9e4066Sahrens 		if (hdr == NULL) {
1849fa9e4066Sahrens 			/* this block is not in the cache */
1850fa9e4066Sahrens 			arc_buf_hdr_t	*exists;
1851fa9e4066Sahrens 
1852fa9e4066Sahrens 			buf = arc_buf_alloc(spa, size, private);
1853fa9e4066Sahrens 			hdr = buf->b_hdr;
1854fa9e4066Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
1855fa9e4066Sahrens 			hdr->b_birth = bp->blk_birth;
1856fa9e4066Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
1857fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1858fa9e4066Sahrens 			if (exists) {
1859fa9e4066Sahrens 				/* somebody beat us to the hash insert */
1860fa9e4066Sahrens 				mutex_exit(hash_lock);
1861fa9e4066Sahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
1862fa9e4066Sahrens 				hdr->b_birth = 0;
1863fa9e4066Sahrens 				hdr->b_cksum0 = 0;
1864ea8dc4b6Seschrock 				(void) arc_buf_remove_ref(buf, private);
1865fa9e4066Sahrens 				goto top; /* restart the IO request */
1866fa9e4066Sahrens 			}
1867*13506d1eSmaybee 			/* if this is a prefetch, we don't have a reference */
1868*13506d1eSmaybee 			if (*arc_flags & ARC_PREFETCH) {
1869*13506d1eSmaybee 				(void) remove_reference(hdr, hash_lock,
1870*13506d1eSmaybee 				    private);
1871*13506d1eSmaybee 				hdr->b_flags |= ARC_PREFETCH;
1872*13506d1eSmaybee 			}
1873*13506d1eSmaybee 			if (BP_GET_LEVEL(bp) > 0)
1874*13506d1eSmaybee 				hdr->b_flags |= ARC_INDIRECT;
1875fa9e4066Sahrens 		} else {
1876fa9e4066Sahrens 			/* this block is in the ghost cache */
1877ea8dc4b6Seschrock 			ASSERT(GHOST_STATE(hdr->b_state));
1878ea8dc4b6Seschrock 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1879*13506d1eSmaybee 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
1880ea8dc4b6Seschrock 			ASSERT(hdr->b_buf == NULL);
1881*13506d1eSmaybee 
1882*13506d1eSmaybee 			/* if this is a prefetch, we don't have a reference */
1883*13506d1eSmaybee 			if (*arc_flags & ARC_PREFETCH)
1884*13506d1eSmaybee 				hdr->b_flags |= ARC_PREFETCH;
1885*13506d1eSmaybee 			else
1886*13506d1eSmaybee 				add_reference(hdr, hash_lock, private);
1887fa9e4066Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1888fa9e4066Sahrens 			buf->b_hdr = hdr;
1889ea8dc4b6Seschrock 			buf->b_efunc = NULL;
1890ea8dc4b6Seschrock 			buf->b_private = NULL;
1891fa9e4066Sahrens 			buf->b_next = NULL;
1892fa9e4066Sahrens 			hdr->b_buf = buf;
1893ea8dc4b6Seschrock 			buf->b_data = zio_buf_alloc(hdr->b_size);
1894ea8dc4b6Seschrock 			atomic_add_64(&arc.size, hdr->b_size);
1895ea8dc4b6Seschrock 			ASSERT(hdr->b_datacnt == 0);
1896ea8dc4b6Seschrock 			hdr->b_datacnt = 1;
1897*13506d1eSmaybee 
1898fa9e4066Sahrens 		}
1899fa9e4066Sahrens 
1900fa9e4066Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1901fa9e4066Sahrens 		acb->acb_done = done;
1902fa9e4066Sahrens 		acb->acb_private = private;
1903fa9e4066Sahrens 		acb->acb_byteswap = swap;
1904fa9e4066Sahrens 
1905fa9e4066Sahrens 		ASSERT(hdr->b_acb == NULL);
1906fa9e4066Sahrens 		hdr->b_acb = acb;
1907fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
1908fa9e4066Sahrens 
1909fa9e4066Sahrens 		/*
1910fa9e4066Sahrens 		 * If the buffer has been evicted, migrate it to a present state
1911fa9e4066Sahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
1912fa9e4066Sahrens 		 * the header will be marked as I/O in progress and have an
1913fa9e4066Sahrens 		 * attached buffer.  At this point, anybody who finds this
1914fa9e4066Sahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
1915fa9e4066Sahrens 		 */
1916fa9e4066Sahrens 
1917ea8dc4b6Seschrock 		if (GHOST_STATE(hdr->b_state))
1918ea8dc4b6Seschrock 			arc_access_and_exit(hdr, hash_lock);
1919ea8dc4b6Seschrock 		else
1920ea8dc4b6Seschrock 			mutex_exit(hash_lock);
1921fa9e4066Sahrens 
1922fa9e4066Sahrens 		ASSERT3U(hdr->b_size, ==, size);
1923c543ec06Sahrens 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
1924c543ec06Sahrens 		    zbookmark_t *, zb);
1925fa9e4066Sahrens 		atomic_add_64(&arc.misses, 1);
1926ea8dc4b6Seschrock 
1927fa9e4066Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
1928ea8dc4b6Seschrock 		    arc_read_done, buf, priority, flags, zb);
1929fa9e4066Sahrens 
1930*13506d1eSmaybee 		if (*arc_flags & ARC_WAIT)
1931fa9e4066Sahrens 			return (zio_wait(rzio));
1932fa9e4066Sahrens 
1933*13506d1eSmaybee 		ASSERT(*arc_flags & ARC_NOWAIT);
1934fa9e4066Sahrens 		zio_nowait(rzio);
1935fa9e4066Sahrens 	}
1936fa9e4066Sahrens 	return (0);
1937fa9e4066Sahrens }
1938fa9e4066Sahrens 
1939fa9e4066Sahrens /*
1940fa9e4066Sahrens  * arc_read() variant to support pool traversal.  If the block is already
1941fa9e4066Sahrens  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
1942fa9e4066Sahrens  * The idea is that we don't want pool traversal filling up memory, but
1943fa9e4066Sahrens  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
1944fa9e4066Sahrens  */
1945fa9e4066Sahrens int
1946fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
1947fa9e4066Sahrens {
1948fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1949fa9e4066Sahrens 	kmutex_t *hash_mtx;
1950fa9e4066Sahrens 	int rc = 0;
1951fa9e4066Sahrens 
1952fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
1953fa9e4066Sahrens 
1954ea8dc4b6Seschrock 	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
1955ea8dc4b6Seschrock 		arc_buf_t *buf = hdr->b_buf;
1956ea8dc4b6Seschrock 
1957ea8dc4b6Seschrock 		ASSERT(buf);
1958ea8dc4b6Seschrock 		while (buf->b_data == NULL) {
1959ea8dc4b6Seschrock 			buf = buf->b_next;
1960ea8dc4b6Seschrock 			ASSERT(buf);
1961ea8dc4b6Seschrock 		}
1962ea8dc4b6Seschrock 		bcopy(buf->b_data, data, hdr->b_size);
1963ea8dc4b6Seschrock 	} else {
1964fa9e4066Sahrens 		rc = ENOENT;
1965ea8dc4b6Seschrock 	}
1966fa9e4066Sahrens 
1967fa9e4066Sahrens 	if (hash_mtx)
1968fa9e4066Sahrens 		mutex_exit(hash_mtx);
1969fa9e4066Sahrens 
1970fa9e4066Sahrens 	return (rc);
1971fa9e4066Sahrens }
1972fa9e4066Sahrens 
1973ea8dc4b6Seschrock void
1974ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
1975ea8dc4b6Seschrock {
1976ea8dc4b6Seschrock 	ASSERT(buf->b_hdr != NULL);
1977ea8dc4b6Seschrock 	ASSERT(buf->b_hdr->b_state != arc.anon);
1978ea8dc4b6Seschrock 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
1979ea8dc4b6Seschrock 	buf->b_efunc = func;
1980ea8dc4b6Seschrock 	buf->b_private = private;
1981ea8dc4b6Seschrock }
1982ea8dc4b6Seschrock 
1983ea8dc4b6Seschrock /*
1984ea8dc4b6Seschrock  * This is used by the DMU to let the ARC know that a buffer is
1985ea8dc4b6Seschrock  * being evicted, so the ARC should clean up.  If this arc buf
1986ea8dc4b6Seschrock  * is not yet in the evicted state, it will be put there.
1987ea8dc4b6Seschrock  */
1988ea8dc4b6Seschrock int
1989ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf)
1990ea8dc4b6Seschrock {
1991ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr;
1992ea8dc4b6Seschrock 	kmutex_t *hash_lock;
1993ea8dc4b6Seschrock 	arc_buf_t **bufp;
1994ea8dc4b6Seschrock 
1995ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
1996ea8dc4b6Seschrock 	hdr = buf->b_hdr;
1997ea8dc4b6Seschrock 	if (hdr == NULL) {
1998ea8dc4b6Seschrock 		/*
1999ea8dc4b6Seschrock 		 * We are in arc_do_user_evicts().
2000ea8dc4b6Seschrock 		 * NOTE: We can't be in arc_buf_add_ref() because
2001ea8dc4b6Seschrock 		 * that would violate the interface rules.
2002ea8dc4b6Seschrock 		 */
2003ea8dc4b6Seschrock 		ASSERT(buf->b_data == NULL);
2004ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2005ea8dc4b6Seschrock 		return (0);
2006ea8dc4b6Seschrock 	} else if (buf->b_data == NULL) {
2007dd6ef538Smaybee 		arc_buf_t copy = *buf; /* structure assignment */
2008ea8dc4b6Seschrock 		/*
2009dd6ef538Smaybee 		 * We are on the eviction list.  Process this buffer
2010dd6ef538Smaybee 		 * now but let arc_do_user_evicts() do the reaping.
2011ea8dc4b6Seschrock 		 */
2012dd6ef538Smaybee 		buf->b_efunc = NULL;
2013dd6ef538Smaybee 		buf->b_hdr = NULL;
2014ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2015dd6ef538Smaybee 		VERIFY(copy.b_efunc(&copy) == 0);
2016dd6ef538Smaybee 		return (1);
2017ea8dc4b6Seschrock 	} else {
2018ea8dc4b6Seschrock 		/*
2019ea8dc4b6Seschrock 		 * Prevent a race with arc_evict()
2020ea8dc4b6Seschrock 		 */
2021ea8dc4b6Seschrock 		ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2022ea8dc4b6Seschrock 		buf->b_hdr = NULL;
2023ea8dc4b6Seschrock 	}
2024ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
2025ea8dc4b6Seschrock 
2026ea8dc4b6Seschrock 	hash_lock = HDR_LOCK(hdr);
2027ea8dc4b6Seschrock 	mutex_enter(hash_lock);
2028ea8dc4b6Seschrock 
2029ea8dc4b6Seschrock 	ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
2030ea8dc4b6Seschrock 
2031ea8dc4b6Seschrock 	/*
2032ea8dc4b6Seschrock 	 * Pull this buffer off of the hdr
2033ea8dc4b6Seschrock 	 */
2034ea8dc4b6Seschrock 	bufp = &hdr->b_buf;
2035ea8dc4b6Seschrock 	while (*bufp != buf)
2036ea8dc4b6Seschrock 		bufp = &(*bufp)->b_next;
2037ea8dc4b6Seschrock 	*bufp = buf->b_next;
2038ea8dc4b6Seschrock 
2039ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
2040ea8dc4b6Seschrock 	buf->b_hdr = hdr;
2041ea8dc4b6Seschrock 	arc_buf_destroy(buf, FALSE);
2042ea8dc4b6Seschrock 
2043ea8dc4b6Seschrock 	if (hdr->b_datacnt == 0) {
2044ea8dc4b6Seschrock 		arc_state_t *old_state = hdr->b_state;
2045ea8dc4b6Seschrock 		arc_state_t *evicted_state;
2046ea8dc4b6Seschrock 
2047ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
2048ea8dc4b6Seschrock 
2049ea8dc4b6Seschrock 		evicted_state =
2050ea8dc4b6Seschrock 		    (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
2051ea8dc4b6Seschrock 
2052ea8dc4b6Seschrock 		mutex_enter(&old_state->mtx);
2053ea8dc4b6Seschrock 		mutex_enter(&evicted_state->mtx);
2054ea8dc4b6Seschrock 
2055ea8dc4b6Seschrock 		arc_change_state(evicted_state, hdr, hash_lock);
2056ea8dc4b6Seschrock 		ASSERT(HDR_IN_HASH_TABLE(hdr));
2057ea8dc4b6Seschrock 		hdr->b_flags = ARC_IN_HASH_TABLE;
2058ea8dc4b6Seschrock 
2059ea8dc4b6Seschrock 		mutex_exit(&evicted_state->mtx);
2060ea8dc4b6Seschrock 		mutex_exit(&old_state->mtx);
2061ea8dc4b6Seschrock 	}
2062ea8dc4b6Seschrock 	mutex_exit(hash_lock);
2063dd6ef538Smaybee 
2064ea8dc4b6Seschrock 	VERIFY(buf->b_efunc(buf) == 0);
2065ea8dc4b6Seschrock 	buf->b_efunc = NULL;
2066ea8dc4b6Seschrock 	buf->b_private = NULL;
2067ea8dc4b6Seschrock 	buf->b_hdr = NULL;
2068ea8dc4b6Seschrock 	kmem_cache_free(buf_cache, buf);
2069ea8dc4b6Seschrock 	return (1);
2070ea8dc4b6Seschrock }
2071ea8dc4b6Seschrock 
2072fa9e4066Sahrens /*
2073fa9e4066Sahrens  * Release this buffer from the cache.  This must be done
2074fa9e4066Sahrens  * after a read and prior to modifying the buffer contents.
2075fa9e4066Sahrens  * If the buffer has more than one reference, we must make
2076fa9e4066Sahrens  * make a new hdr for the buffer.
2077fa9e4066Sahrens  */
2078fa9e4066Sahrens void
2079fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag)
2080fa9e4066Sahrens {
2081fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
2082fa9e4066Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
2083fa9e4066Sahrens 
2084fa9e4066Sahrens 	/* this buffer is not on any list */
2085fa9e4066Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2086fa9e4066Sahrens 
2087fa9e4066Sahrens 	if (hdr->b_state == arc.anon) {
2088fa9e4066Sahrens 		/* this buffer is already released */
2089fa9e4066Sahrens 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2090fa9e4066Sahrens 		ASSERT(BUF_EMPTY(hdr));
2091ea8dc4b6Seschrock 		ASSERT(buf->b_efunc == NULL);
2092fa9e4066Sahrens 		return;
2093fa9e4066Sahrens 	}
2094fa9e4066Sahrens 
2095fa9e4066Sahrens 	mutex_enter(hash_lock);
2096fa9e4066Sahrens 
2097ea8dc4b6Seschrock 	/*
2098ea8dc4b6Seschrock 	 * Do we have more than one buf?
2099ea8dc4b6Seschrock 	 */
2100ea8dc4b6Seschrock 	if (hdr->b_buf != buf || buf->b_next != NULL) {
2101fa9e4066Sahrens 		arc_buf_hdr_t *nhdr;
2102fa9e4066Sahrens 		arc_buf_t **bufp;
2103fa9e4066Sahrens 		uint64_t blksz = hdr->b_size;
2104fa9e4066Sahrens 		spa_t *spa = hdr->b_spa;
2105fa9e4066Sahrens 
2106ea8dc4b6Seschrock 		ASSERT(hdr->b_datacnt > 1);
2107fa9e4066Sahrens 		/*
2108fa9e4066Sahrens 		 * Pull the data off of this buf and attach it to
2109fa9e4066Sahrens 		 * a new anonymous buf.
2110fa9e4066Sahrens 		 */
2111ea8dc4b6Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
2112fa9e4066Sahrens 		bufp = &hdr->b_buf;
2113ea8dc4b6Seschrock 		while (*bufp != buf)
2114fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
2115fa9e4066Sahrens 		*bufp = (*bufp)->b_next;
2116ea8dc4b6Seschrock 
2117fa9e4066Sahrens 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
2118fa9e4066Sahrens 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
2119ea8dc4b6Seschrock 		if (refcount_is_zero(&hdr->b_refcnt)) {
2120ea8dc4b6Seschrock 			ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size);
2121ea8dc4b6Seschrock 			atomic_add_64(&hdr->b_state->lsize, -hdr->b_size);
2122ea8dc4b6Seschrock 		}
2123ea8dc4b6Seschrock 		hdr->b_datacnt -= 1;
2124ea8dc4b6Seschrock 
2125fa9e4066Sahrens 		mutex_exit(hash_lock);
2126fa9e4066Sahrens 
2127fa9e4066Sahrens 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
2128fa9e4066Sahrens 		nhdr->b_size = blksz;
2129fa9e4066Sahrens 		nhdr->b_spa = spa;
2130fa9e4066Sahrens 		nhdr->b_buf = buf;
2131fa9e4066Sahrens 		nhdr->b_state = arc.anon;
2132fa9e4066Sahrens 		nhdr->b_arc_access = 0;
2133fa9e4066Sahrens 		nhdr->b_flags = 0;
2134ea8dc4b6Seschrock 		nhdr->b_datacnt = 1;
2135fa9e4066Sahrens 		buf->b_hdr = nhdr;
2136fa9e4066Sahrens 		buf->b_next = NULL;
2137fa9e4066Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
2138fa9e4066Sahrens 		atomic_add_64(&arc.anon->size, blksz);
2139fa9e4066Sahrens 
2140fa9e4066Sahrens 		hdr = nhdr;
2141fa9e4066Sahrens 	} else {
2142ea8dc4b6Seschrock 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2143fa9e4066Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
2144fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2145fa9e4066Sahrens 		arc_change_state(arc.anon, hdr, hash_lock);
2146fa9e4066Sahrens 		hdr->b_arc_access = 0;
2147fa9e4066Sahrens 		mutex_exit(hash_lock);
2148fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
2149fa9e4066Sahrens 		hdr->b_birth = 0;
2150fa9e4066Sahrens 		hdr->b_cksum0 = 0;
2151fa9e4066Sahrens 	}
2152ea8dc4b6Seschrock 	buf->b_efunc = NULL;
2153ea8dc4b6Seschrock 	buf->b_private = NULL;
2154fa9e4066Sahrens }
2155fa9e4066Sahrens 
2156fa9e4066Sahrens int
2157fa9e4066Sahrens arc_released(arc_buf_t *buf)
2158fa9e4066Sahrens {
2159ea8dc4b6Seschrock 	return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon);
2160ea8dc4b6Seschrock }
2161ea8dc4b6Seschrock 
2162ea8dc4b6Seschrock int
2163ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf)
2164ea8dc4b6Seschrock {
2165ea8dc4b6Seschrock 	return (buf->b_efunc != NULL);
2166fa9e4066Sahrens }
2167fa9e4066Sahrens 
2168ea8dc4b6Seschrock #ifdef ZFS_DEBUG
2169ea8dc4b6Seschrock int
2170ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf)
2171ea8dc4b6Seschrock {
2172ea8dc4b6Seschrock 	return (refcount_count(&buf->b_hdr->b_refcnt));
2173ea8dc4b6Seschrock }
2174ea8dc4b6Seschrock #endif
2175ea8dc4b6Seschrock 
2176fa9e4066Sahrens static void
2177fa9e4066Sahrens arc_write_done(zio_t *zio)
2178fa9e4066Sahrens {
2179fa9e4066Sahrens 	arc_buf_t *buf;
2180fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
2181fa9e4066Sahrens 	arc_callback_t *acb;
2182fa9e4066Sahrens 
2183fa9e4066Sahrens 	buf = zio->io_private;
2184fa9e4066Sahrens 	hdr = buf->b_hdr;
2185fa9e4066Sahrens 	acb = hdr->b_acb;
2186fa9e4066Sahrens 	hdr->b_acb = NULL;
2187ea8dc4b6Seschrock 	ASSERT(acb != NULL);
2188fa9e4066Sahrens 
2189fa9e4066Sahrens 	/* this buffer is on no lists and is not in the hash table */
2190fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
2191fa9e4066Sahrens 
2192fa9e4066Sahrens 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
2193fa9e4066Sahrens 	hdr->b_birth = zio->io_bp->blk_birth;
2194fa9e4066Sahrens 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
2195ea8dc4b6Seschrock 	/*
2196ea8dc4b6Seschrock 	 * If the block to be written was all-zero, we may have
2197ea8dc4b6Seschrock 	 * compressed it away.  In this case no write was performed
2198ea8dc4b6Seschrock 	 * so there will be no dva/birth-date/checksum.  The buffer
2199ea8dc4b6Seschrock 	 * must therefor remain anonymous (and uncached).
2200ea8dc4b6Seschrock 	 */
2201fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
2202fa9e4066Sahrens 		arc_buf_hdr_t *exists;
2203fa9e4066Sahrens 		kmutex_t *hash_lock;
2204fa9e4066Sahrens 
2205fa9e4066Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
2206fa9e4066Sahrens 		if (exists) {
2207fa9e4066Sahrens 			/*
2208fa9e4066Sahrens 			 * This can only happen if we overwrite for
2209fa9e4066Sahrens 			 * sync-to-convergence, because we remove
2210fa9e4066Sahrens 			 * buffers from the hash table when we arc_free().
2211fa9e4066Sahrens 			 */
2212fa9e4066Sahrens 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
2213fa9e4066Sahrens 			    BP_IDENTITY(zio->io_bp)));
2214fa9e4066Sahrens 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
2215fa9e4066Sahrens 			    zio->io_bp->blk_birth);
2216fa9e4066Sahrens 
2217fa9e4066Sahrens 			ASSERT(refcount_is_zero(&exists->b_refcnt));
2218fa9e4066Sahrens 			arc_change_state(arc.anon, exists, hash_lock);
2219fa9e4066Sahrens 			mutex_exit(hash_lock);
2220ea8dc4b6Seschrock 			arc_hdr_destroy(exists);
2221fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
2222fa9e4066Sahrens 			ASSERT3P(exists, ==, NULL);
2223fa9e4066Sahrens 		}
2224ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2225ea8dc4b6Seschrock 		arc_access_and_exit(hdr, hash_lock);
2226ea8dc4b6Seschrock 	} else if (acb->acb_done == NULL) {
2227ea8dc4b6Seschrock 		int destroy_hdr;
2228ea8dc4b6Seschrock 		/*
2229ea8dc4b6Seschrock 		 * This is an anonymous buffer with no user callback,
2230ea8dc4b6Seschrock 		 * destroy it if there are no active references.
2231ea8dc4b6Seschrock 		 */
2232ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
2233ea8dc4b6Seschrock 		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
2234ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2235ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2236ea8dc4b6Seschrock 		if (destroy_hdr)
2237ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
2238ea8dc4b6Seschrock 	} else {
2239ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2240fa9e4066Sahrens 	}
2241ea8dc4b6Seschrock 
2242ea8dc4b6Seschrock 	if (acb->acb_done) {
2243fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
2244fa9e4066Sahrens 		acb->acb_done(zio, buf, acb->acb_private);
2245fa9e4066Sahrens 	}
2246fa9e4066Sahrens 
2247ea8dc4b6Seschrock 	kmem_free(acb, sizeof (arc_callback_t));
2248fa9e4066Sahrens }
2249fa9e4066Sahrens 
2250fa9e4066Sahrens int
225144cd46caSbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
2252fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
2253fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
2254ea8dc4b6Seschrock     uint32_t arc_flags, zbookmark_t *zb)
2255fa9e4066Sahrens {
2256fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
2257fa9e4066Sahrens 	arc_callback_t	*acb;
2258fa9e4066Sahrens 	zio_t	*rzio;
2259fa9e4066Sahrens 
2260fa9e4066Sahrens 	/* this is a private buffer - no locking required */
2261fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
2262fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
2263fa9e4066Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
2264c5c6ffa0Smaybee 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
2265c5c6ffa0Smaybee 	ASSERT(hdr->b_acb == 0);
2266fa9e4066Sahrens 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2267fa9e4066Sahrens 	acb->acb_done = done;
2268fa9e4066Sahrens 	acb->acb_private = private;
2269fa9e4066Sahrens 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
2270fa9e4066Sahrens 	hdr->b_acb = acb;
2271ea8dc4b6Seschrock 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
227244cd46caSbillm 	rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
2273ea8dc4b6Seschrock 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
2274fa9e4066Sahrens 
2275fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
2276fa9e4066Sahrens 		return (zio_wait(rzio));
2277fa9e4066Sahrens 
2278fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
2279fa9e4066Sahrens 	zio_nowait(rzio);
2280fa9e4066Sahrens 
2281fa9e4066Sahrens 	return (0);
2282fa9e4066Sahrens }
2283fa9e4066Sahrens 
2284fa9e4066Sahrens int
2285fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
2286fa9e4066Sahrens     zio_done_func_t *done, void *private, uint32_t arc_flags)
2287fa9e4066Sahrens {
2288fa9e4066Sahrens 	arc_buf_hdr_t *ab;
2289fa9e4066Sahrens 	kmutex_t *hash_lock;
2290fa9e4066Sahrens 	zio_t	*zio;
2291fa9e4066Sahrens 
2292fa9e4066Sahrens 	/*
2293fa9e4066Sahrens 	 * If this buffer is in the cache, release it, so it
2294fa9e4066Sahrens 	 * can be re-used.
2295fa9e4066Sahrens 	 */
2296fa9e4066Sahrens 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2297fa9e4066Sahrens 	if (ab != NULL) {
2298fa9e4066Sahrens 		/*
2299fa9e4066Sahrens 		 * The checksum of blocks to free is not always
2300fa9e4066Sahrens 		 * preserved (eg. on the deadlist).  However, if it is
2301fa9e4066Sahrens 		 * nonzero, it should match what we have in the cache.
2302fa9e4066Sahrens 		 */
2303fa9e4066Sahrens 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
2304fa9e4066Sahrens 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
230577ed8509Smaybee 		if (ab->b_state != arc.anon)
230677ed8509Smaybee 			arc_change_state(arc.anon, ab, hash_lock);
2307*13506d1eSmaybee 		if (HDR_IO_IN_PROGRESS(ab)) {
2308*13506d1eSmaybee 			/*
2309*13506d1eSmaybee 			 * This should only happen when we prefetch.
2310*13506d1eSmaybee 			 */
2311*13506d1eSmaybee 			ASSERT(ab->b_flags & ARC_PREFETCH);
2312*13506d1eSmaybee 			ASSERT3U(ab->b_datacnt, ==, 1);
2313*13506d1eSmaybee 			ab->b_flags |= ARC_FREED_IN_READ;
2314*13506d1eSmaybee 			if (HDR_IN_HASH_TABLE(ab))
2315*13506d1eSmaybee 				buf_hash_remove(ab);
2316*13506d1eSmaybee 			ab->b_arc_access = 0;
2317*13506d1eSmaybee 			bzero(&ab->b_dva, sizeof (dva_t));
2318*13506d1eSmaybee 			ab->b_birth = 0;
2319*13506d1eSmaybee 			ab->b_cksum0 = 0;
2320*13506d1eSmaybee 			ab->b_buf->b_efunc = NULL;
2321*13506d1eSmaybee 			ab->b_buf->b_private = NULL;
2322*13506d1eSmaybee 			mutex_exit(hash_lock);
2323*13506d1eSmaybee 		} else if (refcount_is_zero(&ab->b_refcnt)) {
2324fa9e4066Sahrens 			mutex_exit(hash_lock);
2325ea8dc4b6Seschrock 			arc_hdr_destroy(ab);
2326fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
2327fa9e4066Sahrens 		} else {
2328bbf4a8dfSmaybee 			/*
2329*13506d1eSmaybee 			 * We still have an active reference on this
2330*13506d1eSmaybee 			 * buffer.  This can happen, e.g., from
2331*13506d1eSmaybee 			 * dbuf_unoverride().
2332bbf4a8dfSmaybee 			 */
2333*13506d1eSmaybee 			ASSERT(!HDR_IN_HASH_TABLE(ab));
2334fa9e4066Sahrens 			ab->b_arc_access = 0;
2335fa9e4066Sahrens 			bzero(&ab->b_dva, sizeof (dva_t));
2336fa9e4066Sahrens 			ab->b_birth = 0;
2337fa9e4066Sahrens 			ab->b_cksum0 = 0;
2338ea8dc4b6Seschrock 			ab->b_buf->b_efunc = NULL;
2339ea8dc4b6Seschrock 			ab->b_buf->b_private = NULL;
2340fa9e4066Sahrens 			mutex_exit(hash_lock);
2341fa9e4066Sahrens 		}
2342fa9e4066Sahrens 	}
2343fa9e4066Sahrens 
2344fa9e4066Sahrens 	zio = zio_free(pio, spa, txg, bp, done, private);
2345fa9e4066Sahrens 
2346fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
2347fa9e4066Sahrens 		return (zio_wait(zio));
2348fa9e4066Sahrens 
2349fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
2350fa9e4066Sahrens 	zio_nowait(zio);
2351fa9e4066Sahrens 
2352fa9e4066Sahrens 	return (0);
2353fa9e4066Sahrens }
2354fa9e4066Sahrens 
2355fa9e4066Sahrens void
2356fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve)
2357fa9e4066Sahrens {
2358fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, -tempreserve);
2359fa9e4066Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
2360fa9e4066Sahrens }
2361fa9e4066Sahrens 
2362fa9e4066Sahrens int
2363fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve)
2364fa9e4066Sahrens {
2365fa9e4066Sahrens #ifdef ZFS_DEBUG
2366fa9e4066Sahrens 	/*
2367fa9e4066Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
2368fa9e4066Sahrens 	 */
2369fa9e4066Sahrens 	if (spa_get_random(10000) == 0) {
2370fa9e4066Sahrens 		dprintf("forcing random failure\n");
2371fa9e4066Sahrens 		return (ERESTART);
2372fa9e4066Sahrens 	}
2373fa9e4066Sahrens #endif
2374112fe045Smaybee 	if (tempreserve > arc.c/4 && !arc.no_grow)
2375112fe045Smaybee 		arc.c = MIN(arc.c_max, tempreserve * 4);
2376112fe045Smaybee 	if (tempreserve > arc.c)
2377112fe045Smaybee 		return (ENOMEM);
2378112fe045Smaybee 
2379fa9e4066Sahrens 	/*
2380112fe045Smaybee 	 * Throttle writes when the amount of dirty data in the cache
2381112fe045Smaybee 	 * gets too large.  We try to keep the cache less than half full
2382112fe045Smaybee 	 * of dirty blocks so that our sync times don't grow too large.
2383112fe045Smaybee 	 * Note: if two requests come in concurrently, we might let them
2384112fe045Smaybee 	 * both succeed, when one of them should fail.  Not a huge deal.
2385112fe045Smaybee 	 *
2386112fe045Smaybee 	 * XXX The limit should be adjusted dynamically to keep the time
2387112fe045Smaybee 	 * to sync a dataset fixed (around 1-5 seconds?).
2388fa9e4066Sahrens 	 */
2389fa9e4066Sahrens 
2390112fe045Smaybee 	if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 &&
2391112fe045Smaybee 	    arc_tempreserve + arc.anon->size > arc.c / 4) {
2392fa9e4066Sahrens 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
2393fa9e4066Sahrens 		    "tempreserve=%lluK arc.c=%lluK\n",
2394fa9e4066Sahrens 		    arc_tempreserve>>10, arc.anon->lsize>>10,
2395fa9e4066Sahrens 		    tempreserve>>10, arc.c>>10);
2396fa9e4066Sahrens 		return (ERESTART);
2397fa9e4066Sahrens 	}
2398fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, tempreserve);
2399fa9e4066Sahrens 	return (0);
2400fa9e4066Sahrens }
2401fa9e4066Sahrens 
2402fa9e4066Sahrens void
2403fa9e4066Sahrens arc_init(void)
2404fa9e4066Sahrens {
2405fa9e4066Sahrens 	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
2406fa9e4066Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
2407fa9e4066Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
2408fa9e4066Sahrens 
2409*13506d1eSmaybee 	/* Convert seconds to clock ticks */
2410*13506d1eSmaybee 	arc_min_prefetch_lifespan *= hz;
2411*13506d1eSmaybee 
2412fa9e4066Sahrens 	/* Start out with 1/8 of all memory */
2413fa9e4066Sahrens 	arc.c = physmem * PAGESIZE / 8;
2414fa9e4066Sahrens 
2415fa9e4066Sahrens #ifdef _KERNEL
2416fa9e4066Sahrens 	/*
2417fa9e4066Sahrens 	 * On architectures where the physical memory can be larger
2418fa9e4066Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
2419fa9e4066Sahrens 	 * need to limit the cache to 1/8 of VM size.
2420fa9e4066Sahrens 	 */
2421fa9e4066Sahrens 	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
2422fa9e4066Sahrens #endif
2423fa9e4066Sahrens 
2424112fe045Smaybee 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
2425fa9e4066Sahrens 	arc.c_min = MAX(arc.c / 4, 64<<20);
2426112fe045Smaybee 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
2427fa9e4066Sahrens 	if (arc.c * 8 >= 1<<30)
2428fa9e4066Sahrens 		arc.c_max = (arc.c * 8) - (1<<30);
2429fa9e4066Sahrens 	else
2430fa9e4066Sahrens 		arc.c_max = arc.c_min;
2431fa9e4066Sahrens 	arc.c_max = MAX(arc.c * 6, arc.c_max);
2432fa9e4066Sahrens 	arc.c = arc.c_max;
2433fa9e4066Sahrens 	arc.p = (arc.c >> 1);
2434fa9e4066Sahrens 
2435fa9e4066Sahrens 	/* if kmem_flags are set, lets try to use less memory */
2436fa9e4066Sahrens 	if (kmem_debugging())
2437fa9e4066Sahrens 		arc.c = arc.c / 2;
2438fa9e4066Sahrens 	if (arc.c < arc.c_min)
2439fa9e4066Sahrens 		arc.c = arc.c_min;
2440fa9e4066Sahrens 
2441fa9e4066Sahrens 	arc.anon = &ARC_anon;
2442ea8dc4b6Seschrock 	arc.mru = &ARC_mru;
2443ea8dc4b6Seschrock 	arc.mru_ghost = &ARC_mru_ghost;
2444ea8dc4b6Seschrock 	arc.mfu = &ARC_mfu;
2445ea8dc4b6Seschrock 	arc.mfu_ghost = &ARC_mfu_ghost;
2446ea8dc4b6Seschrock 	arc.size = 0;
2447fa9e4066Sahrens 
2448ea8dc4b6Seschrock 	list_create(&arc.mru->list, sizeof (arc_buf_hdr_t),
2449fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2450ea8dc4b6Seschrock 	list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t),
2451fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2452ea8dc4b6Seschrock 	list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t),
2453fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2454ea8dc4b6Seschrock 	list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t),
2455fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
2456fa9e4066Sahrens 
2457fa9e4066Sahrens 	buf_init();
2458fa9e4066Sahrens 
2459fa9e4066Sahrens 	arc_thread_exit = 0;
2460ea8dc4b6Seschrock 	arc_eviction_list = NULL;
2461ea8dc4b6Seschrock 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
2462fa9e4066Sahrens 
2463fa9e4066Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
2464fa9e4066Sahrens 	    TS_RUN, minclsyspri);
2465fa9e4066Sahrens }
2466fa9e4066Sahrens 
2467fa9e4066Sahrens void
2468fa9e4066Sahrens arc_fini(void)
2469fa9e4066Sahrens {
2470fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
2471fa9e4066Sahrens 	arc_thread_exit = 1;
2472fa9e4066Sahrens 	while (arc_thread_exit != 0)
2473fa9e4066Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
2474fa9e4066Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
2475fa9e4066Sahrens 
2476fa9e4066Sahrens 	arc_flush();
2477fa9e4066Sahrens 
2478fa9e4066Sahrens 	arc_dead = TRUE;
2479fa9e4066Sahrens 
2480ea8dc4b6Seschrock 	mutex_destroy(&arc_eviction_mtx);
2481fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_lock);
2482fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
2483fa9e4066Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
2484fa9e4066Sahrens 
2485ea8dc4b6Seschrock 	list_destroy(&arc.mru->list);
2486ea8dc4b6Seschrock 	list_destroy(&arc.mru_ghost->list);
2487ea8dc4b6Seschrock 	list_destroy(&arc.mfu->list);
2488ea8dc4b6Seschrock 	list_destroy(&arc.mfu_ghost->list);
2489fa9e4066Sahrens 
2490fa9e4066Sahrens 	buf_fini();
2491fa9e4066Sahrens }
2492