xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision 033f983390fa5d2b54e3e09d83ac9000d71ddaae)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5*033f9833Sek  * Common Development and Distribution License (the "License").
6*033f9833Sek  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22*033f9833Sek  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
29fa9e4066Sahrens  * DVA-based Adjustable Relpacement Cache
30fa9e4066Sahrens  *
31fa9e4066Sahrens  * While much of the theory of operation and algorithms used here
32fa9e4066Sahrens  * are based on the self-tuning, low overhead replacement cache
33fa9e4066Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
34fa9e4066Sahrens  * significant differences:
35fa9e4066Sahrens  *
36fa9e4066Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
37fa9e4066Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
38fa9e4066Sahrens  * the eviction algorithm simple: evict the last page in the list.
39fa9e4066Sahrens  * This also make the performance characteristics easy to reason
40fa9e4066Sahrens  * about.  Our cache is not so simple.  At any given moment, some
41fa9e4066Sahrens  * subset of the blocks in the cache are un-evictable because we
42fa9e4066Sahrens  * have handed out a reference to them.  Blocks are only evictable
43fa9e4066Sahrens  * when there are no external references active.  This makes
44fa9e4066Sahrens  * eviction far more problematic:  we choose to evict the evictable
45fa9e4066Sahrens  * blocks that are the "lowest" in the list.
46fa9e4066Sahrens  *
47fa9e4066Sahrens  * There are times when it is not possible to evict the requested
48fa9e4066Sahrens  * space.  In these circumstances we are unable to adjust the cache
49fa9e4066Sahrens  * size.  To prevent the cache growing unbounded at these times we
50fa9e4066Sahrens  * implement a "cache throttle" that slowes the flow of new data
51fa9e4066Sahrens  * into the cache until we can make space avaiable.
52fa9e4066Sahrens  *
53fa9e4066Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
54fa9e4066Sahrens  * Pages are evicted when the cache is full and there is a cache
55fa9e4066Sahrens  * miss.  Our model has a variable sized cache.  It grows with
56fa9e4066Sahrens  * high use, but also tries to react to memory preasure from the
57fa9e4066Sahrens  * operating system: decreasing its size when system memory is
58fa9e4066Sahrens  * tight.
59fa9e4066Sahrens  *
60fa9e4066Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
61fa9e4066Sahrens  * elements of the cache are therefor exactly the same size.  So
62fa9e4066Sahrens  * when adjusting the cache size following a cache miss, its simply
63fa9e4066Sahrens  * a matter of choosing a single page to evict.  In our model, we
64fa9e4066Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
65fa9e4066Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
66fa9e4066Sahrens  * space for a cache miss that approximates as closely as possible
67fa9e4066Sahrens  * the space used by the new block.
68fa9e4066Sahrens  *
69fa9e4066Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70fa9e4066Sahrens  * by N. Megiddo & D. Modha, FAST 2003
71fa9e4066Sahrens  */
72fa9e4066Sahrens 
73fa9e4066Sahrens /*
74fa9e4066Sahrens  * The locking model:
75fa9e4066Sahrens  *
76fa9e4066Sahrens  * A new reference to a cache buffer can be obtained in two
77fa9e4066Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
78fa9e4066Sahrens  * or 2) via one of the ARC lists.  The arc_read() inerface
79fa9e4066Sahrens  * uses method 1, while the internal arc algorithms for
80fa9e4066Sahrens  * adjusting the cache use method 2.  We therefor provide two
81fa9e4066Sahrens  * types of locks: 1) the hash table lock array, and 2) the
82fa9e4066Sahrens  * arc list locks.
83fa9e4066Sahrens  *
84fa9e4066Sahrens  * Buffers do not have their own mutexs, rather they rely on the
85fa9e4066Sahrens  * hash table mutexs for the bulk of their protection (i.e. most
86fa9e4066Sahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
87fa9e4066Sahrens  *
88fa9e4066Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
89fa9e4066Sahrens  * locates the requested buffer in the hash table.  It returns
90fa9e4066Sahrens  * NULL for the mutex if the buffer was not in the table.
91fa9e4066Sahrens  *
92fa9e4066Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
93fa9e4066Sahrens  * already held before it is invoked.
94fa9e4066Sahrens  *
95fa9e4066Sahrens  * Each arc state also has a mutex which is used to protect the
96fa9e4066Sahrens  * buffer list associated with the state.  When attempting to
97fa9e4066Sahrens  * obtain a hash table lock while holding an arc list lock you
98fa9e4066Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
99fa9e4066Sahrens  * the "top" state mutex must be held before the "bot" state mutex.
100fa9e4066Sahrens  *
101fa9e4066Sahrens  * Note that the majority of the performance stats are manipulated
102fa9e4066Sahrens  * with atomic operations.
103fa9e4066Sahrens  */
104fa9e4066Sahrens 
105fa9e4066Sahrens #include <sys/spa.h>
106fa9e4066Sahrens #include <sys/zio.h>
107fa9e4066Sahrens #include <sys/zfs_context.h>
108fa9e4066Sahrens #include <sys/arc.h>
109fa9e4066Sahrens #include <sys/refcount.h>
110fa9e4066Sahrens #ifdef _KERNEL
111fa9e4066Sahrens #include <sys/vmsystm.h>
112fa9e4066Sahrens #include <vm/anon.h>
113fa9e4066Sahrens #include <sys/fs/swapnode.h>
114*033f9833Sek #include <sys/dnlc.h>
115fa9e4066Sahrens #endif
116fa9e4066Sahrens #include <sys/callb.h>
117fa9e4066Sahrens 
118fa9e4066Sahrens static kmutex_t		arc_reclaim_thr_lock;
119fa9e4066Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
120fa9e4066Sahrens static uint8_t		arc_thread_exit;
121fa9e4066Sahrens 
122*033f9833Sek #define	ARC_REDUCE_DNLC_PERCENT	3
123*033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
124*033f9833Sek 
125fa9e4066Sahrens typedef enum arc_reclaim_strategy {
126fa9e4066Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
127fa9e4066Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
128fa9e4066Sahrens } arc_reclaim_strategy_t;
129fa9e4066Sahrens 
130fa9e4066Sahrens /* number of seconds before growing cache again */
131fa9e4066Sahrens static int		arc_grow_retry = 60;
132fa9e4066Sahrens 
133fa9e4066Sahrens static kmutex_t arc_reclaim_lock;
134fa9e4066Sahrens static int arc_dead;
135fa9e4066Sahrens 
136fa9e4066Sahrens /*
137fa9e4066Sahrens  * Note that buffers can be on one of 5 states:
138fa9e4066Sahrens  *	ARC_anon	- anonymous (discussed below)
139fa9e4066Sahrens  *	ARC_mru_top	- recently used, currently cached
140fa9e4066Sahrens  *	ARC_mru_bot	- recentely used, no longer in cache
141fa9e4066Sahrens  *	ARC_mfu_top	- frequently used, currently cached
142fa9e4066Sahrens  *	ARC_mfu_bot	- frequently used, no longer in cache
143fa9e4066Sahrens  * When there are no active references to the buffer, they
144fa9e4066Sahrens  * are linked onto one of the lists in arc.  These are the
145fa9e4066Sahrens  * only buffers that can be evicted or deleted.
146fa9e4066Sahrens  *
147fa9e4066Sahrens  * Anonymous buffers are buffers that are not associated with
148fa9e4066Sahrens  * a DVA.  These are buffers that hold dirty block copies
149fa9e4066Sahrens  * before they are written to stable storage.  By definition,
150fa9e4066Sahrens  * they are "ref'd" and are considered part of arc_mru_top
151fa9e4066Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
152fa9e4066Sahrens  * as they are written and migrate onto the arc_mru_top list.
153fa9e4066Sahrens  */
154fa9e4066Sahrens 
155fa9e4066Sahrens typedef struct arc_state {
156fa9e4066Sahrens 	list_t	list;	/* linked list of evictable buffer in state */
157fa9e4066Sahrens 	uint64_t lsize;	/* total size of buffers in the linked list */
158fa9e4066Sahrens 	uint64_t size;	/* total size of all buffers in this state */
159fa9e4066Sahrens 	uint64_t hits;
160fa9e4066Sahrens 	kmutex_t mtx;
161fa9e4066Sahrens } arc_state_t;
162fa9e4066Sahrens 
163fa9e4066Sahrens /* The 5 states: */
164fa9e4066Sahrens static arc_state_t ARC_anon;
165fa9e4066Sahrens static arc_state_t ARC_mru_top;
166fa9e4066Sahrens static arc_state_t ARC_mru_bot;
167fa9e4066Sahrens static arc_state_t ARC_mfu_top;
168fa9e4066Sahrens static arc_state_t ARC_mfu_bot;
169fa9e4066Sahrens 
170fa9e4066Sahrens static struct arc {
171fa9e4066Sahrens 	arc_state_t 	*anon;
172fa9e4066Sahrens 	arc_state_t	*mru_top;
173fa9e4066Sahrens 	arc_state_t	*mru_bot;
174fa9e4066Sahrens 	arc_state_t	*mfu_top;
175fa9e4066Sahrens 	arc_state_t	*mfu_bot;
176fa9e4066Sahrens 	uint64_t	size;		/* Actual total arc size */
177fa9e4066Sahrens 	uint64_t	p;		/* Target size (in bytes) of mru_top */
178fa9e4066Sahrens 	uint64_t	c;		/* Target size of cache (in bytes) */
179fa9e4066Sahrens 	uint64_t	c_min;		/* Minimum target cache size */
180fa9e4066Sahrens 	uint64_t	c_max;		/* Maximum target cache size */
181fa9e4066Sahrens 	uint64_t	incr;		/* Size by which to increment arc.c */
182fa9e4066Sahrens 	int64_t		size_check;
183fa9e4066Sahrens 
184fa9e4066Sahrens 	/* performance stats */
185fa9e4066Sahrens 	uint64_t	hits;
186fa9e4066Sahrens 	uint64_t	misses;
187fa9e4066Sahrens 	uint64_t	deleted;
188fa9e4066Sahrens 	uint64_t	skipped;
189fa9e4066Sahrens 	uint64_t	hash_elements;
190fa9e4066Sahrens 	uint64_t	hash_elements_max;
191fa9e4066Sahrens 	uint64_t	hash_collisions;
192fa9e4066Sahrens 	uint64_t	hash_chains;
193fa9e4066Sahrens 	uint32_t	hash_chain_max;
194fa9e4066Sahrens 
195fa9e4066Sahrens 	int		no_grow;	/* Don't try to grow cache size */
196fa9e4066Sahrens } arc;
197fa9e4066Sahrens 
198fa9e4066Sahrens /* Default amount to grow arc.incr */
199fa9e4066Sahrens static int64_t arc_incr_size = 1024;
200fa9e4066Sahrens 
201fa9e4066Sahrens /* > 0 ==> time to increment arc.c */
202fa9e4066Sahrens static int64_t arc_size_check_default = -1000;
203fa9e4066Sahrens 
204fa9e4066Sahrens static uint64_t arc_tempreserve;
205fa9e4066Sahrens 
206fa9e4066Sahrens typedef struct arc_callback arc_callback_t;
207fa9e4066Sahrens 
208fa9e4066Sahrens struct arc_callback {
209fa9e4066Sahrens 	arc_done_func_t		*acb_done;
210fa9e4066Sahrens 	void			*acb_private;
211fa9e4066Sahrens 	arc_byteswap_func_t	*acb_byteswap;
212fa9e4066Sahrens 	arc_buf_t		*acb_buf;
213fa9e4066Sahrens 	zio_t			*acb_zio_dummy;
214fa9e4066Sahrens 	arc_callback_t		*acb_next;
215fa9e4066Sahrens };
216fa9e4066Sahrens 
217fa9e4066Sahrens struct arc_buf_hdr {
218fa9e4066Sahrens 	/* immutable */
219fa9e4066Sahrens 	uint64_t		b_size;
220fa9e4066Sahrens 	spa_t			*b_spa;
221fa9e4066Sahrens 
222fa9e4066Sahrens 	/* protected by hash lock */
223fa9e4066Sahrens 	dva_t			b_dva;
224fa9e4066Sahrens 	uint64_t		b_birth;
225fa9e4066Sahrens 	uint64_t		b_cksum0;
226fa9e4066Sahrens 
227fa9e4066Sahrens 	arc_buf_hdr_t		*b_hash_next;
228fa9e4066Sahrens 	arc_buf_t		*b_buf;
229fa9e4066Sahrens 	uint32_t		b_flags;
230fa9e4066Sahrens 
231fa9e4066Sahrens 	kcondvar_t		b_cv;
232fa9e4066Sahrens 	arc_callback_t		*b_acb;
233fa9e4066Sahrens 
234fa9e4066Sahrens 	/* protected by arc state mutex */
235fa9e4066Sahrens 	arc_state_t		*b_state;
236fa9e4066Sahrens 	list_node_t		b_arc_node;
237fa9e4066Sahrens 
238fa9e4066Sahrens 	/* updated atomically */
239fa9e4066Sahrens 	clock_t			b_arc_access;
240fa9e4066Sahrens 
241fa9e4066Sahrens 	/* self protecting */
242fa9e4066Sahrens 	refcount_t		b_refcnt;
243fa9e4066Sahrens };
244fa9e4066Sahrens 
245fa9e4066Sahrens /*
246fa9e4066Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
247fa9e4066Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
248fa9e4066Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
249fa9e4066Sahrens  * should never be passed and should only be set by ARC code.  When adding new
250fa9e4066Sahrens  * public flags, make sure not to smash the private ones.
251fa9e4066Sahrens  */
252fa9e4066Sahrens 
253fa9e4066Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
254fa9e4066Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
255fa9e4066Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
256fa9e4066Sahrens 
257fa9e4066Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
258fa9e4066Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
259fa9e4066Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
260fa9e4066Sahrens 
261fa9e4066Sahrens /*
262fa9e4066Sahrens  * Hash table routines
263fa9e4066Sahrens  */
264fa9e4066Sahrens 
265fa9e4066Sahrens #define	HT_LOCK_PAD	64
266fa9e4066Sahrens 
267fa9e4066Sahrens struct ht_lock {
268fa9e4066Sahrens 	kmutex_t	ht_lock;
269fa9e4066Sahrens #ifdef _KERNEL
270fa9e4066Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
271fa9e4066Sahrens #endif
272fa9e4066Sahrens };
273fa9e4066Sahrens 
274fa9e4066Sahrens #define	BUF_LOCKS 256
275fa9e4066Sahrens typedef struct buf_hash_table {
276fa9e4066Sahrens 	uint64_t ht_mask;
277fa9e4066Sahrens 	arc_buf_hdr_t **ht_table;
278fa9e4066Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
279fa9e4066Sahrens } buf_hash_table_t;
280fa9e4066Sahrens 
281fa9e4066Sahrens static buf_hash_table_t buf_hash_table;
282fa9e4066Sahrens 
283fa9e4066Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
284fa9e4066Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
285fa9e4066Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
286fa9e4066Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
287fa9e4066Sahrens #define	HDR_LOCK(buf) \
288fa9e4066Sahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
289fa9e4066Sahrens 
290fa9e4066Sahrens uint64_t zfs_crc64_table[256];
291fa9e4066Sahrens 
292fa9e4066Sahrens static uint64_t
293fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
294fa9e4066Sahrens {
295fa9e4066Sahrens 	uintptr_t spav = (uintptr_t)spa;
296fa9e4066Sahrens 	uint8_t *vdva = (uint8_t *)dva;
297fa9e4066Sahrens 	uint64_t crc = -1ULL;
298fa9e4066Sahrens 	int i;
299fa9e4066Sahrens 
300fa9e4066Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
301fa9e4066Sahrens 
302fa9e4066Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
303fa9e4066Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
304fa9e4066Sahrens 
305fa9e4066Sahrens 	crc ^= (spav>>8) ^ birth;
306fa9e4066Sahrens 
307fa9e4066Sahrens 	return (crc);
308fa9e4066Sahrens }
309fa9e4066Sahrens 
310fa9e4066Sahrens #define	BUF_EMPTY(buf)						\
311fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
312fa9e4066Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
313fa9e4066Sahrens 	(buf)->b_birth == 0)
314fa9e4066Sahrens 
315fa9e4066Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
316fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
317fa9e4066Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
318fa9e4066Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
319fa9e4066Sahrens 
320fa9e4066Sahrens static arc_buf_hdr_t *
321fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
322fa9e4066Sahrens {
323fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
324fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
325fa9e4066Sahrens 	arc_buf_hdr_t *buf;
326fa9e4066Sahrens 
327fa9e4066Sahrens 	mutex_enter(hash_lock);
328fa9e4066Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
329fa9e4066Sahrens 	    buf = buf->b_hash_next) {
330fa9e4066Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
331fa9e4066Sahrens 			*lockp = hash_lock;
332fa9e4066Sahrens 			return (buf);
333fa9e4066Sahrens 		}
334fa9e4066Sahrens 	}
335fa9e4066Sahrens 	mutex_exit(hash_lock);
336fa9e4066Sahrens 	*lockp = NULL;
337fa9e4066Sahrens 	return (NULL);
338fa9e4066Sahrens }
339fa9e4066Sahrens 
340fa9e4066Sahrens /*
341fa9e4066Sahrens  * Insert an entry into the hash table.  If there is already an element
342fa9e4066Sahrens  * equal to elem in the hash table, then the already existing element
343fa9e4066Sahrens  * will be returned and the new element will not be inserted.
344fa9e4066Sahrens  * Otherwise returns NULL.
345fa9e4066Sahrens  */
346fa9e4066Sahrens static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
347fa9e4066Sahrens static kthread_t *fbufs_lastthread;
348fa9e4066Sahrens static arc_buf_hdr_t *
349fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
350fa9e4066Sahrens {
351fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
352fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
353fa9e4066Sahrens 	arc_buf_hdr_t *fbuf;
354fa9e4066Sahrens 	uint32_t max, i;
355fa9e4066Sahrens 
356fa9e4066Sahrens 	fbufs_lastthread = curthread;
357fa9e4066Sahrens 	*lockp = hash_lock;
358fa9e4066Sahrens 	mutex_enter(hash_lock);
359fa9e4066Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
360fa9e4066Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
361fa9e4066Sahrens 		if (i < sizeof (fbufs) / sizeof (fbufs[0]))
362fa9e4066Sahrens 			fbufs[i] = fbuf;
363fa9e4066Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
364fa9e4066Sahrens 			return (fbuf);
365fa9e4066Sahrens 	}
366fa9e4066Sahrens 
367fa9e4066Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
368fa9e4066Sahrens 	buf_hash_table.ht_table[idx] = buf;
369fa9e4066Sahrens 
370fa9e4066Sahrens 	/* collect some hash table performance data */
371fa9e4066Sahrens 	if (i > 0) {
372fa9e4066Sahrens 		atomic_add_64(&arc.hash_collisions, 1);
373fa9e4066Sahrens 		if (i == 1)
374fa9e4066Sahrens 			atomic_add_64(&arc.hash_chains, 1);
375fa9e4066Sahrens 	}
376fa9e4066Sahrens 	while (i > (max = arc.hash_chain_max) &&
377fa9e4066Sahrens 	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
378fa9e4066Sahrens 		continue;
379fa9e4066Sahrens 	}
380fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, 1);
381fa9e4066Sahrens 	if (arc.hash_elements > arc.hash_elements_max)
382fa9e4066Sahrens 		atomic_add_64(&arc.hash_elements_max, 1);
383fa9e4066Sahrens 
384fa9e4066Sahrens 	return (NULL);
385fa9e4066Sahrens }
386fa9e4066Sahrens 
387fa9e4066Sahrens static void
388fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
389fa9e4066Sahrens {
390fa9e4066Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
391fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
392fa9e4066Sahrens 
393fa9e4066Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
394fa9e4066Sahrens 
395fa9e4066Sahrens 	bufp = &buf_hash_table.ht_table[idx];
396fa9e4066Sahrens 	while ((fbuf = *bufp) != buf) {
397fa9e4066Sahrens 		ASSERT(fbuf != NULL);
398fa9e4066Sahrens 		bufp = &fbuf->b_hash_next;
399fa9e4066Sahrens 	}
400fa9e4066Sahrens 	*bufp = buf->b_hash_next;
401fa9e4066Sahrens 	buf->b_hash_next = NULL;
402fa9e4066Sahrens 
403fa9e4066Sahrens 	/* collect some hash table performance data */
404fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, -1);
405fa9e4066Sahrens 	if (buf_hash_table.ht_table[idx] &&
406fa9e4066Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
407fa9e4066Sahrens 		atomic_add_64(&arc.hash_chains, -1);
408fa9e4066Sahrens }
409fa9e4066Sahrens 
410fa9e4066Sahrens /*
411fa9e4066Sahrens  * Global data structures and functions for the buf kmem cache.
412fa9e4066Sahrens  */
413fa9e4066Sahrens static kmem_cache_t *hdr_cache;
414fa9e4066Sahrens static kmem_cache_t *buf_cache;
415fa9e4066Sahrens 
416fa9e4066Sahrens static void
417fa9e4066Sahrens buf_fini(void)
418fa9e4066Sahrens {
419fa9e4066Sahrens 	int i;
420fa9e4066Sahrens 
421fa9e4066Sahrens 	kmem_free(buf_hash_table.ht_table,
422fa9e4066Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
423fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
424fa9e4066Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
425fa9e4066Sahrens 	kmem_cache_destroy(hdr_cache);
426fa9e4066Sahrens 	kmem_cache_destroy(buf_cache);
427fa9e4066Sahrens }
428fa9e4066Sahrens 
429fa9e4066Sahrens /*
430fa9e4066Sahrens  * Constructor callback - called when the cache is empty
431fa9e4066Sahrens  * and a new buf is requested.
432fa9e4066Sahrens  */
433fa9e4066Sahrens /* ARGSUSED */
434fa9e4066Sahrens static int
435fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
436fa9e4066Sahrens {
437fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
438fa9e4066Sahrens 
439fa9e4066Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
440fa9e4066Sahrens 	refcount_create(&buf->b_refcnt);
441fa9e4066Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
442fa9e4066Sahrens 	return (0);
443fa9e4066Sahrens }
444fa9e4066Sahrens 
445fa9e4066Sahrens /*
446fa9e4066Sahrens  * Destructor callback - called when a cached buf is
447fa9e4066Sahrens  * no longer required.
448fa9e4066Sahrens  */
449fa9e4066Sahrens /* ARGSUSED */
450fa9e4066Sahrens static void
451fa9e4066Sahrens hdr_dest(void *vbuf, void *unused)
452fa9e4066Sahrens {
453fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
454fa9e4066Sahrens 
455fa9e4066Sahrens 	refcount_destroy(&buf->b_refcnt);
456fa9e4066Sahrens 	cv_destroy(&buf->b_cv);
457fa9e4066Sahrens }
458fa9e4066Sahrens 
459fa9e4066Sahrens void arc_kmem_reclaim(void);
460fa9e4066Sahrens 
461fa9e4066Sahrens /*
462fa9e4066Sahrens  * Reclaim callback -- invoked when memory is low.
463fa9e4066Sahrens  */
464fa9e4066Sahrens /* ARGSUSED */
465fa9e4066Sahrens static void
466fa9e4066Sahrens hdr_recl(void *unused)
467fa9e4066Sahrens {
468fa9e4066Sahrens 	dprintf("hdr_recl called\n");
469fa9e4066Sahrens 	arc_kmem_reclaim();
470fa9e4066Sahrens }
471fa9e4066Sahrens 
472fa9e4066Sahrens static void
473fa9e4066Sahrens buf_init(void)
474fa9e4066Sahrens {
475fa9e4066Sahrens 	uint64_t *ct;
476fa9e4066Sahrens 	uint64_t hsize = 1ULL << 10;
477fa9e4066Sahrens 	int i, j;
478fa9e4066Sahrens 
479fa9e4066Sahrens 	/*
480fa9e4066Sahrens 	 * The hash table is big enough to fill all of physical memory
481fa9e4066Sahrens 	 * with an average 4k block size.  The table will take up
482fa9e4066Sahrens 	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
483fa9e4066Sahrens 	 * pointers).
484fa9e4066Sahrens 	 */
485fa9e4066Sahrens 	while (hsize * 4096 < physmem * PAGESIZE)
486fa9e4066Sahrens 		hsize <<= 1;
487fa9e4066Sahrens 
488fa9e4066Sahrens 	buf_hash_table.ht_mask = hsize - 1;
489fa9e4066Sahrens 	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
490fa9e4066Sahrens 
491fa9e4066Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
492fa9e4066Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
493fa9e4066Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
494fa9e4066Sahrens 	    0, NULL, NULL, NULL, NULL, NULL, 0);
495fa9e4066Sahrens 
496fa9e4066Sahrens 	for (i = 0; i < 256; i++)
497fa9e4066Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
498fa9e4066Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
499fa9e4066Sahrens 
500fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
501fa9e4066Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
502fa9e4066Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
503fa9e4066Sahrens 	}
504fa9e4066Sahrens }
505fa9e4066Sahrens 
506fa9e4066Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
507fa9e4066Sahrens 
508fa9e4066Sahrens #define	ARC_TAG		(void *)0x05201962
509fa9e4066Sahrens 
510fa9e4066Sahrens static void
511fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
512fa9e4066Sahrens {
513fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
514fa9e4066Sahrens 
515fa9e4066Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
516fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
517fa9e4066Sahrens 
518fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
519fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
520fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&ab->b_refcnt));
521fa9e4066Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
522fa9e4066Sahrens 		list_remove(&ab->b_state->list, ab);
523fa9e4066Sahrens 		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
524fa9e4066Sahrens 		ab->b_state->lsize -= ab->b_size;
525fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
526fa9e4066Sahrens 	}
527fa9e4066Sahrens }
528fa9e4066Sahrens 
529fa9e4066Sahrens static int
530fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
531fa9e4066Sahrens {
532fa9e4066Sahrens 	int cnt;
533fa9e4066Sahrens 
534fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
535fa9e4066Sahrens 
536fa9e4066Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
537fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
538fa9e4066Sahrens 
539fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
540fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
541fa9e4066Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
542fa9e4066Sahrens 		list_insert_head(&ab->b_state->list, ab);
543fa9e4066Sahrens 		ASSERT(ab->b_buf != NULL);
544fa9e4066Sahrens 		ab->b_state->lsize += ab->b_size;
545fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
546fa9e4066Sahrens 	}
547fa9e4066Sahrens 	return (cnt);
548fa9e4066Sahrens }
549fa9e4066Sahrens 
550fa9e4066Sahrens /*
551fa9e4066Sahrens  * Move the supplied buffer to the indicated state.  The mutex
552fa9e4066Sahrens  * for the buffer must be held by the caller.
553fa9e4066Sahrens  */
554fa9e4066Sahrens static void
555fa9e4066Sahrens arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
556fa9e4066Sahrens     kmutex_t *hash_lock)
557fa9e4066Sahrens {
558fa9e4066Sahrens 	arc_buf_t *buf;
559fa9e4066Sahrens 
560fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
561fa9e4066Sahrens 
562fa9e4066Sahrens 	/*
563fa9e4066Sahrens 	 * If this buffer is evictable, transfer it from the
564fa9e4066Sahrens 	 * old state list to the new state list.
565fa9e4066Sahrens 	 */
566fa9e4066Sahrens 	if (refcount_is_zero(&ab->b_refcnt)) {
567fa9e4066Sahrens 		if (ab->b_state != arc.anon) {
568fa9e4066Sahrens 			int drop_mutex = FALSE;
569fa9e4066Sahrens 
570fa9e4066Sahrens 			if (!MUTEX_HELD(&ab->b_state->mtx)) {
571fa9e4066Sahrens 				mutex_enter(&ab->b_state->mtx);
572fa9e4066Sahrens 				drop_mutex = TRUE;
573fa9e4066Sahrens 			}
574fa9e4066Sahrens 			ASSERT(list_link_active(&ab->b_arc_node));
575fa9e4066Sahrens 			list_remove(&ab->b_state->list, ab);
576fa9e4066Sahrens 			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
577fa9e4066Sahrens 			ab->b_state->lsize -= ab->b_size;
578fa9e4066Sahrens 			if (drop_mutex)
579fa9e4066Sahrens 				mutex_exit(&ab->b_state->mtx);
580fa9e4066Sahrens 		}
581fa9e4066Sahrens 		if (new_state != arc.anon) {
582fa9e4066Sahrens 			int drop_mutex = FALSE;
583fa9e4066Sahrens 
584fa9e4066Sahrens 			if (!MUTEX_HELD(&new_state->mtx)) {
585fa9e4066Sahrens 				mutex_enter(&new_state->mtx);
586fa9e4066Sahrens 				drop_mutex = TRUE;
587fa9e4066Sahrens 			}
588fa9e4066Sahrens 			list_insert_head(&new_state->list, ab);
589fa9e4066Sahrens 			ASSERT(ab->b_buf != NULL);
590fa9e4066Sahrens 			new_state->lsize += ab->b_size;
591fa9e4066Sahrens 			if (drop_mutex)
592fa9e4066Sahrens 				mutex_exit(&new_state->mtx);
593fa9e4066Sahrens 		}
594fa9e4066Sahrens 	}
595fa9e4066Sahrens 
596fa9e4066Sahrens 	ASSERT(!BUF_EMPTY(ab));
597fa9e4066Sahrens 	if (new_state == arc.anon && ab->b_state != arc.anon) {
598fa9e4066Sahrens 		buf_hash_remove(ab);
599fa9e4066Sahrens 	}
600fa9e4066Sahrens 
601fa9e4066Sahrens 	/*
602fa9e4066Sahrens 	 * If this buffer isn't being transferred to the MRU-top
603fa9e4066Sahrens 	 * state, it's safe to clear its prefetch flag
604fa9e4066Sahrens 	 */
605fa9e4066Sahrens 	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
606fa9e4066Sahrens 		ab->b_flags &= ~ARC_PREFETCH;
607fa9e4066Sahrens 	}
608fa9e4066Sahrens 
609fa9e4066Sahrens 	buf = ab->b_buf;
610fa9e4066Sahrens 	if (buf == NULL) {
611fa9e4066Sahrens 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
612fa9e4066Sahrens 		atomic_add_64(&ab->b_state->size, -ab->b_size);
613fa9e4066Sahrens 		/* we should only be here if we are deleting state */
614fa9e4066Sahrens 		ASSERT(new_state == arc.anon &&
615fa9e4066Sahrens 		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
616fa9e4066Sahrens 	} else while (buf) {
617fa9e4066Sahrens 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
618fa9e4066Sahrens 		atomic_add_64(&ab->b_state->size, -ab->b_size);
619fa9e4066Sahrens 		atomic_add_64(&new_state->size, ab->b_size);
620fa9e4066Sahrens 		buf = buf->b_next;
621fa9e4066Sahrens 	}
622fa9e4066Sahrens 	ab->b_state = new_state;
623fa9e4066Sahrens }
624fa9e4066Sahrens 
625fa9e4066Sahrens arc_buf_t *
626fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag)
627fa9e4066Sahrens {
628fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
629fa9e4066Sahrens 	arc_buf_t *buf;
630fa9e4066Sahrens 
631fa9e4066Sahrens 	ASSERT3U(size, >, 0);
632fa9e4066Sahrens 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
633fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
634fa9e4066Sahrens 	hdr->b_size = size;
635fa9e4066Sahrens 	hdr->b_spa = spa;
636fa9e4066Sahrens 	hdr->b_state = arc.anon;
637fa9e4066Sahrens 	hdr->b_arc_access = 0;
638fa9e4066Sahrens 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
639fa9e4066Sahrens 	buf->b_hdr = hdr;
640fa9e4066Sahrens 	buf->b_next = NULL;
641fa9e4066Sahrens 	buf->b_data = zio_buf_alloc(size);
642fa9e4066Sahrens 	hdr->b_buf = buf;
643fa9e4066Sahrens 	hdr->b_flags = 0;
644fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
645fa9e4066Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
646fa9e4066Sahrens 
647fa9e4066Sahrens 	atomic_add_64(&arc.size, size);
648fa9e4066Sahrens 	atomic_add_64(&arc.anon->size, size);
649fa9e4066Sahrens 
650fa9e4066Sahrens 	return (buf);
651fa9e4066Sahrens }
652fa9e4066Sahrens 
653fa9e4066Sahrens static void
654fa9e4066Sahrens arc_hdr_free(arc_buf_hdr_t *hdr)
655fa9e4066Sahrens {
656fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
657fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
658fa9e4066Sahrens 
659fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
660fa9e4066Sahrens 		/*
661fa9e4066Sahrens 		 * We can be called with an arc state lock held,
662fa9e4066Sahrens 		 * so we can't hold a hash lock here.
663fa9e4066Sahrens 		 * ASSERT(not in hash table)
664fa9e4066Sahrens 		 */
665fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
666fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
667fa9e4066Sahrens 		hdr->b_birth = 0;
668fa9e4066Sahrens 		hdr->b_cksum0 = 0;
669fa9e4066Sahrens 	}
670fa9e4066Sahrens 	if (hdr->b_buf) {
671fa9e4066Sahrens 		arc_buf_t *buf = hdr->b_buf;
672fa9e4066Sahrens 
673fa9e4066Sahrens 		ASSERT3U(hdr->b_size, >, 0);
674fa9e4066Sahrens 		zio_buf_free(buf->b_data, hdr->b_size);
675fa9e4066Sahrens 		atomic_add_64(&arc.size, -hdr->b_size);
676fa9e4066Sahrens 		ASSERT3U(arc.anon->size, >=, hdr->b_size);
677fa9e4066Sahrens 		atomic_add_64(&arc.anon->size, -hdr->b_size);
678fa9e4066Sahrens 		ASSERT3P(buf->b_next, ==, NULL);
679fa9e4066Sahrens 		kmem_cache_free(buf_cache, buf);
680fa9e4066Sahrens 		hdr->b_buf = NULL;
681fa9e4066Sahrens 	}
682fa9e4066Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
683fa9e4066Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
684fa9e4066Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
685fa9e4066Sahrens 	kmem_cache_free(hdr_cache, hdr);
686fa9e4066Sahrens }
687fa9e4066Sahrens 
688fa9e4066Sahrens void
689fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
690fa9e4066Sahrens {
691fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
692fa9e4066Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
693fa9e4066Sahrens 	int freeable;
694fa9e4066Sahrens 
695fa9e4066Sahrens 	mutex_enter(hash_lock);
696fa9e4066Sahrens 	if (remove_reference(hdr, hash_lock, tag) > 0) {
697fa9e4066Sahrens 		arc_buf_t **bufp = &hdr->b_buf;
698fa9e4066Sahrens 		arc_state_t *state = hdr->b_state;
699fa9e4066Sahrens 		uint64_t size = hdr->b_size;
700fa9e4066Sahrens 
701fa9e4066Sahrens 		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
702fa9e4066Sahrens 		while (*bufp != buf) {
703fa9e4066Sahrens 			ASSERT(*bufp);
704fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
705fa9e4066Sahrens 		}
706fa9e4066Sahrens 		*bufp = buf->b_next;
707fa9e4066Sahrens 		mutex_exit(hash_lock);
708fa9e4066Sahrens 		zio_buf_free(buf->b_data, size);
709fa9e4066Sahrens 		atomic_add_64(&arc.size, -size);
710fa9e4066Sahrens 		kmem_cache_free(buf_cache, buf);
711fa9e4066Sahrens 		ASSERT3U(state->size, >=, size);
712fa9e4066Sahrens 		atomic_add_64(&state->size, -size);
713fa9e4066Sahrens 		return;
714fa9e4066Sahrens 	}
715fa9e4066Sahrens 
716fa9e4066Sahrens 	/* don't free buffers that are in the middle of an async write */
717fa9e4066Sahrens 	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
718fa9e4066Sahrens 	mutex_exit(hash_lock);
719fa9e4066Sahrens 
720fa9e4066Sahrens 	if (freeable)
721fa9e4066Sahrens 		arc_hdr_free(hdr);
722fa9e4066Sahrens }
723fa9e4066Sahrens 
724fa9e4066Sahrens int
725fa9e4066Sahrens arc_buf_size(arc_buf_t *buf)
726fa9e4066Sahrens {
727fa9e4066Sahrens 	return (buf->b_hdr->b_size);
728fa9e4066Sahrens }
729fa9e4066Sahrens 
730fa9e4066Sahrens /*
731fa9e4066Sahrens  * Evict buffers from list until we've removed the specified number of
732fa9e4066Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
733fa9e4066Sahrens  */
734fa9e4066Sahrens static uint64_t
735fa9e4066Sahrens arc_evict_state(arc_state_t *state, int64_t bytes)
736fa9e4066Sahrens {
737fa9e4066Sahrens 	arc_state_t *evicted_state;
738fa9e4066Sahrens 	uint64_t bytes_evicted = 0;
739fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
740fa9e4066Sahrens 	kmutex_t *hash_lock;
741fa9e4066Sahrens 
742fa9e4066Sahrens 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
743fa9e4066Sahrens 
744fa9e4066Sahrens 	if (state == arc.mru_top)
745fa9e4066Sahrens 		evicted_state = arc.mru_bot;
746fa9e4066Sahrens 	else
747fa9e4066Sahrens 		evicted_state = arc.mfu_bot;
748fa9e4066Sahrens 
749fa9e4066Sahrens 	mutex_enter(&state->mtx);
750fa9e4066Sahrens 	mutex_enter(&evicted_state->mtx);
751fa9e4066Sahrens 
752fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
753fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
754fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
755fa9e4066Sahrens 		if (mutex_tryenter(hash_lock)) {
756fa9e4066Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
757fa9e4066Sahrens 			arc_change_state(evicted_state, ab, hash_lock);
758fa9e4066Sahrens 			zio_buf_free(ab->b_buf->b_data, ab->b_size);
759fa9e4066Sahrens 			atomic_add_64(&arc.size, -ab->b_size);
760fa9e4066Sahrens 			ASSERT3P(ab->b_buf->b_next, ==, NULL);
761fa9e4066Sahrens 			kmem_cache_free(buf_cache, ab->b_buf);
762fa9e4066Sahrens 			ab->b_buf = NULL;
763fa9e4066Sahrens 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
764fa9e4066Sahrens 			bytes_evicted += ab->b_size;
765fa9e4066Sahrens 			mutex_exit(hash_lock);
766fa9e4066Sahrens 			if (bytes_evicted >= bytes)
767fa9e4066Sahrens 				break;
768fa9e4066Sahrens 		} else {
769fa9e4066Sahrens 			atomic_add_64(&arc.skipped, 1);
770fa9e4066Sahrens 		}
771fa9e4066Sahrens 	}
772fa9e4066Sahrens 	mutex_exit(&evicted_state->mtx);
773fa9e4066Sahrens 	mutex_exit(&state->mtx);
774fa9e4066Sahrens 
775fa9e4066Sahrens 	if (bytes_evicted < bytes)
776fa9e4066Sahrens 		dprintf("only evicted %lld bytes from %x",
777fa9e4066Sahrens 		    (longlong_t)bytes_evicted, state);
778fa9e4066Sahrens 
779fa9e4066Sahrens 	return (bytes_evicted);
780fa9e4066Sahrens }
781fa9e4066Sahrens 
782fa9e4066Sahrens /*
783fa9e4066Sahrens  * Remove buffers from list until we've removed the specified number of
784fa9e4066Sahrens  * bytes.  Destroy the buffers that are removed.
785fa9e4066Sahrens  */
786fa9e4066Sahrens static void
787fa9e4066Sahrens arc_delete_state(arc_state_t *state, int64_t bytes)
788fa9e4066Sahrens {
789fa9e4066Sahrens 	uint_t bufs_skipped = 0;
790fa9e4066Sahrens 	uint64_t bytes_deleted = 0;
791fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
792fa9e4066Sahrens 	kmutex_t *hash_lock;
793fa9e4066Sahrens 
794fa9e4066Sahrens top:
795fa9e4066Sahrens 	mutex_enter(&state->mtx);
796fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
797fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
798fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
799fa9e4066Sahrens 		if (mutex_tryenter(hash_lock)) {
800fa9e4066Sahrens 			arc_change_state(arc.anon, ab, hash_lock);
801fa9e4066Sahrens 			mutex_exit(hash_lock);
802fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
803fa9e4066Sahrens 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
804fa9e4066Sahrens 			bytes_deleted += ab->b_size;
805fa9e4066Sahrens 			arc_hdr_free(ab);
806fa9e4066Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
807fa9e4066Sahrens 				break;
808fa9e4066Sahrens 		} else {
809fa9e4066Sahrens 			if (bytes < 0) {
810fa9e4066Sahrens 				mutex_exit(&state->mtx);
811fa9e4066Sahrens 				mutex_enter(hash_lock);
812fa9e4066Sahrens 				mutex_exit(hash_lock);
813fa9e4066Sahrens 				goto top;
814fa9e4066Sahrens 			}
815fa9e4066Sahrens 			bufs_skipped += 1;
816fa9e4066Sahrens 		}
817fa9e4066Sahrens 	}
818fa9e4066Sahrens 	mutex_exit(&state->mtx);
819fa9e4066Sahrens 
820fa9e4066Sahrens 	if (bufs_skipped) {
821fa9e4066Sahrens 		atomic_add_64(&arc.skipped, bufs_skipped);
822fa9e4066Sahrens 		ASSERT(bytes >= 0);
823fa9e4066Sahrens 	}
824fa9e4066Sahrens 
825fa9e4066Sahrens 	if (bytes_deleted < bytes)
826fa9e4066Sahrens 		dprintf("only deleted %lld bytes from %p",
827fa9e4066Sahrens 		    (longlong_t)bytes_deleted, state);
828fa9e4066Sahrens }
829fa9e4066Sahrens 
830fa9e4066Sahrens static void
831fa9e4066Sahrens arc_adjust(void)
832fa9e4066Sahrens {
833fa9e4066Sahrens 	int64_t top_sz, mru_over, arc_over;
834fa9e4066Sahrens 
835fa9e4066Sahrens 	top_sz = arc.anon->size + arc.mru_top->size;
836fa9e4066Sahrens 
837fa9e4066Sahrens 	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
838fa9e4066Sahrens 		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
839fa9e4066Sahrens 		(void) arc_evict_state(arc.mru_top, toevict);
840fa9e4066Sahrens 		top_sz = arc.anon->size + arc.mru_top->size;
841fa9e4066Sahrens 	}
842fa9e4066Sahrens 
843fa9e4066Sahrens 	mru_over = top_sz + arc.mru_bot->size - arc.c;
844fa9e4066Sahrens 
845fa9e4066Sahrens 	if (mru_over > 0) {
846fa9e4066Sahrens 		if (arc.mru_bot->lsize > 0) {
847fa9e4066Sahrens 			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
848fa9e4066Sahrens 			arc_delete_state(arc.mru_bot, todelete);
849fa9e4066Sahrens 		}
850fa9e4066Sahrens 	}
851fa9e4066Sahrens 
852fa9e4066Sahrens 	if ((arc_over = arc.size - arc.c) > 0) {
853fa9e4066Sahrens 		int64_t table_over;
854fa9e4066Sahrens 
855fa9e4066Sahrens 		if (arc.mfu_top->lsize > 0) {
856fa9e4066Sahrens 			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
857fa9e4066Sahrens 			(void) arc_evict_state(arc.mfu_top, toevict);
858fa9e4066Sahrens 		}
859fa9e4066Sahrens 
860fa9e4066Sahrens 		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
861fa9e4066Sahrens 		    - arc.c*2;
862fa9e4066Sahrens 
863fa9e4066Sahrens 		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
864fa9e4066Sahrens 			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
865fa9e4066Sahrens 			arc_delete_state(arc.mfu_bot, todelete);
866fa9e4066Sahrens 		}
867fa9e4066Sahrens 	}
868fa9e4066Sahrens }
869fa9e4066Sahrens 
870fa9e4066Sahrens /*
871fa9e4066Sahrens  * Flush all *evictable* data from the cache.
872fa9e4066Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
873fa9e4066Sahrens  */
874fa9e4066Sahrens void
875fa9e4066Sahrens arc_flush(void)
876fa9e4066Sahrens {
877fa9e4066Sahrens 	arc_delete_state(arc.mru_top, -1);
878fa9e4066Sahrens 	arc_delete_state(arc.mfu_top, -1);
879fa9e4066Sahrens 
880fa9e4066Sahrens 	arc_delete_state(arc.mru_bot, -1);
881fa9e4066Sahrens 	arc_delete_state(arc.mfu_bot, -1);
882fa9e4066Sahrens }
883fa9e4066Sahrens 
884fa9e4066Sahrens void
885fa9e4066Sahrens arc_kmem_reclaim(void)
886fa9e4066Sahrens {
887fa9e4066Sahrens 	/* Remove 6.25% */
888fa9e4066Sahrens 	/*
889fa9e4066Sahrens 	 * We need arc_reclaim_lock because we don't want multiple
890fa9e4066Sahrens 	 * threads trying to reclaim concurrently.
891fa9e4066Sahrens 	 */
892fa9e4066Sahrens 
893fa9e4066Sahrens 	/*
894fa9e4066Sahrens 	 * umem calls the reclaim func when we destroy the buf cache,
895fa9e4066Sahrens 	 * which is after we do arc_fini().  So we set a flag to prevent
896fa9e4066Sahrens 	 * accessing the destroyed mutexes and lists.
897fa9e4066Sahrens 	 */
898fa9e4066Sahrens 	if (arc_dead)
899fa9e4066Sahrens 		return;
900fa9e4066Sahrens 
901fa9e4066Sahrens 	mutex_enter(&arc_reclaim_lock);
902fa9e4066Sahrens 
903fa9e4066Sahrens 	atomic_add_64(&arc.c, -(arc.c >> 4));
904fa9e4066Sahrens 	if (arc.c < arc.c_min)
905fa9e4066Sahrens 		arc.c = arc.c_min;
906fa9e4066Sahrens 	atomic_add_64(&arc.p, -(arc.p >> 4));
907fa9e4066Sahrens 
908fa9e4066Sahrens 	arc_adjust();
909fa9e4066Sahrens 
910fa9e4066Sahrens 	/* Cool it for a while */
911fa9e4066Sahrens 	arc.incr = 0;
912fa9e4066Sahrens 	arc.size_check = arc_size_check_default << 3;
913fa9e4066Sahrens 
914fa9e4066Sahrens 	mutex_exit(&arc_reclaim_lock);
915fa9e4066Sahrens }
916fa9e4066Sahrens 
917fa9e4066Sahrens static int
918fa9e4066Sahrens arc_reclaim_needed(void)
919fa9e4066Sahrens {
920fa9e4066Sahrens 	uint64_t extra;
921fa9e4066Sahrens 
922fa9e4066Sahrens #ifdef _KERNEL
923fa9e4066Sahrens 	/*
924fa9e4066Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
925fa9e4066Sahrens 	 */
926fa9e4066Sahrens 	extra = desfree;
927fa9e4066Sahrens 
928fa9e4066Sahrens 	/*
929fa9e4066Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
930fa9e4066Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
931fa9e4066Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
932fa9e4066Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
933fa9e4066Sahrens 	 * the scanner doesn't start up while we're freeing memory.
934fa9e4066Sahrens 	 */
935fa9e4066Sahrens 	if (freemem < lotsfree + needfree + extra)
936fa9e4066Sahrens 		return (1);
937fa9e4066Sahrens 
938fa9e4066Sahrens 	/*
939fa9e4066Sahrens 	 * check to make sure that swapfs has enough space so that anon
940fa9e4066Sahrens 	 * reservations can still succeeed. anon_resvmem() checks that the
941fa9e4066Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
942fa9e4066Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
943fa9e4066Sahrens 	 * circumstances from getting really dire.
944fa9e4066Sahrens 	 */
945fa9e4066Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
946fa9e4066Sahrens 		return (1);
947fa9e4066Sahrens 
948fa9e4066Sahrens 	/*
949fa9e4066Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
950fa9e4066Sahrens 	 * kernel heap space before we ever run out of available physical
951fa9e4066Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
952fa9e4066Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
953fa9e4066Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
954fa9e4066Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
955fa9e4066Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
956fa9e4066Sahrens 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
957fa9e4066Sahrens 	 * free)
958fa9e4066Sahrens 	 */
959fa9e4066Sahrens #if defined(__i386)
960fa9e4066Sahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
961fa9e4066Sahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
962fa9e4066Sahrens 		return (1);
963fa9e4066Sahrens #endif
964fa9e4066Sahrens 
965fa9e4066Sahrens #else
966fa9e4066Sahrens 	if (spa_get_random(100) == 0)
967fa9e4066Sahrens 		return (1);
968fa9e4066Sahrens #endif
969fa9e4066Sahrens 	return (0);
970fa9e4066Sahrens }
971fa9e4066Sahrens 
972fa9e4066Sahrens static void
973fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
974fa9e4066Sahrens {
975fa9e4066Sahrens 	size_t			i;
976fa9e4066Sahrens 	kmem_cache_t		*prev_cache = NULL;
977fa9e4066Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
978fa9e4066Sahrens 
979*033f9833Sek #ifdef _KERNEL
980*033f9833Sek 	/*
981*033f9833Sek 	 * First purge some DNLC entries, in case the DNLC is using
982*033f9833Sek 	 * up too much memory.
983*033f9833Sek 	 */
984*033f9833Sek 	dnlc_reduce_cache((void *)arc_reduce_dnlc_percent);
985*033f9833Sek #endif
986*033f9833Sek 
987fa9e4066Sahrens 	/*
988fa9e4066Sahrens 	 * an agressive reclamation will shrink the cache size as well as reap
989fa9e4066Sahrens 	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
990fa9e4066Sahrens 	 * header-cache is reaped, so we only reap the header cache if we're
991fa9e4066Sahrens 	 * performing an agressive reclaim.  If we're not, just clean the kmem
992fa9e4066Sahrens 	 * buffer caches.
993fa9e4066Sahrens 	 */
994fa9e4066Sahrens 	if (strat == ARC_RECLAIM_AGGR)
995fa9e4066Sahrens 		kmem_cache_reap_now(hdr_cache);
996fa9e4066Sahrens 
997fa9e4066Sahrens 	kmem_cache_reap_now(buf_cache);
998fa9e4066Sahrens 
999fa9e4066Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1000fa9e4066Sahrens 		if (zio_buf_cache[i] != prev_cache) {
1001fa9e4066Sahrens 			prev_cache = zio_buf_cache[i];
1002fa9e4066Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
1003fa9e4066Sahrens 		}
1004fa9e4066Sahrens 	}
1005fa9e4066Sahrens }
1006fa9e4066Sahrens 
1007fa9e4066Sahrens static void
1008fa9e4066Sahrens arc_reclaim_thread(void)
1009fa9e4066Sahrens {
1010fa9e4066Sahrens 	clock_t			growtime = 0;
1011fa9e4066Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1012fa9e4066Sahrens 	callb_cpr_t		cpr;
1013fa9e4066Sahrens 
1014fa9e4066Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1015fa9e4066Sahrens 
1016fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1017fa9e4066Sahrens 	while (arc_thread_exit == 0) {
1018fa9e4066Sahrens 		if (arc_reclaim_needed()) {
1019fa9e4066Sahrens 
1020fa9e4066Sahrens 			if (arc.no_grow) {
1021fa9e4066Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
1022fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
1023fa9e4066Sahrens 				} else {
1024fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
1025fa9e4066Sahrens 				}
1026fa9e4066Sahrens 			} else {
1027fa9e4066Sahrens 				arc.no_grow = TRUE;
1028fa9e4066Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
1029fa9e4066Sahrens 				membar_producer();
1030fa9e4066Sahrens 			}
1031fa9e4066Sahrens 
1032fa9e4066Sahrens 			/* reset the growth delay for every reclaim */
1033fa9e4066Sahrens 			growtime = lbolt + (arc_grow_retry * hz);
1034fa9e4066Sahrens 
1035fa9e4066Sahrens 			arc_kmem_reap_now(last_reclaim);
1036fa9e4066Sahrens 
1037fa9e4066Sahrens 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1038fa9e4066Sahrens 			arc.no_grow = FALSE;
1039fa9e4066Sahrens 		}
1040fa9e4066Sahrens 
1041fa9e4066Sahrens 		/* block until needed, or one second, whichever is shorter */
1042fa9e4066Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
1043fa9e4066Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1044fa9e4066Sahrens 		    &arc_reclaim_thr_lock, (lbolt + hz));
1045fa9e4066Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1046fa9e4066Sahrens 	}
1047fa9e4066Sahrens 
1048fa9e4066Sahrens 	arc_thread_exit = 0;
1049fa9e4066Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
1050fa9e4066Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1051fa9e4066Sahrens 	thread_exit();
1052fa9e4066Sahrens }
1053fa9e4066Sahrens 
1054fa9e4066Sahrens static void
1055fa9e4066Sahrens arc_try_grow(int64_t bytes)
1056fa9e4066Sahrens {
1057fa9e4066Sahrens 	/*
1058fa9e4066Sahrens 	 * If we're within (2 * maxblocksize) bytes of the target
1059fa9e4066Sahrens 	 * cache size, increment the target cache size
1060fa9e4066Sahrens 	 */
1061fa9e4066Sahrens 	atomic_add_64((uint64_t *)&arc.size_check, 1);
1062fa9e4066Sahrens 
1063fa9e4066Sahrens 	if (arc_reclaim_needed()) {
1064fa9e4066Sahrens 		cv_signal(&arc_reclaim_thr_cv);
1065fa9e4066Sahrens 		return;
1066fa9e4066Sahrens 	}
1067fa9e4066Sahrens 
1068fa9e4066Sahrens 	if (arc.no_grow)
1069fa9e4066Sahrens 		return;
1070fa9e4066Sahrens 
1071fa9e4066Sahrens 	/*
1072fa9e4066Sahrens 	 * return true if we successfully grow, or if there's enough space that
1073fa9e4066Sahrens 	 * we don't have to grow.  Above, we return false if we can't grow, or
1074fa9e4066Sahrens 	 * if we shouldn't because a reclaim is in progress.
1075fa9e4066Sahrens 	 */
1076fa9e4066Sahrens 	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
1077fa9e4066Sahrens 		if (arc.size_check > 0) {
1078fa9e4066Sahrens 			arc.size_check = arc_size_check_default;
1079fa9e4066Sahrens 			atomic_add_64(&arc.incr, arc_incr_size);
1080fa9e4066Sahrens 		}
1081fa9e4066Sahrens 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1082fa9e4066Sahrens 		if (arc.c > arc.c_max)
1083fa9e4066Sahrens 			arc.c = arc.c_max;
1084fa9e4066Sahrens 		else
1085fa9e4066Sahrens 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1086fa9e4066Sahrens 	} else if (arc.size > arc.c) {
1087fa9e4066Sahrens 		if (arc.size_check > 0) {
1088fa9e4066Sahrens 			arc.size_check = arc_size_check_default;
1089fa9e4066Sahrens 			atomic_add_64(&arc.incr, arc_incr_size);
1090fa9e4066Sahrens 		}
1091fa9e4066Sahrens 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1092fa9e4066Sahrens 		if (arc.c > arc.c_max)
1093fa9e4066Sahrens 			arc.c = arc.c_max;
1094fa9e4066Sahrens 		else
1095fa9e4066Sahrens 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1096fa9e4066Sahrens 	}
1097fa9e4066Sahrens }
1098fa9e4066Sahrens 
1099fa9e4066Sahrens /*
1100fa9e4066Sahrens  * check if the cache has reached its limits and eviction is required prior to
1101fa9e4066Sahrens  * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
1102fa9e4066Sahrens  * cache is either big enough that we can insert, or a arc_try_grow will result
1103fa9e4066Sahrens  * in more space being made available.
1104fa9e4066Sahrens  */
1105fa9e4066Sahrens 
1106fa9e4066Sahrens static int
1107fa9e4066Sahrens arc_evict_needed()
1108fa9e4066Sahrens {
1109fa9e4066Sahrens 
1110fa9e4066Sahrens 	if (arc_reclaim_needed())
1111fa9e4066Sahrens 		return (1);
1112fa9e4066Sahrens 
1113fa9e4066Sahrens 	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
1114fa9e4066Sahrens 		return (1);
1115fa9e4066Sahrens 
1116fa9e4066Sahrens 	return (0);
1117fa9e4066Sahrens }
1118fa9e4066Sahrens 
1119fa9e4066Sahrens /*
1120fa9e4066Sahrens  * The state, supplied as the first argument, is going to have something
1121fa9e4066Sahrens  * inserted on its behalf. So, determine which cache must be victimized to
1122fa9e4066Sahrens  * satisfy an insertion for this state.  We have the following cases:
1123fa9e4066Sahrens  *
1124fa9e4066Sahrens  * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
1125fa9e4066Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
1126fa9e4066Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
1127fa9e4066Sahrens  *
1128fa9e4066Sahrens  * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
1129fa9e4066Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
1130fa9e4066Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
1131fa9e4066Sahrens  * entries.
1132fa9e4066Sahrens  *
1133fa9e4066Sahrens  * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
1134fa9e4066Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
1135fa9e4066Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
1136fa9e4066Sahrens  * the MFU side, so the MRU side needs to be victimized.
1137fa9e4066Sahrens  *
1138fa9e4066Sahrens  * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
1139fa9e4066Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
1140fa9e4066Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
1141fa9e4066Sahrens  */
1142fa9e4066Sahrens static void
1143fa9e4066Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes)
1144fa9e4066Sahrens {
1145fa9e4066Sahrens 	uint64_t	mru_used;
1146fa9e4066Sahrens 	uint64_t	mfu_space;
1147fa9e4066Sahrens 	uint64_t	evicted;
1148fa9e4066Sahrens 
1149fa9e4066Sahrens 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
1150fa9e4066Sahrens 
1151fa9e4066Sahrens 	if (state == arc.mru_top) {
1152fa9e4066Sahrens 		mru_used = arc.anon->size + arc.mru_top->size;
1153fa9e4066Sahrens 		if (arc.p > mru_used) {
1154fa9e4066Sahrens 			/* case 1 */
1155fa9e4066Sahrens 			evicted = arc_evict_state(arc.mfu_top, bytes);
1156fa9e4066Sahrens 			if (evicted < bytes) {
1157fa9e4066Sahrens 				arc_adjust();
1158fa9e4066Sahrens 			}
1159fa9e4066Sahrens 		} else {
1160fa9e4066Sahrens 			/* case 2 */
1161fa9e4066Sahrens 			evicted = arc_evict_state(arc.mru_top, bytes);
1162fa9e4066Sahrens 			if (evicted < bytes) {
1163fa9e4066Sahrens 				arc_adjust();
1164fa9e4066Sahrens 			}
1165fa9e4066Sahrens 		}
1166fa9e4066Sahrens 	} else {
1167fa9e4066Sahrens 		/* MFU_top case */
1168fa9e4066Sahrens 		mfu_space = arc.c - arc.p;
1169fa9e4066Sahrens 		if (mfu_space > arc.mfu_top->size) {
1170fa9e4066Sahrens 			/* case 3 */
1171fa9e4066Sahrens 			evicted = arc_evict_state(arc.mru_top, bytes);
1172fa9e4066Sahrens 			if (evicted < bytes) {
1173fa9e4066Sahrens 				arc_adjust();
1174fa9e4066Sahrens 			}
1175fa9e4066Sahrens 		} else {
1176fa9e4066Sahrens 			/* case 4 */
1177fa9e4066Sahrens 			evicted = arc_evict_state(arc.mfu_top, bytes);
1178fa9e4066Sahrens 			if (evicted < bytes) {
1179fa9e4066Sahrens 				arc_adjust();
1180fa9e4066Sahrens 			}
1181fa9e4066Sahrens 		}
1182fa9e4066Sahrens 	}
1183fa9e4066Sahrens }
1184fa9e4066Sahrens 
1185fa9e4066Sahrens /*
1186fa9e4066Sahrens  * This routine is called whenever a buffer is accessed.
1187fa9e4066Sahrens  */
1188fa9e4066Sahrens static void
1189fa9e4066Sahrens arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1190fa9e4066Sahrens {
1191fa9e4066Sahrens 	int		blksz, mult;
1192fa9e4066Sahrens 
1193fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1194fa9e4066Sahrens 
1195fa9e4066Sahrens 	blksz = buf->b_size;
1196fa9e4066Sahrens 
1197fa9e4066Sahrens 	if (buf->b_state == arc.anon) {
1198fa9e4066Sahrens 		/*
1199fa9e4066Sahrens 		 * This buffer is not in the cache, and does not
1200fa9e4066Sahrens 		 * appear in our "ghost" list.  Add the new buffer
1201fa9e4066Sahrens 		 * to the MRU state.
1202fa9e4066Sahrens 		 */
1203fa9e4066Sahrens 
1204fa9e4066Sahrens 		arc_try_grow(blksz);
1205fa9e4066Sahrens 		if (arc_evict_needed()) {
1206fa9e4066Sahrens 			arc_evict_for_state(arc.mru_top, blksz);
1207fa9e4066Sahrens 		}
1208fa9e4066Sahrens 
1209fa9e4066Sahrens 		ASSERT(buf->b_arc_access == 0);
1210fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1211fa9e4066Sahrens 		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
1212fa9e4066Sahrens 		    buf);
1213fa9e4066Sahrens 		arc_change_state(arc.mru_top, buf, hash_lock);
1214fa9e4066Sahrens 
1215fa9e4066Sahrens 		/*
1216fa9e4066Sahrens 		 * If we are using less than 2/3 of our total target
1217fa9e4066Sahrens 		 * cache size, bump up the target size for the MRU
1218fa9e4066Sahrens 		 * list.
1219fa9e4066Sahrens 		 */
1220fa9e4066Sahrens 		if (arc.size < arc.c*2/3) {
1221fa9e4066Sahrens 			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
1222fa9e4066Sahrens 		}
1223fa9e4066Sahrens 
1224fa9e4066Sahrens 	} else if (buf->b_state == arc.mru_top) {
1225fa9e4066Sahrens 		/*
1226fa9e4066Sahrens 		 * If this buffer is in the MRU-top state and has the prefetch
1227fa9e4066Sahrens 		 * flag, the first read was actually part of a prefetch.  In
1228fa9e4066Sahrens 		 * this situation, we simply want to clear the flag and return.
1229fa9e4066Sahrens 		 * A subsequent access should bump this into the MFU state.
1230fa9e4066Sahrens 		 */
1231fa9e4066Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1232fa9e4066Sahrens 			buf->b_flags &= ~ARC_PREFETCH;
1233fa9e4066Sahrens 			atomic_add_64(&arc.mru_top->hits, 1);
1234fa9e4066Sahrens 			return;
1235fa9e4066Sahrens 		}
1236fa9e4066Sahrens 
1237fa9e4066Sahrens 		/*
1238fa9e4066Sahrens 		 * This buffer has been "accessed" only once so far,
1239fa9e4066Sahrens 		 * but it is still in the cache. Move it to the MFU
1240fa9e4066Sahrens 		 * state.
1241fa9e4066Sahrens 		 */
1242fa9e4066Sahrens 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1243fa9e4066Sahrens 			/*
1244fa9e4066Sahrens 			 * More than 125ms have passed since we
1245fa9e4066Sahrens 			 * instantiated this buffer.  Move it to the
1246fa9e4066Sahrens 			 * most frequently used state.
1247fa9e4066Sahrens 			 */
1248fa9e4066Sahrens 			buf->b_arc_access = lbolt;
1249fa9e4066Sahrens 			DTRACE_PROBE1(new_state__mfu_top,
1250fa9e4066Sahrens 			    arc_buf_hdr_t *, buf);
1251fa9e4066Sahrens 			arc_change_state(arc.mfu_top, buf, hash_lock);
1252fa9e4066Sahrens 		}
1253fa9e4066Sahrens 		atomic_add_64(&arc.mru_top->hits, 1);
1254fa9e4066Sahrens 	} else if (buf->b_state == arc.mru_bot) {
1255fa9e4066Sahrens 		arc_state_t	*new_state;
1256fa9e4066Sahrens 		/*
1257fa9e4066Sahrens 		 * This buffer has been "accessed" recently, but
1258fa9e4066Sahrens 		 * was evicted from the cache.  Move it to the
1259fa9e4066Sahrens 		 * MFU state.
1260fa9e4066Sahrens 		 */
1261fa9e4066Sahrens 
1262fa9e4066Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
1263fa9e4066Sahrens 			new_state = arc.mru_top;
1264fa9e4066Sahrens 			DTRACE_PROBE1(new_state__mru_top,
1265fa9e4066Sahrens 			    arc_buf_hdr_t *, buf);
1266fa9e4066Sahrens 		} else {
1267fa9e4066Sahrens 			new_state = arc.mfu_top;
1268fa9e4066Sahrens 			DTRACE_PROBE1(new_state__mfu_top,
1269fa9e4066Sahrens 			    arc_buf_hdr_t *, buf);
1270fa9e4066Sahrens 		}
1271fa9e4066Sahrens 
1272fa9e4066Sahrens 		arc_try_grow(blksz);
1273fa9e4066Sahrens 		if (arc_evict_needed()) {
1274fa9e4066Sahrens 			arc_evict_for_state(new_state, blksz);
1275fa9e4066Sahrens 		}
1276fa9e4066Sahrens 
1277fa9e4066Sahrens 		/* Bump up the target size of the MRU list */
1278fa9e4066Sahrens 		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
1279fa9e4066Sahrens 		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
1280fa9e4066Sahrens 		arc.p = MIN(arc.c, arc.p + blksz * mult);
1281fa9e4066Sahrens 
1282fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1283fa9e4066Sahrens 		arc_change_state(new_state, buf, hash_lock);
1284fa9e4066Sahrens 
1285fa9e4066Sahrens 		atomic_add_64(&arc.mru_bot->hits, 1);
1286fa9e4066Sahrens 	} else if (buf->b_state == arc.mfu_top) {
1287fa9e4066Sahrens 		/*
1288fa9e4066Sahrens 		 * This buffer has been accessed more than once and is
1289fa9e4066Sahrens 		 * still in the cache.  Keep it in the MFU state.
1290fa9e4066Sahrens 		 *
1291fa9e4066Sahrens 		 * NOTE: the add_reference() that occurred when we did
1292fa9e4066Sahrens 		 * the arc_read() should have kicked this off the list,
1293fa9e4066Sahrens 		 * so even if it was a prefetch, it will be put back at
1294fa9e4066Sahrens 		 * the head of the list when we remove_reference().
1295fa9e4066Sahrens 		 */
1296fa9e4066Sahrens 		atomic_add_64(&arc.mfu_top->hits, 1);
1297fa9e4066Sahrens 	} else if (buf->b_state == arc.mfu_bot) {
1298fa9e4066Sahrens 		/*
1299fa9e4066Sahrens 		 * This buffer has been accessed more than once but has
1300fa9e4066Sahrens 		 * been evicted from the cache.  Move it back to the
1301fa9e4066Sahrens 		 * MFU state.
1302fa9e4066Sahrens 		 */
1303fa9e4066Sahrens 
1304fa9e4066Sahrens 		arc_try_grow(blksz);
1305fa9e4066Sahrens 		if (arc_evict_needed()) {
1306fa9e4066Sahrens 			arc_evict_for_state(arc.mfu_top, blksz);
1307fa9e4066Sahrens 		}
1308fa9e4066Sahrens 
1309fa9e4066Sahrens 		/* Bump up the target size for the MFU list */
1310fa9e4066Sahrens 		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
1311fa9e4066Sahrens 		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
1312fa9e4066Sahrens 		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
1313fa9e4066Sahrens 
1314fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1315fa9e4066Sahrens 		DTRACE_PROBE1(new_state__mfu_top,
1316fa9e4066Sahrens 		    arc_buf_hdr_t *, buf);
1317fa9e4066Sahrens 		arc_change_state(arc.mfu_top, buf, hash_lock);
1318fa9e4066Sahrens 
1319fa9e4066Sahrens 		atomic_add_64(&arc.mfu_bot->hits, 1);
1320fa9e4066Sahrens 	} else {
1321fa9e4066Sahrens 		ASSERT(!"invalid arc state");
1322fa9e4066Sahrens 	}
1323fa9e4066Sahrens 
1324fa9e4066Sahrens }
1325fa9e4066Sahrens 
1326fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1327fa9e4066Sahrens /* ARGSUSED */
1328fa9e4066Sahrens void
1329fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1330fa9e4066Sahrens {
1331fa9e4066Sahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1332fa9e4066Sahrens 	arc_buf_free(buf, arg);
1333fa9e4066Sahrens }
1334fa9e4066Sahrens 
1335fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1336fa9e4066Sahrens void
1337fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1338fa9e4066Sahrens {
1339fa9e4066Sahrens 	arc_buf_t **bufp = arg;
1340fa9e4066Sahrens 	if (zio && zio->io_error) {
1341fa9e4066Sahrens 		arc_buf_free(buf, arg);
1342fa9e4066Sahrens 		*bufp = NULL;
1343fa9e4066Sahrens 	} else {
1344fa9e4066Sahrens 		*bufp = buf;
1345fa9e4066Sahrens 	}
1346fa9e4066Sahrens }
1347fa9e4066Sahrens 
1348fa9e4066Sahrens static void
1349fa9e4066Sahrens arc_read_done(zio_t *zio)
1350fa9e4066Sahrens {
1351fa9e4066Sahrens 	arc_buf_hdr_t	*hdr;
1352fa9e4066Sahrens 	arc_buf_t	*buf;
1353fa9e4066Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1354fa9e4066Sahrens 	kmutex_t	*hash_lock;
1355fa9e4066Sahrens 	arc_callback_t	*callback_list, *acb;
1356fa9e4066Sahrens 	int		freeable = FALSE;
1357fa9e4066Sahrens 
1358fa9e4066Sahrens 	buf = zio->io_private;
1359fa9e4066Sahrens 	hdr = buf->b_hdr;
1360fa9e4066Sahrens 
1361fa9e4066Sahrens 	if (!HDR_FREED_IN_READ(hdr)) {
1362fa9e4066Sahrens 		arc_buf_hdr_t *found;
1363fa9e4066Sahrens 
1364fa9e4066Sahrens 		found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1365fa9e4066Sahrens 		    &hash_lock);
1366fa9e4066Sahrens 
1367fa9e4066Sahrens 		/*
1368fa9e4066Sahrens 		 * Buffer was inserted into hash-table and removed from lists
1369fa9e4066Sahrens 		 * prior to starting I/O.  We should find this header, since
1370fa9e4066Sahrens 		 * it's in the hash table, and it should be legit since it's
1371fa9e4066Sahrens 		 * not possible to evict it during the I/O.
1372fa9e4066Sahrens 		 */
1373fa9e4066Sahrens 
1374fa9e4066Sahrens 		ASSERT(found);
1375fa9e4066Sahrens 		ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
1376fa9e4066Sahrens 	}
1377fa9e4066Sahrens 
1378fa9e4066Sahrens 	/* byteswap if necessary */
1379fa9e4066Sahrens 	callback_list = hdr->b_acb;
1380fa9e4066Sahrens 	ASSERT(callback_list != NULL);
1381fa9e4066Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1382fa9e4066Sahrens 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1383fa9e4066Sahrens 
1384fa9e4066Sahrens 	/* create copies of the data buffer for the callers */
1385fa9e4066Sahrens 	abuf = buf;
1386fa9e4066Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
1387fa9e4066Sahrens 		if (acb->acb_done) {
1388fa9e4066Sahrens 			if (abuf == NULL) {
1389fa9e4066Sahrens 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1390fa9e4066Sahrens 				abuf->b_data = zio_buf_alloc(hdr->b_size);
1391fa9e4066Sahrens 				atomic_add_64(&arc.size, hdr->b_size);
1392fa9e4066Sahrens 				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
1393fa9e4066Sahrens 				abuf->b_hdr = hdr;
1394fa9e4066Sahrens 				abuf->b_next = hdr->b_buf;
1395fa9e4066Sahrens 				hdr->b_buf = abuf;
1396fa9e4066Sahrens 				atomic_add_64(&hdr->b_state->size, hdr->b_size);
1397fa9e4066Sahrens 			}
1398fa9e4066Sahrens 			acb->acb_buf = abuf;
1399fa9e4066Sahrens 			abuf = NULL;
1400fa9e4066Sahrens 		} else {
1401fa9e4066Sahrens 			/*
1402fa9e4066Sahrens 			 * The caller did not provide a callback function.
1403fa9e4066Sahrens 			 * In this case, we should just remove the reference.
1404fa9e4066Sahrens 			 */
1405fa9e4066Sahrens 			if (HDR_FREED_IN_READ(hdr)) {
1406fa9e4066Sahrens 				ASSERT3P(hdr->b_state, ==, arc.anon);
1407fa9e4066Sahrens 				(void) refcount_remove(&hdr->b_refcnt,
1408fa9e4066Sahrens 				    acb->acb_private);
1409fa9e4066Sahrens 			} else {
1410fa9e4066Sahrens 				(void) remove_reference(hdr, hash_lock,
1411fa9e4066Sahrens 				    acb->acb_private);
1412fa9e4066Sahrens 			}
1413fa9e4066Sahrens 		}
1414fa9e4066Sahrens 	}
1415fa9e4066Sahrens 	hdr->b_acb = NULL;
1416fa9e4066Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1417fa9e4066Sahrens 
1418fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1419fa9e4066Sahrens 
1420fa9e4066Sahrens 	if (zio->io_error != 0) {
1421fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
1422fa9e4066Sahrens 		if (hdr->b_state != arc.anon)
1423fa9e4066Sahrens 			arc_change_state(arc.anon, hdr, hash_lock);
1424fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1425fa9e4066Sahrens 	}
1426fa9e4066Sahrens 
1427fa9e4066Sahrens 	if (!HDR_FREED_IN_READ(hdr)) {
1428fa9e4066Sahrens 		/*
1429fa9e4066Sahrens 		 * Only call arc_access on anonymous buffers.  This is because
1430fa9e4066Sahrens 		 * if we've issued an I/O for an evicted buffer, we've already
1431fa9e4066Sahrens 		 * called arc_access (to prevent any simultaneous readers from
1432fa9e4066Sahrens 		 * getting confused).
1433fa9e4066Sahrens 		 */
1434fa9e4066Sahrens 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
1435fa9e4066Sahrens 			arc_access(hdr, hash_lock);
1436fa9e4066Sahrens 		mutex_exit(hash_lock);
1437fa9e4066Sahrens 	} else {
1438fa9e4066Sahrens 		/*
1439fa9e4066Sahrens 		 * This block was freed while we waited for the read to
1440fa9e4066Sahrens 		 * complete.  It has been removed from the hash table and
1441fa9e4066Sahrens 		 * moved to the anonymous state (so that it won't show up
1442fa9e4066Sahrens 		 * in the cache).
1443fa9e4066Sahrens 		 */
1444fa9e4066Sahrens 		ASSERT3P(hdr->b_state, ==, arc.anon);
1445fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1446fa9e4066Sahrens 	}
1447fa9e4066Sahrens 
1448fa9e4066Sahrens 	cv_broadcast(&hdr->b_cv);
1449fa9e4066Sahrens 
1450fa9e4066Sahrens 	/* execute each callback and free its structure */
1451fa9e4066Sahrens 	while ((acb = callback_list) != NULL) {
1452fa9e4066Sahrens 		if (acb->acb_done)
1453fa9e4066Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1454fa9e4066Sahrens 
1455fa9e4066Sahrens 		if (acb->acb_zio_dummy != NULL) {
1456fa9e4066Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
1457fa9e4066Sahrens 			zio_nowait(acb->acb_zio_dummy);
1458fa9e4066Sahrens 		}
1459fa9e4066Sahrens 
1460fa9e4066Sahrens 		callback_list = acb->acb_next;
1461fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1462fa9e4066Sahrens 	}
1463fa9e4066Sahrens 
1464fa9e4066Sahrens 	if (freeable)
1465fa9e4066Sahrens 		arc_hdr_free(hdr);
1466fa9e4066Sahrens }
1467fa9e4066Sahrens 
1468fa9e4066Sahrens /*
1469fa9e4066Sahrens  * "Read" the block block at the specified DVA (in bp) via the
1470fa9e4066Sahrens  * cache.  If the block is found in the cache, invoke the provided
1471fa9e4066Sahrens  * callback immediately and return.  Note that the `zio' parameter
1472fa9e4066Sahrens  * in the callback will be NULL in this case, since no IO was
1473fa9e4066Sahrens  * required.  If the block is not in the cache pass the read request
1474fa9e4066Sahrens  * on to the spa with a substitute callback function, so that the
1475fa9e4066Sahrens  * requested block will be added to the cache.
1476fa9e4066Sahrens  *
1477fa9e4066Sahrens  * If a read request arrives for a block that has a read in-progress,
1478fa9e4066Sahrens  * either wait for the in-progress read to complete (and return the
1479fa9e4066Sahrens  * results); or, if this is a read with a "done" func, add a record
1480fa9e4066Sahrens  * to the read to invoke the "done" func when the read completes,
1481fa9e4066Sahrens  * and return; or just return.
1482fa9e4066Sahrens  *
1483fa9e4066Sahrens  * arc_read_done() will invoke all the requested "done" functions
1484fa9e4066Sahrens  * for readers of this block.
1485fa9e4066Sahrens  */
1486fa9e4066Sahrens int
1487fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
1488fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1489fa9e4066Sahrens     uint32_t arc_flags)
1490fa9e4066Sahrens {
1491fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1492fa9e4066Sahrens 	arc_buf_t *buf;
1493fa9e4066Sahrens 	kmutex_t *hash_lock;
1494fa9e4066Sahrens 	zio_t	*rzio;
1495fa9e4066Sahrens 
1496fa9e4066Sahrens top:
1497fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1498fa9e4066Sahrens 	if (hdr && hdr->b_buf) {
1499fa9e4066Sahrens 
1500fa9e4066Sahrens 		ASSERT((hdr->b_state == arc.mru_top) ||
1501fa9e4066Sahrens 		    (hdr->b_state == arc.mfu_top) ||
1502fa9e4066Sahrens 		    ((hdr->b_state == arc.anon) &&
1503fa9e4066Sahrens 		    (HDR_IO_IN_PROGRESS(hdr))));
1504fa9e4066Sahrens 
1505fa9e4066Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
1506fa9e4066Sahrens 
1507fa9e4066Sahrens 			if ((arc_flags & ARC_NOWAIT) && done) {
1508fa9e4066Sahrens 				arc_callback_t	*acb = NULL;
1509fa9e4066Sahrens 
1510fa9e4066Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
1511fa9e4066Sahrens 				    KM_SLEEP);
1512fa9e4066Sahrens 				acb->acb_done = done;
1513fa9e4066Sahrens 				acb->acb_private = private;
1514fa9e4066Sahrens 				acb->acb_byteswap = swap;
1515fa9e4066Sahrens 				if (pio != NULL)
1516fa9e4066Sahrens 					acb->acb_zio_dummy = zio_null(pio,
1517fa9e4066Sahrens 					    spa, NULL, NULL, flags);
1518fa9e4066Sahrens 
1519fa9e4066Sahrens 				ASSERT(acb->acb_done != NULL);
1520fa9e4066Sahrens 				acb->acb_next = hdr->b_acb;
1521fa9e4066Sahrens 				hdr->b_acb = acb;
1522fa9e4066Sahrens 				add_reference(hdr, hash_lock, private);
1523fa9e4066Sahrens 				mutex_exit(hash_lock);
1524fa9e4066Sahrens 				return (0);
1525fa9e4066Sahrens 			} else if (arc_flags & ARC_WAIT) {
1526fa9e4066Sahrens 				cv_wait(&hdr->b_cv, hash_lock);
1527fa9e4066Sahrens 				mutex_exit(hash_lock);
1528fa9e4066Sahrens 				goto top;
1529fa9e4066Sahrens 			}
1530fa9e4066Sahrens 
1531fa9e4066Sahrens 			mutex_exit(hash_lock);
1532fa9e4066Sahrens 			return (0);
1533fa9e4066Sahrens 		}
1534fa9e4066Sahrens 
1535fa9e4066Sahrens 		/*
1536fa9e4066Sahrens 		 * If there is already a reference on this block, create
1537fa9e4066Sahrens 		 * a new copy of the data so that we will be guaranteed
1538fa9e4066Sahrens 		 * that arc_release() will always succeed.
1539fa9e4066Sahrens 		 */
1540fa9e4066Sahrens 
1541fa9e4066Sahrens 		if (done)
1542fa9e4066Sahrens 			add_reference(hdr, hash_lock, private);
1543fa9e4066Sahrens 		if (done && refcount_count(&hdr->b_refcnt) > 1) {
1544fa9e4066Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1545fa9e4066Sahrens 			buf->b_data = zio_buf_alloc(hdr->b_size);
1546fa9e4066Sahrens 			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
1547fa9e4066Sahrens 			atomic_add_64(&arc.size, hdr->b_size);
1548fa9e4066Sahrens 			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
1549fa9e4066Sahrens 			buf->b_hdr = hdr;
1550fa9e4066Sahrens 			buf->b_next = hdr->b_buf;
1551fa9e4066Sahrens 			hdr->b_buf = buf;
1552fa9e4066Sahrens 			atomic_add_64(&hdr->b_state->size, hdr->b_size);
1553fa9e4066Sahrens 		} else {
1554fa9e4066Sahrens 			buf = hdr->b_buf;
1555fa9e4066Sahrens 		}
1556fa9e4066Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1557fa9e4066Sahrens 		arc_access(hdr, hash_lock);
1558fa9e4066Sahrens 		mutex_exit(hash_lock);
1559fa9e4066Sahrens 		atomic_add_64(&arc.hits, 1);
1560fa9e4066Sahrens 		if (done)
1561fa9e4066Sahrens 			done(NULL, buf, private);
1562fa9e4066Sahrens 	} else {
1563fa9e4066Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
1564fa9e4066Sahrens 		arc_callback_t	*acb;
1565fa9e4066Sahrens 
1566fa9e4066Sahrens 		if (hdr == NULL) {
1567fa9e4066Sahrens 			/* this block is not in the cache */
1568fa9e4066Sahrens 			arc_buf_hdr_t	*exists;
1569fa9e4066Sahrens 
1570fa9e4066Sahrens 			buf = arc_buf_alloc(spa, size, private);
1571fa9e4066Sahrens 			hdr = buf->b_hdr;
1572fa9e4066Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
1573fa9e4066Sahrens 			hdr->b_birth = bp->blk_birth;
1574fa9e4066Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
1575fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1576fa9e4066Sahrens 			if (exists) {
1577fa9e4066Sahrens 				/* somebody beat us to the hash insert */
1578fa9e4066Sahrens 				mutex_exit(hash_lock);
1579fa9e4066Sahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
1580fa9e4066Sahrens 				hdr->b_birth = 0;
1581fa9e4066Sahrens 				hdr->b_cksum0 = 0;
1582fa9e4066Sahrens 				arc_buf_free(buf, private);
1583fa9e4066Sahrens 				goto top; /* restart the IO request */
1584fa9e4066Sahrens 			}
1585fa9e4066Sahrens 
1586fa9e4066Sahrens 		} else {
1587fa9e4066Sahrens 			/* this block is in the ghost cache */
1588fa9e4066Sahrens 			ASSERT((hdr->b_state == arc.mru_bot) ||
1589fa9e4066Sahrens 			    (hdr->b_state == arc.mfu_bot));
1590fa9e4066Sahrens 			add_reference(hdr, hash_lock, private);
1591fa9e4066Sahrens 
1592fa9e4066Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1593fa9e4066Sahrens 			buf->b_data = zio_buf_alloc(hdr->b_size);
1594fa9e4066Sahrens 			atomic_add_64(&arc.size, hdr->b_size);
1595fa9e4066Sahrens 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1596fa9e4066Sahrens 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1597fa9e4066Sahrens 			buf->b_hdr = hdr;
1598fa9e4066Sahrens 			buf->b_next = NULL;
1599fa9e4066Sahrens 			hdr->b_buf = buf;
1600fa9e4066Sahrens 		}
1601fa9e4066Sahrens 
1602fa9e4066Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1603fa9e4066Sahrens 		acb->acb_done = done;
1604fa9e4066Sahrens 		acb->acb_private = private;
1605fa9e4066Sahrens 		acb->acb_byteswap = swap;
1606fa9e4066Sahrens 
1607fa9e4066Sahrens 		ASSERT(hdr->b_acb == NULL);
1608fa9e4066Sahrens 		hdr->b_acb = acb;
1609fa9e4066Sahrens 
1610fa9e4066Sahrens 		/*
1611fa9e4066Sahrens 		 * If this DVA is part of a prefetch, mark the buf
1612fa9e4066Sahrens 		 * header with the prefetch flag
1613fa9e4066Sahrens 		 */
1614fa9e4066Sahrens 		if (arc_flags & ARC_PREFETCH)
1615fa9e4066Sahrens 			hdr->b_flags |= ARC_PREFETCH;
1616fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
1617fa9e4066Sahrens 
1618fa9e4066Sahrens 		/*
1619fa9e4066Sahrens 		 * If the buffer has been evicted, migrate it to a present state
1620fa9e4066Sahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
1621fa9e4066Sahrens 		 * the header will be marked as I/O in progress and have an
1622fa9e4066Sahrens 		 * attached buffer.  At this point, anybody who finds this
1623fa9e4066Sahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
1624fa9e4066Sahrens 		 */
1625fa9e4066Sahrens 
1626fa9e4066Sahrens 		if ((hdr->b_state == arc.mru_bot) ||
1627fa9e4066Sahrens 		    (hdr->b_state == arc.mfu_bot))
1628fa9e4066Sahrens 			arc_access(hdr, hash_lock);
1629fa9e4066Sahrens 
1630fa9e4066Sahrens 		mutex_exit(hash_lock);
1631fa9e4066Sahrens 
1632fa9e4066Sahrens 		ASSERT3U(hdr->b_size, ==, size);
1633fa9e4066Sahrens 		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
1634fa9e4066Sahrens 		    uint64_t, size);
1635fa9e4066Sahrens 		atomic_add_64(&arc.misses, 1);
1636fa9e4066Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
1637fa9e4066Sahrens 		    arc_read_done, buf, priority, flags);
1638fa9e4066Sahrens 
1639fa9e4066Sahrens 		if (arc_flags & ARC_WAIT)
1640fa9e4066Sahrens 			return (zio_wait(rzio));
1641fa9e4066Sahrens 
1642fa9e4066Sahrens 		ASSERT(arc_flags & ARC_NOWAIT);
1643fa9e4066Sahrens 		zio_nowait(rzio);
1644fa9e4066Sahrens 	}
1645fa9e4066Sahrens 	return (0);
1646fa9e4066Sahrens }
1647fa9e4066Sahrens 
1648fa9e4066Sahrens /*
1649fa9e4066Sahrens  * arc_read() variant to support pool traversal.  If the block is already
1650fa9e4066Sahrens  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
1651fa9e4066Sahrens  * The idea is that we don't want pool traversal filling up memory, but
1652fa9e4066Sahrens  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
1653fa9e4066Sahrens  */
1654fa9e4066Sahrens int
1655fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
1656fa9e4066Sahrens {
1657fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1658fa9e4066Sahrens 	kmutex_t *hash_mtx;
1659fa9e4066Sahrens 	int rc = 0;
1660fa9e4066Sahrens 
1661fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
1662fa9e4066Sahrens 
1663fa9e4066Sahrens 	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
1664fa9e4066Sahrens 		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
1665fa9e4066Sahrens 	else
1666fa9e4066Sahrens 		rc = ENOENT;
1667fa9e4066Sahrens 
1668fa9e4066Sahrens 	if (hash_mtx)
1669fa9e4066Sahrens 		mutex_exit(hash_mtx);
1670fa9e4066Sahrens 
1671fa9e4066Sahrens 	return (rc);
1672fa9e4066Sahrens }
1673fa9e4066Sahrens 
1674fa9e4066Sahrens /*
1675fa9e4066Sahrens  * Release this buffer from the cache.  This must be done
1676fa9e4066Sahrens  * after a read and prior to modifying the buffer contents.
1677fa9e4066Sahrens  * If the buffer has more than one reference, we must make
1678fa9e4066Sahrens  * make a new hdr for the buffer.
1679fa9e4066Sahrens  */
1680fa9e4066Sahrens void
1681fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag)
1682fa9e4066Sahrens {
1683fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1684fa9e4066Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
1685fa9e4066Sahrens 
1686fa9e4066Sahrens 	/* this buffer is not on any list */
1687fa9e4066Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
1688fa9e4066Sahrens 
1689fa9e4066Sahrens 	if (hdr->b_state == arc.anon) {
1690fa9e4066Sahrens 		/* this buffer is already released */
1691fa9e4066Sahrens 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1692fa9e4066Sahrens 		ASSERT(BUF_EMPTY(hdr));
1693fa9e4066Sahrens 		return;
1694fa9e4066Sahrens 	}
1695fa9e4066Sahrens 
1696fa9e4066Sahrens 	mutex_enter(hash_lock);
1697fa9e4066Sahrens 
1698fa9e4066Sahrens 	if (refcount_count(&hdr->b_refcnt) > 1) {
1699fa9e4066Sahrens 		arc_buf_hdr_t *nhdr;
1700fa9e4066Sahrens 		arc_buf_t **bufp;
1701fa9e4066Sahrens 		uint64_t blksz = hdr->b_size;
1702fa9e4066Sahrens 		spa_t *spa = hdr->b_spa;
1703fa9e4066Sahrens 
1704fa9e4066Sahrens 		/*
1705fa9e4066Sahrens 		 * Pull the data off of this buf and attach it to
1706fa9e4066Sahrens 		 * a new anonymous buf.
1707fa9e4066Sahrens 		 */
1708fa9e4066Sahrens 		bufp = &hdr->b_buf;
1709fa9e4066Sahrens 		while (*bufp != buf) {
1710fa9e4066Sahrens 			ASSERT(*bufp);
1711fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
1712fa9e4066Sahrens 		}
1713fa9e4066Sahrens 		*bufp = (*bufp)->b_next;
1714fa9e4066Sahrens 		(void) refcount_remove(&hdr->b_refcnt, tag);
1715fa9e4066Sahrens 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
1716fa9e4066Sahrens 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
1717fa9e4066Sahrens 		mutex_exit(hash_lock);
1718fa9e4066Sahrens 
1719fa9e4066Sahrens 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1720fa9e4066Sahrens 		nhdr->b_size = blksz;
1721fa9e4066Sahrens 		nhdr->b_spa = spa;
1722fa9e4066Sahrens 		nhdr->b_buf = buf;
1723fa9e4066Sahrens 		nhdr->b_state = arc.anon;
1724fa9e4066Sahrens 		nhdr->b_arc_access = 0;
1725fa9e4066Sahrens 		nhdr->b_flags = 0;
1726fa9e4066Sahrens 		buf->b_hdr = nhdr;
1727fa9e4066Sahrens 		buf->b_next = NULL;
1728fa9e4066Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
1729fa9e4066Sahrens 		atomic_add_64(&arc.anon->size, blksz);
1730fa9e4066Sahrens 
1731fa9e4066Sahrens 		hdr = nhdr;
1732fa9e4066Sahrens 	} else {
1733fa9e4066Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
1734fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1735fa9e4066Sahrens 		arc_change_state(arc.anon, hdr, hash_lock);
1736fa9e4066Sahrens 		hdr->b_arc_access = 0;
1737fa9e4066Sahrens 		mutex_exit(hash_lock);
1738fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
1739fa9e4066Sahrens 		hdr->b_birth = 0;
1740fa9e4066Sahrens 		hdr->b_cksum0 = 0;
1741fa9e4066Sahrens 	}
1742fa9e4066Sahrens }
1743fa9e4066Sahrens 
1744fa9e4066Sahrens int
1745fa9e4066Sahrens arc_released(arc_buf_t *buf)
1746fa9e4066Sahrens {
1747fa9e4066Sahrens 	return (buf->b_hdr->b_state == arc.anon);
1748fa9e4066Sahrens }
1749fa9e4066Sahrens 
1750fa9e4066Sahrens static void
1751fa9e4066Sahrens arc_write_done(zio_t *zio)
1752fa9e4066Sahrens {
1753fa9e4066Sahrens 	arc_buf_t *buf;
1754fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1755fa9e4066Sahrens 	arc_callback_t *acb;
1756fa9e4066Sahrens 
1757fa9e4066Sahrens 	buf = zio->io_private;
1758fa9e4066Sahrens 	hdr = buf->b_hdr;
1759fa9e4066Sahrens 	acb = hdr->b_acb;
1760fa9e4066Sahrens 	hdr->b_acb = NULL;
1761fa9e4066Sahrens 
1762fa9e4066Sahrens 	/* this buffer is on no lists and is not in the hash table */
1763fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
1764fa9e4066Sahrens 
1765fa9e4066Sahrens 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
1766fa9e4066Sahrens 	hdr->b_birth = zio->io_bp->blk_birth;
1767fa9e4066Sahrens 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
1768fa9e4066Sahrens 	/* clear the "in-write" flag */
1769fa9e4066Sahrens 	hdr->b_hash_next = NULL;
1770fa9e4066Sahrens 	/* This write may be all-zero */
1771fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
1772fa9e4066Sahrens 		arc_buf_hdr_t *exists;
1773fa9e4066Sahrens 		kmutex_t *hash_lock;
1774fa9e4066Sahrens 
1775fa9e4066Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
1776fa9e4066Sahrens 		if (exists) {
1777fa9e4066Sahrens 			/*
1778fa9e4066Sahrens 			 * This can only happen if we overwrite for
1779fa9e4066Sahrens 			 * sync-to-convergence, because we remove
1780fa9e4066Sahrens 			 * buffers from the hash table when we arc_free().
1781fa9e4066Sahrens 			 */
1782fa9e4066Sahrens 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
1783fa9e4066Sahrens 			    BP_IDENTITY(zio->io_bp)));
1784fa9e4066Sahrens 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
1785fa9e4066Sahrens 			    zio->io_bp->blk_birth);
1786fa9e4066Sahrens 
1787fa9e4066Sahrens 			ASSERT(refcount_is_zero(&exists->b_refcnt));
1788fa9e4066Sahrens 			arc_change_state(arc.anon, exists, hash_lock);
1789fa9e4066Sahrens 			mutex_exit(hash_lock);
1790fa9e4066Sahrens 			arc_hdr_free(exists);
1791fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1792fa9e4066Sahrens 			ASSERT3P(exists, ==, NULL);
1793fa9e4066Sahrens 		}
1794fa9e4066Sahrens 		arc_access(hdr, hash_lock);
1795fa9e4066Sahrens 		mutex_exit(hash_lock);
1796fa9e4066Sahrens 	}
1797fa9e4066Sahrens 	if (acb && acb->acb_done) {
1798fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
1799fa9e4066Sahrens 		acb->acb_done(zio, buf, acb->acb_private);
1800fa9e4066Sahrens 	}
1801fa9e4066Sahrens 
1802fa9e4066Sahrens 	if (acb)
1803fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1804fa9e4066Sahrens }
1805fa9e4066Sahrens 
1806fa9e4066Sahrens int
1807fa9e4066Sahrens arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
1808fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
1809fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1810fa9e4066Sahrens     uint32_t arc_flags)
1811fa9e4066Sahrens {
1812fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1813fa9e4066Sahrens 	arc_callback_t	*acb;
1814fa9e4066Sahrens 	zio_t	*rzio;
1815fa9e4066Sahrens 
1816fa9e4066Sahrens 	/* this is a private buffer - no locking required */
1817fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
1818fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
1819fa9e4066Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
1820fa9e4066Sahrens 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1821fa9e4066Sahrens 	acb->acb_done = done;
1822fa9e4066Sahrens 	acb->acb_private = private;
1823fa9e4066Sahrens 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
1824fa9e4066Sahrens 	hdr->b_acb = acb;
1825fa9e4066Sahrens 	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
1826fa9e4066Sahrens 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
1827fa9e4066Sahrens 
1828fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
1829fa9e4066Sahrens 		return (zio_wait(rzio));
1830fa9e4066Sahrens 
1831fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
1832fa9e4066Sahrens 	zio_nowait(rzio);
1833fa9e4066Sahrens 
1834fa9e4066Sahrens 	return (0);
1835fa9e4066Sahrens }
1836fa9e4066Sahrens 
1837fa9e4066Sahrens int
1838fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
1839fa9e4066Sahrens     zio_done_func_t *done, void *private, uint32_t arc_flags)
1840fa9e4066Sahrens {
1841fa9e4066Sahrens 	arc_buf_hdr_t *ab;
1842fa9e4066Sahrens 	kmutex_t *hash_lock;
1843fa9e4066Sahrens 	zio_t	*zio;
1844fa9e4066Sahrens 
1845fa9e4066Sahrens 	/*
1846fa9e4066Sahrens 	 * If this buffer is in the cache, release it, so it
1847fa9e4066Sahrens 	 * can be re-used.
1848fa9e4066Sahrens 	 */
1849fa9e4066Sahrens 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1850fa9e4066Sahrens 	if (ab != NULL) {
1851fa9e4066Sahrens 		/*
1852fa9e4066Sahrens 		 * The checksum of blocks to free is not always
1853fa9e4066Sahrens 		 * preserved (eg. on the deadlist).  However, if it is
1854fa9e4066Sahrens 		 * nonzero, it should match what we have in the cache.
1855fa9e4066Sahrens 		 */
1856fa9e4066Sahrens 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
1857fa9e4066Sahrens 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
1858fa9e4066Sahrens 		arc_change_state(arc.anon, ab, hash_lock);
1859fa9e4066Sahrens 		if (refcount_is_zero(&ab->b_refcnt)) {
1860fa9e4066Sahrens 			mutex_exit(hash_lock);
1861fa9e4066Sahrens 			arc_hdr_free(ab);
1862fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
1863fa9e4066Sahrens 		} else {
1864fa9e4066Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
1865fa9e4066Sahrens 			if (HDR_IO_IN_PROGRESS(ab))
1866fa9e4066Sahrens 				ab->b_flags |= ARC_FREED_IN_READ;
1867fa9e4066Sahrens 			ab->b_arc_access = 0;
1868fa9e4066Sahrens 			bzero(&ab->b_dva, sizeof (dva_t));
1869fa9e4066Sahrens 			ab->b_birth = 0;
1870fa9e4066Sahrens 			ab->b_cksum0 = 0;
1871fa9e4066Sahrens 			mutex_exit(hash_lock);
1872fa9e4066Sahrens 		}
1873fa9e4066Sahrens 	}
1874fa9e4066Sahrens 
1875fa9e4066Sahrens 	zio = zio_free(pio, spa, txg, bp, done, private);
1876fa9e4066Sahrens 
1877fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
1878fa9e4066Sahrens 		return (zio_wait(zio));
1879fa9e4066Sahrens 
1880fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
1881fa9e4066Sahrens 	zio_nowait(zio);
1882fa9e4066Sahrens 
1883fa9e4066Sahrens 	return (0);
1884fa9e4066Sahrens }
1885fa9e4066Sahrens 
1886fa9e4066Sahrens void
1887fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve)
1888fa9e4066Sahrens {
1889fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, -tempreserve);
1890fa9e4066Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
1891fa9e4066Sahrens }
1892fa9e4066Sahrens 
1893fa9e4066Sahrens int
1894fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve)
1895fa9e4066Sahrens {
1896fa9e4066Sahrens #ifdef ZFS_DEBUG
1897fa9e4066Sahrens 	/*
1898fa9e4066Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
1899fa9e4066Sahrens 	 */
1900fa9e4066Sahrens 	if (spa_get_random(10000) == 0) {
1901fa9e4066Sahrens 		dprintf("forcing random failure\n");
1902fa9e4066Sahrens 		return (ERESTART);
1903fa9e4066Sahrens 	}
1904fa9e4066Sahrens #endif
1905112fe045Smaybee 	if (tempreserve > arc.c/4 && !arc.no_grow)
1906112fe045Smaybee 		arc.c = MIN(arc.c_max, tempreserve * 4);
1907112fe045Smaybee 	if (tempreserve > arc.c)
1908112fe045Smaybee 		return (ENOMEM);
1909112fe045Smaybee 
1910fa9e4066Sahrens 	/*
1911112fe045Smaybee 	 * Throttle writes when the amount of dirty data in the cache
1912112fe045Smaybee 	 * gets too large.  We try to keep the cache less than half full
1913112fe045Smaybee 	 * of dirty blocks so that our sync times don't grow too large.
1914112fe045Smaybee 	 * Note: if two requests come in concurrently, we might let them
1915112fe045Smaybee 	 * both succeed, when one of them should fail.  Not a huge deal.
1916112fe045Smaybee 	 *
1917112fe045Smaybee 	 * XXX The limit should be adjusted dynamically to keep the time
1918112fe045Smaybee 	 * to sync a dataset fixed (around 1-5 seconds?).
1919fa9e4066Sahrens 	 */
1920fa9e4066Sahrens 
1921112fe045Smaybee 	if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 &&
1922112fe045Smaybee 	    arc_tempreserve + arc.anon->size > arc.c / 4) {
1923fa9e4066Sahrens 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
1924fa9e4066Sahrens 		    "tempreserve=%lluK arc.c=%lluK\n",
1925fa9e4066Sahrens 		    arc_tempreserve>>10, arc.anon->lsize>>10,
1926fa9e4066Sahrens 		    tempreserve>>10, arc.c>>10);
1927fa9e4066Sahrens 		return (ERESTART);
1928fa9e4066Sahrens 	}
1929fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, tempreserve);
1930fa9e4066Sahrens 	return (0);
1931fa9e4066Sahrens }
1932fa9e4066Sahrens 
1933fa9e4066Sahrens void
1934fa9e4066Sahrens arc_init(void)
1935fa9e4066Sahrens {
1936fa9e4066Sahrens 	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
1937fa9e4066Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
1938fa9e4066Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
1939fa9e4066Sahrens 
1940fa9e4066Sahrens 	/* Start out with 1/8 of all memory */
1941fa9e4066Sahrens 	arc.c = physmem * PAGESIZE / 8;
1942fa9e4066Sahrens 
1943fa9e4066Sahrens #ifdef _KERNEL
1944fa9e4066Sahrens 	/*
1945fa9e4066Sahrens 	 * On architectures where the physical memory can be larger
1946fa9e4066Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
1947fa9e4066Sahrens 	 * need to limit the cache to 1/8 of VM size.
1948fa9e4066Sahrens 	 */
1949fa9e4066Sahrens 	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
1950fa9e4066Sahrens #endif
1951fa9e4066Sahrens 
1952112fe045Smaybee 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
1953fa9e4066Sahrens 	arc.c_min = MAX(arc.c / 4, 64<<20);
1954112fe045Smaybee 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
1955fa9e4066Sahrens 	if (arc.c * 8 >= 1<<30)
1956fa9e4066Sahrens 		arc.c_max = (arc.c * 8) - (1<<30);
1957fa9e4066Sahrens 	else
1958fa9e4066Sahrens 		arc.c_max = arc.c_min;
1959fa9e4066Sahrens 	arc.c_max = MAX(arc.c * 6, arc.c_max);
1960fa9e4066Sahrens 	arc.c = arc.c_max;
1961fa9e4066Sahrens 	arc.p = (arc.c >> 1);
1962fa9e4066Sahrens 
1963fa9e4066Sahrens 	/* if kmem_flags are set, lets try to use less memory */
1964fa9e4066Sahrens 	if (kmem_debugging())
1965fa9e4066Sahrens 		arc.c = arc.c / 2;
1966fa9e4066Sahrens 	if (arc.c < arc.c_min)
1967fa9e4066Sahrens 		arc.c = arc.c_min;
1968fa9e4066Sahrens 
1969fa9e4066Sahrens 	arc.anon = &ARC_anon;
1970fa9e4066Sahrens 	arc.mru_top = &ARC_mru_top;
1971fa9e4066Sahrens 	arc.mru_bot = &ARC_mru_bot;
1972fa9e4066Sahrens 	arc.mfu_top = &ARC_mfu_top;
1973fa9e4066Sahrens 	arc.mfu_bot = &ARC_mfu_bot;
1974fa9e4066Sahrens 
1975fa9e4066Sahrens 	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
1976fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1977fa9e4066Sahrens 	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
1978fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1979fa9e4066Sahrens 	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
1980fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1981fa9e4066Sahrens 	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
1982fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1983fa9e4066Sahrens 
1984fa9e4066Sahrens 	buf_init();
1985fa9e4066Sahrens 
1986fa9e4066Sahrens 	arc_thread_exit = 0;
1987fa9e4066Sahrens 
1988fa9e4066Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
1989fa9e4066Sahrens 	    TS_RUN, minclsyspri);
1990fa9e4066Sahrens }
1991fa9e4066Sahrens 
1992fa9e4066Sahrens void
1993fa9e4066Sahrens arc_fini(void)
1994fa9e4066Sahrens {
1995fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1996fa9e4066Sahrens 	arc_thread_exit = 1;
1997fa9e4066Sahrens 	while (arc_thread_exit != 0)
1998fa9e4066Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
1999fa9e4066Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
2000fa9e4066Sahrens 
2001fa9e4066Sahrens 	arc_flush();
2002fa9e4066Sahrens 
2003fa9e4066Sahrens 	arc_dead = TRUE;
2004fa9e4066Sahrens 
2005fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_lock);
2006fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
2007fa9e4066Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
2008fa9e4066Sahrens 
2009fa9e4066Sahrens 	list_destroy(&arc.mru_top->list);
2010fa9e4066Sahrens 	list_destroy(&arc.mru_bot->list);
2011fa9e4066Sahrens 	list_destroy(&arc.mfu_top->list);
2012fa9e4066Sahrens 	list_destroy(&arc.mfu_bot->list);
2013fa9e4066Sahrens 
2014fa9e4066Sahrens 	buf_fini();
2015fa9e4066Sahrens }
2016