xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision 112fe045)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5fa9e4066Sahrens  * Common Development and Distribution License, Version 1.0 only
6fa9e4066Sahrens  * (the "License").  You may not use this file except in compliance
7fa9e4066Sahrens  * with the License.
8fa9e4066Sahrens  *
9fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
11fa9e4066Sahrens  * See the License for the specific language governing permissions
12fa9e4066Sahrens  * and limitations under the License.
13fa9e4066Sahrens  *
14fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19fa9e4066Sahrens  *
20fa9e4066Sahrens  * CDDL HEADER END
21fa9e4066Sahrens  */
22fa9e4066Sahrens /*
23fa9e4066Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24fa9e4066Sahrens  * Use is subject to license terms.
25fa9e4066Sahrens  */
26fa9e4066Sahrens 
27fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28fa9e4066Sahrens 
29fa9e4066Sahrens /*
30fa9e4066Sahrens  * DVA-based Adjustable Relpacement Cache
31fa9e4066Sahrens  *
32fa9e4066Sahrens  * While much of the theory of operation and algorithms used here
33fa9e4066Sahrens  * are based on the self-tuning, low overhead replacement cache
34fa9e4066Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
35fa9e4066Sahrens  * significant differences:
36fa9e4066Sahrens  *
37fa9e4066Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
38fa9e4066Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
39fa9e4066Sahrens  * the eviction algorithm simple: evict the last page in the list.
40fa9e4066Sahrens  * This also make the performance characteristics easy to reason
41fa9e4066Sahrens  * about.  Our cache is not so simple.  At any given moment, some
42fa9e4066Sahrens  * subset of the blocks in the cache are un-evictable because we
43fa9e4066Sahrens  * have handed out a reference to them.  Blocks are only evictable
44fa9e4066Sahrens  * when there are no external references active.  This makes
45fa9e4066Sahrens  * eviction far more problematic:  we choose to evict the evictable
46fa9e4066Sahrens  * blocks that are the "lowest" in the list.
47fa9e4066Sahrens  *
48fa9e4066Sahrens  * There are times when it is not possible to evict the requested
49fa9e4066Sahrens  * space.  In these circumstances we are unable to adjust the cache
50fa9e4066Sahrens  * size.  To prevent the cache growing unbounded at these times we
51fa9e4066Sahrens  * implement a "cache throttle" that slowes the flow of new data
52fa9e4066Sahrens  * into the cache until we can make space avaiable.
53fa9e4066Sahrens  *
54fa9e4066Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
55fa9e4066Sahrens  * Pages are evicted when the cache is full and there is a cache
56fa9e4066Sahrens  * miss.  Our model has a variable sized cache.  It grows with
57fa9e4066Sahrens  * high use, but also tries to react to memory preasure from the
58fa9e4066Sahrens  * operating system: decreasing its size when system memory is
59fa9e4066Sahrens  * tight.
60fa9e4066Sahrens  *
61fa9e4066Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
62fa9e4066Sahrens  * elements of the cache are therefor exactly the same size.  So
63fa9e4066Sahrens  * when adjusting the cache size following a cache miss, its simply
64fa9e4066Sahrens  * a matter of choosing a single page to evict.  In our model, we
65fa9e4066Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
66fa9e4066Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
67fa9e4066Sahrens  * space for a cache miss that approximates as closely as possible
68fa9e4066Sahrens  * the space used by the new block.
69fa9e4066Sahrens  *
70fa9e4066Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71fa9e4066Sahrens  * by N. Megiddo & D. Modha, FAST 2003
72fa9e4066Sahrens  */
73fa9e4066Sahrens 
74fa9e4066Sahrens /*
75fa9e4066Sahrens  * The locking model:
76fa9e4066Sahrens  *
77fa9e4066Sahrens  * A new reference to a cache buffer can be obtained in two
78fa9e4066Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
79fa9e4066Sahrens  * or 2) via one of the ARC lists.  The arc_read() inerface
80fa9e4066Sahrens  * uses method 1, while the internal arc algorithms for
81fa9e4066Sahrens  * adjusting the cache use method 2.  We therefor provide two
82fa9e4066Sahrens  * types of locks: 1) the hash table lock array, and 2) the
83fa9e4066Sahrens  * arc list locks.
84fa9e4066Sahrens  *
85fa9e4066Sahrens  * Buffers do not have their own mutexs, rather they rely on the
86fa9e4066Sahrens  * hash table mutexs for the bulk of their protection (i.e. most
87fa9e4066Sahrens  * fields in the arc_buf_hdr_t are protected by these mutexs).
88fa9e4066Sahrens  *
89fa9e4066Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
90fa9e4066Sahrens  * locates the requested buffer in the hash table.  It returns
91fa9e4066Sahrens  * NULL for the mutex if the buffer was not in the table.
92fa9e4066Sahrens  *
93fa9e4066Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
94fa9e4066Sahrens  * already held before it is invoked.
95fa9e4066Sahrens  *
96fa9e4066Sahrens  * Each arc state also has a mutex which is used to protect the
97fa9e4066Sahrens  * buffer list associated with the state.  When attempting to
98fa9e4066Sahrens  * obtain a hash table lock while holding an arc list lock you
99fa9e4066Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
100fa9e4066Sahrens  * the "top" state mutex must be held before the "bot" state mutex.
101fa9e4066Sahrens  *
102fa9e4066Sahrens  * Note that the majority of the performance stats are manipulated
103fa9e4066Sahrens  * with atomic operations.
104fa9e4066Sahrens  */
105fa9e4066Sahrens 
106fa9e4066Sahrens #include <sys/spa.h>
107fa9e4066Sahrens #include <sys/zio.h>
108fa9e4066Sahrens #include <sys/zfs_context.h>
109fa9e4066Sahrens #include <sys/arc.h>
110fa9e4066Sahrens #include <sys/refcount.h>
111fa9e4066Sahrens #ifdef _KERNEL
112fa9e4066Sahrens #include <sys/vmsystm.h>
113fa9e4066Sahrens #include <vm/anon.h>
114fa9e4066Sahrens #include <sys/fs/swapnode.h>
115fa9e4066Sahrens #endif
116fa9e4066Sahrens #include <sys/callb.h>
117fa9e4066Sahrens 
118fa9e4066Sahrens static kmutex_t		arc_reclaim_thr_lock;
119fa9e4066Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
120fa9e4066Sahrens static uint8_t		arc_thread_exit;
121fa9e4066Sahrens 
122fa9e4066Sahrens typedef enum arc_reclaim_strategy {
123fa9e4066Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
124fa9e4066Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
125fa9e4066Sahrens } arc_reclaim_strategy_t;
126fa9e4066Sahrens 
127fa9e4066Sahrens /* number of seconds before growing cache again */
128fa9e4066Sahrens static int		arc_grow_retry = 60;
129fa9e4066Sahrens 
130fa9e4066Sahrens static kmutex_t arc_reclaim_lock;
131fa9e4066Sahrens static int arc_dead;
132fa9e4066Sahrens 
133fa9e4066Sahrens /*
134fa9e4066Sahrens  * Note that buffers can be on one of 5 states:
135fa9e4066Sahrens  *	ARC_anon	- anonymous (discussed below)
136fa9e4066Sahrens  *	ARC_mru_top	- recently used, currently cached
137fa9e4066Sahrens  *	ARC_mru_bot	- recentely used, no longer in cache
138fa9e4066Sahrens  *	ARC_mfu_top	- frequently used, currently cached
139fa9e4066Sahrens  *	ARC_mfu_bot	- frequently used, no longer in cache
140fa9e4066Sahrens  * When there are no active references to the buffer, they
141fa9e4066Sahrens  * are linked onto one of the lists in arc.  These are the
142fa9e4066Sahrens  * only buffers that can be evicted or deleted.
143fa9e4066Sahrens  *
144fa9e4066Sahrens  * Anonymous buffers are buffers that are not associated with
145fa9e4066Sahrens  * a DVA.  These are buffers that hold dirty block copies
146fa9e4066Sahrens  * before they are written to stable storage.  By definition,
147fa9e4066Sahrens  * they are "ref'd" and are considered part of arc_mru_top
148fa9e4066Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
149fa9e4066Sahrens  * as they are written and migrate onto the arc_mru_top list.
150fa9e4066Sahrens  */
151fa9e4066Sahrens 
152fa9e4066Sahrens typedef struct arc_state {
153fa9e4066Sahrens 	list_t	list;	/* linked list of evictable buffer in state */
154fa9e4066Sahrens 	uint64_t lsize;	/* total size of buffers in the linked list */
155fa9e4066Sahrens 	uint64_t size;	/* total size of all buffers in this state */
156fa9e4066Sahrens 	uint64_t hits;
157fa9e4066Sahrens 	kmutex_t mtx;
158fa9e4066Sahrens } arc_state_t;
159fa9e4066Sahrens 
160fa9e4066Sahrens /* The 5 states: */
161fa9e4066Sahrens static arc_state_t ARC_anon;
162fa9e4066Sahrens static arc_state_t ARC_mru_top;
163fa9e4066Sahrens static arc_state_t ARC_mru_bot;
164fa9e4066Sahrens static arc_state_t ARC_mfu_top;
165fa9e4066Sahrens static arc_state_t ARC_mfu_bot;
166fa9e4066Sahrens 
167fa9e4066Sahrens static struct arc {
168fa9e4066Sahrens 	arc_state_t 	*anon;
169fa9e4066Sahrens 	arc_state_t	*mru_top;
170fa9e4066Sahrens 	arc_state_t	*mru_bot;
171fa9e4066Sahrens 	arc_state_t	*mfu_top;
172fa9e4066Sahrens 	arc_state_t	*mfu_bot;
173fa9e4066Sahrens 	uint64_t	size;		/* Actual total arc size */
174fa9e4066Sahrens 	uint64_t	p;		/* Target size (in bytes) of mru_top */
175fa9e4066Sahrens 	uint64_t	c;		/* Target size of cache (in bytes) */
176fa9e4066Sahrens 	uint64_t	c_min;		/* Minimum target cache size */
177fa9e4066Sahrens 	uint64_t	c_max;		/* Maximum target cache size */
178fa9e4066Sahrens 	uint64_t	incr;		/* Size by which to increment arc.c */
179fa9e4066Sahrens 	int64_t		size_check;
180fa9e4066Sahrens 
181fa9e4066Sahrens 	/* performance stats */
182fa9e4066Sahrens 	uint64_t	hits;
183fa9e4066Sahrens 	uint64_t	misses;
184fa9e4066Sahrens 	uint64_t	deleted;
185fa9e4066Sahrens 	uint64_t	skipped;
186fa9e4066Sahrens 	uint64_t	hash_elements;
187fa9e4066Sahrens 	uint64_t	hash_elements_max;
188fa9e4066Sahrens 	uint64_t	hash_collisions;
189fa9e4066Sahrens 	uint64_t	hash_chains;
190fa9e4066Sahrens 	uint32_t	hash_chain_max;
191fa9e4066Sahrens 
192fa9e4066Sahrens 	int		no_grow;	/* Don't try to grow cache size */
193fa9e4066Sahrens } arc;
194fa9e4066Sahrens 
195fa9e4066Sahrens /* Default amount to grow arc.incr */
196fa9e4066Sahrens static int64_t arc_incr_size = 1024;
197fa9e4066Sahrens 
198fa9e4066Sahrens /* > 0 ==> time to increment arc.c */
199fa9e4066Sahrens static int64_t arc_size_check_default = -1000;
200fa9e4066Sahrens 
201fa9e4066Sahrens static uint64_t arc_tempreserve;
202fa9e4066Sahrens 
203fa9e4066Sahrens typedef struct arc_callback arc_callback_t;
204fa9e4066Sahrens 
205fa9e4066Sahrens struct arc_callback {
206fa9e4066Sahrens 	arc_done_func_t		*acb_done;
207fa9e4066Sahrens 	void			*acb_private;
208fa9e4066Sahrens 	arc_byteswap_func_t	*acb_byteswap;
209fa9e4066Sahrens 	arc_buf_t		*acb_buf;
210fa9e4066Sahrens 	zio_t			*acb_zio_dummy;
211fa9e4066Sahrens 	arc_callback_t		*acb_next;
212fa9e4066Sahrens };
213fa9e4066Sahrens 
214fa9e4066Sahrens struct arc_buf_hdr {
215fa9e4066Sahrens 	/* immutable */
216fa9e4066Sahrens 	uint64_t		b_size;
217fa9e4066Sahrens 	spa_t			*b_spa;
218fa9e4066Sahrens 
219fa9e4066Sahrens 	/* protected by hash lock */
220fa9e4066Sahrens 	dva_t			b_dva;
221fa9e4066Sahrens 	uint64_t		b_birth;
222fa9e4066Sahrens 	uint64_t		b_cksum0;
223fa9e4066Sahrens 
224fa9e4066Sahrens 	arc_buf_hdr_t		*b_hash_next;
225fa9e4066Sahrens 	arc_buf_t		*b_buf;
226fa9e4066Sahrens 	uint32_t		b_flags;
227fa9e4066Sahrens 
228fa9e4066Sahrens 	kcondvar_t		b_cv;
229fa9e4066Sahrens 	arc_callback_t		*b_acb;
230fa9e4066Sahrens 
231fa9e4066Sahrens 	/* protected by arc state mutex */
232fa9e4066Sahrens 	arc_state_t		*b_state;
233fa9e4066Sahrens 	list_node_t		b_arc_node;
234fa9e4066Sahrens 
235fa9e4066Sahrens 	/* updated atomically */
236fa9e4066Sahrens 	clock_t			b_arc_access;
237fa9e4066Sahrens 
238fa9e4066Sahrens 	/* self protecting */
239fa9e4066Sahrens 	refcount_t		b_refcnt;
240fa9e4066Sahrens };
241fa9e4066Sahrens 
242fa9e4066Sahrens /*
243fa9e4066Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
244fa9e4066Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
245fa9e4066Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
246fa9e4066Sahrens  * should never be passed and should only be set by ARC code.  When adding new
247fa9e4066Sahrens  * public flags, make sure not to smash the private ones.
248fa9e4066Sahrens  */
249fa9e4066Sahrens 
250fa9e4066Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
251fa9e4066Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
252fa9e4066Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
253fa9e4066Sahrens 
254fa9e4066Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
255fa9e4066Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
256fa9e4066Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
257fa9e4066Sahrens 
258fa9e4066Sahrens /*
259fa9e4066Sahrens  * Hash table routines
260fa9e4066Sahrens  */
261fa9e4066Sahrens 
262fa9e4066Sahrens #define	HT_LOCK_PAD	64
263fa9e4066Sahrens 
264fa9e4066Sahrens struct ht_lock {
265fa9e4066Sahrens 	kmutex_t	ht_lock;
266fa9e4066Sahrens #ifdef _KERNEL
267fa9e4066Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
268fa9e4066Sahrens #endif
269fa9e4066Sahrens };
270fa9e4066Sahrens 
271fa9e4066Sahrens #define	BUF_LOCKS 256
272fa9e4066Sahrens typedef struct buf_hash_table {
273fa9e4066Sahrens 	uint64_t ht_mask;
274fa9e4066Sahrens 	arc_buf_hdr_t **ht_table;
275fa9e4066Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
276fa9e4066Sahrens } buf_hash_table_t;
277fa9e4066Sahrens 
278fa9e4066Sahrens static buf_hash_table_t buf_hash_table;
279fa9e4066Sahrens 
280fa9e4066Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
281fa9e4066Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
282fa9e4066Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
283fa9e4066Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
284fa9e4066Sahrens #define	HDR_LOCK(buf) \
285fa9e4066Sahrens 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
286fa9e4066Sahrens 
287fa9e4066Sahrens uint64_t zfs_crc64_table[256];
288fa9e4066Sahrens 
289fa9e4066Sahrens static uint64_t
290fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
291fa9e4066Sahrens {
292fa9e4066Sahrens 	uintptr_t spav = (uintptr_t)spa;
293fa9e4066Sahrens 	uint8_t *vdva = (uint8_t *)dva;
294fa9e4066Sahrens 	uint64_t crc = -1ULL;
295fa9e4066Sahrens 	int i;
296fa9e4066Sahrens 
297fa9e4066Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
298fa9e4066Sahrens 
299fa9e4066Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
300fa9e4066Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
301fa9e4066Sahrens 
302fa9e4066Sahrens 	crc ^= (spav>>8) ^ birth;
303fa9e4066Sahrens 
304fa9e4066Sahrens 	return (crc);
305fa9e4066Sahrens }
306fa9e4066Sahrens 
307fa9e4066Sahrens #define	BUF_EMPTY(buf)						\
308fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
309fa9e4066Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
310fa9e4066Sahrens 	(buf)->b_birth == 0)
311fa9e4066Sahrens 
312fa9e4066Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
313fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
314fa9e4066Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
315fa9e4066Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
316fa9e4066Sahrens 
317fa9e4066Sahrens static arc_buf_hdr_t *
318fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
319fa9e4066Sahrens {
320fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
321fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
322fa9e4066Sahrens 	arc_buf_hdr_t *buf;
323fa9e4066Sahrens 
324fa9e4066Sahrens 	mutex_enter(hash_lock);
325fa9e4066Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
326fa9e4066Sahrens 	    buf = buf->b_hash_next) {
327fa9e4066Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
328fa9e4066Sahrens 			*lockp = hash_lock;
329fa9e4066Sahrens 			return (buf);
330fa9e4066Sahrens 		}
331fa9e4066Sahrens 	}
332fa9e4066Sahrens 	mutex_exit(hash_lock);
333fa9e4066Sahrens 	*lockp = NULL;
334fa9e4066Sahrens 	return (NULL);
335fa9e4066Sahrens }
336fa9e4066Sahrens 
337fa9e4066Sahrens /*
338fa9e4066Sahrens  * Insert an entry into the hash table.  If there is already an element
339fa9e4066Sahrens  * equal to elem in the hash table, then the already existing element
340fa9e4066Sahrens  * will be returned and the new element will not be inserted.
341fa9e4066Sahrens  * Otherwise returns NULL.
342fa9e4066Sahrens  */
343fa9e4066Sahrens static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
344fa9e4066Sahrens static kthread_t *fbufs_lastthread;
345fa9e4066Sahrens static arc_buf_hdr_t *
346fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
347fa9e4066Sahrens {
348fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
349fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
350fa9e4066Sahrens 	arc_buf_hdr_t *fbuf;
351fa9e4066Sahrens 	uint32_t max, i;
352fa9e4066Sahrens 
353fa9e4066Sahrens 	fbufs_lastthread = curthread;
354fa9e4066Sahrens 	*lockp = hash_lock;
355fa9e4066Sahrens 	mutex_enter(hash_lock);
356fa9e4066Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
357fa9e4066Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
358fa9e4066Sahrens 		if (i < sizeof (fbufs) / sizeof (fbufs[0]))
359fa9e4066Sahrens 			fbufs[i] = fbuf;
360fa9e4066Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
361fa9e4066Sahrens 			return (fbuf);
362fa9e4066Sahrens 	}
363fa9e4066Sahrens 
364fa9e4066Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
365fa9e4066Sahrens 	buf_hash_table.ht_table[idx] = buf;
366fa9e4066Sahrens 
367fa9e4066Sahrens 	/* collect some hash table performance data */
368fa9e4066Sahrens 	if (i > 0) {
369fa9e4066Sahrens 		atomic_add_64(&arc.hash_collisions, 1);
370fa9e4066Sahrens 		if (i == 1)
371fa9e4066Sahrens 			atomic_add_64(&arc.hash_chains, 1);
372fa9e4066Sahrens 	}
373fa9e4066Sahrens 	while (i > (max = arc.hash_chain_max) &&
374fa9e4066Sahrens 	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
375fa9e4066Sahrens 		continue;
376fa9e4066Sahrens 	}
377fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, 1);
378fa9e4066Sahrens 	if (arc.hash_elements > arc.hash_elements_max)
379fa9e4066Sahrens 		atomic_add_64(&arc.hash_elements_max, 1);
380fa9e4066Sahrens 
381fa9e4066Sahrens 	return (NULL);
382fa9e4066Sahrens }
383fa9e4066Sahrens 
384fa9e4066Sahrens static void
385fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
386fa9e4066Sahrens {
387fa9e4066Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
388fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
389fa9e4066Sahrens 
390fa9e4066Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
391fa9e4066Sahrens 
392fa9e4066Sahrens 	bufp = &buf_hash_table.ht_table[idx];
393fa9e4066Sahrens 	while ((fbuf = *bufp) != buf) {
394fa9e4066Sahrens 		ASSERT(fbuf != NULL);
395fa9e4066Sahrens 		bufp = &fbuf->b_hash_next;
396fa9e4066Sahrens 	}
397fa9e4066Sahrens 	*bufp = buf->b_hash_next;
398fa9e4066Sahrens 	buf->b_hash_next = NULL;
399fa9e4066Sahrens 
400fa9e4066Sahrens 	/* collect some hash table performance data */
401fa9e4066Sahrens 	atomic_add_64(&arc.hash_elements, -1);
402fa9e4066Sahrens 	if (buf_hash_table.ht_table[idx] &&
403fa9e4066Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
404fa9e4066Sahrens 		atomic_add_64(&arc.hash_chains, -1);
405fa9e4066Sahrens }
406fa9e4066Sahrens 
407fa9e4066Sahrens /*
408fa9e4066Sahrens  * Global data structures and functions for the buf kmem cache.
409fa9e4066Sahrens  */
410fa9e4066Sahrens static kmem_cache_t *hdr_cache;
411fa9e4066Sahrens static kmem_cache_t *buf_cache;
412fa9e4066Sahrens 
413fa9e4066Sahrens static void
414fa9e4066Sahrens buf_fini(void)
415fa9e4066Sahrens {
416fa9e4066Sahrens 	int i;
417fa9e4066Sahrens 
418fa9e4066Sahrens 	kmem_free(buf_hash_table.ht_table,
419fa9e4066Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
420fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
421fa9e4066Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
422fa9e4066Sahrens 	kmem_cache_destroy(hdr_cache);
423fa9e4066Sahrens 	kmem_cache_destroy(buf_cache);
424fa9e4066Sahrens }
425fa9e4066Sahrens 
426fa9e4066Sahrens /*
427fa9e4066Sahrens  * Constructor callback - called when the cache is empty
428fa9e4066Sahrens  * and a new buf is requested.
429fa9e4066Sahrens  */
430fa9e4066Sahrens /* ARGSUSED */
431fa9e4066Sahrens static int
432fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
433fa9e4066Sahrens {
434fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
435fa9e4066Sahrens 
436fa9e4066Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
437fa9e4066Sahrens 	refcount_create(&buf->b_refcnt);
438fa9e4066Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
439fa9e4066Sahrens 	return (0);
440fa9e4066Sahrens }
441fa9e4066Sahrens 
442fa9e4066Sahrens /*
443fa9e4066Sahrens  * Destructor callback - called when a cached buf is
444fa9e4066Sahrens  * no longer required.
445fa9e4066Sahrens  */
446fa9e4066Sahrens /* ARGSUSED */
447fa9e4066Sahrens static void
448fa9e4066Sahrens hdr_dest(void *vbuf, void *unused)
449fa9e4066Sahrens {
450fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
451fa9e4066Sahrens 
452fa9e4066Sahrens 	refcount_destroy(&buf->b_refcnt);
453fa9e4066Sahrens 	cv_destroy(&buf->b_cv);
454fa9e4066Sahrens }
455fa9e4066Sahrens 
456fa9e4066Sahrens void arc_kmem_reclaim(void);
457fa9e4066Sahrens 
458fa9e4066Sahrens /*
459fa9e4066Sahrens  * Reclaim callback -- invoked when memory is low.
460fa9e4066Sahrens  */
461fa9e4066Sahrens /* ARGSUSED */
462fa9e4066Sahrens static void
463fa9e4066Sahrens hdr_recl(void *unused)
464fa9e4066Sahrens {
465fa9e4066Sahrens 	dprintf("hdr_recl called\n");
466fa9e4066Sahrens 	arc_kmem_reclaim();
467fa9e4066Sahrens }
468fa9e4066Sahrens 
469fa9e4066Sahrens static void
470fa9e4066Sahrens buf_init(void)
471fa9e4066Sahrens {
472fa9e4066Sahrens 	uint64_t *ct;
473fa9e4066Sahrens 	uint64_t hsize = 1ULL << 10;
474fa9e4066Sahrens 	int i, j;
475fa9e4066Sahrens 
476fa9e4066Sahrens 	/*
477fa9e4066Sahrens 	 * The hash table is big enough to fill all of physical memory
478fa9e4066Sahrens 	 * with an average 4k block size.  The table will take up
479fa9e4066Sahrens 	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
480fa9e4066Sahrens 	 * pointers).
481fa9e4066Sahrens 	 */
482fa9e4066Sahrens 	while (hsize * 4096 < physmem * PAGESIZE)
483fa9e4066Sahrens 		hsize <<= 1;
484fa9e4066Sahrens 
485fa9e4066Sahrens 	buf_hash_table.ht_mask = hsize - 1;
486fa9e4066Sahrens 	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
487fa9e4066Sahrens 
488fa9e4066Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
489fa9e4066Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
490fa9e4066Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
491fa9e4066Sahrens 	    0, NULL, NULL, NULL, NULL, NULL, 0);
492fa9e4066Sahrens 
493fa9e4066Sahrens 	for (i = 0; i < 256; i++)
494fa9e4066Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
495fa9e4066Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
496fa9e4066Sahrens 
497fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
498fa9e4066Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
499fa9e4066Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
500fa9e4066Sahrens 	}
501fa9e4066Sahrens }
502fa9e4066Sahrens 
503fa9e4066Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
504fa9e4066Sahrens 
505fa9e4066Sahrens #define	ARC_TAG		(void *)0x05201962
506fa9e4066Sahrens 
507fa9e4066Sahrens static void
508fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
509fa9e4066Sahrens {
510fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
511fa9e4066Sahrens 
512fa9e4066Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
513fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
514fa9e4066Sahrens 
515fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
516fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
517fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&ab->b_refcnt));
518fa9e4066Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
519fa9e4066Sahrens 		list_remove(&ab->b_state->list, ab);
520fa9e4066Sahrens 		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
521fa9e4066Sahrens 		ab->b_state->lsize -= ab->b_size;
522fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
523fa9e4066Sahrens 	}
524fa9e4066Sahrens }
525fa9e4066Sahrens 
526fa9e4066Sahrens static int
527fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
528fa9e4066Sahrens {
529fa9e4066Sahrens 	int cnt;
530fa9e4066Sahrens 
531fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
532fa9e4066Sahrens 
533fa9e4066Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
534fa9e4066Sahrens 	    (ab->b_state != arc.anon)) {
535fa9e4066Sahrens 
536fa9e4066Sahrens 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
537fa9e4066Sahrens 		mutex_enter(&ab->b_state->mtx);
538fa9e4066Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
539fa9e4066Sahrens 		list_insert_head(&ab->b_state->list, ab);
540fa9e4066Sahrens 		ASSERT(ab->b_buf != NULL);
541fa9e4066Sahrens 		ab->b_state->lsize += ab->b_size;
542fa9e4066Sahrens 		mutex_exit(&ab->b_state->mtx);
543fa9e4066Sahrens 	}
544fa9e4066Sahrens 	return (cnt);
545fa9e4066Sahrens }
546fa9e4066Sahrens 
547fa9e4066Sahrens /*
548fa9e4066Sahrens  * Move the supplied buffer to the indicated state.  The mutex
549fa9e4066Sahrens  * for the buffer must be held by the caller.
550fa9e4066Sahrens  */
551fa9e4066Sahrens static void
552fa9e4066Sahrens arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
553fa9e4066Sahrens     kmutex_t *hash_lock)
554fa9e4066Sahrens {
555fa9e4066Sahrens 	arc_buf_t *buf;
556fa9e4066Sahrens 
557fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
558fa9e4066Sahrens 
559fa9e4066Sahrens 	/*
560fa9e4066Sahrens 	 * If this buffer is evictable, transfer it from the
561fa9e4066Sahrens 	 * old state list to the new state list.
562fa9e4066Sahrens 	 */
563fa9e4066Sahrens 	if (refcount_is_zero(&ab->b_refcnt)) {
564fa9e4066Sahrens 		if (ab->b_state != arc.anon) {
565fa9e4066Sahrens 			int drop_mutex = FALSE;
566fa9e4066Sahrens 
567fa9e4066Sahrens 			if (!MUTEX_HELD(&ab->b_state->mtx)) {
568fa9e4066Sahrens 				mutex_enter(&ab->b_state->mtx);
569fa9e4066Sahrens 				drop_mutex = TRUE;
570fa9e4066Sahrens 			}
571fa9e4066Sahrens 			ASSERT(list_link_active(&ab->b_arc_node));
572fa9e4066Sahrens 			list_remove(&ab->b_state->list, ab);
573fa9e4066Sahrens 			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
574fa9e4066Sahrens 			ab->b_state->lsize -= ab->b_size;
575fa9e4066Sahrens 			if (drop_mutex)
576fa9e4066Sahrens 				mutex_exit(&ab->b_state->mtx);
577fa9e4066Sahrens 		}
578fa9e4066Sahrens 		if (new_state != arc.anon) {
579fa9e4066Sahrens 			int drop_mutex = FALSE;
580fa9e4066Sahrens 
581fa9e4066Sahrens 			if (!MUTEX_HELD(&new_state->mtx)) {
582fa9e4066Sahrens 				mutex_enter(&new_state->mtx);
583fa9e4066Sahrens 				drop_mutex = TRUE;
584fa9e4066Sahrens 			}
585fa9e4066Sahrens 			list_insert_head(&new_state->list, ab);
586fa9e4066Sahrens 			ASSERT(ab->b_buf != NULL);
587fa9e4066Sahrens 			new_state->lsize += ab->b_size;
588fa9e4066Sahrens 			if (drop_mutex)
589fa9e4066Sahrens 				mutex_exit(&new_state->mtx);
590fa9e4066Sahrens 		}
591fa9e4066Sahrens 	}
592fa9e4066Sahrens 
593fa9e4066Sahrens 	ASSERT(!BUF_EMPTY(ab));
594fa9e4066Sahrens 	if (new_state == arc.anon && ab->b_state != arc.anon) {
595fa9e4066Sahrens 		buf_hash_remove(ab);
596fa9e4066Sahrens 	}
597fa9e4066Sahrens 
598fa9e4066Sahrens 	/*
599fa9e4066Sahrens 	 * If this buffer isn't being transferred to the MRU-top
600fa9e4066Sahrens 	 * state, it's safe to clear its prefetch flag
601fa9e4066Sahrens 	 */
602fa9e4066Sahrens 	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
603fa9e4066Sahrens 		ab->b_flags &= ~ARC_PREFETCH;
604fa9e4066Sahrens 	}
605fa9e4066Sahrens 
606fa9e4066Sahrens 	buf = ab->b_buf;
607fa9e4066Sahrens 	if (buf == NULL) {
608fa9e4066Sahrens 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
609fa9e4066Sahrens 		atomic_add_64(&ab->b_state->size, -ab->b_size);
610fa9e4066Sahrens 		/* we should only be here if we are deleting state */
611fa9e4066Sahrens 		ASSERT(new_state == arc.anon &&
612fa9e4066Sahrens 		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
613fa9e4066Sahrens 	} else while (buf) {
614fa9e4066Sahrens 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
615fa9e4066Sahrens 		atomic_add_64(&ab->b_state->size, -ab->b_size);
616fa9e4066Sahrens 		atomic_add_64(&new_state->size, ab->b_size);
617fa9e4066Sahrens 		buf = buf->b_next;
618fa9e4066Sahrens 	}
619fa9e4066Sahrens 	ab->b_state = new_state;
620fa9e4066Sahrens }
621fa9e4066Sahrens 
622fa9e4066Sahrens arc_buf_t *
623fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag)
624fa9e4066Sahrens {
625fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
626fa9e4066Sahrens 	arc_buf_t *buf;
627fa9e4066Sahrens 
628fa9e4066Sahrens 	ASSERT3U(size, >, 0);
629fa9e4066Sahrens 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
630fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
631fa9e4066Sahrens 	hdr->b_size = size;
632fa9e4066Sahrens 	hdr->b_spa = spa;
633fa9e4066Sahrens 	hdr->b_state = arc.anon;
634fa9e4066Sahrens 	hdr->b_arc_access = 0;
635fa9e4066Sahrens 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
636fa9e4066Sahrens 	buf->b_hdr = hdr;
637fa9e4066Sahrens 	buf->b_next = NULL;
638fa9e4066Sahrens 	buf->b_data = zio_buf_alloc(size);
639fa9e4066Sahrens 	hdr->b_buf = buf;
640fa9e4066Sahrens 	hdr->b_flags = 0;
641fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
642fa9e4066Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
643fa9e4066Sahrens 
644fa9e4066Sahrens 	atomic_add_64(&arc.size, size);
645fa9e4066Sahrens 	atomic_add_64(&arc.anon->size, size);
646fa9e4066Sahrens 
647fa9e4066Sahrens 	return (buf);
648fa9e4066Sahrens }
649fa9e4066Sahrens 
650fa9e4066Sahrens static void
651fa9e4066Sahrens arc_hdr_free(arc_buf_hdr_t *hdr)
652fa9e4066Sahrens {
653fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
654fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
655fa9e4066Sahrens 
656fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
657fa9e4066Sahrens 		/*
658fa9e4066Sahrens 		 * We can be called with an arc state lock held,
659fa9e4066Sahrens 		 * so we can't hold a hash lock here.
660fa9e4066Sahrens 		 * ASSERT(not in hash table)
661fa9e4066Sahrens 		 */
662fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
663fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
664fa9e4066Sahrens 		hdr->b_birth = 0;
665fa9e4066Sahrens 		hdr->b_cksum0 = 0;
666fa9e4066Sahrens 	}
667fa9e4066Sahrens 	if (hdr->b_buf) {
668fa9e4066Sahrens 		arc_buf_t *buf = hdr->b_buf;
669fa9e4066Sahrens 
670fa9e4066Sahrens 		ASSERT3U(hdr->b_size, >, 0);
671fa9e4066Sahrens 		zio_buf_free(buf->b_data, hdr->b_size);
672fa9e4066Sahrens 		atomic_add_64(&arc.size, -hdr->b_size);
673fa9e4066Sahrens 		ASSERT3U(arc.anon->size, >=, hdr->b_size);
674fa9e4066Sahrens 		atomic_add_64(&arc.anon->size, -hdr->b_size);
675fa9e4066Sahrens 		ASSERT3P(buf->b_next, ==, NULL);
676fa9e4066Sahrens 		kmem_cache_free(buf_cache, buf);
677fa9e4066Sahrens 		hdr->b_buf = NULL;
678fa9e4066Sahrens 	}
679fa9e4066Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
680fa9e4066Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
681fa9e4066Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
682fa9e4066Sahrens 	kmem_cache_free(hdr_cache, hdr);
683fa9e4066Sahrens }
684fa9e4066Sahrens 
685fa9e4066Sahrens void
686fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
687fa9e4066Sahrens {
688fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
689fa9e4066Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
690fa9e4066Sahrens 	int freeable;
691fa9e4066Sahrens 
692fa9e4066Sahrens 	mutex_enter(hash_lock);
693fa9e4066Sahrens 	if (remove_reference(hdr, hash_lock, tag) > 0) {
694fa9e4066Sahrens 		arc_buf_t **bufp = &hdr->b_buf;
695fa9e4066Sahrens 		arc_state_t *state = hdr->b_state;
696fa9e4066Sahrens 		uint64_t size = hdr->b_size;
697fa9e4066Sahrens 
698fa9e4066Sahrens 		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
699fa9e4066Sahrens 		while (*bufp != buf) {
700fa9e4066Sahrens 			ASSERT(*bufp);
701fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
702fa9e4066Sahrens 		}
703fa9e4066Sahrens 		*bufp = buf->b_next;
704fa9e4066Sahrens 		mutex_exit(hash_lock);
705fa9e4066Sahrens 		zio_buf_free(buf->b_data, size);
706fa9e4066Sahrens 		atomic_add_64(&arc.size, -size);
707fa9e4066Sahrens 		kmem_cache_free(buf_cache, buf);
708fa9e4066Sahrens 		ASSERT3U(state->size, >=, size);
709fa9e4066Sahrens 		atomic_add_64(&state->size, -size);
710fa9e4066Sahrens 		return;
711fa9e4066Sahrens 	}
712fa9e4066Sahrens 
713fa9e4066Sahrens 	/* don't free buffers that are in the middle of an async write */
714fa9e4066Sahrens 	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
715fa9e4066Sahrens 	mutex_exit(hash_lock);
716fa9e4066Sahrens 
717fa9e4066Sahrens 	if (freeable)
718fa9e4066Sahrens 		arc_hdr_free(hdr);
719fa9e4066Sahrens }
720fa9e4066Sahrens 
721fa9e4066Sahrens int
722fa9e4066Sahrens arc_buf_size(arc_buf_t *buf)
723fa9e4066Sahrens {
724fa9e4066Sahrens 	return (buf->b_hdr->b_size);
725fa9e4066Sahrens }
726fa9e4066Sahrens 
727fa9e4066Sahrens /*
728fa9e4066Sahrens  * Evict buffers from list until we've removed the specified number of
729fa9e4066Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
730fa9e4066Sahrens  */
731fa9e4066Sahrens static uint64_t
732fa9e4066Sahrens arc_evict_state(arc_state_t *state, int64_t bytes)
733fa9e4066Sahrens {
734fa9e4066Sahrens 	arc_state_t *evicted_state;
735fa9e4066Sahrens 	uint64_t bytes_evicted = 0;
736fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
737fa9e4066Sahrens 	kmutex_t *hash_lock;
738fa9e4066Sahrens 
739fa9e4066Sahrens 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
740fa9e4066Sahrens 
741fa9e4066Sahrens 	if (state == arc.mru_top)
742fa9e4066Sahrens 		evicted_state = arc.mru_bot;
743fa9e4066Sahrens 	else
744fa9e4066Sahrens 		evicted_state = arc.mfu_bot;
745fa9e4066Sahrens 
746fa9e4066Sahrens 	mutex_enter(&state->mtx);
747fa9e4066Sahrens 	mutex_enter(&evicted_state->mtx);
748fa9e4066Sahrens 
749fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
750fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
751fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
752fa9e4066Sahrens 		if (mutex_tryenter(hash_lock)) {
753fa9e4066Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
754fa9e4066Sahrens 			arc_change_state(evicted_state, ab, hash_lock);
755fa9e4066Sahrens 			zio_buf_free(ab->b_buf->b_data, ab->b_size);
756fa9e4066Sahrens 			atomic_add_64(&arc.size, -ab->b_size);
757fa9e4066Sahrens 			ASSERT3P(ab->b_buf->b_next, ==, NULL);
758fa9e4066Sahrens 			kmem_cache_free(buf_cache, ab->b_buf);
759fa9e4066Sahrens 			ab->b_buf = NULL;
760fa9e4066Sahrens 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
761fa9e4066Sahrens 			bytes_evicted += ab->b_size;
762fa9e4066Sahrens 			mutex_exit(hash_lock);
763fa9e4066Sahrens 			if (bytes_evicted >= bytes)
764fa9e4066Sahrens 				break;
765fa9e4066Sahrens 		} else {
766fa9e4066Sahrens 			atomic_add_64(&arc.skipped, 1);
767fa9e4066Sahrens 		}
768fa9e4066Sahrens 	}
769fa9e4066Sahrens 	mutex_exit(&evicted_state->mtx);
770fa9e4066Sahrens 	mutex_exit(&state->mtx);
771fa9e4066Sahrens 
772fa9e4066Sahrens 	if (bytes_evicted < bytes)
773fa9e4066Sahrens 		dprintf("only evicted %lld bytes from %x",
774fa9e4066Sahrens 		    (longlong_t)bytes_evicted, state);
775fa9e4066Sahrens 
776fa9e4066Sahrens 	return (bytes_evicted);
777fa9e4066Sahrens }
778fa9e4066Sahrens 
779fa9e4066Sahrens /*
780fa9e4066Sahrens  * Remove buffers from list until we've removed the specified number of
781fa9e4066Sahrens  * bytes.  Destroy the buffers that are removed.
782fa9e4066Sahrens  */
783fa9e4066Sahrens static void
784fa9e4066Sahrens arc_delete_state(arc_state_t *state, int64_t bytes)
785fa9e4066Sahrens {
786fa9e4066Sahrens 	uint_t bufs_skipped = 0;
787fa9e4066Sahrens 	uint64_t bytes_deleted = 0;
788fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
789fa9e4066Sahrens 	kmutex_t *hash_lock;
790fa9e4066Sahrens 
791fa9e4066Sahrens top:
792fa9e4066Sahrens 	mutex_enter(&state->mtx);
793fa9e4066Sahrens 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
794fa9e4066Sahrens 		ab_prev = list_prev(&state->list, ab);
795fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
796fa9e4066Sahrens 		if (mutex_tryenter(hash_lock)) {
797fa9e4066Sahrens 			arc_change_state(arc.anon, ab, hash_lock);
798fa9e4066Sahrens 			mutex_exit(hash_lock);
799fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
800fa9e4066Sahrens 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
801fa9e4066Sahrens 			bytes_deleted += ab->b_size;
802fa9e4066Sahrens 			arc_hdr_free(ab);
803fa9e4066Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
804fa9e4066Sahrens 				break;
805fa9e4066Sahrens 		} else {
806fa9e4066Sahrens 			if (bytes < 0) {
807fa9e4066Sahrens 				mutex_exit(&state->mtx);
808fa9e4066Sahrens 				mutex_enter(hash_lock);
809fa9e4066Sahrens 				mutex_exit(hash_lock);
810fa9e4066Sahrens 				goto top;
811fa9e4066Sahrens 			}
812fa9e4066Sahrens 			bufs_skipped += 1;
813fa9e4066Sahrens 		}
814fa9e4066Sahrens 	}
815fa9e4066Sahrens 	mutex_exit(&state->mtx);
816fa9e4066Sahrens 
817fa9e4066Sahrens 	if (bufs_skipped) {
818fa9e4066Sahrens 		atomic_add_64(&arc.skipped, bufs_skipped);
819fa9e4066Sahrens 		ASSERT(bytes >= 0);
820fa9e4066Sahrens 	}
821fa9e4066Sahrens 
822fa9e4066Sahrens 	if (bytes_deleted < bytes)
823fa9e4066Sahrens 		dprintf("only deleted %lld bytes from %p",
824fa9e4066Sahrens 		    (longlong_t)bytes_deleted, state);
825fa9e4066Sahrens }
826fa9e4066Sahrens 
827fa9e4066Sahrens static void
828fa9e4066Sahrens arc_adjust(void)
829fa9e4066Sahrens {
830fa9e4066Sahrens 	int64_t top_sz, mru_over, arc_over;
831fa9e4066Sahrens 
832fa9e4066Sahrens 	top_sz = arc.anon->size + arc.mru_top->size;
833fa9e4066Sahrens 
834fa9e4066Sahrens 	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
835fa9e4066Sahrens 		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
836fa9e4066Sahrens 		(void) arc_evict_state(arc.mru_top, toevict);
837fa9e4066Sahrens 		top_sz = arc.anon->size + arc.mru_top->size;
838fa9e4066Sahrens 	}
839fa9e4066Sahrens 
840fa9e4066Sahrens 	mru_over = top_sz + arc.mru_bot->size - arc.c;
841fa9e4066Sahrens 
842fa9e4066Sahrens 	if (mru_over > 0) {
843fa9e4066Sahrens 		if (arc.mru_bot->lsize > 0) {
844fa9e4066Sahrens 			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
845fa9e4066Sahrens 			arc_delete_state(arc.mru_bot, todelete);
846fa9e4066Sahrens 		}
847fa9e4066Sahrens 	}
848fa9e4066Sahrens 
849fa9e4066Sahrens 	if ((arc_over = arc.size - arc.c) > 0) {
850fa9e4066Sahrens 		int64_t table_over;
851fa9e4066Sahrens 
852fa9e4066Sahrens 		if (arc.mfu_top->lsize > 0) {
853fa9e4066Sahrens 			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
854fa9e4066Sahrens 			(void) arc_evict_state(arc.mfu_top, toevict);
855fa9e4066Sahrens 		}
856fa9e4066Sahrens 
857fa9e4066Sahrens 		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
858fa9e4066Sahrens 		    - arc.c*2;
859fa9e4066Sahrens 
860fa9e4066Sahrens 		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
861fa9e4066Sahrens 			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
862fa9e4066Sahrens 			arc_delete_state(arc.mfu_bot, todelete);
863fa9e4066Sahrens 		}
864fa9e4066Sahrens 	}
865fa9e4066Sahrens }
866fa9e4066Sahrens 
867fa9e4066Sahrens /*
868fa9e4066Sahrens  * Flush all *evictable* data from the cache.
869fa9e4066Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
870fa9e4066Sahrens  */
871fa9e4066Sahrens void
872fa9e4066Sahrens arc_flush(void)
873fa9e4066Sahrens {
874fa9e4066Sahrens 	arc_delete_state(arc.mru_top, -1);
875fa9e4066Sahrens 	arc_delete_state(arc.mfu_top, -1);
876fa9e4066Sahrens 
877fa9e4066Sahrens 	arc_delete_state(arc.mru_bot, -1);
878fa9e4066Sahrens 	arc_delete_state(arc.mfu_bot, -1);
879fa9e4066Sahrens }
880fa9e4066Sahrens 
881fa9e4066Sahrens void
882fa9e4066Sahrens arc_kmem_reclaim(void)
883fa9e4066Sahrens {
884fa9e4066Sahrens 	/* Remove 6.25% */
885fa9e4066Sahrens 	/*
886fa9e4066Sahrens 	 * We need arc_reclaim_lock because we don't want multiple
887fa9e4066Sahrens 	 * threads trying to reclaim concurrently.
888fa9e4066Sahrens 	 */
889fa9e4066Sahrens 
890fa9e4066Sahrens 	/*
891fa9e4066Sahrens 	 * umem calls the reclaim func when we destroy the buf cache,
892fa9e4066Sahrens 	 * which is after we do arc_fini().  So we set a flag to prevent
893fa9e4066Sahrens 	 * accessing the destroyed mutexes and lists.
894fa9e4066Sahrens 	 */
895fa9e4066Sahrens 	if (arc_dead)
896fa9e4066Sahrens 		return;
897fa9e4066Sahrens 
898fa9e4066Sahrens 	mutex_enter(&arc_reclaim_lock);
899fa9e4066Sahrens 
900fa9e4066Sahrens 	atomic_add_64(&arc.c, -(arc.c >> 4));
901fa9e4066Sahrens 	if (arc.c < arc.c_min)
902fa9e4066Sahrens 		arc.c = arc.c_min;
903fa9e4066Sahrens 	atomic_add_64(&arc.p, -(arc.p >> 4));
904fa9e4066Sahrens 
905fa9e4066Sahrens 	arc_adjust();
906fa9e4066Sahrens 
907fa9e4066Sahrens 	/* Cool it for a while */
908fa9e4066Sahrens 	arc.incr = 0;
909fa9e4066Sahrens 	arc.size_check = arc_size_check_default << 3;
910fa9e4066Sahrens 
911fa9e4066Sahrens 	mutex_exit(&arc_reclaim_lock);
912fa9e4066Sahrens }
913fa9e4066Sahrens 
914fa9e4066Sahrens static int
915fa9e4066Sahrens arc_reclaim_needed(void)
916fa9e4066Sahrens {
917fa9e4066Sahrens 	uint64_t extra;
918fa9e4066Sahrens 
919fa9e4066Sahrens #ifdef _KERNEL
920fa9e4066Sahrens 	/*
921fa9e4066Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
922fa9e4066Sahrens 	 */
923fa9e4066Sahrens 	extra = desfree;
924fa9e4066Sahrens 
925fa9e4066Sahrens 	/*
926fa9e4066Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
927fa9e4066Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
928fa9e4066Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
929fa9e4066Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
930fa9e4066Sahrens 	 * the scanner doesn't start up while we're freeing memory.
931fa9e4066Sahrens 	 */
932fa9e4066Sahrens 	if (freemem < lotsfree + needfree + extra)
933fa9e4066Sahrens 		return (1);
934fa9e4066Sahrens 
935fa9e4066Sahrens 	/*
936fa9e4066Sahrens 	 * check to make sure that swapfs has enough space so that anon
937fa9e4066Sahrens 	 * reservations can still succeeed. anon_resvmem() checks that the
938fa9e4066Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
939fa9e4066Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
940fa9e4066Sahrens 	 * circumstances from getting really dire.
941fa9e4066Sahrens 	 */
942fa9e4066Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
943fa9e4066Sahrens 		return (1);
944fa9e4066Sahrens 
945fa9e4066Sahrens 	/*
946fa9e4066Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
947fa9e4066Sahrens 	 * kernel heap space before we ever run out of available physical
948fa9e4066Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
949fa9e4066Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
950fa9e4066Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
951fa9e4066Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
952fa9e4066Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
953fa9e4066Sahrens 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
954fa9e4066Sahrens 	 * free)
955fa9e4066Sahrens 	 */
956fa9e4066Sahrens #if defined(__i386)
957fa9e4066Sahrens 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
958fa9e4066Sahrens 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
959fa9e4066Sahrens 		return (1);
960fa9e4066Sahrens #endif
961fa9e4066Sahrens 
962fa9e4066Sahrens #else
963fa9e4066Sahrens 	if (spa_get_random(100) == 0)
964fa9e4066Sahrens 		return (1);
965fa9e4066Sahrens #endif
966fa9e4066Sahrens 	return (0);
967fa9e4066Sahrens }
968fa9e4066Sahrens 
969fa9e4066Sahrens static void
970fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
971fa9e4066Sahrens {
972fa9e4066Sahrens 	size_t			i;
973fa9e4066Sahrens 	kmem_cache_t		*prev_cache = NULL;
974fa9e4066Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
975fa9e4066Sahrens 
976fa9e4066Sahrens 	/*
977fa9e4066Sahrens 	 * an agressive reclamation will shrink the cache size as well as reap
978fa9e4066Sahrens 	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
979fa9e4066Sahrens 	 * header-cache is reaped, so we only reap the header cache if we're
980fa9e4066Sahrens 	 * performing an agressive reclaim.  If we're not, just clean the kmem
981fa9e4066Sahrens 	 * buffer caches.
982fa9e4066Sahrens 	 */
983fa9e4066Sahrens 	if (strat == ARC_RECLAIM_AGGR)
984fa9e4066Sahrens 		kmem_cache_reap_now(hdr_cache);
985fa9e4066Sahrens 
986fa9e4066Sahrens 	kmem_cache_reap_now(buf_cache);
987fa9e4066Sahrens 
988fa9e4066Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
989fa9e4066Sahrens 		if (zio_buf_cache[i] != prev_cache) {
990fa9e4066Sahrens 			prev_cache = zio_buf_cache[i];
991fa9e4066Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
992fa9e4066Sahrens 		}
993fa9e4066Sahrens 	}
994fa9e4066Sahrens }
995fa9e4066Sahrens 
996fa9e4066Sahrens static void
997fa9e4066Sahrens arc_reclaim_thread(void)
998fa9e4066Sahrens {
999fa9e4066Sahrens 	clock_t			growtime = 0;
1000fa9e4066Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1001fa9e4066Sahrens 	callb_cpr_t		cpr;
1002fa9e4066Sahrens 
1003fa9e4066Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1004fa9e4066Sahrens 
1005fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1006fa9e4066Sahrens 	while (arc_thread_exit == 0) {
1007fa9e4066Sahrens 		if (arc_reclaim_needed()) {
1008fa9e4066Sahrens 
1009fa9e4066Sahrens 			if (arc.no_grow) {
1010fa9e4066Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
1011fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
1012fa9e4066Sahrens 				} else {
1013fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
1014fa9e4066Sahrens 				}
1015fa9e4066Sahrens 			} else {
1016fa9e4066Sahrens 				arc.no_grow = TRUE;
1017fa9e4066Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
1018fa9e4066Sahrens 				membar_producer();
1019fa9e4066Sahrens 			}
1020fa9e4066Sahrens 
1021fa9e4066Sahrens 			/* reset the growth delay for every reclaim */
1022fa9e4066Sahrens 			growtime = lbolt + (arc_grow_retry * hz);
1023fa9e4066Sahrens 
1024fa9e4066Sahrens 			arc_kmem_reap_now(last_reclaim);
1025fa9e4066Sahrens 
1026fa9e4066Sahrens 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1027fa9e4066Sahrens 			arc.no_grow = FALSE;
1028fa9e4066Sahrens 		}
1029fa9e4066Sahrens 
1030fa9e4066Sahrens 		/* block until needed, or one second, whichever is shorter */
1031fa9e4066Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
1032fa9e4066Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1033fa9e4066Sahrens 		    &arc_reclaim_thr_lock, (lbolt + hz));
1034fa9e4066Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1035fa9e4066Sahrens 	}
1036fa9e4066Sahrens 
1037fa9e4066Sahrens 	arc_thread_exit = 0;
1038fa9e4066Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
1039fa9e4066Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1040fa9e4066Sahrens 	thread_exit();
1041fa9e4066Sahrens }
1042fa9e4066Sahrens 
1043fa9e4066Sahrens static void
1044fa9e4066Sahrens arc_try_grow(int64_t bytes)
1045fa9e4066Sahrens {
1046fa9e4066Sahrens 	/*
1047fa9e4066Sahrens 	 * If we're within (2 * maxblocksize) bytes of the target
1048fa9e4066Sahrens 	 * cache size, increment the target cache size
1049fa9e4066Sahrens 	 */
1050fa9e4066Sahrens 	atomic_add_64((uint64_t *)&arc.size_check, 1);
1051fa9e4066Sahrens 
1052fa9e4066Sahrens 	if (arc_reclaim_needed()) {
1053fa9e4066Sahrens 		cv_signal(&arc_reclaim_thr_cv);
1054fa9e4066Sahrens 		return;
1055fa9e4066Sahrens 	}
1056fa9e4066Sahrens 
1057fa9e4066Sahrens 	if (arc.no_grow)
1058fa9e4066Sahrens 		return;
1059fa9e4066Sahrens 
1060fa9e4066Sahrens 	/*
1061fa9e4066Sahrens 	 * return true if we successfully grow, or if there's enough space that
1062fa9e4066Sahrens 	 * we don't have to grow.  Above, we return false if we can't grow, or
1063fa9e4066Sahrens 	 * if we shouldn't because a reclaim is in progress.
1064fa9e4066Sahrens 	 */
1065fa9e4066Sahrens 	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
1066fa9e4066Sahrens 		if (arc.size_check > 0) {
1067fa9e4066Sahrens 			arc.size_check = arc_size_check_default;
1068fa9e4066Sahrens 			atomic_add_64(&arc.incr, arc_incr_size);
1069fa9e4066Sahrens 		}
1070fa9e4066Sahrens 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1071fa9e4066Sahrens 		if (arc.c > arc.c_max)
1072fa9e4066Sahrens 			arc.c = arc.c_max;
1073fa9e4066Sahrens 		else
1074fa9e4066Sahrens 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1075fa9e4066Sahrens 	} else if (arc.size > arc.c) {
1076fa9e4066Sahrens 		if (arc.size_check > 0) {
1077fa9e4066Sahrens 			arc.size_check = arc_size_check_default;
1078fa9e4066Sahrens 			atomic_add_64(&arc.incr, arc_incr_size);
1079fa9e4066Sahrens 		}
1080fa9e4066Sahrens 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1081fa9e4066Sahrens 		if (arc.c > arc.c_max)
1082fa9e4066Sahrens 			arc.c = arc.c_max;
1083fa9e4066Sahrens 		else
1084fa9e4066Sahrens 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1085fa9e4066Sahrens 	}
1086fa9e4066Sahrens }
1087fa9e4066Sahrens 
1088fa9e4066Sahrens /*
1089fa9e4066Sahrens  * check if the cache has reached its limits and eviction is required prior to
1090fa9e4066Sahrens  * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
1091fa9e4066Sahrens  * cache is either big enough that we can insert, or a arc_try_grow will result
1092fa9e4066Sahrens  * in more space being made available.
1093fa9e4066Sahrens  */
1094fa9e4066Sahrens 
1095fa9e4066Sahrens static int
1096fa9e4066Sahrens arc_evict_needed()
1097fa9e4066Sahrens {
1098fa9e4066Sahrens 
1099fa9e4066Sahrens 	if (arc_reclaim_needed())
1100fa9e4066Sahrens 		return (1);
1101fa9e4066Sahrens 
1102fa9e4066Sahrens 	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
1103fa9e4066Sahrens 		return (1);
1104fa9e4066Sahrens 
1105fa9e4066Sahrens 	return (0);
1106fa9e4066Sahrens }
1107fa9e4066Sahrens 
1108fa9e4066Sahrens /*
1109fa9e4066Sahrens  * The state, supplied as the first argument, is going to have something
1110fa9e4066Sahrens  * inserted on its behalf. So, determine which cache must be victimized to
1111fa9e4066Sahrens  * satisfy an insertion for this state.  We have the following cases:
1112fa9e4066Sahrens  *
1113fa9e4066Sahrens  * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
1114fa9e4066Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
1115fa9e4066Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
1116fa9e4066Sahrens  *
1117fa9e4066Sahrens  * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
1118fa9e4066Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
1119fa9e4066Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
1120fa9e4066Sahrens  * entries.
1121fa9e4066Sahrens  *
1122fa9e4066Sahrens  * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
1123fa9e4066Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
1124fa9e4066Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
1125fa9e4066Sahrens  * the MFU side, so the MRU side needs to be victimized.
1126fa9e4066Sahrens  *
1127fa9e4066Sahrens  * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
1128fa9e4066Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
1129fa9e4066Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
1130fa9e4066Sahrens  */
1131fa9e4066Sahrens static void
1132fa9e4066Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes)
1133fa9e4066Sahrens {
1134fa9e4066Sahrens 	uint64_t	mru_used;
1135fa9e4066Sahrens 	uint64_t	mfu_space;
1136fa9e4066Sahrens 	uint64_t	evicted;
1137fa9e4066Sahrens 
1138fa9e4066Sahrens 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
1139fa9e4066Sahrens 
1140fa9e4066Sahrens 	if (state == arc.mru_top) {
1141fa9e4066Sahrens 		mru_used = arc.anon->size + arc.mru_top->size;
1142fa9e4066Sahrens 		if (arc.p > mru_used) {
1143fa9e4066Sahrens 			/* case 1 */
1144fa9e4066Sahrens 			evicted = arc_evict_state(arc.mfu_top, bytes);
1145fa9e4066Sahrens 			if (evicted < bytes) {
1146fa9e4066Sahrens 				arc_adjust();
1147fa9e4066Sahrens 			}
1148fa9e4066Sahrens 		} else {
1149fa9e4066Sahrens 			/* case 2 */
1150fa9e4066Sahrens 			evicted = arc_evict_state(arc.mru_top, bytes);
1151fa9e4066Sahrens 			if (evicted < bytes) {
1152fa9e4066Sahrens 				arc_adjust();
1153fa9e4066Sahrens 			}
1154fa9e4066Sahrens 		}
1155fa9e4066Sahrens 	} else {
1156fa9e4066Sahrens 		/* MFU_top case */
1157fa9e4066Sahrens 		mfu_space = arc.c - arc.p;
1158fa9e4066Sahrens 		if (mfu_space > arc.mfu_top->size) {
1159fa9e4066Sahrens 			/* case 3 */
1160fa9e4066Sahrens 			evicted = arc_evict_state(arc.mru_top, bytes);
1161fa9e4066Sahrens 			if (evicted < bytes) {
1162fa9e4066Sahrens 				arc_adjust();
1163fa9e4066Sahrens 			}
1164fa9e4066Sahrens 		} else {
1165fa9e4066Sahrens 			/* case 4 */
1166fa9e4066Sahrens 			evicted = arc_evict_state(arc.mfu_top, bytes);
1167fa9e4066Sahrens 			if (evicted < bytes) {
1168fa9e4066Sahrens 				arc_adjust();
1169fa9e4066Sahrens 			}
1170fa9e4066Sahrens 		}
1171fa9e4066Sahrens 	}
1172fa9e4066Sahrens }
1173fa9e4066Sahrens 
1174fa9e4066Sahrens /*
1175fa9e4066Sahrens  * This routine is called whenever a buffer is accessed.
1176fa9e4066Sahrens  */
1177fa9e4066Sahrens static void
1178fa9e4066Sahrens arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1179fa9e4066Sahrens {
1180fa9e4066Sahrens 	int		blksz, mult;
1181fa9e4066Sahrens 
1182fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1183fa9e4066Sahrens 
1184fa9e4066Sahrens 	blksz = buf->b_size;
1185fa9e4066Sahrens 
1186fa9e4066Sahrens 	if (buf->b_state == arc.anon) {
1187fa9e4066Sahrens 		/*
1188fa9e4066Sahrens 		 * This buffer is not in the cache, and does not
1189fa9e4066Sahrens 		 * appear in our "ghost" list.  Add the new buffer
1190fa9e4066Sahrens 		 * to the MRU state.
1191fa9e4066Sahrens 		 */
1192fa9e4066Sahrens 
1193fa9e4066Sahrens 		arc_try_grow(blksz);
1194fa9e4066Sahrens 		if (arc_evict_needed()) {
1195fa9e4066Sahrens 			arc_evict_for_state(arc.mru_top, blksz);
1196fa9e4066Sahrens 		}
1197fa9e4066Sahrens 
1198fa9e4066Sahrens 		ASSERT(buf->b_arc_access == 0);
1199fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1200fa9e4066Sahrens 		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
1201fa9e4066Sahrens 		    buf);
1202fa9e4066Sahrens 		arc_change_state(arc.mru_top, buf, hash_lock);
1203fa9e4066Sahrens 
1204fa9e4066Sahrens 		/*
1205fa9e4066Sahrens 		 * If we are using less than 2/3 of our total target
1206fa9e4066Sahrens 		 * cache size, bump up the target size for the MRU
1207fa9e4066Sahrens 		 * list.
1208fa9e4066Sahrens 		 */
1209fa9e4066Sahrens 		if (arc.size < arc.c*2/3) {
1210fa9e4066Sahrens 			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
1211fa9e4066Sahrens 		}
1212fa9e4066Sahrens 
1213fa9e4066Sahrens 	} else if (buf->b_state == arc.mru_top) {
1214fa9e4066Sahrens 		/*
1215fa9e4066Sahrens 		 * If this buffer is in the MRU-top state and has the prefetch
1216fa9e4066Sahrens 		 * flag, the first read was actually part of a prefetch.  In
1217fa9e4066Sahrens 		 * this situation, we simply want to clear the flag and return.
1218fa9e4066Sahrens 		 * A subsequent access should bump this into the MFU state.
1219fa9e4066Sahrens 		 */
1220fa9e4066Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1221fa9e4066Sahrens 			buf->b_flags &= ~ARC_PREFETCH;
1222fa9e4066Sahrens 			atomic_add_64(&arc.mru_top->hits, 1);
1223fa9e4066Sahrens 			return;
1224fa9e4066Sahrens 		}
1225fa9e4066Sahrens 
1226fa9e4066Sahrens 		/*
1227fa9e4066Sahrens 		 * This buffer has been "accessed" only once so far,
1228fa9e4066Sahrens 		 * but it is still in the cache. Move it to the MFU
1229fa9e4066Sahrens 		 * state.
1230fa9e4066Sahrens 		 */
1231fa9e4066Sahrens 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1232fa9e4066Sahrens 			/*
1233fa9e4066Sahrens 			 * More than 125ms have passed since we
1234fa9e4066Sahrens 			 * instantiated this buffer.  Move it to the
1235fa9e4066Sahrens 			 * most frequently used state.
1236fa9e4066Sahrens 			 */
1237fa9e4066Sahrens 			buf->b_arc_access = lbolt;
1238fa9e4066Sahrens 			DTRACE_PROBE1(new_state__mfu_top,
1239fa9e4066Sahrens 			    arc_buf_hdr_t *, buf);
1240fa9e4066Sahrens 			arc_change_state(arc.mfu_top, buf, hash_lock);
1241fa9e4066Sahrens 		}
1242fa9e4066Sahrens 		atomic_add_64(&arc.mru_top->hits, 1);
1243fa9e4066Sahrens 	} else if (buf->b_state == arc.mru_bot) {
1244fa9e4066Sahrens 		arc_state_t	*new_state;
1245fa9e4066Sahrens 		/*
1246fa9e4066Sahrens 		 * This buffer has been "accessed" recently, but
1247fa9e4066Sahrens 		 * was evicted from the cache.  Move it to the
1248fa9e4066Sahrens 		 * MFU state.
1249fa9e4066Sahrens 		 */
1250fa9e4066Sahrens 
1251fa9e4066Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
1252fa9e4066Sahrens 			new_state = arc.mru_top;
1253fa9e4066Sahrens 			DTRACE_PROBE1(new_state__mru_top,
1254fa9e4066Sahrens 			    arc_buf_hdr_t *, buf);
1255fa9e4066Sahrens 		} else {
1256fa9e4066Sahrens 			new_state = arc.mfu_top;
1257fa9e4066Sahrens 			DTRACE_PROBE1(new_state__mfu_top,
1258fa9e4066Sahrens 			    arc_buf_hdr_t *, buf);
1259fa9e4066Sahrens 		}
1260fa9e4066Sahrens 
1261fa9e4066Sahrens 		arc_try_grow(blksz);
1262fa9e4066Sahrens 		if (arc_evict_needed()) {
1263fa9e4066Sahrens 			arc_evict_for_state(new_state, blksz);
1264fa9e4066Sahrens 		}
1265fa9e4066Sahrens 
1266fa9e4066Sahrens 		/* Bump up the target size of the MRU list */
1267fa9e4066Sahrens 		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
1268fa9e4066Sahrens 		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
1269fa9e4066Sahrens 		arc.p = MIN(arc.c, arc.p + blksz * mult);
1270fa9e4066Sahrens 
1271fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1272fa9e4066Sahrens 		arc_change_state(new_state, buf, hash_lock);
1273fa9e4066Sahrens 
1274fa9e4066Sahrens 		atomic_add_64(&arc.mru_bot->hits, 1);
1275fa9e4066Sahrens 	} else if (buf->b_state == arc.mfu_top) {
1276fa9e4066Sahrens 		/*
1277fa9e4066Sahrens 		 * This buffer has been accessed more than once and is
1278fa9e4066Sahrens 		 * still in the cache.  Keep it in the MFU state.
1279fa9e4066Sahrens 		 *
1280fa9e4066Sahrens 		 * NOTE: the add_reference() that occurred when we did
1281fa9e4066Sahrens 		 * the arc_read() should have kicked this off the list,
1282fa9e4066Sahrens 		 * so even if it was a prefetch, it will be put back at
1283fa9e4066Sahrens 		 * the head of the list when we remove_reference().
1284fa9e4066Sahrens 		 */
1285fa9e4066Sahrens 		atomic_add_64(&arc.mfu_top->hits, 1);
1286fa9e4066Sahrens 	} else if (buf->b_state == arc.mfu_bot) {
1287fa9e4066Sahrens 		/*
1288fa9e4066Sahrens 		 * This buffer has been accessed more than once but has
1289fa9e4066Sahrens 		 * been evicted from the cache.  Move it back to the
1290fa9e4066Sahrens 		 * MFU state.
1291fa9e4066Sahrens 		 */
1292fa9e4066Sahrens 
1293fa9e4066Sahrens 		arc_try_grow(blksz);
1294fa9e4066Sahrens 		if (arc_evict_needed()) {
1295fa9e4066Sahrens 			arc_evict_for_state(arc.mfu_top, blksz);
1296fa9e4066Sahrens 		}
1297fa9e4066Sahrens 
1298fa9e4066Sahrens 		/* Bump up the target size for the MFU list */
1299fa9e4066Sahrens 		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
1300fa9e4066Sahrens 		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
1301fa9e4066Sahrens 		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
1302fa9e4066Sahrens 
1303fa9e4066Sahrens 		buf->b_arc_access = lbolt;
1304fa9e4066Sahrens 		DTRACE_PROBE1(new_state__mfu_top,
1305fa9e4066Sahrens 		    arc_buf_hdr_t *, buf);
1306fa9e4066Sahrens 		arc_change_state(arc.mfu_top, buf, hash_lock);
1307fa9e4066Sahrens 
1308fa9e4066Sahrens 		atomic_add_64(&arc.mfu_bot->hits, 1);
1309fa9e4066Sahrens 	} else {
1310fa9e4066Sahrens 		ASSERT(!"invalid arc state");
1311fa9e4066Sahrens 	}
1312fa9e4066Sahrens 
1313fa9e4066Sahrens }
1314fa9e4066Sahrens 
1315fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1316fa9e4066Sahrens /* ARGSUSED */
1317fa9e4066Sahrens void
1318fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1319fa9e4066Sahrens {
1320fa9e4066Sahrens 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1321fa9e4066Sahrens 	arc_buf_free(buf, arg);
1322fa9e4066Sahrens }
1323fa9e4066Sahrens 
1324fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
1325fa9e4066Sahrens void
1326fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1327fa9e4066Sahrens {
1328fa9e4066Sahrens 	arc_buf_t **bufp = arg;
1329fa9e4066Sahrens 	if (zio && zio->io_error) {
1330fa9e4066Sahrens 		arc_buf_free(buf, arg);
1331fa9e4066Sahrens 		*bufp = NULL;
1332fa9e4066Sahrens 	} else {
1333fa9e4066Sahrens 		*bufp = buf;
1334fa9e4066Sahrens 	}
1335fa9e4066Sahrens }
1336fa9e4066Sahrens 
1337fa9e4066Sahrens static void
1338fa9e4066Sahrens arc_read_done(zio_t *zio)
1339fa9e4066Sahrens {
1340fa9e4066Sahrens 	arc_buf_hdr_t	*hdr;
1341fa9e4066Sahrens 	arc_buf_t	*buf;
1342fa9e4066Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1343fa9e4066Sahrens 	kmutex_t	*hash_lock;
1344fa9e4066Sahrens 	arc_callback_t	*callback_list, *acb;
1345fa9e4066Sahrens 	int		freeable = FALSE;
1346fa9e4066Sahrens 
1347fa9e4066Sahrens 	buf = zio->io_private;
1348fa9e4066Sahrens 	hdr = buf->b_hdr;
1349fa9e4066Sahrens 
1350fa9e4066Sahrens 	if (!HDR_FREED_IN_READ(hdr)) {
1351fa9e4066Sahrens 		arc_buf_hdr_t *found;
1352fa9e4066Sahrens 
1353fa9e4066Sahrens 		found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1354fa9e4066Sahrens 		    &hash_lock);
1355fa9e4066Sahrens 
1356fa9e4066Sahrens 		/*
1357fa9e4066Sahrens 		 * Buffer was inserted into hash-table and removed from lists
1358fa9e4066Sahrens 		 * prior to starting I/O.  We should find this header, since
1359fa9e4066Sahrens 		 * it's in the hash table, and it should be legit since it's
1360fa9e4066Sahrens 		 * not possible to evict it during the I/O.
1361fa9e4066Sahrens 		 */
1362fa9e4066Sahrens 
1363fa9e4066Sahrens 		ASSERT(found);
1364fa9e4066Sahrens 		ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
1365fa9e4066Sahrens 	}
1366fa9e4066Sahrens 
1367fa9e4066Sahrens 	/* byteswap if necessary */
1368fa9e4066Sahrens 	callback_list = hdr->b_acb;
1369fa9e4066Sahrens 	ASSERT(callback_list != NULL);
1370fa9e4066Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1371fa9e4066Sahrens 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1372fa9e4066Sahrens 
1373fa9e4066Sahrens 	/* create copies of the data buffer for the callers */
1374fa9e4066Sahrens 	abuf = buf;
1375fa9e4066Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
1376fa9e4066Sahrens 		if (acb->acb_done) {
1377fa9e4066Sahrens 			if (abuf == NULL) {
1378fa9e4066Sahrens 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1379fa9e4066Sahrens 				abuf->b_data = zio_buf_alloc(hdr->b_size);
1380fa9e4066Sahrens 				atomic_add_64(&arc.size, hdr->b_size);
1381fa9e4066Sahrens 				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
1382fa9e4066Sahrens 				abuf->b_hdr = hdr;
1383fa9e4066Sahrens 				abuf->b_next = hdr->b_buf;
1384fa9e4066Sahrens 				hdr->b_buf = abuf;
1385fa9e4066Sahrens 				atomic_add_64(&hdr->b_state->size, hdr->b_size);
1386fa9e4066Sahrens 			}
1387fa9e4066Sahrens 			acb->acb_buf = abuf;
1388fa9e4066Sahrens 			abuf = NULL;
1389fa9e4066Sahrens 		} else {
1390fa9e4066Sahrens 			/*
1391fa9e4066Sahrens 			 * The caller did not provide a callback function.
1392fa9e4066Sahrens 			 * In this case, we should just remove the reference.
1393fa9e4066Sahrens 			 */
1394fa9e4066Sahrens 			if (HDR_FREED_IN_READ(hdr)) {
1395fa9e4066Sahrens 				ASSERT3P(hdr->b_state, ==, arc.anon);
1396fa9e4066Sahrens 				(void) refcount_remove(&hdr->b_refcnt,
1397fa9e4066Sahrens 				    acb->acb_private);
1398fa9e4066Sahrens 			} else {
1399fa9e4066Sahrens 				(void) remove_reference(hdr, hash_lock,
1400fa9e4066Sahrens 				    acb->acb_private);
1401fa9e4066Sahrens 			}
1402fa9e4066Sahrens 		}
1403fa9e4066Sahrens 	}
1404fa9e4066Sahrens 	hdr->b_acb = NULL;
1405fa9e4066Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1406fa9e4066Sahrens 
1407fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1408fa9e4066Sahrens 
1409fa9e4066Sahrens 	if (zio->io_error != 0) {
1410fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
1411fa9e4066Sahrens 		if (hdr->b_state != arc.anon)
1412fa9e4066Sahrens 			arc_change_state(arc.anon, hdr, hash_lock);
1413fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1414fa9e4066Sahrens 	}
1415fa9e4066Sahrens 
1416fa9e4066Sahrens 	if (!HDR_FREED_IN_READ(hdr)) {
1417fa9e4066Sahrens 		/*
1418fa9e4066Sahrens 		 * Only call arc_access on anonymous buffers.  This is because
1419fa9e4066Sahrens 		 * if we've issued an I/O for an evicted buffer, we've already
1420fa9e4066Sahrens 		 * called arc_access (to prevent any simultaneous readers from
1421fa9e4066Sahrens 		 * getting confused).
1422fa9e4066Sahrens 		 */
1423fa9e4066Sahrens 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
1424fa9e4066Sahrens 			arc_access(hdr, hash_lock);
1425fa9e4066Sahrens 		mutex_exit(hash_lock);
1426fa9e4066Sahrens 	} else {
1427fa9e4066Sahrens 		/*
1428fa9e4066Sahrens 		 * This block was freed while we waited for the read to
1429fa9e4066Sahrens 		 * complete.  It has been removed from the hash table and
1430fa9e4066Sahrens 		 * moved to the anonymous state (so that it won't show up
1431fa9e4066Sahrens 		 * in the cache).
1432fa9e4066Sahrens 		 */
1433fa9e4066Sahrens 		ASSERT3P(hdr->b_state, ==, arc.anon);
1434fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
1435fa9e4066Sahrens 	}
1436fa9e4066Sahrens 
1437fa9e4066Sahrens 	cv_broadcast(&hdr->b_cv);
1438fa9e4066Sahrens 
1439fa9e4066Sahrens 	/* execute each callback and free its structure */
1440fa9e4066Sahrens 	while ((acb = callback_list) != NULL) {
1441fa9e4066Sahrens 		if (acb->acb_done)
1442fa9e4066Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1443fa9e4066Sahrens 
1444fa9e4066Sahrens 		if (acb->acb_zio_dummy != NULL) {
1445fa9e4066Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
1446fa9e4066Sahrens 			zio_nowait(acb->acb_zio_dummy);
1447fa9e4066Sahrens 		}
1448fa9e4066Sahrens 
1449fa9e4066Sahrens 		callback_list = acb->acb_next;
1450fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1451fa9e4066Sahrens 	}
1452fa9e4066Sahrens 
1453fa9e4066Sahrens 	if (freeable)
1454fa9e4066Sahrens 		arc_hdr_free(hdr);
1455fa9e4066Sahrens }
1456fa9e4066Sahrens 
1457fa9e4066Sahrens /*
1458fa9e4066Sahrens  * "Read" the block block at the specified DVA (in bp) via the
1459fa9e4066Sahrens  * cache.  If the block is found in the cache, invoke the provided
1460fa9e4066Sahrens  * callback immediately and return.  Note that the `zio' parameter
1461fa9e4066Sahrens  * in the callback will be NULL in this case, since no IO was
1462fa9e4066Sahrens  * required.  If the block is not in the cache pass the read request
1463fa9e4066Sahrens  * on to the spa with a substitute callback function, so that the
1464fa9e4066Sahrens  * requested block will be added to the cache.
1465fa9e4066Sahrens  *
1466fa9e4066Sahrens  * If a read request arrives for a block that has a read in-progress,
1467fa9e4066Sahrens  * either wait for the in-progress read to complete (and return the
1468fa9e4066Sahrens  * results); or, if this is a read with a "done" func, add a record
1469fa9e4066Sahrens  * to the read to invoke the "done" func when the read completes,
1470fa9e4066Sahrens  * and return; or just return.
1471fa9e4066Sahrens  *
1472fa9e4066Sahrens  * arc_read_done() will invoke all the requested "done" functions
1473fa9e4066Sahrens  * for readers of this block.
1474fa9e4066Sahrens  */
1475fa9e4066Sahrens int
1476fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
1477fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1478fa9e4066Sahrens     uint32_t arc_flags)
1479fa9e4066Sahrens {
1480fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1481fa9e4066Sahrens 	arc_buf_t *buf;
1482fa9e4066Sahrens 	kmutex_t *hash_lock;
1483fa9e4066Sahrens 	zio_t	*rzio;
1484fa9e4066Sahrens 
1485fa9e4066Sahrens top:
1486fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1487fa9e4066Sahrens 	if (hdr && hdr->b_buf) {
1488fa9e4066Sahrens 
1489fa9e4066Sahrens 		ASSERT((hdr->b_state == arc.mru_top) ||
1490fa9e4066Sahrens 		    (hdr->b_state == arc.mfu_top) ||
1491fa9e4066Sahrens 		    ((hdr->b_state == arc.anon) &&
1492fa9e4066Sahrens 		    (HDR_IO_IN_PROGRESS(hdr))));
1493fa9e4066Sahrens 
1494fa9e4066Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
1495fa9e4066Sahrens 
1496fa9e4066Sahrens 			if ((arc_flags & ARC_NOWAIT) && done) {
1497fa9e4066Sahrens 				arc_callback_t	*acb = NULL;
1498fa9e4066Sahrens 
1499fa9e4066Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
1500fa9e4066Sahrens 				    KM_SLEEP);
1501fa9e4066Sahrens 				acb->acb_done = done;
1502fa9e4066Sahrens 				acb->acb_private = private;
1503fa9e4066Sahrens 				acb->acb_byteswap = swap;
1504fa9e4066Sahrens 				if (pio != NULL)
1505fa9e4066Sahrens 					acb->acb_zio_dummy = zio_null(pio,
1506fa9e4066Sahrens 					    spa, NULL, NULL, flags);
1507fa9e4066Sahrens 
1508fa9e4066Sahrens 				ASSERT(acb->acb_done != NULL);
1509fa9e4066Sahrens 				acb->acb_next = hdr->b_acb;
1510fa9e4066Sahrens 				hdr->b_acb = acb;
1511fa9e4066Sahrens 				add_reference(hdr, hash_lock, private);
1512fa9e4066Sahrens 				mutex_exit(hash_lock);
1513fa9e4066Sahrens 				return (0);
1514fa9e4066Sahrens 			} else if (arc_flags & ARC_WAIT) {
1515fa9e4066Sahrens 				cv_wait(&hdr->b_cv, hash_lock);
1516fa9e4066Sahrens 				mutex_exit(hash_lock);
1517fa9e4066Sahrens 				goto top;
1518fa9e4066Sahrens 			}
1519fa9e4066Sahrens 
1520fa9e4066Sahrens 			mutex_exit(hash_lock);
1521fa9e4066Sahrens 			return (0);
1522fa9e4066Sahrens 		}
1523fa9e4066Sahrens 
1524fa9e4066Sahrens 		/*
1525fa9e4066Sahrens 		 * If there is already a reference on this block, create
1526fa9e4066Sahrens 		 * a new copy of the data so that we will be guaranteed
1527fa9e4066Sahrens 		 * that arc_release() will always succeed.
1528fa9e4066Sahrens 		 */
1529fa9e4066Sahrens 
1530fa9e4066Sahrens 		if (done)
1531fa9e4066Sahrens 			add_reference(hdr, hash_lock, private);
1532fa9e4066Sahrens 		if (done && refcount_count(&hdr->b_refcnt) > 1) {
1533fa9e4066Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1534fa9e4066Sahrens 			buf->b_data = zio_buf_alloc(hdr->b_size);
1535fa9e4066Sahrens 			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
1536fa9e4066Sahrens 			atomic_add_64(&arc.size, hdr->b_size);
1537fa9e4066Sahrens 			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
1538fa9e4066Sahrens 			buf->b_hdr = hdr;
1539fa9e4066Sahrens 			buf->b_next = hdr->b_buf;
1540fa9e4066Sahrens 			hdr->b_buf = buf;
1541fa9e4066Sahrens 			atomic_add_64(&hdr->b_state->size, hdr->b_size);
1542fa9e4066Sahrens 		} else {
1543fa9e4066Sahrens 			buf = hdr->b_buf;
1544fa9e4066Sahrens 		}
1545fa9e4066Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1546fa9e4066Sahrens 		arc_access(hdr, hash_lock);
1547fa9e4066Sahrens 		mutex_exit(hash_lock);
1548fa9e4066Sahrens 		atomic_add_64(&arc.hits, 1);
1549fa9e4066Sahrens 		if (done)
1550fa9e4066Sahrens 			done(NULL, buf, private);
1551fa9e4066Sahrens 	} else {
1552fa9e4066Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
1553fa9e4066Sahrens 		arc_callback_t	*acb;
1554fa9e4066Sahrens 
1555fa9e4066Sahrens 		if (hdr == NULL) {
1556fa9e4066Sahrens 			/* this block is not in the cache */
1557fa9e4066Sahrens 			arc_buf_hdr_t	*exists;
1558fa9e4066Sahrens 
1559fa9e4066Sahrens 			buf = arc_buf_alloc(spa, size, private);
1560fa9e4066Sahrens 			hdr = buf->b_hdr;
1561fa9e4066Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
1562fa9e4066Sahrens 			hdr->b_birth = bp->blk_birth;
1563fa9e4066Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
1564fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1565fa9e4066Sahrens 			if (exists) {
1566fa9e4066Sahrens 				/* somebody beat us to the hash insert */
1567fa9e4066Sahrens 				mutex_exit(hash_lock);
1568fa9e4066Sahrens 				bzero(&hdr->b_dva, sizeof (dva_t));
1569fa9e4066Sahrens 				hdr->b_birth = 0;
1570fa9e4066Sahrens 				hdr->b_cksum0 = 0;
1571fa9e4066Sahrens 				arc_buf_free(buf, private);
1572fa9e4066Sahrens 				goto top; /* restart the IO request */
1573fa9e4066Sahrens 			}
1574fa9e4066Sahrens 
1575fa9e4066Sahrens 		} else {
1576fa9e4066Sahrens 			/* this block is in the ghost cache */
1577fa9e4066Sahrens 			ASSERT((hdr->b_state == arc.mru_bot) ||
1578fa9e4066Sahrens 			    (hdr->b_state == arc.mfu_bot));
1579fa9e4066Sahrens 			add_reference(hdr, hash_lock, private);
1580fa9e4066Sahrens 
1581fa9e4066Sahrens 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1582fa9e4066Sahrens 			buf->b_data = zio_buf_alloc(hdr->b_size);
1583fa9e4066Sahrens 			atomic_add_64(&arc.size, hdr->b_size);
1584fa9e4066Sahrens 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1585fa9e4066Sahrens 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1586fa9e4066Sahrens 			buf->b_hdr = hdr;
1587fa9e4066Sahrens 			buf->b_next = NULL;
1588fa9e4066Sahrens 			hdr->b_buf = buf;
1589fa9e4066Sahrens 		}
1590fa9e4066Sahrens 
1591fa9e4066Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1592fa9e4066Sahrens 		acb->acb_done = done;
1593fa9e4066Sahrens 		acb->acb_private = private;
1594fa9e4066Sahrens 		acb->acb_byteswap = swap;
1595fa9e4066Sahrens 
1596fa9e4066Sahrens 		ASSERT(hdr->b_acb == NULL);
1597fa9e4066Sahrens 		hdr->b_acb = acb;
1598fa9e4066Sahrens 
1599fa9e4066Sahrens 		/*
1600fa9e4066Sahrens 		 * If this DVA is part of a prefetch, mark the buf
1601fa9e4066Sahrens 		 * header with the prefetch flag
1602fa9e4066Sahrens 		 */
1603fa9e4066Sahrens 		if (arc_flags & ARC_PREFETCH)
1604fa9e4066Sahrens 			hdr->b_flags |= ARC_PREFETCH;
1605fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
1606fa9e4066Sahrens 
1607fa9e4066Sahrens 		/*
1608fa9e4066Sahrens 		 * If the buffer has been evicted, migrate it to a present state
1609fa9e4066Sahrens 		 * before issuing the I/O.  Once we drop the hash-table lock,
1610fa9e4066Sahrens 		 * the header will be marked as I/O in progress and have an
1611fa9e4066Sahrens 		 * attached buffer.  At this point, anybody who finds this
1612fa9e4066Sahrens 		 * buffer ought to notice that it's legit but has a pending I/O.
1613fa9e4066Sahrens 		 */
1614fa9e4066Sahrens 
1615fa9e4066Sahrens 		if ((hdr->b_state == arc.mru_bot) ||
1616fa9e4066Sahrens 		    (hdr->b_state == arc.mfu_bot))
1617fa9e4066Sahrens 			arc_access(hdr, hash_lock);
1618fa9e4066Sahrens 
1619fa9e4066Sahrens 		mutex_exit(hash_lock);
1620fa9e4066Sahrens 
1621fa9e4066Sahrens 		ASSERT3U(hdr->b_size, ==, size);
1622fa9e4066Sahrens 		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
1623fa9e4066Sahrens 		    uint64_t, size);
1624fa9e4066Sahrens 		atomic_add_64(&arc.misses, 1);
1625fa9e4066Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
1626fa9e4066Sahrens 		    arc_read_done, buf, priority, flags);
1627fa9e4066Sahrens 
1628fa9e4066Sahrens 		if (arc_flags & ARC_WAIT)
1629fa9e4066Sahrens 			return (zio_wait(rzio));
1630fa9e4066Sahrens 
1631fa9e4066Sahrens 		ASSERT(arc_flags & ARC_NOWAIT);
1632fa9e4066Sahrens 		zio_nowait(rzio);
1633fa9e4066Sahrens 	}
1634fa9e4066Sahrens 	return (0);
1635fa9e4066Sahrens }
1636fa9e4066Sahrens 
1637fa9e4066Sahrens /*
1638fa9e4066Sahrens  * arc_read() variant to support pool traversal.  If the block is already
1639fa9e4066Sahrens  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
1640fa9e4066Sahrens  * The idea is that we don't want pool traversal filling up memory, but
1641fa9e4066Sahrens  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
1642fa9e4066Sahrens  */
1643fa9e4066Sahrens int
1644fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
1645fa9e4066Sahrens {
1646fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1647fa9e4066Sahrens 	kmutex_t *hash_mtx;
1648fa9e4066Sahrens 	int rc = 0;
1649fa9e4066Sahrens 
1650fa9e4066Sahrens 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
1651fa9e4066Sahrens 
1652fa9e4066Sahrens 	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
1653fa9e4066Sahrens 		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
1654fa9e4066Sahrens 	else
1655fa9e4066Sahrens 		rc = ENOENT;
1656fa9e4066Sahrens 
1657fa9e4066Sahrens 	if (hash_mtx)
1658fa9e4066Sahrens 		mutex_exit(hash_mtx);
1659fa9e4066Sahrens 
1660fa9e4066Sahrens 	return (rc);
1661fa9e4066Sahrens }
1662fa9e4066Sahrens 
1663fa9e4066Sahrens /*
1664fa9e4066Sahrens  * Release this buffer from the cache.  This must be done
1665fa9e4066Sahrens  * after a read and prior to modifying the buffer contents.
1666fa9e4066Sahrens  * If the buffer has more than one reference, we must make
1667fa9e4066Sahrens  * make a new hdr for the buffer.
1668fa9e4066Sahrens  */
1669fa9e4066Sahrens void
1670fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag)
1671fa9e4066Sahrens {
1672fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1673fa9e4066Sahrens 	kmutex_t *hash_lock = HDR_LOCK(hdr);
1674fa9e4066Sahrens 
1675fa9e4066Sahrens 	/* this buffer is not on any list */
1676fa9e4066Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
1677fa9e4066Sahrens 
1678fa9e4066Sahrens 	if (hdr->b_state == arc.anon) {
1679fa9e4066Sahrens 		/* this buffer is already released */
1680fa9e4066Sahrens 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1681fa9e4066Sahrens 		ASSERT(BUF_EMPTY(hdr));
1682fa9e4066Sahrens 		return;
1683fa9e4066Sahrens 	}
1684fa9e4066Sahrens 
1685fa9e4066Sahrens 	mutex_enter(hash_lock);
1686fa9e4066Sahrens 
1687fa9e4066Sahrens 	if (refcount_count(&hdr->b_refcnt) > 1) {
1688fa9e4066Sahrens 		arc_buf_hdr_t *nhdr;
1689fa9e4066Sahrens 		arc_buf_t **bufp;
1690fa9e4066Sahrens 		uint64_t blksz = hdr->b_size;
1691fa9e4066Sahrens 		spa_t *spa = hdr->b_spa;
1692fa9e4066Sahrens 
1693fa9e4066Sahrens 		/*
1694fa9e4066Sahrens 		 * Pull the data off of this buf and attach it to
1695fa9e4066Sahrens 		 * a new anonymous buf.
1696fa9e4066Sahrens 		 */
1697fa9e4066Sahrens 		bufp = &hdr->b_buf;
1698fa9e4066Sahrens 		while (*bufp != buf) {
1699fa9e4066Sahrens 			ASSERT(*bufp);
1700fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
1701fa9e4066Sahrens 		}
1702fa9e4066Sahrens 		*bufp = (*bufp)->b_next;
1703fa9e4066Sahrens 		(void) refcount_remove(&hdr->b_refcnt, tag);
1704fa9e4066Sahrens 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
1705fa9e4066Sahrens 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
1706fa9e4066Sahrens 		mutex_exit(hash_lock);
1707fa9e4066Sahrens 
1708fa9e4066Sahrens 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1709fa9e4066Sahrens 		nhdr->b_size = blksz;
1710fa9e4066Sahrens 		nhdr->b_spa = spa;
1711fa9e4066Sahrens 		nhdr->b_buf = buf;
1712fa9e4066Sahrens 		nhdr->b_state = arc.anon;
1713fa9e4066Sahrens 		nhdr->b_arc_access = 0;
1714fa9e4066Sahrens 		nhdr->b_flags = 0;
1715fa9e4066Sahrens 		buf->b_hdr = nhdr;
1716fa9e4066Sahrens 		buf->b_next = NULL;
1717fa9e4066Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
1718fa9e4066Sahrens 		atomic_add_64(&arc.anon->size, blksz);
1719fa9e4066Sahrens 
1720fa9e4066Sahrens 		hdr = nhdr;
1721fa9e4066Sahrens 	} else {
1722fa9e4066Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
1723fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1724fa9e4066Sahrens 		arc_change_state(arc.anon, hdr, hash_lock);
1725fa9e4066Sahrens 		hdr->b_arc_access = 0;
1726fa9e4066Sahrens 		mutex_exit(hash_lock);
1727fa9e4066Sahrens 		bzero(&hdr->b_dva, sizeof (dva_t));
1728fa9e4066Sahrens 		hdr->b_birth = 0;
1729fa9e4066Sahrens 		hdr->b_cksum0 = 0;
1730fa9e4066Sahrens 	}
1731fa9e4066Sahrens }
1732fa9e4066Sahrens 
1733fa9e4066Sahrens int
1734fa9e4066Sahrens arc_released(arc_buf_t *buf)
1735fa9e4066Sahrens {
1736fa9e4066Sahrens 	return (buf->b_hdr->b_state == arc.anon);
1737fa9e4066Sahrens }
1738fa9e4066Sahrens 
1739fa9e4066Sahrens static void
1740fa9e4066Sahrens arc_write_done(zio_t *zio)
1741fa9e4066Sahrens {
1742fa9e4066Sahrens 	arc_buf_t *buf;
1743fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1744fa9e4066Sahrens 	arc_callback_t *acb;
1745fa9e4066Sahrens 
1746fa9e4066Sahrens 	buf = zio->io_private;
1747fa9e4066Sahrens 	hdr = buf->b_hdr;
1748fa9e4066Sahrens 	acb = hdr->b_acb;
1749fa9e4066Sahrens 	hdr->b_acb = NULL;
1750fa9e4066Sahrens 
1751fa9e4066Sahrens 	/* this buffer is on no lists and is not in the hash table */
1752fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
1753fa9e4066Sahrens 
1754fa9e4066Sahrens 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
1755fa9e4066Sahrens 	hdr->b_birth = zio->io_bp->blk_birth;
1756fa9e4066Sahrens 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
1757fa9e4066Sahrens 	/* clear the "in-write" flag */
1758fa9e4066Sahrens 	hdr->b_hash_next = NULL;
1759fa9e4066Sahrens 	/* This write may be all-zero */
1760fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
1761fa9e4066Sahrens 		arc_buf_hdr_t *exists;
1762fa9e4066Sahrens 		kmutex_t *hash_lock;
1763fa9e4066Sahrens 
1764fa9e4066Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
1765fa9e4066Sahrens 		if (exists) {
1766fa9e4066Sahrens 			/*
1767fa9e4066Sahrens 			 * This can only happen if we overwrite for
1768fa9e4066Sahrens 			 * sync-to-convergence, because we remove
1769fa9e4066Sahrens 			 * buffers from the hash table when we arc_free().
1770fa9e4066Sahrens 			 */
1771fa9e4066Sahrens 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
1772fa9e4066Sahrens 			    BP_IDENTITY(zio->io_bp)));
1773fa9e4066Sahrens 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
1774fa9e4066Sahrens 			    zio->io_bp->blk_birth);
1775fa9e4066Sahrens 
1776fa9e4066Sahrens 			ASSERT(refcount_is_zero(&exists->b_refcnt));
1777fa9e4066Sahrens 			arc_change_state(arc.anon, exists, hash_lock);
1778fa9e4066Sahrens 			mutex_exit(hash_lock);
1779fa9e4066Sahrens 			arc_hdr_free(exists);
1780fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
1781fa9e4066Sahrens 			ASSERT3P(exists, ==, NULL);
1782fa9e4066Sahrens 		}
1783fa9e4066Sahrens 		arc_access(hdr, hash_lock);
1784fa9e4066Sahrens 		mutex_exit(hash_lock);
1785fa9e4066Sahrens 	}
1786fa9e4066Sahrens 	if (acb && acb->acb_done) {
1787fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
1788fa9e4066Sahrens 		acb->acb_done(zio, buf, acb->acb_private);
1789fa9e4066Sahrens 	}
1790fa9e4066Sahrens 
1791fa9e4066Sahrens 	if (acb)
1792fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
1793fa9e4066Sahrens }
1794fa9e4066Sahrens 
1795fa9e4066Sahrens int
1796fa9e4066Sahrens arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
1797fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
1798fa9e4066Sahrens     arc_done_func_t *done, void *private, int priority, int flags,
1799fa9e4066Sahrens     uint32_t arc_flags)
1800fa9e4066Sahrens {
1801fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1802fa9e4066Sahrens 	arc_callback_t	*acb;
1803fa9e4066Sahrens 	zio_t	*rzio;
1804fa9e4066Sahrens 
1805fa9e4066Sahrens 	/* this is a private buffer - no locking required */
1806fa9e4066Sahrens 	ASSERT3P(hdr->b_state, ==, arc.anon);
1807fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
1808fa9e4066Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
1809fa9e4066Sahrens 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1810fa9e4066Sahrens 	acb->acb_done = done;
1811fa9e4066Sahrens 	acb->acb_private = private;
1812fa9e4066Sahrens 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
1813fa9e4066Sahrens 	hdr->b_acb = acb;
1814fa9e4066Sahrens 	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
1815fa9e4066Sahrens 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
1816fa9e4066Sahrens 
1817fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
1818fa9e4066Sahrens 		return (zio_wait(rzio));
1819fa9e4066Sahrens 
1820fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
1821fa9e4066Sahrens 	zio_nowait(rzio);
1822fa9e4066Sahrens 
1823fa9e4066Sahrens 	return (0);
1824fa9e4066Sahrens }
1825fa9e4066Sahrens 
1826fa9e4066Sahrens int
1827fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
1828fa9e4066Sahrens     zio_done_func_t *done, void *private, uint32_t arc_flags)
1829fa9e4066Sahrens {
1830fa9e4066Sahrens 	arc_buf_hdr_t *ab;
1831fa9e4066Sahrens 	kmutex_t *hash_lock;
1832fa9e4066Sahrens 	zio_t	*zio;
1833fa9e4066Sahrens 
1834fa9e4066Sahrens 	/*
1835fa9e4066Sahrens 	 * If this buffer is in the cache, release it, so it
1836fa9e4066Sahrens 	 * can be re-used.
1837fa9e4066Sahrens 	 */
1838fa9e4066Sahrens 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1839fa9e4066Sahrens 	if (ab != NULL) {
1840fa9e4066Sahrens 		/*
1841fa9e4066Sahrens 		 * The checksum of blocks to free is not always
1842fa9e4066Sahrens 		 * preserved (eg. on the deadlist).  However, if it is
1843fa9e4066Sahrens 		 * nonzero, it should match what we have in the cache.
1844fa9e4066Sahrens 		 */
1845fa9e4066Sahrens 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
1846fa9e4066Sahrens 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
1847fa9e4066Sahrens 		arc_change_state(arc.anon, ab, hash_lock);
1848fa9e4066Sahrens 		if (refcount_is_zero(&ab->b_refcnt)) {
1849fa9e4066Sahrens 			mutex_exit(hash_lock);
1850fa9e4066Sahrens 			arc_hdr_free(ab);
1851fa9e4066Sahrens 			atomic_add_64(&arc.deleted, 1);
1852fa9e4066Sahrens 		} else {
1853fa9e4066Sahrens 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
1854fa9e4066Sahrens 			if (HDR_IO_IN_PROGRESS(ab))
1855fa9e4066Sahrens 				ab->b_flags |= ARC_FREED_IN_READ;
1856fa9e4066Sahrens 			ab->b_arc_access = 0;
1857fa9e4066Sahrens 			bzero(&ab->b_dva, sizeof (dva_t));
1858fa9e4066Sahrens 			ab->b_birth = 0;
1859fa9e4066Sahrens 			ab->b_cksum0 = 0;
1860fa9e4066Sahrens 			mutex_exit(hash_lock);
1861fa9e4066Sahrens 		}
1862fa9e4066Sahrens 	}
1863fa9e4066Sahrens 
1864fa9e4066Sahrens 	zio = zio_free(pio, spa, txg, bp, done, private);
1865fa9e4066Sahrens 
1866fa9e4066Sahrens 	if (arc_flags & ARC_WAIT)
1867fa9e4066Sahrens 		return (zio_wait(zio));
1868fa9e4066Sahrens 
1869fa9e4066Sahrens 	ASSERT(arc_flags & ARC_NOWAIT);
1870fa9e4066Sahrens 	zio_nowait(zio);
1871fa9e4066Sahrens 
1872fa9e4066Sahrens 	return (0);
1873fa9e4066Sahrens }
1874fa9e4066Sahrens 
1875fa9e4066Sahrens void
1876fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve)
1877fa9e4066Sahrens {
1878fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, -tempreserve);
1879fa9e4066Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
1880fa9e4066Sahrens }
1881fa9e4066Sahrens 
1882fa9e4066Sahrens int
1883fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve)
1884fa9e4066Sahrens {
1885fa9e4066Sahrens #ifdef ZFS_DEBUG
1886fa9e4066Sahrens 	/*
1887fa9e4066Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
1888fa9e4066Sahrens 	 */
1889fa9e4066Sahrens 	if (spa_get_random(10000) == 0) {
1890fa9e4066Sahrens 		dprintf("forcing random failure\n");
1891fa9e4066Sahrens 		return (ERESTART);
1892fa9e4066Sahrens 	}
1893fa9e4066Sahrens #endif
1894*112fe045Smaybee 	if (tempreserve > arc.c/4 && !arc.no_grow)
1895*112fe045Smaybee 		arc.c = MIN(arc.c_max, tempreserve * 4);
1896*112fe045Smaybee 	if (tempreserve > arc.c)
1897*112fe045Smaybee 		return (ENOMEM);
1898*112fe045Smaybee 
1899fa9e4066Sahrens 	/*
1900*112fe045Smaybee 	 * Throttle writes when the amount of dirty data in the cache
1901*112fe045Smaybee 	 * gets too large.  We try to keep the cache less than half full
1902*112fe045Smaybee 	 * of dirty blocks so that our sync times don't grow too large.
1903*112fe045Smaybee 	 * Note: if two requests come in concurrently, we might let them
1904*112fe045Smaybee 	 * both succeed, when one of them should fail.  Not a huge deal.
1905*112fe045Smaybee 	 *
1906*112fe045Smaybee 	 * XXX The limit should be adjusted dynamically to keep the time
1907*112fe045Smaybee 	 * to sync a dataset fixed (around 1-5 seconds?).
1908fa9e4066Sahrens 	 */
1909fa9e4066Sahrens 
1910*112fe045Smaybee 	if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 &&
1911*112fe045Smaybee 	    arc_tempreserve + arc.anon->size > arc.c / 4) {
1912fa9e4066Sahrens 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
1913fa9e4066Sahrens 		    "tempreserve=%lluK arc.c=%lluK\n",
1914fa9e4066Sahrens 		    arc_tempreserve>>10, arc.anon->lsize>>10,
1915fa9e4066Sahrens 		    tempreserve>>10, arc.c>>10);
1916fa9e4066Sahrens 		return (ERESTART);
1917fa9e4066Sahrens 	}
1918fa9e4066Sahrens 	atomic_add_64(&arc_tempreserve, tempreserve);
1919fa9e4066Sahrens 	return (0);
1920fa9e4066Sahrens }
1921fa9e4066Sahrens 
1922fa9e4066Sahrens void
1923fa9e4066Sahrens arc_init(void)
1924fa9e4066Sahrens {
1925fa9e4066Sahrens 	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
1926fa9e4066Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
1927fa9e4066Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
1928fa9e4066Sahrens 
1929fa9e4066Sahrens 	/* Start out with 1/8 of all memory */
1930fa9e4066Sahrens 	arc.c = physmem * PAGESIZE / 8;
1931fa9e4066Sahrens 
1932fa9e4066Sahrens #ifdef _KERNEL
1933fa9e4066Sahrens 	/*
1934fa9e4066Sahrens 	 * On architectures where the physical memory can be larger
1935fa9e4066Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
1936fa9e4066Sahrens 	 * need to limit the cache to 1/8 of VM size.
1937fa9e4066Sahrens 	 */
1938fa9e4066Sahrens 	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
1939fa9e4066Sahrens #endif
1940fa9e4066Sahrens 
1941*112fe045Smaybee 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
1942fa9e4066Sahrens 	arc.c_min = MAX(arc.c / 4, 64<<20);
1943*112fe045Smaybee 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
1944fa9e4066Sahrens 	if (arc.c * 8 >= 1<<30)
1945fa9e4066Sahrens 		arc.c_max = (arc.c * 8) - (1<<30);
1946fa9e4066Sahrens 	else
1947fa9e4066Sahrens 		arc.c_max = arc.c_min;
1948fa9e4066Sahrens 	arc.c_max = MAX(arc.c * 6, arc.c_max);
1949fa9e4066Sahrens 	arc.c = arc.c_max;
1950fa9e4066Sahrens 	arc.p = (arc.c >> 1);
1951fa9e4066Sahrens 
1952fa9e4066Sahrens 	/* if kmem_flags are set, lets try to use less memory */
1953fa9e4066Sahrens 	if (kmem_debugging())
1954fa9e4066Sahrens 		arc.c = arc.c / 2;
1955fa9e4066Sahrens 	if (arc.c < arc.c_min)
1956fa9e4066Sahrens 		arc.c = arc.c_min;
1957fa9e4066Sahrens 
1958fa9e4066Sahrens 	arc.anon = &ARC_anon;
1959fa9e4066Sahrens 	arc.mru_top = &ARC_mru_top;
1960fa9e4066Sahrens 	arc.mru_bot = &ARC_mru_bot;
1961fa9e4066Sahrens 	arc.mfu_top = &ARC_mfu_top;
1962fa9e4066Sahrens 	arc.mfu_bot = &ARC_mfu_bot;
1963fa9e4066Sahrens 
1964fa9e4066Sahrens 	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
1965fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1966fa9e4066Sahrens 	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
1967fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1968fa9e4066Sahrens 	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
1969fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1970fa9e4066Sahrens 	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
1971fa9e4066Sahrens 	    offsetof(arc_buf_hdr_t, b_arc_node));
1972fa9e4066Sahrens 
1973fa9e4066Sahrens 	buf_init();
1974fa9e4066Sahrens 
1975fa9e4066Sahrens 	arc_thread_exit = 0;
1976fa9e4066Sahrens 
1977fa9e4066Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
1978fa9e4066Sahrens 	    TS_RUN, minclsyspri);
1979fa9e4066Sahrens }
1980fa9e4066Sahrens 
1981fa9e4066Sahrens void
1982fa9e4066Sahrens arc_fini(void)
1983fa9e4066Sahrens {
1984fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
1985fa9e4066Sahrens 	arc_thread_exit = 1;
1986fa9e4066Sahrens 	while (arc_thread_exit != 0)
1987fa9e4066Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
1988fa9e4066Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
1989fa9e4066Sahrens 
1990fa9e4066Sahrens 	arc_flush();
1991fa9e4066Sahrens 
1992fa9e4066Sahrens 	arc_dead = TRUE;
1993fa9e4066Sahrens 
1994fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_lock);
1995fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
1996fa9e4066Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
1997fa9e4066Sahrens 
1998fa9e4066Sahrens 	list_destroy(&arc.mru_top->list);
1999fa9e4066Sahrens 	list_destroy(&arc.mru_bot->list);
2000fa9e4066Sahrens 	list_destroy(&arc.mfu_top->list);
2001fa9e4066Sahrens 	list_destroy(&arc.mfu_bot->list);
2002fa9e4066Sahrens 
2003fa9e4066Sahrens 	buf_fini();
2004fa9e4066Sahrens }
2005