xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision aad02571)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5033f9833Sek  * Common Development and Distribution License (the "License").
6033f9833Sek  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
223f9d6ad7SLin Ling  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23e9103aaeSGarrett D'Amore  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24be6fd75aSMatthew Ahrens  * Copyright (c) 2013 by Delphix. All rights reserved.
25*aad02571SSaso Kiselkov  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26fa9e4066Sahrens  */
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
2944cb6abcSbmc  * DVA-based Adjustable Replacement Cache
30fa9e4066Sahrens  *
31ea8dc4b6Seschrock  * While much of the theory of operation used here is
32ea8dc4b6Seschrock  * based on the self-tuning, low overhead replacement cache
33fa9e4066Sahrens  * presented by Megiddo and Modha at FAST 2003, there are some
34fa9e4066Sahrens  * significant differences:
35fa9e4066Sahrens  *
36fa9e4066Sahrens  * 1. The Megiddo and Modha model assumes any page is evictable.
37fa9e4066Sahrens  * Pages in its cache cannot be "locked" into memory.  This makes
38fa9e4066Sahrens  * the eviction algorithm simple: evict the last page in the list.
39fa9e4066Sahrens  * This also make the performance characteristics easy to reason
40fa9e4066Sahrens  * about.  Our cache is not so simple.  At any given moment, some
41fa9e4066Sahrens  * subset of the blocks in the cache are un-evictable because we
42fa9e4066Sahrens  * have handed out a reference to them.  Blocks are only evictable
43fa9e4066Sahrens  * when there are no external references active.  This makes
44fa9e4066Sahrens  * eviction far more problematic:  we choose to evict the evictable
45fa9e4066Sahrens  * blocks that are the "lowest" in the list.
46fa9e4066Sahrens  *
47fa9e4066Sahrens  * There are times when it is not possible to evict the requested
48fa9e4066Sahrens  * space.  In these circumstances we are unable to adjust the cache
49fa9e4066Sahrens  * size.  To prevent the cache growing unbounded at these times we
50fa94a07fSbrendan  * implement a "cache throttle" that slows the flow of new data
51fa94a07fSbrendan  * into the cache until we can make space available.
52fa9e4066Sahrens  *
53fa9e4066Sahrens  * 2. The Megiddo and Modha model assumes a fixed cache size.
54fa9e4066Sahrens  * Pages are evicted when the cache is full and there is a cache
55fa9e4066Sahrens  * miss.  Our model has a variable sized cache.  It grows with
56fa94a07fSbrendan  * high use, but also tries to react to memory pressure from the
57fa9e4066Sahrens  * operating system: decreasing its size when system memory is
58fa9e4066Sahrens  * tight.
59fa9e4066Sahrens  *
60fa9e4066Sahrens  * 3. The Megiddo and Modha model assumes a fixed page size. All
61fa9e4066Sahrens  * elements of the cache are therefor exactly the same size.  So
62fa9e4066Sahrens  * when adjusting the cache size following a cache miss, its simply
63fa9e4066Sahrens  * a matter of choosing a single page to evict.  In our model, we
64fa9e4066Sahrens  * have variable sized cache blocks (rangeing from 512 bytes to
65fa9e4066Sahrens  * 128K bytes).  We therefor choose a set of blocks to evict to make
66fa9e4066Sahrens  * space for a cache miss that approximates as closely as possible
67fa9e4066Sahrens  * the space used by the new block.
68fa9e4066Sahrens  *
69fa9e4066Sahrens  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70fa9e4066Sahrens  * by N. Megiddo & D. Modha, FAST 2003
71fa9e4066Sahrens  */
72fa9e4066Sahrens 
73fa9e4066Sahrens /*
74fa9e4066Sahrens  * The locking model:
75fa9e4066Sahrens  *
76fa9e4066Sahrens  * A new reference to a cache buffer can be obtained in two
77fa9e4066Sahrens  * ways: 1) via a hash table lookup using the DVA as a key,
78fa94a07fSbrendan  * or 2) via one of the ARC lists.  The arc_read() interface
79fa9e4066Sahrens  * uses method 1, while the internal arc algorithms for
80fa9e4066Sahrens  * adjusting the cache use method 2.  We therefor provide two
81fa9e4066Sahrens  * types of locks: 1) the hash table lock array, and 2) the
82fa9e4066Sahrens  * arc list locks.
83fa9e4066Sahrens  *
84fc98fea5SBart Coddens  * Buffers do not have their own mutexes, rather they rely on the
85fc98fea5SBart Coddens  * hash table mutexes for the bulk of their protection (i.e. most
86fc98fea5SBart Coddens  * fields in the arc_buf_hdr_t are protected by these mutexes).
87fa9e4066Sahrens  *
88fa9e4066Sahrens  * buf_hash_find() returns the appropriate mutex (held) when it
89fa9e4066Sahrens  * locates the requested buffer in the hash table.  It returns
90fa9e4066Sahrens  * NULL for the mutex if the buffer was not in the table.
91fa9e4066Sahrens  *
92fa9e4066Sahrens  * buf_hash_remove() expects the appropriate hash mutex to be
93fa9e4066Sahrens  * already held before it is invoked.
94fa9e4066Sahrens  *
95fa9e4066Sahrens  * Each arc state also has a mutex which is used to protect the
96fa9e4066Sahrens  * buffer list associated with the state.  When attempting to
97fa9e4066Sahrens  * obtain a hash table lock while holding an arc list lock you
98fa9e4066Sahrens  * must use: mutex_tryenter() to avoid deadlock.  Also note that
9944eda4d7Smaybee  * the active state mutex must be held before the ghost state mutex.
100fa9e4066Sahrens  *
101ea8dc4b6Seschrock  * Arc buffers may have an associated eviction callback function.
102ea8dc4b6Seschrock  * This function will be invoked prior to removing the buffer (e.g.
103ea8dc4b6Seschrock  * in arc_do_user_evicts()).  Note however that the data associated
104ea8dc4b6Seschrock  * with the buffer may be evicted prior to the callback.  The callback
105ea8dc4b6Seschrock  * must be made with *no locks held* (to prevent deadlock).  Additionally,
106ea8dc4b6Seschrock  * the users of callbacks must ensure that their private data is
107ea8dc4b6Seschrock  * protected from simultaneous callbacks from arc_buf_evict()
108ea8dc4b6Seschrock  * and arc_do_user_evicts().
109ea8dc4b6Seschrock  *
110fa9e4066Sahrens  * Note that the majority of the performance stats are manipulated
111fa9e4066Sahrens  * with atomic operations.
112fa94a07fSbrendan  *
113fa94a07fSbrendan  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114fa94a07fSbrendan  *
115fa94a07fSbrendan  *	- L2ARC buflist creation
116fa94a07fSbrendan  *	- L2ARC buflist eviction
117fa94a07fSbrendan  *	- L2ARC write completion, which walks L2ARC buflists
118fa94a07fSbrendan  *	- ARC header destruction, as it removes from L2ARC buflists
119fa94a07fSbrendan  *	- ARC header release, as it removes from L2ARC buflists
120fa9e4066Sahrens  */
121fa9e4066Sahrens 
122fa9e4066Sahrens #include <sys/spa.h>
123fa9e4066Sahrens #include <sys/zio.h>
124*aad02571SSaso Kiselkov #include <sys/zio_compress.h>
125fa9e4066Sahrens #include <sys/zfs_context.h>
126fa9e4066Sahrens #include <sys/arc.h>
127fa9e4066Sahrens #include <sys/refcount.h>
128c5904d13Seschrock #include <sys/vdev.h>
129573ca77eSGeorge Wilson #include <sys/vdev_impl.h>
130fa9e4066Sahrens #ifdef _KERNEL
131fa9e4066Sahrens #include <sys/vmsystm.h>
132fa9e4066Sahrens #include <vm/anon.h>
133fa9e4066Sahrens #include <sys/fs/swapnode.h>
134033f9833Sek #include <sys/dnlc.h>
135fa9e4066Sahrens #endif
136fa9e4066Sahrens #include <sys/callb.h>
13744cb6abcSbmc #include <sys/kstat.h>
138b24ab676SJeff Bonwick #include <zfs_fletcher.h>
139fa9e4066Sahrens 
140cd1c8b85SMatthew Ahrens #ifndef _KERNEL
141cd1c8b85SMatthew Ahrens /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
142cd1c8b85SMatthew Ahrens boolean_t arc_watch = B_FALSE;
143cd1c8b85SMatthew Ahrens int arc_procfd;
144cd1c8b85SMatthew Ahrens #endif
145cd1c8b85SMatthew Ahrens 
146fa9e4066Sahrens static kmutex_t		arc_reclaim_thr_lock;
147fa9e4066Sahrens static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
148fa9e4066Sahrens static uint8_t		arc_thread_exit;
149fa9e4066Sahrens 
1501ab7f2deSmaybee extern int zfs_write_limit_shift;
1511ab7f2deSmaybee extern uint64_t zfs_write_limit_max;
15205715f94SMark Maybee extern kmutex_t zfs_write_limit_lock;
1531ab7f2deSmaybee 
154033f9833Sek #define	ARC_REDUCE_DNLC_PERCENT	3
155033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156033f9833Sek 
157fa9e4066Sahrens typedef enum arc_reclaim_strategy {
158fa9e4066Sahrens 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
159fa9e4066Sahrens 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
160fa9e4066Sahrens } arc_reclaim_strategy_t;
161fa9e4066Sahrens 
162fa9e4066Sahrens /* number of seconds before growing cache again */
163fa9e4066Sahrens static int		arc_grow_retry = 60;
164fa9e4066Sahrens 
1655a98e54bSBrendan Gregg - Sun Microsystems /* shift of arc_c for calculating both min and max arc_p */
1665a98e54bSBrendan Gregg - Sun Microsystems static int		arc_p_min_shift = 4;
1675a98e54bSBrendan Gregg - Sun Microsystems 
1685a98e54bSBrendan Gregg - Sun Microsystems /* log2(fraction of arc to reclaim) */
1695a98e54bSBrendan Gregg - Sun Microsystems static int		arc_shrink_shift = 5;
1705a98e54bSBrendan Gregg - Sun Microsystems 
17113506d1eSmaybee /*
172b19a79ecSperrin  * minimum lifespan of a prefetch block in clock ticks
173b19a79ecSperrin  * (initialized in arc_init())
17413506d1eSmaybee  */
175b19a79ecSperrin static int		arc_min_prefetch_lifespan;
17613506d1eSmaybee 
177fa9e4066Sahrens static int arc_dead;
178fa9e4066Sahrens 
1793a737e0dSbrendan /*
1803a737e0dSbrendan  * The arc has filled available memory and has now warmed up.
1813a737e0dSbrendan  */
1823a737e0dSbrendan static boolean_t arc_warm;
1833a737e0dSbrendan 
184a2eea2e1Sahrens /*
185a2eea2e1Sahrens  * These tunables are for performance analysis.
186a2eea2e1Sahrens  */
187a2eea2e1Sahrens uint64_t zfs_arc_max;
188a2eea2e1Sahrens uint64_t zfs_arc_min;
1891116048bSek uint64_t zfs_arc_meta_limit = 0;
1905a98e54bSBrendan Gregg - Sun Microsystems int zfs_arc_grow_retry = 0;
1915a98e54bSBrendan Gregg - Sun Microsystems int zfs_arc_shrink_shift = 0;
1925a98e54bSBrendan Gregg - Sun Microsystems int zfs_arc_p_min_shift = 0;
1939253d63dSGeorge Wilson int zfs_disable_dup_eviction = 0;
194a2eea2e1Sahrens 
195fa9e4066Sahrens /*
196fa94a07fSbrendan  * Note that buffers can be in one of 6 states:
197fa9e4066Sahrens  *	ARC_anon	- anonymous (discussed below)
198ea8dc4b6Seschrock  *	ARC_mru		- recently used, currently cached
199ea8dc4b6Seschrock  *	ARC_mru_ghost	- recentely used, no longer in cache
200ea8dc4b6Seschrock  *	ARC_mfu		- frequently used, currently cached
201ea8dc4b6Seschrock  *	ARC_mfu_ghost	- frequently used, no longer in cache
202fa94a07fSbrendan  *	ARC_l2c_only	- exists in L2ARC but not other states
2030e8c6158Smaybee  * When there are no active references to the buffer, they are
2040e8c6158Smaybee  * are linked onto a list in one of these arc states.  These are
2050e8c6158Smaybee  * the only buffers that can be evicted or deleted.  Within each
2060e8c6158Smaybee  * state there are multiple lists, one for meta-data and one for
2070e8c6158Smaybee  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
2080e8c6158Smaybee  * etc.) is tracked separately so that it can be managed more
209fa94a07fSbrendan  * explicitly: favored over data, limited explicitly.
210fa9e4066Sahrens  *
211fa9e4066Sahrens  * Anonymous buffers are buffers that are not associated with
212fa9e4066Sahrens  * a DVA.  These are buffers that hold dirty block copies
213fa9e4066Sahrens  * before they are written to stable storage.  By definition,
214ea8dc4b6Seschrock  * they are "ref'd" and are considered part of arc_mru
215fa9e4066Sahrens  * that cannot be freed.  Generally, they will aquire a DVA
216ea8dc4b6Seschrock  * as they are written and migrate onto the arc_mru list.
217fa94a07fSbrendan  *
218fa94a07fSbrendan  * The ARC_l2c_only state is for buffers that are in the second
219fa94a07fSbrendan  * level ARC but no longer in any of the ARC_m* lists.  The second
220fa94a07fSbrendan  * level ARC itself may also contain buffers that are in any of
221fa94a07fSbrendan  * the ARC_m* states - meaning that a buffer can exist in two
222fa94a07fSbrendan  * places.  The reason for the ARC_l2c_only state is to keep the
223fa94a07fSbrendan  * buffer header in the hash table, so that reads that hit the
224fa94a07fSbrendan  * second level ARC benefit from these fast lookups.
225fa9e4066Sahrens  */
226fa9e4066Sahrens 
227fa9e4066Sahrens typedef struct arc_state {
2280e8c6158Smaybee 	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
2290e8c6158Smaybee 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
2300e8c6158Smaybee 	uint64_t arcs_size;	/* total amount of data in this state */
23144cb6abcSbmc 	kmutex_t arcs_mtx;
232fa9e4066Sahrens } arc_state_t;
233fa9e4066Sahrens 
234fa94a07fSbrendan /* The 6 states: */
235fa9e4066Sahrens static arc_state_t ARC_anon;
236ea8dc4b6Seschrock static arc_state_t ARC_mru;
237ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost;
238ea8dc4b6Seschrock static arc_state_t ARC_mfu;
239ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost;
240fa94a07fSbrendan static arc_state_t ARC_l2c_only;
241fa9e4066Sahrens 
24244cb6abcSbmc typedef struct arc_stats {
24344cb6abcSbmc 	kstat_named_t arcstat_hits;
24444cb6abcSbmc 	kstat_named_t arcstat_misses;
24544cb6abcSbmc 	kstat_named_t arcstat_demand_data_hits;
24644cb6abcSbmc 	kstat_named_t arcstat_demand_data_misses;
24744cb6abcSbmc 	kstat_named_t arcstat_demand_metadata_hits;
24844cb6abcSbmc 	kstat_named_t arcstat_demand_metadata_misses;
24944cb6abcSbmc 	kstat_named_t arcstat_prefetch_data_hits;
25044cb6abcSbmc 	kstat_named_t arcstat_prefetch_data_misses;
25144cb6abcSbmc 	kstat_named_t arcstat_prefetch_metadata_hits;
25244cb6abcSbmc 	kstat_named_t arcstat_prefetch_metadata_misses;
25344cb6abcSbmc 	kstat_named_t arcstat_mru_hits;
25444cb6abcSbmc 	kstat_named_t arcstat_mru_ghost_hits;
25544cb6abcSbmc 	kstat_named_t arcstat_mfu_hits;
25644cb6abcSbmc 	kstat_named_t arcstat_mfu_ghost_hits;
25744cb6abcSbmc 	kstat_named_t arcstat_deleted;
25844cb6abcSbmc 	kstat_named_t arcstat_recycle_miss;
25944cb6abcSbmc 	kstat_named_t arcstat_mutex_miss;
26044cb6abcSbmc 	kstat_named_t arcstat_evict_skip;
2615ea40c06SBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_evict_l2_cached;
2625ea40c06SBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_evict_l2_eligible;
2635ea40c06SBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_evict_l2_ineligible;
26444cb6abcSbmc 	kstat_named_t arcstat_hash_elements;
26544cb6abcSbmc 	kstat_named_t arcstat_hash_elements_max;
26644cb6abcSbmc 	kstat_named_t arcstat_hash_collisions;
26744cb6abcSbmc 	kstat_named_t arcstat_hash_chains;
26844cb6abcSbmc 	kstat_named_t arcstat_hash_chain_max;
26944cb6abcSbmc 	kstat_named_t arcstat_p;
27044cb6abcSbmc 	kstat_named_t arcstat_c;
27144cb6abcSbmc 	kstat_named_t arcstat_c_min;
27244cb6abcSbmc 	kstat_named_t arcstat_c_max;
27344cb6abcSbmc 	kstat_named_t arcstat_size;
274fa94a07fSbrendan 	kstat_named_t arcstat_hdr_size;
2755a98e54bSBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_data_size;
2765a98e54bSBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_other_size;
277fa94a07fSbrendan 	kstat_named_t arcstat_l2_hits;
278fa94a07fSbrendan 	kstat_named_t arcstat_l2_misses;
279fa94a07fSbrendan 	kstat_named_t arcstat_l2_feeds;
280fa94a07fSbrendan 	kstat_named_t arcstat_l2_rw_clash;
2815a98e54bSBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_l2_read_bytes;
2825a98e54bSBrendan Gregg - Sun Microsystems 	kstat_named_t arcstat_l2_write_bytes;
283fa94a07fSbrendan 	kstat_named_t arcstat_l2_writes_sent;
284fa94a07fSbrendan 	kstat_named_t arcstat_l2_writes_done;
285fa94a07fSbrendan 	kstat_named_t arcstat_l2_writes_error;
286fa94a07fSbrendan 	kstat_named_t arcstat_l2_writes_hdr_miss;
287fa94a07fSbrendan 	kstat_named_t arcstat_l2_evict_lock_retry;
288fa94a07fSbrendan 	kstat_named_t arcstat_l2_evict_reading;
289fa94a07fSbrendan 	kstat_named_t arcstat_l2_free_on_write;
290fa94a07fSbrendan 	kstat_named_t arcstat_l2_abort_lowmem;
291fa94a07fSbrendan 	kstat_named_t arcstat_l2_cksum_bad;
292fa94a07fSbrendan 	kstat_named_t arcstat_l2_io_error;
293fa94a07fSbrendan 	kstat_named_t arcstat_l2_size;
294*aad02571SSaso Kiselkov 	kstat_named_t arcstat_l2_asize;
295fa94a07fSbrendan 	kstat_named_t arcstat_l2_hdr_size;
296*aad02571SSaso Kiselkov 	kstat_named_t arcstat_l2_compress_successes;
297*aad02571SSaso Kiselkov 	kstat_named_t arcstat_l2_compress_zeros;
298*aad02571SSaso Kiselkov 	kstat_named_t arcstat_l2_compress_failures;
2991ab7f2deSmaybee 	kstat_named_t arcstat_memory_throttle_count;
3009253d63dSGeorge Wilson 	kstat_named_t arcstat_duplicate_buffers;
3019253d63dSGeorge Wilson 	kstat_named_t arcstat_duplicate_buffers_size;
3029253d63dSGeorge Wilson 	kstat_named_t arcstat_duplicate_reads;
30320128a08SGeorge Wilson 	kstat_named_t arcstat_meta_used;
30420128a08SGeorge Wilson 	kstat_named_t arcstat_meta_limit;
30520128a08SGeorge Wilson 	kstat_named_t arcstat_meta_max;
30644cb6abcSbmc } arc_stats_t;
30744cb6abcSbmc 
30844cb6abcSbmc static arc_stats_t arc_stats = {
30944cb6abcSbmc 	{ "hits",			KSTAT_DATA_UINT64 },
31044cb6abcSbmc 	{ "misses",			KSTAT_DATA_UINT64 },
31144cb6abcSbmc 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
31244cb6abcSbmc 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
31344cb6abcSbmc 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
31444cb6abcSbmc 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
31544cb6abcSbmc 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
31644cb6abcSbmc 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
31744cb6abcSbmc 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
31844cb6abcSbmc 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
31944cb6abcSbmc 	{ "mru_hits",			KSTAT_DATA_UINT64 },
32044cb6abcSbmc 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
32144cb6abcSbmc 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
32244cb6abcSbmc 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
32344cb6abcSbmc 	{ "deleted",			KSTAT_DATA_UINT64 },
32444cb6abcSbmc 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
32544cb6abcSbmc 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
32644cb6abcSbmc 	{ "evict_skip",			KSTAT_DATA_UINT64 },
3275ea40c06SBrendan Gregg - Sun Microsystems 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
3285ea40c06SBrendan Gregg - Sun Microsystems 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
3295ea40c06SBrendan Gregg - Sun Microsystems 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
33044cb6abcSbmc 	{ "hash_elements",		KSTAT_DATA_UINT64 },
33144cb6abcSbmc 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
33244cb6abcSbmc 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
33344cb6abcSbmc 	{ "hash_chains",		KSTAT_DATA_UINT64 },
33444cb6abcSbmc 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
33544cb6abcSbmc 	{ "p",				KSTAT_DATA_UINT64 },
33644cb6abcSbmc 	{ "c",				KSTAT_DATA_UINT64 },
33744cb6abcSbmc 	{ "c_min",			KSTAT_DATA_UINT64 },
33844cb6abcSbmc 	{ "c_max",			KSTAT_DATA_UINT64 },
339fa94a07fSbrendan 	{ "size",			KSTAT_DATA_UINT64 },
340fa94a07fSbrendan 	{ "hdr_size",			KSTAT_DATA_UINT64 },
3415a98e54bSBrendan Gregg - Sun Microsystems 	{ "data_size",			KSTAT_DATA_UINT64 },
3425a98e54bSBrendan Gregg - Sun Microsystems 	{ "other_size",			KSTAT_DATA_UINT64 },
343fa94a07fSbrendan 	{ "l2_hits",			KSTAT_DATA_UINT64 },
344fa94a07fSbrendan 	{ "l2_misses",			KSTAT_DATA_UINT64 },
345fa94a07fSbrendan 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
346fa94a07fSbrendan 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
3475a98e54bSBrendan Gregg - Sun Microsystems 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
3485a98e54bSBrendan Gregg - Sun Microsystems 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
349fa94a07fSbrendan 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
350fa94a07fSbrendan 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
351fa94a07fSbrendan 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
352fa94a07fSbrendan 	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
353fa94a07fSbrendan 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
354fa94a07fSbrendan 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
355fa94a07fSbrendan 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
356fa94a07fSbrendan 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
357fa94a07fSbrendan 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
358fa94a07fSbrendan 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
359fa94a07fSbrendan 	{ "l2_size",			KSTAT_DATA_UINT64 },
360*aad02571SSaso Kiselkov 	{ "l2_asize",			KSTAT_DATA_UINT64 },
3611ab7f2deSmaybee 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
362*aad02571SSaso Kiselkov 	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
363*aad02571SSaso Kiselkov 	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
364*aad02571SSaso Kiselkov 	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
3659253d63dSGeorge Wilson 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
3669253d63dSGeorge Wilson 	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
3679253d63dSGeorge Wilson 	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
36820128a08SGeorge Wilson 	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
36920128a08SGeorge Wilson 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
37020128a08SGeorge Wilson 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
37120128a08SGeorge Wilson 	{ "arc_meta_max",		KSTAT_DATA_UINT64 }
37244cb6abcSbmc };
37344cb6abcSbmc 
37444cb6abcSbmc #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
37544cb6abcSbmc 
37644cb6abcSbmc #define	ARCSTAT_INCR(stat, val) \
37744cb6abcSbmc 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
37844cb6abcSbmc 
379b24ab676SJeff Bonwick #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
38044cb6abcSbmc #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
38144cb6abcSbmc 
38244cb6abcSbmc #define	ARCSTAT_MAX(stat, val) {					\
38344cb6abcSbmc 	uint64_t m;							\
38444cb6abcSbmc 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
38544cb6abcSbmc 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
38644cb6abcSbmc 		continue;						\
38744cb6abcSbmc }
38844cb6abcSbmc 
38944cb6abcSbmc #define	ARCSTAT_MAXSTAT(stat) \
39044cb6abcSbmc 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
39144cb6abcSbmc 
39244cb6abcSbmc /*
39344cb6abcSbmc  * We define a macro to allow ARC hits/misses to be easily broken down by
39444cb6abcSbmc  * two separate conditions, giving a total of four different subtypes for
39544cb6abcSbmc  * each of hits and misses (so eight statistics total).
39644cb6abcSbmc  */
39744cb6abcSbmc #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
39844cb6abcSbmc 	if (cond1) {							\
39944cb6abcSbmc 		if (cond2) {						\
40044cb6abcSbmc 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
40144cb6abcSbmc 		} else {						\
40244cb6abcSbmc 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
40344cb6abcSbmc 		}							\
40444cb6abcSbmc 	} else {							\
40544cb6abcSbmc 		if (cond2) {						\
40644cb6abcSbmc 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
40744cb6abcSbmc 		} else {						\
40844cb6abcSbmc 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
40944cb6abcSbmc 		}							\
41044cb6abcSbmc 	}
41144cb6abcSbmc 
41244cb6abcSbmc kstat_t			*arc_ksp;
413b24ab676SJeff Bonwick static arc_state_t	*arc_anon;
41444cb6abcSbmc static arc_state_t	*arc_mru;
41544cb6abcSbmc static arc_state_t	*arc_mru_ghost;
41644cb6abcSbmc static arc_state_t	*arc_mfu;
41744cb6abcSbmc static arc_state_t	*arc_mfu_ghost;
418fa94a07fSbrendan static arc_state_t	*arc_l2c_only;
41944cb6abcSbmc 
42044cb6abcSbmc /*
42144cb6abcSbmc  * There are several ARC variables that are critical to export as kstats --
42244cb6abcSbmc  * but we don't want to have to grovel around in the kstat whenever we wish to
42344cb6abcSbmc  * manipulate them.  For these variables, we therefore define them to be in
42444cb6abcSbmc  * terms of the statistic variable.  This assures that we are not introducing
42544cb6abcSbmc  * the possibility of inconsistency by having shadow copies of the variables,
42644cb6abcSbmc  * while still allowing the code to be readable.
42744cb6abcSbmc  */
42844cb6abcSbmc #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
42944cb6abcSbmc #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
43044cb6abcSbmc #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
43144cb6abcSbmc #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
43244cb6abcSbmc #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
43320128a08SGeorge Wilson #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
43420128a08SGeorge Wilson #define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
43520128a08SGeorge Wilson #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
43644cb6abcSbmc 
437*aad02571SSaso Kiselkov #define	L2ARC_IS_VALID_COMPRESS(_c_) \
438*aad02571SSaso Kiselkov 	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
439*aad02571SSaso Kiselkov 
44044cb6abcSbmc static int		arc_no_grow;	/* Don't try to grow cache size */
44144cb6abcSbmc static uint64_t		arc_tempreserve;
4422fdbea25SAleksandr Guzovskiy static uint64_t		arc_loaned_bytes;
443fa9e4066Sahrens 
444fa94a07fSbrendan typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
445fa94a07fSbrendan 
446fa9e4066Sahrens typedef struct arc_callback arc_callback_t;
447fa9e4066Sahrens 
448fa9e4066Sahrens struct arc_callback {
449fa9e4066Sahrens 	void			*acb_private;
450c717a561Smaybee 	arc_done_func_t		*acb_done;
451fa9e4066Sahrens 	arc_buf_t		*acb_buf;
452fa9e4066Sahrens 	zio_t			*acb_zio_dummy;
453fa9e4066Sahrens 	arc_callback_t		*acb_next;
454fa9e4066Sahrens };
455fa9e4066Sahrens 
456c717a561Smaybee typedef struct arc_write_callback arc_write_callback_t;
457c717a561Smaybee 
458c717a561Smaybee struct arc_write_callback {
459c717a561Smaybee 	void		*awcb_private;
460c717a561Smaybee 	arc_done_func_t	*awcb_ready;
461c717a561Smaybee 	arc_done_func_t	*awcb_done;
462c717a561Smaybee 	arc_buf_t	*awcb_buf;
463c717a561Smaybee };
464c717a561Smaybee 
465fa9e4066Sahrens struct arc_buf_hdr {
466fa9e4066Sahrens 	/* protected by hash lock */
467fa9e4066Sahrens 	dva_t			b_dva;
468fa9e4066Sahrens 	uint64_t		b_birth;
469fa9e4066Sahrens 	uint64_t		b_cksum0;
470fa9e4066Sahrens 
4716b4acc8bSahrens 	kmutex_t		b_freeze_lock;
4726b4acc8bSahrens 	zio_cksum_t		*b_freeze_cksum;
4733f9d6ad7SLin Ling 	void			*b_thawed;
4746b4acc8bSahrens 
475fa9e4066Sahrens 	arc_buf_hdr_t		*b_hash_next;
476fa9e4066Sahrens 	arc_buf_t		*b_buf;
477fa9e4066Sahrens 	uint32_t		b_flags;
478ea8dc4b6Seschrock 	uint32_t		b_datacnt;
479fa9e4066Sahrens 
480fa9e4066Sahrens 	arc_callback_t		*b_acb;
481ad23a2dbSjohansen 	kcondvar_t		b_cv;
482ad23a2dbSjohansen 
483ad23a2dbSjohansen 	/* immutable */
484ad23a2dbSjohansen 	arc_buf_contents_t	b_type;
485ad23a2dbSjohansen 	uint64_t		b_size;
486ac05c741SMark Maybee 	uint64_t		b_spa;
487fa9e4066Sahrens 
488fa9e4066Sahrens 	/* protected by arc state mutex */
489fa9e4066Sahrens 	arc_state_t		*b_state;
490fa9e4066Sahrens 	list_node_t		b_arc_node;
491fa9e4066Sahrens 
492fa9e4066Sahrens 	/* updated atomically */
493fa9e4066Sahrens 	clock_t			b_arc_access;
494fa9e4066Sahrens 
495fa9e4066Sahrens 	/* self protecting */
496fa9e4066Sahrens 	refcount_t		b_refcnt;
497fa94a07fSbrendan 
498fa94a07fSbrendan 	l2arc_buf_hdr_t		*b_l2hdr;
499fa94a07fSbrendan 	list_node_t		b_l2node;
500fa9e4066Sahrens };
501fa9e4066Sahrens 
502ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list;
503ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx;
50440d7d650Smaybee static arc_buf_hdr_t arc_eviction_hdr;
50544eda4d7Smaybee static void arc_get_data_buf(arc_buf_t *buf);
50644eda4d7Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
5070e8c6158Smaybee static int arc_evict_needed(arc_buf_contents_t type);
508ac05c741SMark Maybee static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
509cd1c8b85SMatthew Ahrens static void arc_buf_watch(arc_buf_t *buf);
510ea8dc4b6Seschrock 
5115ea40c06SBrendan Gregg - Sun Microsystems static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
5125ea40c06SBrendan Gregg - Sun Microsystems 
513ea8dc4b6Seschrock #define	GHOST_STATE(state)	\
514fa94a07fSbrendan 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
515fa94a07fSbrendan 	(state) == arc_l2c_only)
516ea8dc4b6Seschrock 
517fa9e4066Sahrens /*
518fa9e4066Sahrens  * Private ARC flags.  These flags are private ARC only flags that will show up
519fa9e4066Sahrens  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
520fa9e4066Sahrens  * be passed in as arc_flags in things like arc_read.  However, these flags
521fa9e4066Sahrens  * should never be passed and should only be set by ARC code.  When adding new
522fa9e4066Sahrens  * public flags, make sure not to smash the private ones.
523fa9e4066Sahrens  */
524fa9e4066Sahrens 
525ea8dc4b6Seschrock #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
526fa9e4066Sahrens #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
527fa9e4066Sahrens #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
528fa9e4066Sahrens #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
529ea8dc4b6Seschrock #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
53013506d1eSmaybee #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
531fa94a07fSbrendan #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
5323baa08fcSek #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
5333baa08fcSek #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
5343baa08fcSek #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
535fa9e4066Sahrens 
536ea8dc4b6Seschrock #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
537fa9e4066Sahrens #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
538fa9e4066Sahrens #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
5395a98e54bSBrendan Gregg - Sun Microsystems #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
540fa9e4066Sahrens #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
541ea8dc4b6Seschrock #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
542fa94a07fSbrendan #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
5433baa08fcSek #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
5443a737e0dSbrendan #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
5453a737e0dSbrendan 				    (hdr)->b_l2hdr != NULL)
546fa94a07fSbrendan #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
547fa94a07fSbrendan #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
548fa94a07fSbrendan #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
549fa9e4066Sahrens 
550e6c728e1Sbrendan /*
551e6c728e1Sbrendan  * Other sizes
552e6c728e1Sbrendan  */
553e6c728e1Sbrendan 
554e6c728e1Sbrendan #define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
555e6c728e1Sbrendan #define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
556e6c728e1Sbrendan 
557fa9e4066Sahrens /*
558fa9e4066Sahrens  * Hash table routines
559fa9e4066Sahrens  */
560fa9e4066Sahrens 
561fa9e4066Sahrens #define	HT_LOCK_PAD	64
562fa9e4066Sahrens 
563fa9e4066Sahrens struct ht_lock {
564fa9e4066Sahrens 	kmutex_t	ht_lock;
565fa9e4066Sahrens #ifdef _KERNEL
566fa9e4066Sahrens 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
567fa9e4066Sahrens #endif
568fa9e4066Sahrens };
569fa9e4066Sahrens 
570fa9e4066Sahrens #define	BUF_LOCKS 256
571fa9e4066Sahrens typedef struct buf_hash_table {
572fa9e4066Sahrens 	uint64_t ht_mask;
573fa9e4066Sahrens 	arc_buf_hdr_t **ht_table;
574fa9e4066Sahrens 	struct ht_lock ht_locks[BUF_LOCKS];
575fa9e4066Sahrens } buf_hash_table_t;
576fa9e4066Sahrens 
577fa9e4066Sahrens static buf_hash_table_t buf_hash_table;
578fa9e4066Sahrens 
579fa9e4066Sahrens #define	BUF_HASH_INDEX(spa, dva, birth) \
580fa9e4066Sahrens 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
581fa9e4066Sahrens #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
582fa9e4066Sahrens #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
5833f9d6ad7SLin Ling #define	HDR_LOCK(hdr) \
5843f9d6ad7SLin Ling 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
585fa9e4066Sahrens 
586fa9e4066Sahrens uint64_t zfs_crc64_table[256];
587fa9e4066Sahrens 
588fa94a07fSbrendan /*
589fa94a07fSbrendan  * Level 2 ARC
590fa94a07fSbrendan  */
591fa94a07fSbrendan 
592fa94a07fSbrendan #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
593*aad02571SSaso Kiselkov #define	L2ARC_HEADROOM		2			/* num of writes */
594*aad02571SSaso Kiselkov /*
595*aad02571SSaso Kiselkov  * If we discover during ARC scan any buffers to be compressed, we boost
596*aad02571SSaso Kiselkov  * our headroom for the next scanning cycle by this percentage multiple.
597*aad02571SSaso Kiselkov  */
598*aad02571SSaso Kiselkov #define	L2ARC_HEADROOM_BOOST	200
5995a98e54bSBrendan Gregg - Sun Microsystems #define	L2ARC_FEED_SECS		1		/* caching interval secs */
6005a98e54bSBrendan Gregg - Sun Microsystems #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
601fa94a07fSbrendan 
602fa94a07fSbrendan #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
603fa94a07fSbrendan #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
604fa94a07fSbrendan 
605fa94a07fSbrendan /*
606fa94a07fSbrendan  * L2ARC Performance Tunables
607fa94a07fSbrendan  */
608fa94a07fSbrendan uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
6093a737e0dSbrendan uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
610fa94a07fSbrendan uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
611*aad02571SSaso Kiselkov uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
612fa94a07fSbrendan uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
6135a98e54bSBrendan Gregg - Sun Microsystems uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
614fa94a07fSbrendan boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
6155a98e54bSBrendan Gregg - Sun Microsystems boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
6165a98e54bSBrendan Gregg - Sun Microsystems boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
617fa94a07fSbrendan 
618fa94a07fSbrendan /*
619fa94a07fSbrendan  * L2ARC Internals
620fa94a07fSbrendan  */
621fa94a07fSbrendan typedef struct l2arc_dev {
622fa94a07fSbrendan 	vdev_t			*l2ad_vdev;	/* vdev */
623fa94a07fSbrendan 	spa_t			*l2ad_spa;	/* spa */
624fa94a07fSbrendan 	uint64_t		l2ad_hand;	/* next write location */
625fa94a07fSbrendan 	uint64_t		l2ad_start;	/* first addr on device */
626fa94a07fSbrendan 	uint64_t		l2ad_end;	/* last addr on device */
627fa94a07fSbrendan 	uint64_t		l2ad_evict;	/* last addr eviction reached */
628fa94a07fSbrendan 	boolean_t		l2ad_first;	/* first sweep through */
6295a98e54bSBrendan Gregg - Sun Microsystems 	boolean_t		l2ad_writing;	/* currently writing */
630fa94a07fSbrendan 	list_t			*l2ad_buflist;	/* buffer list */
631fa94a07fSbrendan 	list_node_t		l2ad_node;	/* device list node */
632fa94a07fSbrendan } l2arc_dev_t;
633fa94a07fSbrendan 
634fa94a07fSbrendan static list_t L2ARC_dev_list;			/* device list */
635fa94a07fSbrendan static list_t *l2arc_dev_list;			/* device list pointer */
636fa94a07fSbrendan static kmutex_t l2arc_dev_mtx;			/* device list mutex */
637fa94a07fSbrendan static l2arc_dev_t *l2arc_dev_last;		/* last device used */
638fa94a07fSbrendan static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
639fa94a07fSbrendan static list_t L2ARC_free_on_write;		/* free after write buf list */
640fa94a07fSbrendan static list_t *l2arc_free_on_write;		/* free after write list ptr */
641fa94a07fSbrendan static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
642fa94a07fSbrendan static uint64_t l2arc_ndev;			/* number of devices */
643fa94a07fSbrendan 
644fa94a07fSbrendan typedef struct l2arc_read_callback {
645*aad02571SSaso Kiselkov 	arc_buf_t		*l2rcb_buf;		/* read buffer */
646*aad02571SSaso Kiselkov 	spa_t			*l2rcb_spa;		/* spa */
647*aad02571SSaso Kiselkov 	blkptr_t		l2rcb_bp;		/* original blkptr */
648*aad02571SSaso Kiselkov 	zbookmark_t		l2rcb_zb;		/* original bookmark */
649*aad02571SSaso Kiselkov 	int			l2rcb_flags;		/* original flags */
650*aad02571SSaso Kiselkov 	enum zio_compress	l2rcb_compress;		/* applied compress */
651fa94a07fSbrendan } l2arc_read_callback_t;
652fa94a07fSbrendan 
653fa94a07fSbrendan typedef struct l2arc_write_callback {
654fa94a07fSbrendan 	l2arc_dev_t	*l2wcb_dev;		/* device info */
655fa94a07fSbrendan 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
656fa94a07fSbrendan } l2arc_write_callback_t;
657fa94a07fSbrendan 
658fa94a07fSbrendan struct l2arc_buf_hdr {
659fa94a07fSbrendan 	/* protected by arc_buf_hdr  mutex */
660*aad02571SSaso Kiselkov 	l2arc_dev_t		*b_dev;		/* L2ARC device */
661*aad02571SSaso Kiselkov 	uint64_t		b_daddr;	/* disk address, offset byte */
662*aad02571SSaso Kiselkov 	/* compression applied to buffer data */
663*aad02571SSaso Kiselkov 	enum zio_compress	b_compress;
664*aad02571SSaso Kiselkov 	/* real alloc'd buffer size depending on b_compress applied */
665*aad02571SSaso Kiselkov 	int			b_asize;
666*aad02571SSaso Kiselkov 	/* temporary buffer holder for in-flight compressed data */
667*aad02571SSaso Kiselkov 	void			*b_tmp_cdata;
668fa94a07fSbrendan };
669fa94a07fSbrendan 
670fa94a07fSbrendan typedef struct l2arc_data_free {
671fa94a07fSbrendan 	/* protected by l2arc_free_on_write_mtx */
672fa94a07fSbrendan 	void		*l2df_data;
673fa94a07fSbrendan 	size_t		l2df_size;
674fa94a07fSbrendan 	void		(*l2df_func)(void *, size_t);
675fa94a07fSbrendan 	list_node_t	l2df_list_node;
676fa94a07fSbrendan } l2arc_data_free_t;
677fa94a07fSbrendan 
678fa94a07fSbrendan static kmutex_t l2arc_feed_thr_lock;
679fa94a07fSbrendan static kcondvar_t l2arc_feed_thr_cv;
680fa94a07fSbrendan static uint8_t l2arc_thread_exit;
681fa94a07fSbrendan 
682fa94a07fSbrendan static void l2arc_read_done(zio_t *zio);
683fa94a07fSbrendan static void l2arc_hdr_stat_add(void);
684fa94a07fSbrendan static void l2arc_hdr_stat_remove(void);
685fa94a07fSbrendan 
686*aad02571SSaso Kiselkov static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
687*aad02571SSaso Kiselkov static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
688*aad02571SSaso Kiselkov     enum zio_compress c);
689*aad02571SSaso Kiselkov static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
690*aad02571SSaso Kiselkov 
691fa9e4066Sahrens static uint64_t
692ac05c741SMark Maybee buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
693fa9e4066Sahrens {
694fa9e4066Sahrens 	uint8_t *vdva = (uint8_t *)dva;
695fa9e4066Sahrens 	uint64_t crc = -1ULL;
696fa9e4066Sahrens 	int i;
697fa9e4066Sahrens 
698fa9e4066Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
699fa9e4066Sahrens 
700fa9e4066Sahrens 	for (i = 0; i < sizeof (dva_t); i++)
701fa9e4066Sahrens 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
702fa9e4066Sahrens 
703ac05c741SMark Maybee 	crc ^= (spa>>8) ^ birth;
704fa9e4066Sahrens 
705fa9e4066Sahrens 	return (crc);
706fa9e4066Sahrens }
707fa9e4066Sahrens 
708fa9e4066Sahrens #define	BUF_EMPTY(buf)						\
709fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == 0 &&			\
710fa9e4066Sahrens 	(buf)->b_dva.dva_word[1] == 0 &&			\
711fa9e4066Sahrens 	(buf)->b_birth == 0)
712fa9e4066Sahrens 
713fa9e4066Sahrens #define	BUF_EQUAL(spa, dva, birth, buf)				\
714fa9e4066Sahrens 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
715fa9e4066Sahrens 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
716fa9e4066Sahrens 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
717fa9e4066Sahrens 
7183f9d6ad7SLin Ling static void
7193f9d6ad7SLin Ling buf_discard_identity(arc_buf_hdr_t *hdr)
7203f9d6ad7SLin Ling {
7213f9d6ad7SLin Ling 	hdr->b_dva.dva_word[0] = 0;
7223f9d6ad7SLin Ling 	hdr->b_dva.dva_word[1] = 0;
7233f9d6ad7SLin Ling 	hdr->b_birth = 0;
7243f9d6ad7SLin Ling 	hdr->b_cksum0 = 0;
7253f9d6ad7SLin Ling }
7263f9d6ad7SLin Ling 
727fa9e4066Sahrens static arc_buf_hdr_t *
728ac05c741SMark Maybee buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
729fa9e4066Sahrens {
730fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
731fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
732fa9e4066Sahrens 	arc_buf_hdr_t *buf;
733fa9e4066Sahrens 
734fa9e4066Sahrens 	mutex_enter(hash_lock);
735fa9e4066Sahrens 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
736fa9e4066Sahrens 	    buf = buf->b_hash_next) {
737fa9e4066Sahrens 		if (BUF_EQUAL(spa, dva, birth, buf)) {
738fa9e4066Sahrens 			*lockp = hash_lock;
739fa9e4066Sahrens 			return (buf);
740fa9e4066Sahrens 		}
741fa9e4066Sahrens 	}
742fa9e4066Sahrens 	mutex_exit(hash_lock);
743fa9e4066Sahrens 	*lockp = NULL;
744fa9e4066Sahrens 	return (NULL);
745fa9e4066Sahrens }
746fa9e4066Sahrens 
747fa9e4066Sahrens /*
748fa9e4066Sahrens  * Insert an entry into the hash table.  If there is already an element
749fa9e4066Sahrens  * equal to elem in the hash table, then the already existing element
750fa9e4066Sahrens  * will be returned and the new element will not be inserted.
751fa9e4066Sahrens  * Otherwise returns NULL.
752fa9e4066Sahrens  */
753fa9e4066Sahrens static arc_buf_hdr_t *
754fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
755fa9e4066Sahrens {
756fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
757fa9e4066Sahrens 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
758fa9e4066Sahrens 	arc_buf_hdr_t *fbuf;
75944cb6abcSbmc 	uint32_t i;
760fa9e4066Sahrens 
761ea8dc4b6Seschrock 	ASSERT(!HDR_IN_HASH_TABLE(buf));
762fa9e4066Sahrens 	*lockp = hash_lock;
763fa9e4066Sahrens 	mutex_enter(hash_lock);
764fa9e4066Sahrens 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
765fa9e4066Sahrens 	    fbuf = fbuf->b_hash_next, i++) {
766fa9e4066Sahrens 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
767fa9e4066Sahrens 			return (fbuf);
768fa9e4066Sahrens 	}
769fa9e4066Sahrens 
770fa9e4066Sahrens 	buf->b_hash_next = buf_hash_table.ht_table[idx];
771fa9e4066Sahrens 	buf_hash_table.ht_table[idx] = buf;
772ea8dc4b6Seschrock 	buf->b_flags |= ARC_IN_HASH_TABLE;
773fa9e4066Sahrens 
774fa9e4066Sahrens 	/* collect some hash table performance data */
775fa9e4066Sahrens 	if (i > 0) {
77644cb6abcSbmc 		ARCSTAT_BUMP(arcstat_hash_collisions);
777fa9e4066Sahrens 		if (i == 1)
77844cb6abcSbmc 			ARCSTAT_BUMP(arcstat_hash_chains);
77944cb6abcSbmc 
78044cb6abcSbmc 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
781fa9e4066Sahrens 	}
78244cb6abcSbmc 
78344cb6abcSbmc 	ARCSTAT_BUMP(arcstat_hash_elements);
78444cb6abcSbmc 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
785fa9e4066Sahrens 
786fa9e4066Sahrens 	return (NULL);
787fa9e4066Sahrens }
788fa9e4066Sahrens 
789fa9e4066Sahrens static void
790fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf)
791fa9e4066Sahrens {
792fa9e4066Sahrens 	arc_buf_hdr_t *fbuf, **bufp;
793fa9e4066Sahrens 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
794fa9e4066Sahrens 
795fa9e4066Sahrens 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
796ea8dc4b6Seschrock 	ASSERT(HDR_IN_HASH_TABLE(buf));
797fa9e4066Sahrens 
798fa9e4066Sahrens 	bufp = &buf_hash_table.ht_table[idx];
799fa9e4066Sahrens 	while ((fbuf = *bufp) != buf) {
800fa9e4066Sahrens 		ASSERT(fbuf != NULL);
801fa9e4066Sahrens 		bufp = &fbuf->b_hash_next;
802fa9e4066Sahrens 	}
803fa9e4066Sahrens 	*bufp = buf->b_hash_next;
804fa9e4066Sahrens 	buf->b_hash_next = NULL;
805ea8dc4b6Seschrock 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
806fa9e4066Sahrens 
807fa9e4066Sahrens 	/* collect some hash table performance data */
80844cb6abcSbmc 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
80944cb6abcSbmc 
810fa9e4066Sahrens 	if (buf_hash_table.ht_table[idx] &&
811fa9e4066Sahrens 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
81244cb6abcSbmc 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
813fa9e4066Sahrens }
814fa9e4066Sahrens 
815fa9e4066Sahrens /*
816fa9e4066Sahrens  * Global data structures and functions for the buf kmem cache.
817fa9e4066Sahrens  */
818fa9e4066Sahrens static kmem_cache_t *hdr_cache;
819fa9e4066Sahrens static kmem_cache_t *buf_cache;
820fa9e4066Sahrens 
821fa9e4066Sahrens static void
822fa9e4066Sahrens buf_fini(void)
823fa9e4066Sahrens {
824fa9e4066Sahrens 	int i;
825fa9e4066Sahrens 
826fa9e4066Sahrens 	kmem_free(buf_hash_table.ht_table,
827fa9e4066Sahrens 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
828fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++)
829fa9e4066Sahrens 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
830fa9e4066Sahrens 	kmem_cache_destroy(hdr_cache);
831fa9e4066Sahrens 	kmem_cache_destroy(buf_cache);
832fa9e4066Sahrens }
833fa9e4066Sahrens 
834fa9e4066Sahrens /*
835fa9e4066Sahrens  * Constructor callback - called when the cache is empty
836fa9e4066Sahrens  * and a new buf is requested.
837fa9e4066Sahrens  */
838fa9e4066Sahrens /* ARGSUSED */
839fa9e4066Sahrens static int
840fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag)
841fa9e4066Sahrens {
842fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
843fa9e4066Sahrens 
844fa9e4066Sahrens 	bzero(buf, sizeof (arc_buf_hdr_t));
845fa9e4066Sahrens 	refcount_create(&buf->b_refcnt);
846fa9e4066Sahrens 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
847c25056deSgw 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
8485a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
849fa94a07fSbrendan 
850fa9e4066Sahrens 	return (0);
851fa9e4066Sahrens }
852fa9e4066Sahrens 
8536f83844dSMark Maybee /* ARGSUSED */
8546f83844dSMark Maybee static int
8556f83844dSMark Maybee buf_cons(void *vbuf, void *unused, int kmflag)
8566f83844dSMark Maybee {
8576f83844dSMark Maybee 	arc_buf_t *buf = vbuf;
8586f83844dSMark Maybee 
8596f83844dSMark Maybee 	bzero(buf, sizeof (arc_buf_t));
8603f9d6ad7SLin Ling 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
8615a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
8625a98e54bSBrendan Gregg - Sun Microsystems 
8636f83844dSMark Maybee 	return (0);
8646f83844dSMark Maybee }
8656f83844dSMark Maybee 
866fa9e4066Sahrens /*
867fa9e4066Sahrens  * Destructor callback - called when a cached buf is
868fa9e4066Sahrens  * no longer required.
869fa9e4066Sahrens  */
870fa9e4066Sahrens /* ARGSUSED */
871fa9e4066Sahrens static void
872fa9e4066Sahrens hdr_dest(void *vbuf, void *unused)
873fa9e4066Sahrens {
874fa9e4066Sahrens 	arc_buf_hdr_t *buf = vbuf;
875fa9e4066Sahrens 
876b24ab676SJeff Bonwick 	ASSERT(BUF_EMPTY(buf));
877fa9e4066Sahrens 	refcount_destroy(&buf->b_refcnt);
878fa9e4066Sahrens 	cv_destroy(&buf->b_cv);
879c25056deSgw 	mutex_destroy(&buf->b_freeze_lock);
8805a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
881fa9e4066Sahrens }
882fa9e4066Sahrens 
8836f83844dSMark Maybee /* ARGSUSED */
8846f83844dSMark Maybee static void
8856f83844dSMark Maybee buf_dest(void *vbuf, void *unused)
8866f83844dSMark Maybee {
8876f83844dSMark Maybee 	arc_buf_t *buf = vbuf;
8886f83844dSMark Maybee 
8893f9d6ad7SLin Ling 	mutex_destroy(&buf->b_evict_lock);
8905a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
8916f83844dSMark Maybee }
8926f83844dSMark Maybee 
893fa9e4066Sahrens /*
894fa9e4066Sahrens  * Reclaim callback -- invoked when memory is low.
895fa9e4066Sahrens  */
896fa9e4066Sahrens /* ARGSUSED */
897fa9e4066Sahrens static void
898fa9e4066Sahrens hdr_recl(void *unused)
899fa9e4066Sahrens {
900fa9e4066Sahrens 	dprintf("hdr_recl called\n");
90149e3519aSmaybee 	/*
90249e3519aSmaybee 	 * umem calls the reclaim func when we destroy the buf cache,
90349e3519aSmaybee 	 * which is after we do arc_fini().
90449e3519aSmaybee 	 */
90549e3519aSmaybee 	if (!arc_dead)
90649e3519aSmaybee 		cv_signal(&arc_reclaim_thr_cv);
907fa9e4066Sahrens }
908fa9e4066Sahrens 
909fa9e4066Sahrens static void
910fa9e4066Sahrens buf_init(void)
911fa9e4066Sahrens {
912fa9e4066Sahrens 	uint64_t *ct;
913ea8dc4b6Seschrock 	uint64_t hsize = 1ULL << 12;
914fa9e4066Sahrens 	int i, j;
915fa9e4066Sahrens 
916fa9e4066Sahrens 	/*
917fa9e4066Sahrens 	 * The hash table is big enough to fill all of physical memory
918ea8dc4b6Seschrock 	 * with an average 64K block size.  The table will take up
919ea8dc4b6Seschrock 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
920fa9e4066Sahrens 	 */
921ea8dc4b6Seschrock 	while (hsize * 65536 < physmem * PAGESIZE)
922fa9e4066Sahrens 		hsize <<= 1;
923ea8dc4b6Seschrock retry:
924fa9e4066Sahrens 	buf_hash_table.ht_mask = hsize - 1;
925ea8dc4b6Seschrock 	buf_hash_table.ht_table =
926ea8dc4b6Seschrock 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
927ea8dc4b6Seschrock 	if (buf_hash_table.ht_table == NULL) {
928ea8dc4b6Seschrock 		ASSERT(hsize > (1ULL << 8));
929ea8dc4b6Seschrock 		hsize >>= 1;
930ea8dc4b6Seschrock 		goto retry;
931ea8dc4b6Seschrock 	}
932fa9e4066Sahrens 
933fa9e4066Sahrens 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
934fa9e4066Sahrens 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
935fa9e4066Sahrens 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
9366f83844dSMark Maybee 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
937fa9e4066Sahrens 
938fa9e4066Sahrens 	for (i = 0; i < 256; i++)
939fa9e4066Sahrens 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
940fa9e4066Sahrens 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
941fa9e4066Sahrens 
942fa9e4066Sahrens 	for (i = 0; i < BUF_LOCKS; i++) {
943fa9e4066Sahrens 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
944fa9e4066Sahrens 		    NULL, MUTEX_DEFAULT, NULL);
945fa9e4066Sahrens 	}
946fa9e4066Sahrens }
947fa9e4066Sahrens 
948fa9e4066Sahrens #define	ARC_MINTIME	(hz>>4) /* 62 ms */
949fa9e4066Sahrens 
9506b4acc8bSahrens static void
9516b4acc8bSahrens arc_cksum_verify(arc_buf_t *buf)
9526b4acc8bSahrens {
9536b4acc8bSahrens 	zio_cksum_t zc;
9546b4acc8bSahrens 
955cc60fd72Sahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
9566b4acc8bSahrens 		return;
9576b4acc8bSahrens 
9586b4acc8bSahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
9593ccfa83cSahrens 	if (buf->b_hdr->b_freeze_cksum == NULL ||
9603ccfa83cSahrens 	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
9616b4acc8bSahrens 		mutex_exit(&buf->b_hdr->b_freeze_lock);
9626b4acc8bSahrens 		return;
9636b4acc8bSahrens 	}
9646b4acc8bSahrens 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
9656b4acc8bSahrens 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
9666b4acc8bSahrens 		panic("buffer modified while frozen!");
9676b4acc8bSahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
9686b4acc8bSahrens }
9696b4acc8bSahrens 
970fa94a07fSbrendan static int
971fa94a07fSbrendan arc_cksum_equal(arc_buf_t *buf)
972fa94a07fSbrendan {
973fa94a07fSbrendan 	zio_cksum_t zc;
974fa94a07fSbrendan 	int equal;
975fa94a07fSbrendan 
976fa94a07fSbrendan 	mutex_enter(&buf->b_hdr->b_freeze_lock);
977fa94a07fSbrendan 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
978fa94a07fSbrendan 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
979fa94a07fSbrendan 	mutex_exit(&buf->b_hdr->b_freeze_lock);
980fa94a07fSbrendan 
981fa94a07fSbrendan 	return (equal);
982fa94a07fSbrendan }
983fa94a07fSbrendan 
9846b4acc8bSahrens static void
985fa94a07fSbrendan arc_cksum_compute(arc_buf_t *buf, boolean_t force)
9866b4acc8bSahrens {
987fa94a07fSbrendan 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
9886b4acc8bSahrens 		return;
9896b4acc8bSahrens 
9906b4acc8bSahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
9916b4acc8bSahrens 	if (buf->b_hdr->b_freeze_cksum != NULL) {
9926b4acc8bSahrens 		mutex_exit(&buf->b_hdr->b_freeze_lock);
9936b4acc8bSahrens 		return;
9946b4acc8bSahrens 	}
9956b4acc8bSahrens 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
9966b4acc8bSahrens 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
9976b4acc8bSahrens 	    buf->b_hdr->b_freeze_cksum);
9986b4acc8bSahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
999cd1c8b85SMatthew Ahrens 	arc_buf_watch(buf);
1000cd1c8b85SMatthew Ahrens }
1001cd1c8b85SMatthew Ahrens 
1002cd1c8b85SMatthew Ahrens #ifndef _KERNEL
1003cd1c8b85SMatthew Ahrens typedef struct procctl {
1004cd1c8b85SMatthew Ahrens 	long cmd;
1005cd1c8b85SMatthew Ahrens 	prwatch_t prwatch;
1006cd1c8b85SMatthew Ahrens } procctl_t;
1007cd1c8b85SMatthew Ahrens #endif
1008cd1c8b85SMatthew Ahrens 
1009cd1c8b85SMatthew Ahrens /* ARGSUSED */
1010cd1c8b85SMatthew Ahrens static void
1011cd1c8b85SMatthew Ahrens arc_buf_unwatch(arc_buf_t *buf)
1012cd1c8b85SMatthew Ahrens {
1013cd1c8b85SMatthew Ahrens #ifndef _KERNEL
1014cd1c8b85SMatthew Ahrens 	if (arc_watch) {
1015cd1c8b85SMatthew Ahrens 		int result;
1016cd1c8b85SMatthew Ahrens 		procctl_t ctl;
1017cd1c8b85SMatthew Ahrens 		ctl.cmd = PCWATCH;
1018cd1c8b85SMatthew Ahrens 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1019cd1c8b85SMatthew Ahrens 		ctl.prwatch.pr_size = 0;
1020cd1c8b85SMatthew Ahrens 		ctl.prwatch.pr_wflags = 0;
1021cd1c8b85SMatthew Ahrens 		result = write(arc_procfd, &ctl, sizeof (ctl));
1022cd1c8b85SMatthew Ahrens 		ASSERT3U(result, ==, sizeof (ctl));
1023cd1c8b85SMatthew Ahrens 	}
1024cd1c8b85SMatthew Ahrens #endif
1025cd1c8b85SMatthew Ahrens }
1026cd1c8b85SMatthew Ahrens 
1027cd1c8b85SMatthew Ahrens /* ARGSUSED */
1028cd1c8b85SMatthew Ahrens static void
1029cd1c8b85SMatthew Ahrens arc_buf_watch(arc_buf_t *buf)
1030cd1c8b85SMatthew Ahrens {
1031cd1c8b85SMatthew Ahrens #ifndef _KERNEL
1032cd1c8b85SMatthew Ahrens 	if (arc_watch) {
1033cd1c8b85SMatthew Ahrens 		int result;
1034cd1c8b85SMatthew Ahrens 		procctl_t ctl;
1035cd1c8b85SMatthew Ahrens 		ctl.cmd = PCWATCH;
1036cd1c8b85SMatthew Ahrens 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1037cd1c8b85SMatthew Ahrens 		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1038cd1c8b85SMatthew Ahrens 		ctl.prwatch.pr_wflags = WA_WRITE;
1039cd1c8b85SMatthew Ahrens 		result = write(arc_procfd, &ctl, sizeof (ctl));
1040cd1c8b85SMatthew Ahrens 		ASSERT3U(result, ==, sizeof (ctl));
1041cd1c8b85SMatthew Ahrens 	}
1042cd1c8b85SMatthew Ahrens #endif
10436b4acc8bSahrens }
10446b4acc8bSahrens 
10456b4acc8bSahrens void
10466b4acc8bSahrens arc_buf_thaw(arc_buf_t *buf)
10476b4acc8bSahrens {
1048fa94a07fSbrendan 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1049fa94a07fSbrendan 		if (buf->b_hdr->b_state != arc_anon)
1050fa94a07fSbrendan 			panic("modifying non-anon buffer!");
1051fa94a07fSbrendan 		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1052fa94a07fSbrendan 			panic("modifying buffer while i/o in progress!");
1053fa94a07fSbrendan 		arc_cksum_verify(buf);
1054fa94a07fSbrendan 	}
10556b4acc8bSahrens 
10566b4acc8bSahrens 	mutex_enter(&buf->b_hdr->b_freeze_lock);
10576b4acc8bSahrens 	if (buf->b_hdr->b_freeze_cksum != NULL) {
10586b4acc8bSahrens 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
10596b4acc8bSahrens 		buf->b_hdr->b_freeze_cksum = NULL;
10606b4acc8bSahrens 	}
10613f9d6ad7SLin Ling 
10623f9d6ad7SLin Ling 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
10633f9d6ad7SLin Ling 		if (buf->b_hdr->b_thawed)
10643f9d6ad7SLin Ling 			kmem_free(buf->b_hdr->b_thawed, 1);
10653f9d6ad7SLin Ling 		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
10663f9d6ad7SLin Ling 	}
10673f9d6ad7SLin Ling 
10686b4acc8bSahrens 	mutex_exit(&buf->b_hdr->b_freeze_lock);
1069cd1c8b85SMatthew Ahrens 
1070cd1c8b85SMatthew Ahrens 	arc_buf_unwatch(buf);
10716b4acc8bSahrens }
10726b4acc8bSahrens 
10736b4acc8bSahrens void
10746b4acc8bSahrens arc_buf_freeze(arc_buf_t *buf)
10756b4acc8bSahrens {
10763f9d6ad7SLin Ling 	kmutex_t *hash_lock;
10773f9d6ad7SLin Ling 
1078cc60fd72Sahrens 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1079cc60fd72Sahrens 		return;
1080cc60fd72Sahrens 
10813f9d6ad7SLin Ling 	hash_lock = HDR_LOCK(buf->b_hdr);
10823f9d6ad7SLin Ling 	mutex_enter(hash_lock);
10833f9d6ad7SLin Ling 
10846b4acc8bSahrens 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
108544cb6abcSbmc 	    buf->b_hdr->b_state == arc_anon);
1086fa94a07fSbrendan 	arc_cksum_compute(buf, B_FALSE);
10873f9d6ad7SLin Ling 	mutex_exit(hash_lock);
1088cd1c8b85SMatthew Ahrens 
10896b4acc8bSahrens }
10906b4acc8bSahrens 
1091fa9e4066Sahrens static void
1092fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1093fa9e4066Sahrens {
1094fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1095fa9e4066Sahrens 
1096fa9e4066Sahrens 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
109744cb6abcSbmc 	    (ab->b_state != arc_anon)) {
1098c0a81264Sek 		uint64_t delta = ab->b_size * ab->b_datacnt;
10990e8c6158Smaybee 		list_t *list = &ab->b_state->arcs_list[ab->b_type];
11000e8c6158Smaybee 		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1101fa9e4066Sahrens 
110244cb6abcSbmc 		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
110344cb6abcSbmc 		mutex_enter(&ab->b_state->arcs_mtx);
1104fa9e4066Sahrens 		ASSERT(list_link_active(&ab->b_arc_node));
11050e8c6158Smaybee 		list_remove(list, ab);
1106ea8dc4b6Seschrock 		if (GHOST_STATE(ab->b_state)) {
1107fb09f5aaSMadhav Suresh 			ASSERT0(ab->b_datacnt);
1108ea8dc4b6Seschrock 			ASSERT3P(ab->b_buf, ==, NULL);
1109ea8dc4b6Seschrock 			delta = ab->b_size;
1110ea8dc4b6Seschrock 		}
1111ea8dc4b6Seschrock 		ASSERT(delta > 0);
11120e8c6158Smaybee 		ASSERT3U(*size, >=, delta);
11130e8c6158Smaybee 		atomic_add_64(size, -delta);
111444cb6abcSbmc 		mutex_exit(&ab->b_state->arcs_mtx);
1115088f3894Sahrens 		/* remove the prefetch flag if we get a reference */
111613506d1eSmaybee 		if (ab->b_flags & ARC_PREFETCH)
111713506d1eSmaybee 			ab->b_flags &= ~ARC_PREFETCH;
1118fa9e4066Sahrens 	}
1119fa9e4066Sahrens }
1120fa9e4066Sahrens 
1121fa9e4066Sahrens static int
1122fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1123fa9e4066Sahrens {
1124fa9e4066Sahrens 	int cnt;
112544cb6abcSbmc 	arc_state_t *state = ab->b_state;
1126fa9e4066Sahrens 
112744cb6abcSbmc 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
112844cb6abcSbmc 	ASSERT(!GHOST_STATE(state));
1129fa9e4066Sahrens 
1130fa9e4066Sahrens 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
113144cb6abcSbmc 	    (state != arc_anon)) {
11320e8c6158Smaybee 		uint64_t *size = &state->arcs_lsize[ab->b_type];
11330e8c6158Smaybee 
113444cb6abcSbmc 		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
113544cb6abcSbmc 		mutex_enter(&state->arcs_mtx);
1136fa9e4066Sahrens 		ASSERT(!list_link_active(&ab->b_arc_node));
11370e8c6158Smaybee 		list_insert_head(&state->arcs_list[ab->b_type], ab);
1138ea8dc4b6Seschrock 		ASSERT(ab->b_datacnt > 0);
11390e8c6158Smaybee 		atomic_add_64(size, ab->b_size * ab->b_datacnt);
114044cb6abcSbmc 		mutex_exit(&state->arcs_mtx);
1141fa9e4066Sahrens 	}
1142fa9e4066Sahrens 	return (cnt);
1143fa9e4066Sahrens }
1144fa9e4066Sahrens 
1145fa9e4066Sahrens /*
1146fa9e4066Sahrens  * Move the supplied buffer to the indicated state.  The mutex
1147fa9e4066Sahrens  * for the buffer must be held by the caller.
1148fa9e4066Sahrens  */
1149fa9e4066Sahrens static void
1150ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1151fa9e4066Sahrens {
1152ea8dc4b6Seschrock 	arc_state_t *old_state = ab->b_state;
1153c0a81264Sek 	int64_t refcnt = refcount_count(&ab->b_refcnt);
1154c0a81264Sek 	uint64_t from_delta, to_delta;
1155fa9e4066Sahrens 
1156fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
1157ea8dc4b6Seschrock 	ASSERT(new_state != old_state);
1158ea8dc4b6Seschrock 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1159ea8dc4b6Seschrock 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1160b24ab676SJeff Bonwick 	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1161ea8dc4b6Seschrock 
1162ea8dc4b6Seschrock 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1163fa9e4066Sahrens 
1164fa9e4066Sahrens 	/*
1165fa9e4066Sahrens 	 * If this buffer is evictable, transfer it from the
1166fa9e4066Sahrens 	 * old state list to the new state list.
1167fa9e4066Sahrens 	 */
1168ea8dc4b6Seschrock 	if (refcnt == 0) {
116944cb6abcSbmc 		if (old_state != arc_anon) {
117044cb6abcSbmc 			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
11710e8c6158Smaybee 			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1172ea8dc4b6Seschrock 
1173ea8dc4b6Seschrock 			if (use_mutex)
117444cb6abcSbmc 				mutex_enter(&old_state->arcs_mtx);
1175fa9e4066Sahrens 
1176fa9e4066Sahrens 			ASSERT(list_link_active(&ab->b_arc_node));
11770e8c6158Smaybee 			list_remove(&old_state->arcs_list[ab->b_type], ab);
1178ea8dc4b6Seschrock 
117913506d1eSmaybee 			/*
118013506d1eSmaybee 			 * If prefetching out of the ghost cache,
11813f9d6ad7SLin Ling 			 * we will have a non-zero datacnt.
118213506d1eSmaybee 			 */
118313506d1eSmaybee 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
118413506d1eSmaybee 				/* ghost elements have a ghost size */
1185ea8dc4b6Seschrock 				ASSERT(ab->b_buf == NULL);
1186ea8dc4b6Seschrock 				from_delta = ab->b_size;
1187ea8dc4b6Seschrock 			}
11880e8c6158Smaybee 			ASSERT3U(*size, >=, from_delta);
11890e8c6158Smaybee 			atomic_add_64(size, -from_delta);
1190ea8dc4b6Seschrock 
1191ea8dc4b6Seschrock 			if (use_mutex)
119244cb6abcSbmc 				mutex_exit(&old_state->arcs_mtx);
1193fa9e4066Sahrens 		}
119444cb6abcSbmc 		if (new_state != arc_anon) {
119544cb6abcSbmc 			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
11960e8c6158Smaybee 			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1197fa9e4066Sahrens 
1198ea8dc4b6Seschrock 			if (use_mutex)
119944cb6abcSbmc 				mutex_enter(&new_state->arcs_mtx);
1200ea8dc4b6Seschrock 
12010e8c6158Smaybee 			list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1202ea8dc4b6Seschrock 
1203ea8dc4b6Seschrock 			/* ghost elements have a ghost size */
1204ea8dc4b6Seschrock 			if (GHOST_STATE(new_state)) {
1205ea8dc4b6Seschrock 				ASSERT(ab->b_datacnt == 0);
1206ea8dc4b6Seschrock 				ASSERT(ab->b_buf == NULL);
1207ea8dc4b6Seschrock 				to_delta = ab->b_size;
1208ea8dc4b6Seschrock 			}
12090e8c6158Smaybee 			atomic_add_64(size, to_delta);
1210ea8dc4b6Seschrock 
1211ea8dc4b6Seschrock 			if (use_mutex)
121244cb6abcSbmc 				mutex_exit(&new_state->arcs_mtx);
1213fa9e4066Sahrens 		}
1214fa9e4066Sahrens 	}
1215fa9e4066Sahrens 
1216fa9e4066Sahrens 	ASSERT(!BUF_EMPTY(ab));
12173f9d6ad7SLin Ling 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1218fa9e4066Sahrens 		buf_hash_remove(ab);
1219fa9e4066Sahrens 
1220ea8dc4b6Seschrock 	/* adjust state sizes */
1221ea8dc4b6Seschrock 	if (to_delta)
122244cb6abcSbmc 		atomic_add_64(&new_state->arcs_size, to_delta);
1223ea8dc4b6Seschrock 	if (from_delta) {
122444cb6abcSbmc 		ASSERT3U(old_state->arcs_size, >=, from_delta);
122544cb6abcSbmc 		atomic_add_64(&old_state->arcs_size, -from_delta);
1226fa9e4066Sahrens 	}
1227fa9e4066Sahrens 	ab->b_state = new_state;
1228fa94a07fSbrendan 
1229fa94a07fSbrendan 	/* adjust l2arc hdr stats */
1230fa94a07fSbrendan 	if (new_state == arc_l2c_only)
1231fa94a07fSbrendan 		l2arc_hdr_stat_add();
1232fa94a07fSbrendan 	else if (old_state == arc_l2c_only)
1233fa94a07fSbrendan 		l2arc_hdr_stat_remove();
1234fa9e4066Sahrens }
1235fa9e4066Sahrens 
12360e8c6158Smaybee void
12375a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(uint64_t space, arc_space_type_t type)
12380e8c6158Smaybee {
12395a98e54bSBrendan Gregg - Sun Microsystems 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
12405a98e54bSBrendan Gregg - Sun Microsystems 
12415a98e54bSBrendan Gregg - Sun Microsystems 	switch (type) {
12425a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_DATA:
12435a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_data_size, space);
12445a98e54bSBrendan Gregg - Sun Microsystems 		break;
12455a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_OTHER:
12465a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_other_size, space);
12475a98e54bSBrendan Gregg - Sun Microsystems 		break;
12485a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_HDRS:
12495a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_hdr_size, space);
12505a98e54bSBrendan Gregg - Sun Microsystems 		break;
12515a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_L2HDRS:
12525a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
12535a98e54bSBrendan Gregg - Sun Microsystems 		break;
12545a98e54bSBrendan Gregg - Sun Microsystems 	}
12555a98e54bSBrendan Gregg - Sun Microsystems 
125620128a08SGeorge Wilson 	ARCSTAT_INCR(arcstat_meta_used, space);
12570e8c6158Smaybee 	atomic_add_64(&arc_size, space);
12580e8c6158Smaybee }
12590e8c6158Smaybee 
12600e8c6158Smaybee void
12615a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(uint64_t space, arc_space_type_t type)
12620e8c6158Smaybee {
12635a98e54bSBrendan Gregg - Sun Microsystems 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
12645a98e54bSBrendan Gregg - Sun Microsystems 
12655a98e54bSBrendan Gregg - Sun Microsystems 	switch (type) {
12665a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_DATA:
12675a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_data_size, -space);
12685a98e54bSBrendan Gregg - Sun Microsystems 		break;
12695a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_OTHER:
12705a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_other_size, -space);
12715a98e54bSBrendan Gregg - Sun Microsystems 		break;
12725a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_HDRS:
12735a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_hdr_size, -space);
12745a98e54bSBrendan Gregg - Sun Microsystems 		break;
12755a98e54bSBrendan Gregg - Sun Microsystems 	case ARC_SPACE_L2HDRS:
12765a98e54bSBrendan Gregg - Sun Microsystems 		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
12775a98e54bSBrendan Gregg - Sun Microsystems 		break;
12785a98e54bSBrendan Gregg - Sun Microsystems 	}
12795a98e54bSBrendan Gregg - Sun Microsystems 
12800e8c6158Smaybee 	ASSERT(arc_meta_used >= space);
12810e8c6158Smaybee 	if (arc_meta_max < arc_meta_used)
12820e8c6158Smaybee 		arc_meta_max = arc_meta_used;
128320128a08SGeorge Wilson 	ARCSTAT_INCR(arcstat_meta_used, -space);
12840e8c6158Smaybee 	ASSERT(arc_size >= space);
12850e8c6158Smaybee 	atomic_add_64(&arc_size, -space);
12860e8c6158Smaybee }
12870e8c6158Smaybee 
12880e8c6158Smaybee void *
12890e8c6158Smaybee arc_data_buf_alloc(uint64_t size)
12900e8c6158Smaybee {
12910e8c6158Smaybee 	if (arc_evict_needed(ARC_BUFC_DATA))
12920e8c6158Smaybee 		cv_signal(&arc_reclaim_thr_cv);
12930e8c6158Smaybee 	atomic_add_64(&arc_size, size);
12940e8c6158Smaybee 	return (zio_data_buf_alloc(size));
12950e8c6158Smaybee }
12960e8c6158Smaybee 
12970e8c6158Smaybee void
12980e8c6158Smaybee arc_data_buf_free(void *buf, uint64_t size)
12990e8c6158Smaybee {
13000e8c6158Smaybee 	zio_data_buf_free(buf, size);
13010e8c6158Smaybee 	ASSERT(arc_size >= size);
13020e8c6158Smaybee 	atomic_add_64(&arc_size, -size);
13030e8c6158Smaybee }
13040e8c6158Smaybee 
1305fa9e4066Sahrens arc_buf_t *
1306ad23a2dbSjohansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1307fa9e4066Sahrens {
1308fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
1309fa9e4066Sahrens 	arc_buf_t *buf;
1310fa9e4066Sahrens 
1311fa9e4066Sahrens 	ASSERT3U(size, >, 0);
13121ab7f2deSmaybee 	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1313fa9e4066Sahrens 	ASSERT(BUF_EMPTY(hdr));
1314fa9e4066Sahrens 	hdr->b_size = size;
1315ad23a2dbSjohansen 	hdr->b_type = type;
1316e9103aaeSGarrett D'Amore 	hdr->b_spa = spa_load_guid(spa);
131744cb6abcSbmc 	hdr->b_state = arc_anon;
1318fa9e4066Sahrens 	hdr->b_arc_access = 0;
13191ab7f2deSmaybee 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1320fa9e4066Sahrens 	buf->b_hdr = hdr;
132144eda4d7Smaybee 	buf->b_data = NULL;
1322ea8dc4b6Seschrock 	buf->b_efunc = NULL;
1323ea8dc4b6Seschrock 	buf->b_private = NULL;
1324fa9e4066Sahrens 	buf->b_next = NULL;
1325fa9e4066Sahrens 	hdr->b_buf = buf;
132644eda4d7Smaybee 	arc_get_data_buf(buf);
1327ea8dc4b6Seschrock 	hdr->b_datacnt = 1;
1328fa9e4066Sahrens 	hdr->b_flags = 0;
1329fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1330fa9e4066Sahrens 	(void) refcount_add(&hdr->b_refcnt, tag);
1331fa9e4066Sahrens 
1332fa9e4066Sahrens 	return (buf);
1333fa9e4066Sahrens }
1334fa9e4066Sahrens 
13352fdbea25SAleksandr Guzovskiy static char *arc_onloan_tag = "onloan";
13362fdbea25SAleksandr Guzovskiy 
13372fdbea25SAleksandr Guzovskiy /*
13382fdbea25SAleksandr Guzovskiy  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
13392fdbea25SAleksandr Guzovskiy  * flight data by arc_tempreserve_space() until they are "returned". Loaned
13402fdbea25SAleksandr Guzovskiy  * buffers must be returned to the arc before they can be used by the DMU or
13412fdbea25SAleksandr Guzovskiy  * freed.
13422fdbea25SAleksandr Guzovskiy  */
13432fdbea25SAleksandr Guzovskiy arc_buf_t *
13442fdbea25SAleksandr Guzovskiy arc_loan_buf(spa_t *spa, int size)
13452fdbea25SAleksandr Guzovskiy {
13462fdbea25SAleksandr Guzovskiy 	arc_buf_t *buf;
13472fdbea25SAleksandr Guzovskiy 
13482fdbea25SAleksandr Guzovskiy 	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
13492fdbea25SAleksandr Guzovskiy 
13502fdbea25SAleksandr Guzovskiy 	atomic_add_64(&arc_loaned_bytes, size);
13512fdbea25SAleksandr Guzovskiy 	return (buf);
13522fdbea25SAleksandr Guzovskiy }
13532fdbea25SAleksandr Guzovskiy 
13542fdbea25SAleksandr Guzovskiy /*
13552fdbea25SAleksandr Guzovskiy  * Return a loaned arc buffer to the arc.
13562fdbea25SAleksandr Guzovskiy  */
13572fdbea25SAleksandr Guzovskiy void
13582fdbea25SAleksandr Guzovskiy arc_return_buf(arc_buf_t *buf, void *tag)
13592fdbea25SAleksandr Guzovskiy {
13602fdbea25SAleksandr Guzovskiy 	arc_buf_hdr_t *hdr = buf->b_hdr;
13612fdbea25SAleksandr Guzovskiy 
13622fdbea25SAleksandr Guzovskiy 	ASSERT(buf->b_data != NULL);
1363c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	(void) refcount_add(&hdr->b_refcnt, tag);
1364c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
13652fdbea25SAleksandr Guzovskiy 
13662fdbea25SAleksandr Guzovskiy 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
13672fdbea25SAleksandr Guzovskiy }
13682fdbea25SAleksandr Guzovskiy 
1369c242f9a0Schunli zhang - Sun Microsystems - Irvine United States /* Detach an arc_buf from a dbuf (tag) */
1370c242f9a0Schunli zhang - Sun Microsystems - Irvine United States void
1371c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1372c242f9a0Schunli zhang - Sun Microsystems - Irvine United States {
1373c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	arc_buf_hdr_t *hdr;
1374c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 
1375c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	ASSERT(buf->b_data != NULL);
1376c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	hdr = buf->b_hdr;
1377c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1378c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	(void) refcount_remove(&hdr->b_refcnt, tag);
1379c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	buf->b_efunc = NULL;
1380c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	buf->b_private = NULL;
1381c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 
1382c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1383c242f9a0Schunli zhang - Sun Microsystems - Irvine United States }
1384c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 
138544eda4d7Smaybee static arc_buf_t *
138644eda4d7Smaybee arc_buf_clone(arc_buf_t *from)
1387ea8dc4b6Seschrock {
138844eda4d7Smaybee 	arc_buf_t *buf;
138944eda4d7Smaybee 	arc_buf_hdr_t *hdr = from->b_hdr;
139044eda4d7Smaybee 	uint64_t size = hdr->b_size;
1391ea8dc4b6Seschrock 
1392b24ab676SJeff Bonwick 	ASSERT(hdr->b_state != arc_anon);
1393b24ab676SJeff Bonwick 
13941ab7f2deSmaybee 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
139544eda4d7Smaybee 	buf->b_hdr = hdr;
139644eda4d7Smaybee 	buf->b_data = NULL;
139744eda4d7Smaybee 	buf->b_efunc = NULL;
139844eda4d7Smaybee 	buf->b_private = NULL;
139944eda4d7Smaybee 	buf->b_next = hdr->b_buf;
140044eda4d7Smaybee 	hdr->b_buf = buf;
140144eda4d7Smaybee 	arc_get_data_buf(buf);
140244eda4d7Smaybee 	bcopy(from->b_data, buf->b_data, size);
14039253d63dSGeorge Wilson 
14049253d63dSGeorge Wilson 	/*
14059253d63dSGeorge Wilson 	 * This buffer already exists in the arc so create a duplicate
14069253d63dSGeorge Wilson 	 * copy for the caller.  If the buffer is associated with user data
14079253d63dSGeorge Wilson 	 * then track the size and number of duplicates.  These stats will be
14089253d63dSGeorge Wilson 	 * updated as duplicate buffers are created and destroyed.
14099253d63dSGeorge Wilson 	 */
14109253d63dSGeorge Wilson 	if (hdr->b_type == ARC_BUFC_DATA) {
14119253d63dSGeorge Wilson 		ARCSTAT_BUMP(arcstat_duplicate_buffers);
14129253d63dSGeorge Wilson 		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
14139253d63dSGeorge Wilson 	}
141444eda4d7Smaybee 	hdr->b_datacnt += 1;
141544eda4d7Smaybee 	return (buf);
1416ea8dc4b6Seschrock }
1417ea8dc4b6Seschrock 
1418ea8dc4b6Seschrock void
1419ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag)
1420ea8dc4b6Seschrock {
142140d7d650Smaybee 	arc_buf_hdr_t *hdr;
1422ea8dc4b6Seschrock 	kmutex_t *hash_lock;
1423ea8dc4b6Seschrock 
14249b23f181Smaybee 	/*
14256f83844dSMark Maybee 	 * Check to see if this buffer is evicted.  Callers
14266f83844dSMark Maybee 	 * must verify b_data != NULL to know if the add_ref
14276f83844dSMark Maybee 	 * was successful.
14289b23f181Smaybee 	 */
14293f9d6ad7SLin Ling 	mutex_enter(&buf->b_evict_lock);
14306f83844dSMark Maybee 	if (buf->b_data == NULL) {
14313f9d6ad7SLin Ling 		mutex_exit(&buf->b_evict_lock);
14329b23f181Smaybee 		return;
143340d7d650Smaybee 	}
14343f9d6ad7SLin Ling 	hash_lock = HDR_LOCK(buf->b_hdr);
14359b23f181Smaybee 	mutex_enter(hash_lock);
14363f9d6ad7SLin Ling 	hdr = buf->b_hdr;
14373f9d6ad7SLin Ling 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
14383f9d6ad7SLin Ling 	mutex_exit(&buf->b_evict_lock);
1439ea8dc4b6Seschrock 
144044cb6abcSbmc 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1441ea8dc4b6Seschrock 	add_reference(hdr, hash_lock, tag);
14425a98e54bSBrendan Gregg - Sun Microsystems 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
144344eda4d7Smaybee 	arc_access(hdr, hash_lock);
144444eda4d7Smaybee 	mutex_exit(hash_lock);
144544cb6abcSbmc 	ARCSTAT_BUMP(arcstat_hits);
144644cb6abcSbmc 	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
144744cb6abcSbmc 	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
144844cb6abcSbmc 	    data, metadata, hits);
1449ea8dc4b6Seschrock }
1450ea8dc4b6Seschrock 
1451fa94a07fSbrendan /*
1452fa94a07fSbrendan  * Free the arc data buffer.  If it is an l2arc write in progress,
1453fa94a07fSbrendan  * the buffer is placed on l2arc_free_on_write to be freed later.
1454fa94a07fSbrendan  */
1455fa94a07fSbrendan static void
1456cd1c8b85SMatthew Ahrens arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1457fa94a07fSbrendan {
1458cd1c8b85SMatthew Ahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
1459cd1c8b85SMatthew Ahrens 
1460fa94a07fSbrendan 	if (HDR_L2_WRITING(hdr)) {
1461fa94a07fSbrendan 		l2arc_data_free_t *df;
1462fa94a07fSbrendan 		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1463cd1c8b85SMatthew Ahrens 		df->l2df_data = buf->b_data;
1464cd1c8b85SMatthew Ahrens 		df->l2df_size = hdr->b_size;
1465fa94a07fSbrendan 		df->l2df_func = free_func;
1466fa94a07fSbrendan 		mutex_enter(&l2arc_free_on_write_mtx);
1467fa94a07fSbrendan 		list_insert_head(l2arc_free_on_write, df);
1468fa94a07fSbrendan 		mutex_exit(&l2arc_free_on_write_mtx);
1469fa94a07fSbrendan 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1470fa94a07fSbrendan 	} else {
1471cd1c8b85SMatthew Ahrens 		free_func(buf->b_data, hdr->b_size);
1472fa94a07fSbrendan 	}
1473fa94a07fSbrendan }
1474fa94a07fSbrendan 
1475ea8dc4b6Seschrock static void
147644eda4d7Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1477ea8dc4b6Seschrock {
1478ea8dc4b6Seschrock 	arc_buf_t **bufp;
1479ea8dc4b6Seschrock 
1480ea8dc4b6Seschrock 	/* free up data associated with the buf */
1481ea8dc4b6Seschrock 	if (buf->b_data) {
1482ea8dc4b6Seschrock 		arc_state_t *state = buf->b_hdr->b_state;
1483ea8dc4b6Seschrock 		uint64_t size = buf->b_hdr->b_size;
1484ad23a2dbSjohansen 		arc_buf_contents_t type = buf->b_hdr->b_type;
1485ea8dc4b6Seschrock 
14866b4acc8bSahrens 		arc_cksum_verify(buf);
1487cd1c8b85SMatthew Ahrens 		arc_buf_unwatch(buf);
1488b24ab676SJeff Bonwick 
148944eda4d7Smaybee 		if (!recycle) {
1490ad23a2dbSjohansen 			if (type == ARC_BUFC_METADATA) {
1491cd1c8b85SMatthew Ahrens 				arc_buf_data_free(buf, zio_buf_free);
14925a98e54bSBrendan Gregg - Sun Microsystems 				arc_space_return(size, ARC_SPACE_DATA);
1493ad23a2dbSjohansen 			} else {
1494ad23a2dbSjohansen 				ASSERT(type == ARC_BUFC_DATA);
1495cd1c8b85SMatthew Ahrens 				arc_buf_data_free(buf, zio_data_buf_free);
14965a98e54bSBrendan Gregg - Sun Microsystems 				ARCSTAT_INCR(arcstat_data_size, -size);
14970e8c6158Smaybee 				atomic_add_64(&arc_size, -size);
1498ad23a2dbSjohansen 			}
149944eda4d7Smaybee 		}
1500ea8dc4b6Seschrock 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
15010e8c6158Smaybee 			uint64_t *cnt = &state->arcs_lsize[type];
15020e8c6158Smaybee 
1503ea8dc4b6Seschrock 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
150444cb6abcSbmc 			ASSERT(state != arc_anon);
15050e8c6158Smaybee 
15060e8c6158Smaybee 			ASSERT3U(*cnt, >=, size);
15070e8c6158Smaybee 			atomic_add_64(cnt, -size);
1508ea8dc4b6Seschrock 		}
150944cb6abcSbmc 		ASSERT3U(state->arcs_size, >=, size);
151044cb6abcSbmc 		atomic_add_64(&state->arcs_size, -size);
1511ea8dc4b6Seschrock 		buf->b_data = NULL;
15129253d63dSGeorge Wilson 
15139253d63dSGeorge Wilson 		/*
15149253d63dSGeorge Wilson 		 * If we're destroying a duplicate buffer make sure
15159253d63dSGeorge Wilson 		 * that the appropriate statistics are updated.
15169253d63dSGeorge Wilson 		 */
15179253d63dSGeorge Wilson 		if (buf->b_hdr->b_datacnt > 1 &&
15189253d63dSGeorge Wilson 		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
15199253d63dSGeorge Wilson 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
15209253d63dSGeorge Wilson 			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
15219253d63dSGeorge Wilson 		}
1522ea8dc4b6Seschrock 		ASSERT(buf->b_hdr->b_datacnt > 0);
1523ea8dc4b6Seschrock 		buf->b_hdr->b_datacnt -= 1;
1524ea8dc4b6Seschrock 	}
1525ea8dc4b6Seschrock 
1526ea8dc4b6Seschrock 	/* only remove the buf if requested */
1527ea8dc4b6Seschrock 	if (!all)
1528ea8dc4b6Seschrock 		return;
1529ea8dc4b6Seschrock 
1530ea8dc4b6Seschrock 	/* remove the buf from the hdr list */
1531ea8dc4b6Seschrock 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1532ea8dc4b6Seschrock 		continue;
1533ea8dc4b6Seschrock 	*bufp = buf->b_next;
15343f9d6ad7SLin Ling 	buf->b_next = NULL;
1535ea8dc4b6Seschrock 
1536ea8dc4b6Seschrock 	ASSERT(buf->b_efunc == NULL);
1537ea8dc4b6Seschrock 
1538ea8dc4b6Seschrock 	/* clean up the buf */
1539ea8dc4b6Seschrock 	buf->b_hdr = NULL;
1540ea8dc4b6Seschrock 	kmem_cache_free(buf_cache, buf);
1541ea8dc4b6Seschrock }
1542ea8dc4b6Seschrock 
1543fa9e4066Sahrens static void
1544ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr)
1545fa9e4066Sahrens {
1546fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
154744cb6abcSbmc 	ASSERT3P(hdr->b_state, ==, arc_anon);
1548ea8dc4b6Seschrock 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1549b24ab676SJeff Bonwick 	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1550fa9e4066Sahrens 
1551b24ab676SJeff Bonwick 	if (l2hdr != NULL) {
1552b24ab676SJeff Bonwick 		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1553b24ab676SJeff Bonwick 		/*
1554b24ab676SJeff Bonwick 		 * To prevent arc_free() and l2arc_evict() from
1555b24ab676SJeff Bonwick 		 * attempting to free the same buffer at the same time,
1556b24ab676SJeff Bonwick 		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1557b24ab676SJeff Bonwick 		 * give it priority.  l2arc_evict() can't destroy this
1558b24ab676SJeff Bonwick 		 * header while we are waiting on l2arc_buflist_mtx.
1559b24ab676SJeff Bonwick 		 *
1560b24ab676SJeff Bonwick 		 * The hdr may be removed from l2ad_buflist before we
1561b24ab676SJeff Bonwick 		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1562b24ab676SJeff Bonwick 		 */
1563b24ab676SJeff Bonwick 		if (!buflist_held) {
1564fa94a07fSbrendan 			mutex_enter(&l2arc_buflist_mtx);
1565b24ab676SJeff Bonwick 			l2hdr = hdr->b_l2hdr;
1566fa94a07fSbrendan 		}
1567b24ab676SJeff Bonwick 
1568b24ab676SJeff Bonwick 		if (l2hdr != NULL) {
1569b24ab676SJeff Bonwick 			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1570b24ab676SJeff Bonwick 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1571*aad02571SSaso Kiselkov 			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1572b24ab676SJeff Bonwick 			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1573b24ab676SJeff Bonwick 			if (hdr->b_state == arc_l2c_only)
1574b24ab676SJeff Bonwick 				l2arc_hdr_stat_remove();
1575b24ab676SJeff Bonwick 			hdr->b_l2hdr = NULL;
1576b24ab676SJeff Bonwick 		}
1577b24ab676SJeff Bonwick 
1578b24ab676SJeff Bonwick 		if (!buflist_held)
1579b24ab676SJeff Bonwick 			mutex_exit(&l2arc_buflist_mtx);
1580fa94a07fSbrendan 	}
1581fa94a07fSbrendan 
1582fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
1583ea8dc4b6Seschrock 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
15843f9d6ad7SLin Ling 		buf_discard_identity(hdr);
1585fa9e4066Sahrens 	}
1586ea8dc4b6Seschrock 	while (hdr->b_buf) {
1587fa9e4066Sahrens 		arc_buf_t *buf = hdr->b_buf;
1588fa9e4066Sahrens 
1589ea8dc4b6Seschrock 		if (buf->b_efunc) {
1590ea8dc4b6Seschrock 			mutex_enter(&arc_eviction_mtx);
15913f9d6ad7SLin Ling 			mutex_enter(&buf->b_evict_lock);
1592ea8dc4b6Seschrock 			ASSERT(buf->b_hdr != NULL);
159344eda4d7Smaybee 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1594ea8dc4b6Seschrock 			hdr->b_buf = buf->b_next;
159540d7d650Smaybee 			buf->b_hdr = &arc_eviction_hdr;
1596ea8dc4b6Seschrock 			buf->b_next = arc_eviction_list;
1597ea8dc4b6Seschrock 			arc_eviction_list = buf;
15983f9d6ad7SLin Ling 			mutex_exit(&buf->b_evict_lock);
1599ea8dc4b6Seschrock 			mutex_exit(&arc_eviction_mtx);
1600ea8dc4b6Seschrock 		} else {
160144eda4d7Smaybee 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1602ea8dc4b6Seschrock 		}
1603fa9e4066Sahrens 	}
16046b4acc8bSahrens 	if (hdr->b_freeze_cksum != NULL) {
16056b4acc8bSahrens 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
16066b4acc8bSahrens 		hdr->b_freeze_cksum = NULL;
16076b4acc8bSahrens 	}
16083f9d6ad7SLin Ling 	if (hdr->b_thawed) {
16093f9d6ad7SLin Ling 		kmem_free(hdr->b_thawed, 1);
16103f9d6ad7SLin Ling 		hdr->b_thawed = NULL;
16113f9d6ad7SLin Ling 	}
1612ea8dc4b6Seschrock 
1613fa9e4066Sahrens 	ASSERT(!list_link_active(&hdr->b_arc_node));
1614fa9e4066Sahrens 	ASSERT3P(hdr->b_hash_next, ==, NULL);
1615fa9e4066Sahrens 	ASSERT3P(hdr->b_acb, ==, NULL);
1616fa9e4066Sahrens 	kmem_cache_free(hdr_cache, hdr);
1617fa9e4066Sahrens }
1618fa9e4066Sahrens 
1619fa9e4066Sahrens void
1620fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag)
1621fa9e4066Sahrens {
1622fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
162344cb6abcSbmc 	int hashed = hdr->b_state != arc_anon;
1624fa9e4066Sahrens 
1625ea8dc4b6Seschrock 	ASSERT(buf->b_efunc == NULL);
1626ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
1627ea8dc4b6Seschrock 
1628ea8dc4b6Seschrock 	if (hashed) {
1629ea8dc4b6Seschrock 		kmutex_t *hash_lock = HDR_LOCK(hdr);
1630ea8dc4b6Seschrock 
1631ea8dc4b6Seschrock 		mutex_enter(hash_lock);
16323f9d6ad7SLin Ling 		hdr = buf->b_hdr;
16333f9d6ad7SLin Ling 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
16343f9d6ad7SLin Ling 
1635ea8dc4b6Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
1636b24ab676SJeff Bonwick 		if (hdr->b_datacnt > 1) {
163744eda4d7Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
1638b24ab676SJeff Bonwick 		} else {
1639b24ab676SJeff Bonwick 			ASSERT(buf == hdr->b_buf);
1640b24ab676SJeff Bonwick 			ASSERT(buf->b_efunc == NULL);
1641ea8dc4b6Seschrock 			hdr->b_flags |= ARC_BUF_AVAILABLE;
1642b24ab676SJeff Bonwick 		}
1643fa9e4066Sahrens 		mutex_exit(hash_lock);
1644ea8dc4b6Seschrock 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1645ea8dc4b6Seschrock 		int destroy_hdr;
1646ea8dc4b6Seschrock 		/*
1647ea8dc4b6Seschrock 		 * We are in the middle of an async write.  Don't destroy
1648ea8dc4b6Seschrock 		 * this buffer unless the write completes before we finish
1649ea8dc4b6Seschrock 		 * decrementing the reference count.
1650ea8dc4b6Seschrock 		 */
1651ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
1652ea8dc4b6Seschrock 		(void) remove_reference(hdr, NULL, tag);
1653ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1654ea8dc4b6Seschrock 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1655ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
1656ea8dc4b6Seschrock 		if (destroy_hdr)
1657ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
1658ea8dc4b6Seschrock 	} else {
16593f9d6ad7SLin Ling 		if (remove_reference(hdr, NULL, tag) > 0)
166044eda4d7Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
16613f9d6ad7SLin Ling 		else
1662ea8dc4b6Seschrock 			arc_hdr_destroy(hdr);
1663fa9e4066Sahrens 	}
1664ea8dc4b6Seschrock }
1665fa9e4066Sahrens 
16663b2aab18SMatthew Ahrens boolean_t
1667ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1668ea8dc4b6Seschrock {
1669ea8dc4b6Seschrock 	arc_buf_hdr_t *hdr = buf->b_hdr;
1670ea8dc4b6Seschrock 	kmutex_t *hash_lock = HDR_LOCK(hdr);
16713b2aab18SMatthew Ahrens 	boolean_t no_callback = (buf->b_efunc == NULL);
1672fa9e4066Sahrens 
167344cb6abcSbmc 	if (hdr->b_state == arc_anon) {
1674b24ab676SJeff Bonwick 		ASSERT(hdr->b_datacnt == 1);
1675ea8dc4b6Seschrock 		arc_buf_free(buf, tag);
1676ea8dc4b6Seschrock 		return (no_callback);
1677ea8dc4b6Seschrock 	}
1678ea8dc4b6Seschrock 
1679ea8dc4b6Seschrock 	mutex_enter(hash_lock);
16803f9d6ad7SLin Ling 	hdr = buf->b_hdr;
16813f9d6ad7SLin Ling 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
168244cb6abcSbmc 	ASSERT(hdr->b_state != arc_anon);
1683ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
1684ea8dc4b6Seschrock 
1685ea8dc4b6Seschrock 	(void) remove_reference(hdr, hash_lock, tag);
1686ea8dc4b6Seschrock 	if (hdr->b_datacnt > 1) {
1687ea8dc4b6Seschrock 		if (no_callback)
168844eda4d7Smaybee 			arc_buf_destroy(buf, FALSE, TRUE);
1689ea8dc4b6Seschrock 	} else if (no_callback) {
1690ea8dc4b6Seschrock 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1691b24ab676SJeff Bonwick 		ASSERT(buf->b_efunc == NULL);
1692ea8dc4b6Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
1693ea8dc4b6Seschrock 	}
1694ea8dc4b6Seschrock 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1695ea8dc4b6Seschrock 	    refcount_is_zero(&hdr->b_refcnt));
1696ea8dc4b6Seschrock 	mutex_exit(hash_lock);
1697ea8dc4b6Seschrock 	return (no_callback);
1698fa9e4066Sahrens }
1699fa9e4066Sahrens 
1700fa9e4066Sahrens int
1701fa9e4066Sahrens arc_buf_size(arc_buf_t *buf)
1702fa9e4066Sahrens {
1703fa9e4066Sahrens 	return (buf->b_hdr->b_size);
1704fa9e4066Sahrens }
1705fa9e4066Sahrens 
17069253d63dSGeorge Wilson /*
17079253d63dSGeorge Wilson  * Called from the DMU to determine if the current buffer should be
17089253d63dSGeorge Wilson  * evicted. In order to ensure proper locking, the eviction must be initiated
17099253d63dSGeorge Wilson  * from the DMU. Return true if the buffer is associated with user data and
17109253d63dSGeorge Wilson  * duplicate buffers still exist.
17119253d63dSGeorge Wilson  */
17129253d63dSGeorge Wilson boolean_t
17139253d63dSGeorge Wilson arc_buf_eviction_needed(arc_buf_t *buf)
17149253d63dSGeorge Wilson {
17159253d63dSGeorge Wilson 	arc_buf_hdr_t *hdr;
17169253d63dSGeorge Wilson 	boolean_t evict_needed = B_FALSE;
17179253d63dSGeorge Wilson 
17189253d63dSGeorge Wilson 	if (zfs_disable_dup_eviction)
17199253d63dSGeorge Wilson 		return (B_FALSE);
17209253d63dSGeorge Wilson 
17219253d63dSGeorge Wilson 	mutex_enter(&buf->b_evict_lock);
17229253d63dSGeorge Wilson 	hdr = buf->b_hdr;
17239253d63dSGeorge Wilson 	if (hdr == NULL) {
17249253d63dSGeorge Wilson 		/*
17259253d63dSGeorge Wilson 		 * We are in arc_do_user_evicts(); let that function
17269253d63dSGeorge Wilson 		 * perform the eviction.
17279253d63dSGeorge Wilson 		 */
17289253d63dSGeorge Wilson 		ASSERT(buf->b_data == NULL);
17299253d63dSGeorge Wilson 		mutex_exit(&buf->b_evict_lock);
17309253d63dSGeorge Wilson 		return (B_FALSE);
17319253d63dSGeorge Wilson 	} else if (buf->b_data == NULL) {
17329253d63dSGeorge Wilson 		/*
17339253d63dSGeorge Wilson 		 * We have already been added to the arc eviction list;
17349253d63dSGeorge Wilson 		 * recommend eviction.
17359253d63dSGeorge Wilson 		 */
17369253d63dSGeorge Wilson 		ASSERT3P(hdr, ==, &arc_eviction_hdr);
17379253d63dSGeorge Wilson 		mutex_exit(&buf->b_evict_lock);
17389253d63dSGeorge Wilson 		return (B_TRUE);
17399253d63dSGeorge Wilson 	}
17409253d63dSGeorge Wilson 
17419253d63dSGeorge Wilson 	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
17429253d63dSGeorge Wilson 		evict_needed = B_TRUE;
17439253d63dSGeorge Wilson 
17449253d63dSGeorge Wilson 	mutex_exit(&buf->b_evict_lock);
17459253d63dSGeorge Wilson 	return (evict_needed);
17469253d63dSGeorge Wilson }
17479253d63dSGeorge Wilson 
1748fa9e4066Sahrens /*
1749fa9e4066Sahrens  * Evict buffers from list until we've removed the specified number of
1750fa9e4066Sahrens  * bytes.  Move the removed buffers to the appropriate evict state.
175144eda4d7Smaybee  * If the recycle flag is set, then attempt to "recycle" a buffer:
175244eda4d7Smaybee  * - look for a buffer to evict that is `bytes' long.
175344eda4d7Smaybee  * - return the data block from this buffer rather than freeing it.
175444eda4d7Smaybee  * This flag is used by callers that are trying to make space for a
175544eda4d7Smaybee  * new buffer in a full arc cache.
1756874395d5Smaybee  *
1757874395d5Smaybee  * This function makes a "best effort".  It skips over any buffers
1758874395d5Smaybee  * it can't get a hash_lock on, and so may not catch all candidates.
1759874395d5Smaybee  * It may also return without evicting as much space as requested.
1760fa9e4066Sahrens  */
176144eda4d7Smaybee static void *
1762ac05c741SMark Maybee arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1763ad23a2dbSjohansen     arc_buf_contents_t type)
1764fa9e4066Sahrens {
1765fa9e4066Sahrens 	arc_state_t *evicted_state;
176644eda4d7Smaybee 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
17673fa51506Smaybee 	arc_buf_hdr_t *ab, *ab_prev = NULL;
17680e8c6158Smaybee 	list_t *list = &state->arcs_list[type];
1769fa9e4066Sahrens 	kmutex_t *hash_lock;
177044eda4d7Smaybee 	boolean_t have_lock;
17713fa51506Smaybee 	void *stolen = NULL;
1772fa9e4066Sahrens 
177344cb6abcSbmc 	ASSERT(state == arc_mru || state == arc_mfu);
1774fa9e4066Sahrens 
177544cb6abcSbmc 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1776fa9e4066Sahrens 
177744cb6abcSbmc 	mutex_enter(&state->arcs_mtx);
177844cb6abcSbmc 	mutex_enter(&evicted_state->arcs_mtx);
1779fa9e4066Sahrens 
17800e8c6158Smaybee 	for (ab = list_tail(list); ab; ab = ab_prev) {
17810e8c6158Smaybee 		ab_prev = list_prev(list, ab);
178213506d1eSmaybee 		/* prefetch buffers have a minimum lifespan */
178344eda4d7Smaybee 		if (HDR_IO_IN_PROGRESS(ab) ||
1784874395d5Smaybee 		    (spa && ab->b_spa != spa) ||
178544eda4d7Smaybee 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1786d3d50737SRafael Vanoni 		    ddi_get_lbolt() - ab->b_arc_access <
1787d3d50737SRafael Vanoni 		    arc_min_prefetch_lifespan)) {
178813506d1eSmaybee 			skipped++;
178913506d1eSmaybee 			continue;
179013506d1eSmaybee 		}
17913fa51506Smaybee 		/* "lookahead" for better eviction candidate */
17923fa51506Smaybee 		if (recycle && ab->b_size != bytes &&
17933fa51506Smaybee 		    ab_prev && ab_prev->b_size == bytes)
179444eda4d7Smaybee 			continue;
1795fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
179644eda4d7Smaybee 		have_lock = MUTEX_HELD(hash_lock);
179744eda4d7Smaybee 		if (have_lock || mutex_tryenter(hash_lock)) {
1798fb09f5aaSMadhav Suresh 			ASSERT0(refcount_count(&ab->b_refcnt));
1799ea8dc4b6Seschrock 			ASSERT(ab->b_datacnt > 0);
1800ea8dc4b6Seschrock 			while (ab->b_buf) {
1801ea8dc4b6Seschrock 				arc_buf_t *buf = ab->b_buf;
18023f9d6ad7SLin Ling 				if (!mutex_tryenter(&buf->b_evict_lock)) {
18036f83844dSMark Maybee 					missed += 1;
18046f83844dSMark Maybee 					break;
18056f83844dSMark Maybee 				}
180644eda4d7Smaybee 				if (buf->b_data) {
1807ea8dc4b6Seschrock 					bytes_evicted += ab->b_size;
1808ad23a2dbSjohansen 					if (recycle && ab->b_type == type &&
1809fa94a07fSbrendan 					    ab->b_size == bytes &&
1810fa94a07fSbrendan 					    !HDR_L2_WRITING(ab)) {
18113fa51506Smaybee 						stolen = buf->b_data;
18123fa51506Smaybee 						recycle = FALSE;
18133fa51506Smaybee 					}
181444eda4d7Smaybee 				}
1815ea8dc4b6Seschrock 				if (buf->b_efunc) {
1816ea8dc4b6Seschrock 					mutex_enter(&arc_eviction_mtx);
18173fa51506Smaybee 					arc_buf_destroy(buf,
18183fa51506Smaybee 					    buf->b_data == stolen, FALSE);
1819ea8dc4b6Seschrock 					ab->b_buf = buf->b_next;
182040d7d650Smaybee 					buf->b_hdr = &arc_eviction_hdr;
1821ea8dc4b6Seschrock 					buf->b_next = arc_eviction_list;
1822ea8dc4b6Seschrock 					arc_eviction_list = buf;
1823ea8dc4b6Seschrock 					mutex_exit(&arc_eviction_mtx);
18243f9d6ad7SLin Ling 					mutex_exit(&buf->b_evict_lock);
1825ea8dc4b6Seschrock 				} else {
18263f9d6ad7SLin Ling 					mutex_exit(&buf->b_evict_lock);
18273fa51506Smaybee 					arc_buf_destroy(buf,
18283fa51506Smaybee 					    buf->b_data == stolen, TRUE);
1829ea8dc4b6Seschrock 				}
1830ea8dc4b6Seschrock 			}
18315ea40c06SBrendan Gregg - Sun Microsystems 
18325ea40c06SBrendan Gregg - Sun Microsystems 			if (ab->b_l2hdr) {
18335ea40c06SBrendan Gregg - Sun Microsystems 				ARCSTAT_INCR(arcstat_evict_l2_cached,
18345ea40c06SBrendan Gregg - Sun Microsystems 				    ab->b_size);
18355ea40c06SBrendan Gregg - Sun Microsystems 			} else {
18365ea40c06SBrendan Gregg - Sun Microsystems 				if (l2arc_write_eligible(ab->b_spa, ab)) {
18375ea40c06SBrendan Gregg - Sun Microsystems 					ARCSTAT_INCR(arcstat_evict_l2_eligible,
18385ea40c06SBrendan Gregg - Sun Microsystems 					    ab->b_size);
18395ea40c06SBrendan Gregg - Sun Microsystems 				} else {
18405ea40c06SBrendan Gregg - Sun Microsystems 					ARCSTAT_INCR(
18415ea40c06SBrendan Gregg - Sun Microsystems 					    arcstat_evict_l2_ineligible,
18425ea40c06SBrendan Gregg - Sun Microsystems 					    ab->b_size);
18435ea40c06SBrendan Gregg - Sun Microsystems 				}
18445ea40c06SBrendan Gregg - Sun Microsystems 			}
18455ea40c06SBrendan Gregg - Sun Microsystems 
18466f83844dSMark Maybee 			if (ab->b_datacnt == 0) {
18476f83844dSMark Maybee 				arc_change_state(evicted_state, ab, hash_lock);
18486f83844dSMark Maybee 				ASSERT(HDR_IN_HASH_TABLE(ab));
18496f83844dSMark Maybee 				ab->b_flags |= ARC_IN_HASH_TABLE;
18506f83844dSMark Maybee 				ab->b_flags &= ~ARC_BUF_AVAILABLE;
18516f83844dSMark Maybee 				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
18526f83844dSMark Maybee 			}
185344eda4d7Smaybee 			if (!have_lock)
185444eda4d7Smaybee 				mutex_exit(hash_lock);
1855ea8dc4b6Seschrock 			if (bytes >= 0 && bytes_evicted >= bytes)
1856fa9e4066Sahrens 				break;
1857fa9e4066Sahrens 		} else {
185844eda4d7Smaybee 			missed += 1;
1859fa9e4066Sahrens 		}
1860fa9e4066Sahrens 	}
186144cb6abcSbmc 
186244cb6abcSbmc 	mutex_exit(&evicted_state->arcs_mtx);
186344cb6abcSbmc 	mutex_exit(&state->arcs_mtx);
1864fa9e4066Sahrens 
1865fa9e4066Sahrens 	if (bytes_evicted < bytes)
1866fa9e4066Sahrens 		dprintf("only evicted %lld bytes from %x",
1867fa9e4066Sahrens 		    (longlong_t)bytes_evicted, state);
1868fa9e4066Sahrens 
186944eda4d7Smaybee 	if (skipped)
187044cb6abcSbmc 		ARCSTAT_INCR(arcstat_evict_skip, skipped);
187144cb6abcSbmc 
187244eda4d7Smaybee 	if (missed)
187344cb6abcSbmc 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
1874f4d2e9e6Smaybee 
1875f4d2e9e6Smaybee 	/*
18763b2aab18SMatthew Ahrens 	 * We have just evicted some data into the ghost state, make
1877f4d2e9e6Smaybee 	 * sure we also adjust the ghost state size if necessary.
1878f4d2e9e6Smaybee 	 */
1879f4d2e9e6Smaybee 	if (arc_no_grow &&
1880f4d2e9e6Smaybee 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1881f4d2e9e6Smaybee 		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1882f4d2e9e6Smaybee 		    arc_mru_ghost->arcs_size - arc_c;
1883f4d2e9e6Smaybee 
1884f4d2e9e6Smaybee 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1885f4d2e9e6Smaybee 			int64_t todelete =
1886f4d2e9e6Smaybee 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1887874395d5Smaybee 			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1888f4d2e9e6Smaybee 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1889f4d2e9e6Smaybee 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1890f4d2e9e6Smaybee 			    arc_mru_ghost->arcs_size +
1891f4d2e9e6Smaybee 			    arc_mfu_ghost->arcs_size - arc_c);
1892874395d5Smaybee 			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1893f4d2e9e6Smaybee 		}
1894f4d2e9e6Smaybee 	}
189544cb6abcSbmc 
18963fa51506Smaybee 	return (stolen);
1897fa9e4066Sahrens }
1898fa9e4066Sahrens 
1899fa9e4066Sahrens /*
1900fa9e4066Sahrens  * Remove buffers from list until we've removed the specified number of
1901fa9e4066Sahrens  * bytes.  Destroy the buffers that are removed.
1902fa9e4066Sahrens  */
1903fa9e4066Sahrens static void
1904ac05c741SMark Maybee arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1905fa9e4066Sahrens {
1906fa9e4066Sahrens 	arc_buf_hdr_t *ab, *ab_prev;
1907b802aa8cSSanjeev Bagewadi 	arc_buf_hdr_t marker = { 0 };
19080e8c6158Smaybee 	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1909fa9e4066Sahrens 	kmutex_t *hash_lock;
1910ea8dc4b6Seschrock 	uint64_t bytes_deleted = 0;
1911c0a81264Sek 	uint64_t bufs_skipped = 0;
1912fa9e4066Sahrens 
1913ea8dc4b6Seschrock 	ASSERT(GHOST_STATE(state));
1914fa9e4066Sahrens top:
191544cb6abcSbmc 	mutex_enter(&state->arcs_mtx);
19160e8c6158Smaybee 	for (ab = list_tail(list); ab; ab = ab_prev) {
19170e8c6158Smaybee 		ab_prev = list_prev(list, ab);
1918874395d5Smaybee 		if (spa && ab->b_spa != spa)
1919874395d5Smaybee 			continue;
1920b802aa8cSSanjeev Bagewadi 
1921b802aa8cSSanjeev Bagewadi 		/* ignore markers */
1922b802aa8cSSanjeev Bagewadi 		if (ab->b_spa == 0)
1923b802aa8cSSanjeev Bagewadi 			continue;
1924b802aa8cSSanjeev Bagewadi 
1925fa9e4066Sahrens 		hash_lock = HDR_LOCK(ab);
19267e453561SWilliam Gorrell 		/* caller may be trying to modify this buffer, skip it */
19277e453561SWilliam Gorrell 		if (MUTEX_HELD(hash_lock))
19287e453561SWilliam Gorrell 			continue;
19297e453561SWilliam Gorrell 		if (mutex_tryenter(hash_lock)) {
193013506d1eSmaybee 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1931ea8dc4b6Seschrock 			ASSERT(ab->b_buf == NULL);
193244cb6abcSbmc 			ARCSTAT_BUMP(arcstat_deleted);
1933fa9e4066Sahrens 			bytes_deleted += ab->b_size;
1934fa94a07fSbrendan 
1935fa94a07fSbrendan 			if (ab->b_l2hdr != NULL) {
1936fa94a07fSbrendan 				/*
1937fa94a07fSbrendan 				 * This buffer is cached on the 2nd Level ARC;
1938fa94a07fSbrendan 				 * don't destroy the header.
1939fa94a07fSbrendan 				 */
1940fa94a07fSbrendan 				arc_change_state(arc_l2c_only, ab, hash_lock);
19417e453561SWilliam Gorrell 				mutex_exit(hash_lock);
1942fa94a07fSbrendan 			} else {
1943fa94a07fSbrendan 				arc_change_state(arc_anon, ab, hash_lock);
19447e453561SWilliam Gorrell 				mutex_exit(hash_lock);
1945fa94a07fSbrendan 				arc_hdr_destroy(ab);
1946fa94a07fSbrendan 			}
1947fa94a07fSbrendan 
1948ea8dc4b6Seschrock 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1949fa9e4066Sahrens 			if (bytes >= 0 && bytes_deleted >= bytes)
1950fa9e4066Sahrens 				break;
1951b802aa8cSSanjeev Bagewadi 		} else if (bytes < 0) {
1952b802aa8cSSanjeev Bagewadi 			/*
1953b802aa8cSSanjeev Bagewadi 			 * Insert a list marker and then wait for the
1954b802aa8cSSanjeev Bagewadi 			 * hash lock to become available. Once its
1955b802aa8cSSanjeev Bagewadi 			 * available, restart from where we left off.
1956b802aa8cSSanjeev Bagewadi 			 */
1957b802aa8cSSanjeev Bagewadi 			list_insert_after(list, ab, &marker);
1958b802aa8cSSanjeev Bagewadi 			mutex_exit(&state->arcs_mtx);
1959b802aa8cSSanjeev Bagewadi 			mutex_enter(hash_lock);
1960b802aa8cSSanjeev Bagewadi 			mutex_exit(hash_lock);
1961b802aa8cSSanjeev Bagewadi 			mutex_enter(&state->arcs_mtx);
1962b802aa8cSSanjeev Bagewadi 			ab_prev = list_prev(list, &marker);
1963b802aa8cSSanjeev Bagewadi 			list_remove(list, &marker);
1964b802aa8cSSanjeev Bagewadi 		} else
1965fa9e4066Sahrens 			bufs_skipped += 1;
1966fa9e4066Sahrens 	}
196744cb6abcSbmc 	mutex_exit(&state->arcs_mtx);
1968fa9e4066Sahrens 
19690e8c6158Smaybee 	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
19700e8c6158Smaybee 	    (bytes < 0 || bytes_deleted < bytes)) {
19710e8c6158Smaybee 		list = &state->arcs_list[ARC_BUFC_METADATA];
19720e8c6158Smaybee 		goto top;
19730e8c6158Smaybee 	}
19740e8c6158Smaybee 
1975fa9e4066Sahrens 	if (bufs_skipped) {
197644cb6abcSbmc 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1977fa9e4066Sahrens 		ASSERT(bytes >= 0);
1978fa9e4066Sahrens 	}
1979fa9e4066Sahrens 
1980fa9e4066Sahrens 	if (bytes_deleted < bytes)
1981fa9e4066Sahrens 		dprintf("only deleted %lld bytes from %p",
1982fa9e4066Sahrens 		    (longlong_t)bytes_deleted, state);
1983fa9e4066Sahrens }
1984fa9e4066Sahrens 
1985fa9e4066Sahrens static void
1986fa9e4066Sahrens arc_adjust(void)
1987fa9e4066Sahrens {
19885a98e54bSBrendan Gregg - Sun Microsystems 	int64_t adjustment, delta;
1989fa9e4066Sahrens 
19905a98e54bSBrendan Gregg - Sun Microsystems 	/*
19915a98e54bSBrendan Gregg - Sun Microsystems 	 * Adjust MRU size
19925a98e54bSBrendan Gregg - Sun Microsystems 	 */
19935a98e54bSBrendan Gregg - Sun Microsystems 
19943e4e8481STom Erickson 	adjustment = MIN((int64_t)(arc_size - arc_c),
19953e4e8481STom Erickson 	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
19963e4e8481STom Erickson 	    arc_p));
1997fa9e4066Sahrens 
19985a98e54bSBrendan Gregg - Sun Microsystems 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
19995a98e54bSBrendan Gregg - Sun Microsystems 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
20005a98e54bSBrendan Gregg - Sun Microsystems 		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
20015a98e54bSBrendan Gregg - Sun Microsystems 		adjustment -= delta;
20020e8c6158Smaybee 	}
20030e8c6158Smaybee 
20045a98e54bSBrendan Gregg - Sun Microsystems 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
20055a98e54bSBrendan Gregg - Sun Microsystems 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
20065a98e54bSBrendan Gregg - Sun Microsystems 		(void) arc_evict(arc_mru, NULL, delta, FALSE,
2007874395d5Smaybee 		    ARC_BUFC_METADATA);
2008fa9e4066Sahrens 	}
2009fa9e4066Sahrens 
20105a98e54bSBrendan Gregg - Sun Microsystems 	/*
20115a98e54bSBrendan Gregg - Sun Microsystems 	 * Adjust MFU size
20125a98e54bSBrendan Gregg - Sun Microsystems 	 */
2013fa9e4066Sahrens 
20145a98e54bSBrendan Gregg - Sun Microsystems 	adjustment = arc_size - arc_c;
20155a98e54bSBrendan Gregg - Sun Microsystems 
20165a98e54bSBrendan Gregg - Sun Microsystems 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
20175a98e54bSBrendan Gregg - Sun Microsystems 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
20185a98e54bSBrendan Gregg - Sun Microsystems 		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
20195a98e54bSBrendan Gregg - Sun Microsystems 		adjustment -= delta;
2020fa9e4066Sahrens 	}
2021fa9e4066Sahrens 
20225a98e54bSBrendan Gregg - Sun Microsystems 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
20235a98e54bSBrendan Gregg - Sun Microsystems 		int64_t delta = MIN(adjustment,
20245a98e54bSBrendan Gregg - Sun Microsystems 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
20255a98e54bSBrendan Gregg - Sun Microsystems 		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
20265a98e54bSBrendan Gregg - Sun Microsystems 		    ARC_BUFC_METADATA);
20275a98e54bSBrendan Gregg - Sun Microsystems 	}
2028fa9e4066Sahrens 
20295a98e54bSBrendan Gregg - Sun Microsystems 	/*
20305a98e54bSBrendan Gregg - Sun Microsystems 	 * Adjust ghost lists
20315a98e54bSBrendan Gregg - Sun Microsystems 	 */
2032fa9e4066Sahrens 
20335a98e54bSBrendan Gregg - Sun Microsystems 	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2034fa9e4066Sahrens 
20355a98e54bSBrendan Gregg - Sun Microsystems 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
20365a98e54bSBrendan Gregg - Sun Microsystems 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
20375a98e54bSBrendan Gregg - Sun Microsystems 		arc_evict_ghost(arc_mru_ghost, NULL, delta);
20385a98e54bSBrendan Gregg - Sun Microsystems 	}
20390e8c6158Smaybee 
20405a98e54bSBrendan Gregg - Sun Microsystems 	adjustment =
20415a98e54bSBrendan Gregg - Sun Microsystems 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
20425a98e54bSBrendan Gregg - Sun Microsystems 
20435a98e54bSBrendan Gregg - Sun Microsystems 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
20445a98e54bSBrendan Gregg - Sun Microsystems 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
20455a98e54bSBrendan Gregg - Sun Microsystems 		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2046fa9e4066Sahrens 	}
2047fa9e4066Sahrens }
2048fa9e4066Sahrens 
2049ea8dc4b6Seschrock static void
2050ea8dc4b6Seschrock arc_do_user_evicts(void)
2051ea8dc4b6Seschrock {
2052ea8dc4b6Seschrock 	mutex_enter(&arc_eviction_mtx);
2053ea8dc4b6Seschrock 	while (arc_eviction_list != NULL) {
2054ea8dc4b6Seschrock 		arc_buf_t *buf = arc_eviction_list;
2055ea8dc4b6Seschrock 		arc_eviction_list = buf->b_next;
20563f9d6ad7SLin Ling 		mutex_enter(&buf->b_evict_lock);
2057ea8dc4b6Seschrock 		buf->b_hdr = NULL;
20583f9d6ad7SLin Ling 		mutex_exit(&buf->b_evict_lock);
2059ea8dc4b6Seschrock 		mutex_exit(&arc_eviction_mtx);
2060ea8dc4b6Seschrock 
2061dd6ef538Smaybee 		if (buf->b_efunc != NULL)
2062dd6ef538Smaybee 			VERIFY(buf->b_efunc(buf) == 0);
2063ea8dc4b6Seschrock 
2064ea8dc4b6Seschrock 		buf->b_efunc = NULL;
2065ea8dc4b6Seschrock 		buf->b_private = NULL;
2066ea8dc4b6Seschrock 		kmem_cache_free(buf_cache, buf);
2067ea8dc4b6Seschrock 		mutex_enter(&arc_eviction_mtx);
2068ea8dc4b6Seschrock 	}
2069ea8dc4b6Seschrock 	mutex_exit(&arc_eviction_mtx);
2070ea8dc4b6Seschrock }
2071ea8dc4b6Seschrock 
2072fa9e4066Sahrens /*
2073874395d5Smaybee  * Flush all *evictable* data from the cache for the given spa.
2074fa9e4066Sahrens  * NOTE: this will not touch "active" (i.e. referenced) data.
2075fa9e4066Sahrens  */
2076fa9e4066Sahrens void
2077874395d5Smaybee arc_flush(spa_t *spa)
2078fa9e4066Sahrens {
2079ac05c741SMark Maybee 	uint64_t guid = 0;
2080ac05c741SMark Maybee 
2081ac05c741SMark Maybee 	if (spa)
2082e9103aaeSGarrett D'Amore 		guid = spa_load_guid(spa);
2083ac05c741SMark Maybee 
2084874395d5Smaybee 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2085ac05c741SMark Maybee 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2086874395d5Smaybee 		if (spa)
2087874395d5Smaybee 			break;
2088874395d5Smaybee 	}
2089874395d5Smaybee 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2090ac05c741SMark Maybee 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2091874395d5Smaybee 		if (spa)
2092874395d5Smaybee 			break;
2093874395d5Smaybee 	}
2094874395d5Smaybee 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2095ac05c741SMark Maybee 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2096874395d5Smaybee 		if (spa)
2097874395d5Smaybee 			break;
2098874395d5Smaybee 	}
2099874395d5Smaybee 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2100ac05c741SMark Maybee 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2101874395d5Smaybee 		if (spa)
2102874395d5Smaybee 			break;
2103874395d5Smaybee 	}
2104874395d5Smaybee 
2105ac05c741SMark Maybee 	arc_evict_ghost(arc_mru_ghost, guid, -1);
2106ac05c741SMark Maybee 	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2107ea8dc4b6Seschrock 
2108ea8dc4b6Seschrock 	mutex_enter(&arc_reclaim_thr_lock);
2109ea8dc4b6Seschrock 	arc_do_user_evicts();
2110ea8dc4b6Seschrock 	mutex_exit(&arc_reclaim_thr_lock);
2111874395d5Smaybee 	ASSERT(spa || arc_eviction_list == NULL);
2112fa9e4066Sahrens }
2113fa9e4066Sahrens 
2114fa9e4066Sahrens void
211549e3519aSmaybee arc_shrink(void)
2116fa9e4066Sahrens {
211744cb6abcSbmc 	if (arc_c > arc_c_min) {
211849e3519aSmaybee 		uint64_t to_free;
2119fa9e4066Sahrens 
21203cff2f43Sstans #ifdef _KERNEL
212144cb6abcSbmc 		to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
21223cff2f43Sstans #else
212344cb6abcSbmc 		to_free = arc_c >> arc_shrink_shift;
21243cff2f43Sstans #endif
212544cb6abcSbmc 		if (arc_c > arc_c_min + to_free)
212644cb6abcSbmc 			atomic_add_64(&arc_c, -to_free);
212749e3519aSmaybee 		else
212844cb6abcSbmc 			arc_c = arc_c_min;
212944cb6abcSbmc 
213044cb6abcSbmc 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
213144cb6abcSbmc 		if (arc_c > arc_size)
213244cb6abcSbmc 			arc_c = MAX(arc_size, arc_c_min);
213344cb6abcSbmc 		if (arc_p > arc_c)
213444cb6abcSbmc 			arc_p = (arc_c >> 1);
213544cb6abcSbmc 		ASSERT(arc_c >= arc_c_min);
213644cb6abcSbmc 		ASSERT((int64_t)arc_p >= 0);
213749e3519aSmaybee 	}
2138fa9e4066Sahrens 
213944cb6abcSbmc 	if (arc_size > arc_c)
214049e3519aSmaybee 		arc_adjust();
2141fa9e4066Sahrens }
2142fa9e4066Sahrens 
214394dd93aeSGeorge Wilson /*
214494dd93aeSGeorge Wilson  * Determine if the system is under memory pressure and is asking
214594dd93aeSGeorge Wilson  * to reclaim memory. A return value of 1 indicates that the system
214694dd93aeSGeorge Wilson  * is under memory pressure and that the arc should adjust accordingly.
214794dd93aeSGeorge Wilson  */
2148fa9e4066Sahrens static int
2149fa9e4066Sahrens arc_reclaim_needed(void)
2150fa9e4066Sahrens {
2151fa9e4066Sahrens 	uint64_t extra;
2152fa9e4066Sahrens 
2153fa9e4066Sahrens #ifdef _KERNEL
21543cff2f43Sstans 
21553cff2f43Sstans 	if (needfree)
21563cff2f43Sstans 		return (1);
21573cff2f43Sstans 
2158fa9e4066Sahrens 	/*
2159fa9e4066Sahrens 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2160fa9e4066Sahrens 	 */
2161fa9e4066Sahrens 	extra = desfree;
2162fa9e4066Sahrens 
2163fa9e4066Sahrens 	/*
2164fa9e4066Sahrens 	 * check that we're out of range of the pageout scanner.  It starts to
2165fa9e4066Sahrens 	 * schedule paging if freemem is less than lotsfree and needfree.
2166fa9e4066Sahrens 	 * lotsfree is the high-water mark for pageout, and needfree is the
2167fa9e4066Sahrens 	 * number of needed free pages.  We add extra pages here to make sure
2168fa9e4066Sahrens 	 * the scanner doesn't start up while we're freeing memory.
2169fa9e4066Sahrens 	 */
2170fa9e4066Sahrens 	if (freemem < lotsfree + needfree + extra)
2171fa9e4066Sahrens 		return (1);
2172fa9e4066Sahrens 
2173fa9e4066Sahrens 	/*
2174fa9e4066Sahrens 	 * check to make sure that swapfs has enough space so that anon
2175fa94a07fSbrendan 	 * reservations can still succeed. anon_resvmem() checks that the
2176fa9e4066Sahrens 	 * availrmem is greater than swapfs_minfree, and the number of reserved
2177fa9e4066Sahrens 	 * swap pages.  We also add a bit of extra here just to prevent
2178fa9e4066Sahrens 	 * circumstances from getting really dire.
2179fa9e4066Sahrens 	 */
2180fa9e4066Sahrens 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2181fa9e4066Sahrens 		return (1);
2182fa9e4066Sahrens 
21835dc8af33Smaybee #if defined(__i386)
2184fa9e4066Sahrens 	/*
2185fa9e4066Sahrens 	 * If we're on an i386 platform, it's possible that we'll exhaust the
2186fa9e4066Sahrens 	 * kernel heap space before we ever run out of available physical
2187fa9e4066Sahrens 	 * memory.  Most checks of the size of the heap_area compare against
2188fa9e4066Sahrens 	 * tune.t_minarmem, which is the minimum available real memory that we
2189fa9e4066Sahrens 	 * can have in the system.  However, this is generally fixed at 25 pages
2190fa9e4066Sahrens 	 * which is so low that it's useless.  In this comparison, we seek to
2191fa9e4066Sahrens 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2192fa94a07fSbrendan 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2193fa9e4066Sahrens 	 * free)
2194fa9e4066Sahrens 	 */
219594dd93aeSGeorge Wilson 	if (vmem_size(heap_arena, VMEM_FREE) <
219694dd93aeSGeorge Wilson 	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2197fa9e4066Sahrens 		return (1);
2198fa9e4066Sahrens #endif
2199fa9e4066Sahrens 
220094dd93aeSGeorge Wilson 	/*
220194dd93aeSGeorge Wilson 	 * If zio data pages are being allocated out of a separate heap segment,
220294dd93aeSGeorge Wilson 	 * then enforce that the size of available vmem for this arena remains
220394dd93aeSGeorge Wilson 	 * above about 1/16th free.
220494dd93aeSGeorge Wilson 	 *
220594dd93aeSGeorge Wilson 	 * Note: The 1/16th arena free requirement was put in place
220694dd93aeSGeorge Wilson 	 * to aggressively evict memory from the arc in order to avoid
220794dd93aeSGeorge Wilson 	 * memory fragmentation issues.
220894dd93aeSGeorge Wilson 	 */
220994dd93aeSGeorge Wilson 	if (zio_arena != NULL &&
221094dd93aeSGeorge Wilson 	    vmem_size(zio_arena, VMEM_FREE) <
221194dd93aeSGeorge Wilson 	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
221294dd93aeSGeorge Wilson 		return (1);
2213fa9e4066Sahrens #else
2214fa9e4066Sahrens 	if (spa_get_random(100) == 0)
2215fa9e4066Sahrens 		return (1);
2216fa9e4066Sahrens #endif
2217fa9e4066Sahrens 	return (0);
2218fa9e4066Sahrens }
2219fa9e4066Sahrens 
2220fa9e4066Sahrens static void
2221fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2222fa9e4066Sahrens {
2223fa9e4066Sahrens 	size_t			i;
2224fa9e4066Sahrens 	kmem_cache_t		*prev_cache = NULL;
2225ad23a2dbSjohansen 	kmem_cache_t		*prev_data_cache = NULL;
2226fa9e4066Sahrens 	extern kmem_cache_t	*zio_buf_cache[];
2227ad23a2dbSjohansen 	extern kmem_cache_t	*zio_data_buf_cache[];
2228fa9e4066Sahrens 
2229033f9833Sek #ifdef _KERNEL
22300e8c6158Smaybee 	if (arc_meta_used >= arc_meta_limit) {
22310e8c6158Smaybee 		/*
22320e8c6158Smaybee 		 * We are exceeding our meta-data cache limit.
22330e8c6158Smaybee 		 * Purge some DNLC entries to release holds on meta-data.
22340e8c6158Smaybee 		 */
22350e8c6158Smaybee 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
22360e8c6158Smaybee 	}
22375dc8af33Smaybee #if defined(__i386)
22385dc8af33Smaybee 	/*
22395dc8af33Smaybee 	 * Reclaim unused memory from all kmem caches.
22405dc8af33Smaybee 	 */
22415dc8af33Smaybee 	kmem_reap();
22425dc8af33Smaybee #endif
2243033f9833Sek #endif
2244033f9833Sek 
2245fa9e4066Sahrens 	/*
2246fa94a07fSbrendan 	 * An aggressive reclamation will shrink the cache size as well as
2247ea8dc4b6Seschrock 	 * reap free buffers from the arc kmem caches.
2248fa9e4066Sahrens 	 */
2249fa9e4066Sahrens 	if (strat == ARC_RECLAIM_AGGR)
225049e3519aSmaybee 		arc_shrink();
2251fa9e4066Sahrens 
2252fa9e4066Sahrens 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2253fa9e4066Sahrens 		if (zio_buf_cache[i] != prev_cache) {
2254fa9e4066Sahrens 			prev_cache = zio_buf_cache[i];
2255fa9e4066Sahrens 			kmem_cache_reap_now(zio_buf_cache[i]);
2256fa9e4066Sahrens 		}
2257ad23a2dbSjohansen 		if (zio_data_buf_cache[i] != prev_data_cache) {
2258ad23a2dbSjohansen 			prev_data_cache = zio_data_buf_cache[i];
2259ad23a2dbSjohansen 			kmem_cache_reap_now(zio_data_buf_cache[i]);
2260ad23a2dbSjohansen 		}
2261fa9e4066Sahrens 	}
2262ea8dc4b6Seschrock 	kmem_cache_reap_now(buf_cache);
2263ea8dc4b6Seschrock 	kmem_cache_reap_now(hdr_cache);
226494dd93aeSGeorge Wilson 
226594dd93aeSGeorge Wilson 	/*
226694dd93aeSGeorge Wilson 	 * Ask the vmem areana to reclaim unused memory from its
226794dd93aeSGeorge Wilson 	 * quantum caches.
226894dd93aeSGeorge Wilson 	 */
226994dd93aeSGeorge Wilson 	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
227094dd93aeSGeorge Wilson 		vmem_qcache_reap(zio_arena);
2271fa9e4066Sahrens }
2272fa9e4066Sahrens 
2273fa9e4066Sahrens static void
2274fa9e4066Sahrens arc_reclaim_thread(void)
2275fa9e4066Sahrens {
2276fa9e4066Sahrens 	clock_t			growtime = 0;
2277fa9e4066Sahrens 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2278fa9e4066Sahrens 	callb_cpr_t		cpr;
2279fa9e4066Sahrens 
2280fa9e4066Sahrens 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2281fa9e4066Sahrens 
2282fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
2283fa9e4066Sahrens 	while (arc_thread_exit == 0) {
2284fa9e4066Sahrens 		if (arc_reclaim_needed()) {
2285fa9e4066Sahrens 
228644cb6abcSbmc 			if (arc_no_grow) {
2287fa9e4066Sahrens 				if (last_reclaim == ARC_RECLAIM_CONS) {
2288fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_AGGR;
2289fa9e4066Sahrens 				} else {
2290fa9e4066Sahrens 					last_reclaim = ARC_RECLAIM_CONS;
2291fa9e4066Sahrens 				}
2292fa9e4066Sahrens 			} else {
229344cb6abcSbmc 				arc_no_grow = TRUE;
2294fa9e4066Sahrens 				last_reclaim = ARC_RECLAIM_AGGR;
2295fa9e4066Sahrens 				membar_producer();
2296fa9e4066Sahrens 			}
2297fa9e4066Sahrens 
2298fa9e4066Sahrens 			/* reset the growth delay for every reclaim */
2299d3d50737SRafael Vanoni 			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2300fa9e4066Sahrens 
2301fa9e4066Sahrens 			arc_kmem_reap_now(last_reclaim);
23023a737e0dSbrendan 			arc_warm = B_TRUE;
2303fa9e4066Sahrens 
2304d3d50737SRafael Vanoni 		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
230544cb6abcSbmc 			arc_no_grow = FALSE;
2306fa9e4066Sahrens 		}
2307fa9e4066Sahrens 
23083e4e8481STom Erickson 		arc_adjust();
2309641fbdaeSmaybee 
2310ea8dc4b6Seschrock 		if (arc_eviction_list != NULL)
2311ea8dc4b6Seschrock 			arc_do_user_evicts();
2312ea8dc4b6Seschrock 
2313fa9e4066Sahrens 		/* block until needed, or one second, whichever is shorter */
2314fa9e4066Sahrens 		CALLB_CPR_SAFE_BEGIN(&cpr);
2315fa9e4066Sahrens 		(void) cv_timedwait(&arc_reclaim_thr_cv,
2316d3d50737SRafael Vanoni 		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2317fa9e4066Sahrens 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2318fa9e4066Sahrens 	}
2319fa9e4066Sahrens 
2320fa9e4066Sahrens 	arc_thread_exit = 0;
2321fa9e4066Sahrens 	cv_broadcast(&arc_reclaim_thr_cv);
2322fa9e4066Sahrens 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2323fa9e4066Sahrens 	thread_exit();
2324fa9e4066Sahrens }
2325fa9e4066Sahrens 
2326ea8dc4b6Seschrock /*
2327ea8dc4b6Seschrock  * Adapt arc info given the number of bytes we are trying to add and
2328ea8dc4b6Seschrock  * the state that we are comming from.  This function is only called
2329ea8dc4b6Seschrock  * when we are adding new content to the cache.
2330ea8dc4b6Seschrock  */
2331fa9e4066Sahrens static void
2332ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state)
2333fa9e4066Sahrens {
2334ea8dc4b6Seschrock 	int mult;
23355a98e54bSBrendan Gregg - Sun Microsystems 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2336ea8dc4b6Seschrock 
2337fa94a07fSbrendan 	if (state == arc_l2c_only)
2338fa94a07fSbrendan 		return;
2339fa94a07fSbrendan 
2340ea8dc4b6Seschrock 	ASSERT(bytes > 0);
2341fa9e4066Sahrens 	/*
2342ea8dc4b6Seschrock 	 * Adapt the target size of the MRU list:
2343ea8dc4b6Seschrock 	 *	- if we just hit in the MRU ghost list, then increase
2344ea8dc4b6Seschrock 	 *	  the target size of the MRU list.
2345ea8dc4b6Seschrock 	 *	- if we just hit in the MFU ghost list, then increase
2346ea8dc4b6Seschrock 	 *	  the target size of the MFU list by decreasing the
2347ea8dc4b6Seschrock 	 *	  target size of the MRU list.
2348fa9e4066Sahrens 	 */
234944cb6abcSbmc 	if (state == arc_mru_ghost) {
235044cb6abcSbmc 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
235144cb6abcSbmc 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
23523e4e8481STom Erickson 		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2353ea8dc4b6Seschrock 
23545a98e54bSBrendan Gregg - Sun Microsystems 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
235544cb6abcSbmc 	} else if (state == arc_mfu_ghost) {
23565a98e54bSBrendan Gregg - Sun Microsystems 		uint64_t delta;
23575a98e54bSBrendan Gregg - Sun Microsystems 
235844cb6abcSbmc 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
235944cb6abcSbmc 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
23603e4e8481STom Erickson 		mult = MIN(mult, 10);
2361ea8dc4b6Seschrock 
23625a98e54bSBrendan Gregg - Sun Microsystems 		delta = MIN(bytes * mult, arc_p);
23635a98e54bSBrendan Gregg - Sun Microsystems 		arc_p = MAX(arc_p_min, arc_p - delta);
2364ea8dc4b6Seschrock 	}
236544cb6abcSbmc 	ASSERT((int64_t)arc_p >= 0);
2366fa9e4066Sahrens 
2367fa9e4066Sahrens 	if (arc_reclaim_needed()) {
2368fa9e4066Sahrens 		cv_signal(&arc_reclaim_thr_cv);
2369fa9e4066Sahrens 		return;
2370fa9e4066Sahrens 	}
2371fa9e4066Sahrens 
237244cb6abcSbmc 	if (arc_no_grow)
2373fa9e4066Sahrens 		return;
2374fa9e4066Sahrens 
237544cb6abcSbmc 	if (arc_c >= arc_c_max)
2376ea8dc4b6Seschrock 		return;
2377ea8dc4b6Seschrock 
2378fa9e4066Sahrens 	/*
2379ea8dc4b6Seschrock 	 * If we're within (2 * maxblocksize) bytes of the target
2380ea8dc4b6Seschrock 	 * cache size, increment the target cache size
2381fa9e4066Sahrens 	 */
238244cb6abcSbmc 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
238344cb6abcSbmc 		atomic_add_64(&arc_c, (int64_t)bytes);
238444cb6abcSbmc 		if (arc_c > arc_c_max)
238544cb6abcSbmc 			arc_c = arc_c_max;
238644cb6abcSbmc 		else if (state == arc_anon)
238744cb6abcSbmc 			atomic_add_64(&arc_p, (int64_t)bytes);
238844cb6abcSbmc 		if (arc_p > arc_c)
238944cb6abcSbmc 			arc_p = arc_c;
2390fa9e4066Sahrens 	}
239144cb6abcSbmc 	ASSERT((int64_t)arc_p >= 0);
2392fa9e4066Sahrens }
2393fa9e4066Sahrens 
2394fa9e4066Sahrens /*
2395ea8dc4b6Seschrock  * Check if the cache has reached its limits and eviction is required
2396ea8dc4b6Seschrock  * prior to insert.
2397fa9e4066Sahrens  */
2398fa9e4066Sahrens static int
23990e8c6158Smaybee arc_evict_needed(arc_buf_contents_t type)
2400fa9e4066Sahrens {
24010e8c6158Smaybee 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
24020e8c6158Smaybee 		return (1);
24030e8c6158Smaybee 
2404fa9e4066Sahrens 	if (arc_reclaim_needed())
2405fa9e4066Sahrens 		return (1);
2406fa9e4066Sahrens 
240744cb6abcSbmc 	return (arc_size > arc_c);
2408fa9e4066Sahrens }
2409fa9e4066Sahrens 
2410fa9e4066Sahrens /*
241144eda4d7Smaybee  * The buffer, supplied as the first argument, needs a data block.
241244eda4d7Smaybee  * So, if we are at cache max, determine which cache should be victimized.
241344eda4d7Smaybee  * We have the following cases:
2414fa9e4066Sahrens  *
241544cb6abcSbmc  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2416fa9e4066Sahrens  * In this situation if we're out of space, but the resident size of the MFU is
2417fa9e4066Sahrens  * under the limit, victimize the MFU cache to satisfy this insertion request.
2418fa9e4066Sahrens  *
241944cb6abcSbmc  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2420fa9e4066Sahrens  * Here, we've used up all of the available space for the MRU, so we need to
2421fa9e4066Sahrens  * evict from our own cache instead.  Evict from the set of resident MRU
2422fa9e4066Sahrens  * entries.
2423fa9e4066Sahrens  *
242444cb6abcSbmc  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2425fa9e4066Sahrens  * c minus p represents the MFU space in the cache, since p is the size of the
2426fa9e4066Sahrens  * cache that is dedicated to the MRU.  In this situation there's still space on
2427fa9e4066Sahrens  * the MFU side, so the MRU side needs to be victimized.
2428fa9e4066Sahrens  *
242944cb6abcSbmc  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2430fa9e4066Sahrens  * MFU's resident set is consuming more space than it has been allotted.  In
2431fa9e4066Sahrens  * this situation, we must victimize our own cache, the MFU, for this insertion.
2432fa9e4066Sahrens  */
2433fa9e4066Sahrens static void
243444eda4d7Smaybee arc_get_data_buf(arc_buf_t *buf)
2435fa9e4066Sahrens {
2436ad23a2dbSjohansen 	arc_state_t		*state = buf->b_hdr->b_state;
2437ad23a2dbSjohansen 	uint64_t		size = buf->b_hdr->b_size;
2438ad23a2dbSjohansen 	arc_buf_contents_t	type = buf->b_hdr->b_type;
2439fa9e4066Sahrens 
244044eda4d7Smaybee 	arc_adapt(size, state);
2441fa9e4066Sahrens 
244244eda4d7Smaybee 	/*
244344eda4d7Smaybee 	 * We have not yet reached cache maximum size,
244444eda4d7Smaybee 	 * just allocate a new buffer.
244544eda4d7Smaybee 	 */
24460e8c6158Smaybee 	if (!arc_evict_needed(type)) {
2447ad23a2dbSjohansen 		if (type == ARC_BUFC_METADATA) {
2448ad23a2dbSjohansen 			buf->b_data = zio_buf_alloc(size);
24495a98e54bSBrendan Gregg - Sun Microsystems 			arc_space_consume(size, ARC_SPACE_DATA);
2450ad23a2dbSjohansen 		} else {
2451ad23a2dbSjohansen 			ASSERT(type == ARC_BUFC_DATA);
2452ad23a2dbSjohansen 			buf->b_data = zio_data_buf_alloc(size);
24535a98e54bSBrendan Gregg - Sun Microsystems 			ARCSTAT_INCR(arcstat_data_size, size);
24540e8c6158Smaybee 			atomic_add_64(&arc_size, size);
2455ad23a2dbSjohansen 		}
245644eda4d7Smaybee 		goto out;
245744eda4d7Smaybee 	}
245844eda4d7Smaybee 
245944eda4d7Smaybee 	/*
246044eda4d7Smaybee 	 * If we are prefetching from the mfu ghost list, this buffer
246144eda4d7Smaybee 	 * will end up on the mru list; so steal space from there.
246244eda4d7Smaybee 	 */
246344cb6abcSbmc 	if (state == arc_mfu_ghost)
246444cb6abcSbmc 		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
246544cb6abcSbmc 	else if (state == arc_mru_ghost)
246644cb6abcSbmc 		state = arc_mru;
246744cb6abcSbmc 
246844cb6abcSbmc 	if (state == arc_mru || state == arc_anon) {
246944cb6abcSbmc 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
24705a98e54bSBrendan Gregg - Sun Microsystems 		state = (arc_mfu->arcs_lsize[type] >= size &&
24710e8c6158Smaybee 		    arc_p > mru_used) ? arc_mfu : arc_mru;
2472fa9e4066Sahrens 	} else {
247344eda4d7Smaybee 		/* MFU cases */
247444cb6abcSbmc 		uint64_t mfu_space = arc_c - arc_p;
24755a98e54bSBrendan Gregg - Sun Microsystems 		state =  (arc_mru->arcs_lsize[type] >= size &&
24760e8c6158Smaybee 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
247744eda4d7Smaybee 	}
2478874395d5Smaybee 	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2479ad23a2dbSjohansen 		if (type == ARC_BUFC_METADATA) {
2480ad23a2dbSjohansen 			buf->b_data = zio_buf_alloc(size);
24815a98e54bSBrendan Gregg - Sun Microsystems 			arc_space_consume(size, ARC_SPACE_DATA);
2482ad23a2dbSjohansen 		} else {
2483ad23a2dbSjohansen 			ASSERT(type == ARC_BUFC_DATA);
2484ad23a2dbSjohansen 			buf->b_data = zio_data_buf_alloc(size);
24855a98e54bSBrendan Gregg - Sun Microsystems 			ARCSTAT_INCR(arcstat_data_size, size);
24860e8c6158Smaybee 			atomic_add_64(&arc_size, size);
2487ad23a2dbSjohansen 		}
248844cb6abcSbmc 		ARCSTAT_BUMP(arcstat_recycle_miss);
248944eda4d7Smaybee 	}
249044eda4d7Smaybee 	ASSERT(buf->b_data != NULL);
249144eda4d7Smaybee out:
249244eda4d7Smaybee 	/*
249344eda4d7Smaybee 	 * Update the state size.  Note that ghost states have a
249444eda4d7Smaybee 	 * "ghost size" and so don't need to be updated.
249544eda4d7Smaybee 	 */
249644eda4d7Smaybee 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
249744eda4d7Smaybee 		arc_buf_hdr_t *hdr = buf->b_hdr;
249844eda4d7Smaybee 
249944cb6abcSbmc 		atomic_add_64(&hdr->b_state->arcs_size, size);
250044eda4d7Smaybee 		if (list_link_active(&hdr->b_arc_node)) {
250144eda4d7Smaybee 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
25020e8c6158Smaybee 			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2503fa9e4066Sahrens 		}
2504641fbdaeSmaybee 		/*
2505641fbdaeSmaybee 		 * If we are growing the cache, and we are adding anonymous
250644cb6abcSbmc 		 * data, and we have outgrown arc_p, update arc_p
2507641fbdaeSmaybee 		 */
250844cb6abcSbmc 		if (arc_size < arc_c && hdr->b_state == arc_anon &&
250944cb6abcSbmc 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
251044cb6abcSbmc 			arc_p = MIN(arc_c, arc_p + size);
2511fa9e4066Sahrens 	}
2512fa9e4066Sahrens }
2513fa9e4066Sahrens 
2514fa9e4066Sahrens /*
2515fa9e4066Sahrens  * This routine is called whenever a buffer is accessed.
2516ea8dc4b6Seschrock  * NOTE: the hash lock is dropped in this function.
2517fa9e4066Sahrens  */
2518fa9e4066Sahrens static void
251944eda4d7Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2520fa9e4066Sahrens {
2521d3d50737SRafael Vanoni 	clock_t now;
2522d3d50737SRafael Vanoni 
2523fa9e4066Sahrens 	ASSERT(MUTEX_HELD(hash_lock));
2524fa9e4066Sahrens 
252544cb6abcSbmc 	if (buf->b_state == arc_anon) {
2526fa9e4066Sahrens 		/*
2527fa9e4066Sahrens 		 * This buffer is not in the cache, and does not
2528fa9e4066Sahrens 		 * appear in our "ghost" list.  Add the new buffer
2529fa9e4066Sahrens 		 * to the MRU state.
2530fa9e4066Sahrens 		 */
2531fa9e4066Sahrens 
2532fa9e4066Sahrens 		ASSERT(buf->b_arc_access == 0);
2533d3d50737SRafael Vanoni 		buf->b_arc_access = ddi_get_lbolt();
2534ea8dc4b6Seschrock 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
253544cb6abcSbmc 		arc_change_state(arc_mru, buf, hash_lock);
2536fa9e4066Sahrens 
253744cb6abcSbmc 	} else if (buf->b_state == arc_mru) {
2538d3d50737SRafael Vanoni 		now = ddi_get_lbolt();
2539d3d50737SRafael Vanoni 
2540fa9e4066Sahrens 		/*
254113506d1eSmaybee 		 * If this buffer is here because of a prefetch, then either:
254213506d1eSmaybee 		 * - clear the flag if this is a "referencing" read
254313506d1eSmaybee 		 *   (any subsequent access will bump this into the MFU state).
254413506d1eSmaybee 		 * or
254513506d1eSmaybee 		 * - move the buffer to the head of the list if this is
254613506d1eSmaybee 		 *   another prefetch (to make it less likely to be evicted).
2547fa9e4066Sahrens 		 */
2548fa9e4066Sahrens 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
254913506d1eSmaybee 			if (refcount_count(&buf->b_refcnt) == 0) {
255013506d1eSmaybee 				ASSERT(list_link_active(&buf->b_arc_node));
255113506d1eSmaybee 			} else {
255213506d1eSmaybee 				buf->b_flags &= ~ARC_PREFETCH;
255344cb6abcSbmc 				ARCSTAT_BUMP(arcstat_mru_hits);
255413506d1eSmaybee 			}
2555d3d50737SRafael Vanoni 			buf->b_arc_access = now;
2556fa9e4066Sahrens 			return;
2557fa9e4066Sahrens 		}
2558fa9e4066Sahrens 
2559fa9e4066Sahrens 		/*
2560fa9e4066Sahrens 		 * This buffer has been "accessed" only once so far,
2561fa9e4066Sahrens 		 * but it is still in the cache. Move it to the MFU
2562fa9e4066Sahrens 		 * state.
2563fa9e4066Sahrens 		 */
2564d3d50737SRafael Vanoni 		if (now > buf->b_arc_access + ARC_MINTIME) {
2565fa9e4066Sahrens 			/*
2566fa9e4066Sahrens 			 * More than 125ms have passed since we
2567fa9e4066Sahrens 			 * instantiated this buffer.  Move it to the
2568fa9e4066Sahrens 			 * most frequently used state.
2569fa9e4066Sahrens 			 */
2570d3d50737SRafael Vanoni 			buf->b_arc_access = now;
2571ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
257244cb6abcSbmc 			arc_change_state(arc_mfu, buf, hash_lock);
2573fa9e4066Sahrens 		}
257444cb6abcSbmc 		ARCSTAT_BUMP(arcstat_mru_hits);
257544cb6abcSbmc 	} else if (buf->b_state == arc_mru_ghost) {
2576fa9e4066Sahrens 		arc_state_t	*new_state;
2577fa9e4066Sahrens 		/*
2578fa9e4066Sahrens 		 * This buffer has been "accessed" recently, but
2579fa9e4066Sahrens 		 * was evicted from the cache.  Move it to the
2580fa9e4066Sahrens 		 * MFU state.
2581fa9e4066Sahrens 		 */
2582fa9e4066Sahrens 
2583fa9e4066Sahrens 		if (buf->b_flags & ARC_PREFETCH) {
258444cb6abcSbmc 			new_state = arc_mru;
258513506d1eSmaybee 			if (refcount_count(&buf->b_refcnt) > 0)
258613506d1eSmaybee 				buf->b_flags &= ~ARC_PREFETCH;
2587ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2588fa9e4066Sahrens 		} else {
258944cb6abcSbmc 			new_state = arc_mfu;
2590ea8dc4b6Seschrock 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2591fa9e4066Sahrens 		}
2592fa9e4066Sahrens 
2593d3d50737SRafael Vanoni 		buf->b_arc_access = ddi_get_lbolt();
2594fa9e4066Sahrens 		arc_change_state(new_state, buf, hash_lock);
2595fa9e4066Sahrens 
259644cb6abcSbmc 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
259744cb6abcSbmc 	} else if (buf->b_state == arc_mfu) {
2598fa9e4066Sahrens 		/*
2599fa9e4066Sahrens 		 * This buffer has been accessed more than once and is
2600fa9e4066Sahrens 		 * still in the cache.  Keep it in the MFU state.
2601fa9e4066Sahrens 		 *
260213506d1eSmaybee 		 * NOTE: an add_reference() that occurred when we did
260313506d1eSmaybee 		 * the arc_read() will have kicked this off the list.
260413506d1eSmaybee 		 * If it was a prefetch, we will explicitly move it to
260513506d1eSmaybee 		 * the head of the list now.
2606fa9e4066Sahrens 		 */
260713506d1eSmaybee 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
260813506d1eSmaybee 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
260913506d1eSmaybee 			ASSERT(list_link_active(&buf->b_arc_node));
261013506d1eSmaybee 		}
261144cb6abcSbmc 		ARCSTAT_BUMP(arcstat_mfu_hits);
2612d3d50737SRafael Vanoni 		buf->b_arc_access = ddi_get_lbolt();
261344cb6abcSbmc 	} else if (buf->b_state == arc_mfu_ghost) {
261444cb6abcSbmc 		arc_state_t	*new_state = arc_mfu;
2615fa9e4066Sahrens 		/*
2616fa9e4066Sahrens 		 * This buffer has been accessed more than once but has
2617fa9e4066Sahrens 		 * been evicted from the cache.  Move it back to the
2618fa9e4066Sahrens 		 * MFU state.
2619fa9e4066Sahrens 		 */
2620fa9e4066Sahrens 
262113506d1eSmaybee 		if (buf->b_flags & ARC_PREFETCH) {
262213506d1eSmaybee 			/*
262313506d1eSmaybee 			 * This is a prefetch access...
262413506d1eSmaybee 			 * move this block back to the MRU state.
262513506d1eSmaybee 			 */
2626fb09f5aaSMadhav Suresh 			ASSERT0(refcount_count(&buf->b_refcnt));
262744cb6abcSbmc 			new_state = arc_mru;
262813506d1eSmaybee 		}
262913506d1eSmaybee 
2630d3d50737SRafael Vanoni 		buf->b_arc_access = ddi_get_lbolt();
2631ea8dc4b6Seschrock 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
263213506d1eSmaybee 		arc_change_state(new_state, buf, hash_lock);
2633fa9e4066Sahrens 
263444cb6abcSbmc 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2635fa94a07fSbrendan 	} else if (buf->b_state == arc_l2c_only) {
2636fa94a07fSbrendan 		/*
2637fa94a07fSbrendan 		 * This buffer is on the 2nd Level ARC.
2638fa94a07fSbrendan 		 */
2639fa94a07fSbrendan 
2640d3d50737SRafael Vanoni 		buf->b_arc_access = ddi_get_lbolt();
2641fa94a07fSbrendan 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2642fa94a07fSbrendan 		arc_change_state(arc_mfu, buf, hash_lock);
2643fa9e4066Sahrens 	} else {
2644fa9e4066Sahrens 		ASSERT(!"invalid arc state");
2645fa9e4066Sahrens 	}
2646fa9e4066Sahrens }
2647fa9e4066Sahrens 
2648fa9e4066Sahrens /* a generic arc_done_func_t which you can use */
2649fa9e4066Sahrens /* ARGSUSED */
2650fa9e4066Sahrens void
2651fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2652fa9e4066Sahrens {
26533f9d6ad7SLin Ling 	if (zio == NULL || zio->io_error == 0)
26543f9d6ad7SLin Ling 		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
26553b2aab18SMatthew Ahrens 	VERIFY(arc_buf_remove_ref(buf, arg));
2656fa9e4066Sahrens }
2657fa9e4066Sahrens 
26580e8c6158Smaybee /* a generic arc_done_func_t */
2659fa9e4066Sahrens void
2660fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2661fa9e4066Sahrens {
2662fa9e4066Sahrens 	arc_buf_t **bufp = arg;
2663fa9e4066Sahrens 	if (zio && zio->io_error) {
26643b2aab18SMatthew Ahrens 		VERIFY(arc_buf_remove_ref(buf, arg));
2665fa9e4066Sahrens 		*bufp = NULL;
2666fa9e4066Sahrens 	} else {
2667fa9e4066Sahrens 		*bufp = buf;
26683f9d6ad7SLin Ling 		ASSERT(buf->b_data);
2669fa9e4066Sahrens 	}
2670fa9e4066Sahrens }
2671fa9e4066Sahrens 
2672fa9e4066Sahrens static void
2673fa9e4066Sahrens arc_read_done(zio_t *zio)
2674fa9e4066Sahrens {
2675bbf4a8dfSmaybee 	arc_buf_hdr_t	*hdr, *found;
2676fa9e4066Sahrens 	arc_buf_t	*buf;
2677fa9e4066Sahrens 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2678fa9e4066Sahrens 	kmutex_t	*hash_lock;
2679fa9e4066Sahrens 	arc_callback_t	*callback_list, *acb;
2680fa9e4066Sahrens 	int		freeable = FALSE;
2681fa9e4066Sahrens 
2682fa9e4066Sahrens 	buf = zio->io_private;
2683fa9e4066Sahrens 	hdr = buf->b_hdr;
2684fa9e4066Sahrens 
2685bbf4a8dfSmaybee 	/*
2686bbf4a8dfSmaybee 	 * The hdr was inserted into hash-table and removed from lists
2687bbf4a8dfSmaybee 	 * prior to starting I/O.  We should find this header, since
2688bbf4a8dfSmaybee 	 * it's in the hash table, and it should be legit since it's
2689bbf4a8dfSmaybee 	 * not possible to evict it during the I/O.  The only possible
2690bbf4a8dfSmaybee 	 * reason for it not to be found is if we were freed during the
2691bbf4a8dfSmaybee 	 * read.
2692bbf4a8dfSmaybee 	 */
2693ac05c741SMark Maybee 	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
26946b4acc8bSahrens 	    &hash_lock);
2695fa9e4066Sahrens 
2696bbf4a8dfSmaybee 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2697fa94a07fSbrendan 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2698fa94a07fSbrendan 	    (found == hdr && HDR_L2_READING(hdr)));
2699fa94a07fSbrendan 
27003a737e0dSbrendan 	hdr->b_flags &= ~ARC_L2_EVICTED;
2701fa94a07fSbrendan 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
27023baa08fcSek 		hdr->b_flags &= ~ARC_L2CACHE;
2703fa9e4066Sahrens 
2704fa9e4066Sahrens 	/* byteswap if necessary */
2705fa9e4066Sahrens 	callback_list = hdr->b_acb;
2706fa9e4066Sahrens 	ASSERT(callback_list != NULL);
27078e0f0d3dSWilliam Gorrell 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2708ad135b5dSChristopher Siden 		dmu_object_byteswap_t bswap =
2709ad135b5dSChristopher Siden 		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2710088f3894Sahrens 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2711088f3894Sahrens 		    byteswap_uint64_array :
2712ad135b5dSChristopher Siden 		    dmu_ot_byteswap[bswap].ob_func;
2713088f3894Sahrens 		func(buf->b_data, hdr->b_size);
2714088f3894Sahrens 	}
2715fa9e4066Sahrens 
2716fa94a07fSbrendan 	arc_cksum_compute(buf, B_FALSE);
2717cd1c8b85SMatthew Ahrens 	arc_buf_watch(buf);
27186b4acc8bSahrens 
2719b24ab676SJeff Bonwick 	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2720b24ab676SJeff Bonwick 		/*
2721b24ab676SJeff Bonwick 		 * Only call arc_access on anonymous buffers.  This is because
2722b24ab676SJeff Bonwick 		 * if we've issued an I/O for an evicted buffer, we've already
2723b24ab676SJeff Bonwick 		 * called arc_access (to prevent any simultaneous readers from
2724b24ab676SJeff Bonwick 		 * getting confused).
2725b24ab676SJeff Bonwick 		 */
2726b24ab676SJeff Bonwick 		arc_access(hdr, hash_lock);
2727b24ab676SJeff Bonwick 	}
2728b24ab676SJeff Bonwick 
2729fa9e4066Sahrens 	/* create copies of the data buffer for the callers */
2730fa9e4066Sahrens 	abuf = buf;
2731fa9e4066Sahrens 	for (acb = callback_list; acb; acb = acb->acb_next) {
2732fa9e4066Sahrens 		if (acb->acb_done) {
27339253d63dSGeorge Wilson 			if (abuf == NULL) {
27349253d63dSGeorge Wilson 				ARCSTAT_BUMP(arcstat_duplicate_reads);
273544eda4d7Smaybee 				abuf = arc_buf_clone(buf);
27369253d63dSGeorge Wilson 			}
2737fa9e4066Sahrens 			acb->acb_buf = abuf;
2738fa9e4066Sahrens 			abuf = NULL;
2739fa9e4066Sahrens 		}
2740fa9e4066Sahrens 	}
2741fa9e4066Sahrens 	hdr->b_acb = NULL;
2742fa9e4066Sahrens 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2743ea8dc4b6Seschrock 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
2744b24ab676SJeff Bonwick 	if (abuf == buf) {
2745b24ab676SJeff Bonwick 		ASSERT(buf->b_efunc == NULL);
2746b24ab676SJeff Bonwick 		ASSERT(hdr->b_datacnt == 1);
2747ea8dc4b6Seschrock 		hdr->b_flags |= ARC_BUF_AVAILABLE;
2748b24ab676SJeff Bonwick 	}
2749fa9e4066Sahrens 
2750fa9e4066Sahrens 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2751fa9e4066Sahrens 
2752fa9e4066Sahrens 	if (zio->io_error != 0) {
2753fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_ERROR;
275444cb6abcSbmc 		if (hdr->b_state != arc_anon)
275544cb6abcSbmc 			arc_change_state(arc_anon, hdr, hash_lock);
2756ea8dc4b6Seschrock 		if (HDR_IN_HASH_TABLE(hdr))
2757ea8dc4b6Seschrock 			buf_hash_remove(hdr);
2758fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
2759fa9e4066Sahrens 	}
2760fa9e4066Sahrens 
2761ea8dc4b6Seschrock 	/*
276213506d1eSmaybee 	 * Broadcast before we drop the hash_lock to avoid the possibility
276313506d1eSmaybee 	 * that the hdr (and hence the cv) might be freed before we get to
276413506d1eSmaybee 	 * the cv_broadcast().
2765ea8dc4b6Seschrock 	 */
2766ea8dc4b6Seschrock 	cv_broadcast(&hdr->b_cv);
2767ea8dc4b6Seschrock 
2768bbf4a8dfSmaybee 	if (hash_lock) {
276944eda4d7Smaybee 		mutex_exit(hash_lock);
2770fa9e4066Sahrens 	} else {
2771fa9e4066Sahrens 		/*
2772fa9e4066Sahrens 		 * This block was freed while we waited for the read to
2773fa9e4066Sahrens 		 * complete.  It has been removed from the hash table and
2774fa9e4066Sahrens 		 * moved to the anonymous state (so that it won't show up
2775fa9e4066Sahrens 		 * in the cache).
2776fa9e4066Sahrens 		 */
277744cb6abcSbmc 		ASSERT3P(hdr->b_state, ==, arc_anon);
2778fa9e4066Sahrens 		freeable = refcount_is_zero(&hdr->b_refcnt);
2779fa9e4066Sahrens 	}
2780fa9e4066Sahrens 
2781fa9e4066Sahrens 	/* execute each callback and free its structure */
2782fa9e4066Sahrens 	while ((acb = callback_list) != NULL) {
2783fa9e4066Sahrens 		if (acb->acb_done)
2784fa9e4066Sahrens 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2785fa9e4066Sahrens 
2786fa9e4066Sahrens 		if (acb->acb_zio_dummy != NULL) {
2787fa9e4066Sahrens 			acb->acb_zio_dummy->io_error = zio->io_error;
2788fa9e4066Sahrens 			zio_nowait(acb->acb_zio_dummy);
2789fa9e4066Sahrens 		}
2790fa9e4066Sahrens 
2791fa9e4066Sahrens 		callback_list = acb->acb_next;
2792fa9e4066Sahrens 		kmem_free(acb, sizeof (arc_callback_t));
2793fa9e4066Sahrens 	}
2794fa9e4066Sahrens 
2795fa9e4066Sahrens 	if (freeable)
2796ea8dc4b6Seschrock 		arc_hdr_destroy(hdr);
2797fa9e4066Sahrens }
2798fa9e4066Sahrens 
2799fa9e4066Sahrens /*
2800fc98fea5SBart Coddens  * "Read" the block at the specified DVA (in bp) via the
2801fa9e4066Sahrens  * cache.  If the block is found in the cache, invoke the provided
2802fa9e4066Sahrens  * callback immediately and return.  Note that the `zio' parameter
2803fa9e4066Sahrens  * in the callback will be NULL in this case, since no IO was
2804fa9e4066Sahrens  * required.  If the block is not in the cache pass the read request
2805fa9e4066Sahrens  * on to the spa with a substitute callback function, so that the
2806fa9e4066Sahrens  * requested block will be added to the cache.
2807fa9e4066Sahrens  *
2808fa9e4066Sahrens  * If a read request arrives for a block that has a read in-progress,
2809fa9e4066Sahrens  * either wait for the in-progress read to complete (and return the
2810fa9e4066Sahrens  * results); or, if this is a read with a "done" func, add a record
2811fa9e4066Sahrens  * to the read to invoke the "done" func when the read completes,
2812fa9e4066Sahrens  * and return; or just return.
2813fa9e4066Sahrens  *
2814fa9e4066Sahrens  * arc_read_done() will invoke all the requested "done" functions
2815fa9e4066Sahrens  * for readers of this block.
2816fa9e4066Sahrens  */
2817fa9e4066Sahrens int
28181b912ec7SGeorge Wilson arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
28191b912ec7SGeorge Wilson     void *private, int priority, int zio_flags, uint32_t *arc_flags,
28201b912ec7SGeorge Wilson     const zbookmark_t *zb)
2821fa9e4066Sahrens {
2822fa9e4066Sahrens 	arc_buf_hdr_t *hdr;
2823d5285caeSGeorge Wilson 	arc_buf_t *buf = NULL;
2824fa9e4066Sahrens 	kmutex_t *hash_lock;
2825fa94a07fSbrendan 	zio_t *rzio;
2826e9103aaeSGarrett D'Amore 	uint64_t guid = spa_load_guid(spa);
2827fa9e4066Sahrens 
2828fa9e4066Sahrens top:
2829b24ab676SJeff Bonwick 	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2830b24ab676SJeff Bonwick 	    &hash_lock);
2831ea8dc4b6Seschrock 	if (hdr && hdr->b_datacnt > 0) {
2832fa9e4066Sahrens 
283313506d1eSmaybee 		*arc_flags |= ARC_CACHED;
283413506d1eSmaybee 
2835fa9e4066Sahrens 		if (HDR_IO_IN_PROGRESS(hdr)) {
283613506d1eSmaybee 
283713506d1eSmaybee 			if (*arc_flags & ARC_WAIT) {
283813506d1eSmaybee 				cv_wait(&hdr->b_cv, hash_lock);
283913506d1eSmaybee 				mutex_exit(hash_lock);
284013506d1eSmaybee 				goto top;
284113506d1eSmaybee 			}
284213506d1eSmaybee 			ASSERT(*arc_flags & ARC_NOWAIT);
284313506d1eSmaybee 
284413506d1eSmaybee 			if (done) {
2845fa9e4066Sahrens 				arc_callback_t	*acb = NULL;
2846fa9e4066Sahrens 
2847fa9e4066Sahrens 				acb = kmem_zalloc(sizeof (arc_callback_t),
2848fa9e4066Sahrens 				    KM_SLEEP);
2849fa9e4066Sahrens 				acb->acb_done = done;
2850fa9e4066Sahrens 				acb->acb_private = private;
2851fa9e4066Sahrens 				if (pio != NULL)
2852fa9e4066Sahrens 					acb->acb_zio_dummy = zio_null(pio,
2853a3f829aeSBill Moore 					    spa, NULL, NULL, NULL, zio_flags);
2854fa9e4066Sahrens 
2855fa9e4066Sahrens 				ASSERT(acb->acb_done != NULL);
2856fa9e4066Sahrens 				acb->acb_next = hdr->b_acb;
2857fa9e4066Sahrens 				hdr->b_acb = acb;
2858fa9e4066Sahrens 				add_reference(hdr, hash_lock, private);
2859fa9e4066Sahrens 				mutex_exit(hash_lock);
2860fa9e4066Sahrens 				return (0);
2861fa9e4066Sahrens 			}
2862fa9e4066Sahrens 			mutex_exit(hash_lock);
2863fa9e4066Sahrens 			return (0);
2864fa9e4066Sahrens 		}
2865fa9e4066Sahrens 
286644cb6abcSbmc 		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2867fa9e4066Sahrens 
2868ea8dc4b6Seschrock 		if (done) {
286944eda4d7Smaybee 			add_reference(hdr, hash_lock, private);
2870ea8dc4b6Seschrock 			/*
2871ea8dc4b6Seschrock 			 * If this block is already in use, create a new
2872ea8dc4b6Seschrock 			 * copy of the data so that we will be guaranteed
2873ea8dc4b6Seschrock 			 * that arc_release() will always succeed.
2874ea8dc4b6Seschrock 			 */
2875fa9e4066Sahrens 			buf = hdr->b_buf;
2876ea8dc4b6Seschrock 			ASSERT(buf);
2877ea8dc4b6Seschrock 			ASSERT(buf->b_data);
287844eda4d7Smaybee 			if (HDR_BUF_AVAILABLE(hdr)) {
2879ea8dc4b6Seschrock 				ASSERT(buf->b_efunc == NULL);
2880ea8dc4b6Seschrock 				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
288144eda4d7Smaybee 			} else {
288244eda4d7Smaybee 				buf = arc_buf_clone(buf);
2883ea8dc4b6Seschrock 			}
2884b24ab676SJeff Bonwick 
288513506d1eSmaybee 		} else if (*arc_flags & ARC_PREFETCH &&
288613506d1eSmaybee 		    refcount_count(&hdr->b_refcnt) == 0) {
288713506d1eSmaybee 			hdr->b_flags |= ARC_PREFETCH;
2888fa9e4066Sahrens 		}
2889fa9e4066Sahrens 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
289044eda4d7Smaybee 		arc_access(hdr, hash_lock);
28913baa08fcSek 		if (*arc_flags & ARC_L2CACHE)
28923baa08fcSek 			hdr->b_flags |= ARC_L2CACHE;
2893*aad02571SSaso Kiselkov 		if (*arc_flags & ARC_L2COMPRESS)
2894*aad02571SSaso Kiselkov 			hdr->b_flags |= ARC_L2COMPRESS;
289544eda4d7Smaybee 		mutex_exit(hash_lock);
289644cb6abcSbmc 		ARCSTAT_BUMP(arcstat_hits);
289744cb6abcSbmc 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
289844cb6abcSbmc 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
289944cb6abcSbmc 		    data, metadata, hits);
290044cb6abcSbmc 
2901fa9e4066Sahrens 		if (done)
2902fa9e4066Sahrens 			done(NULL, buf, private);
2903fa9e4066Sahrens 	} else {
2904fa9e4066Sahrens 		uint64_t size = BP_GET_LSIZE(bp);
2905fa9e4066Sahrens 		arc_callback_t	*acb;
29063a737e0dSbrendan 		vdev_t *vd = NULL;
2907d5285caeSGeorge Wilson 		uint64_t addr = 0;
29085a98e54bSBrendan Gregg - Sun Microsystems 		boolean_t devw = B_FALSE;
2909fa9e4066Sahrens 
2910fa9e4066Sahrens 		if (hdr == NULL) {
2911fa9e4066Sahrens 			/* this block is not in the cache */
2912fa9e4066Sahrens 			arc_buf_hdr_t	*exists;
2913ad23a2dbSjohansen 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2914ad23a2dbSjohansen 			buf = arc_buf_alloc(spa, size, private, type);
2915fa9e4066Sahrens 			hdr = buf->b_hdr;
2916fa9e4066Sahrens 			hdr->b_dva = *BP_IDENTITY(bp);
2917b24ab676SJeff Bonwick 			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2918fa9e4066Sahrens 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2919fa9e4066Sahrens 			exists = buf_hash_insert(hdr, &hash_lock);
2920fa9e4066Sahrens 			if (exists) {
2921fa9e4066Sahrens 				/* somebody beat us to the hash insert */
2922fa9e4066Sahrens 				mutex_exit(hash_lock);
29233f9d6ad7SLin Ling 				buf_discard_identity(hdr);
2924ea8dc4b6Seschrock 				(void) arc_buf_remove_ref(buf, private);
2925fa9e4066Sahrens 				goto top; /* restart the IO request */
2926fa9e4066Sahrens 			}
292713506d1eSmaybee 			/* if this is a prefetch, we don't have a reference */
292813506d1eSmaybee 			if (*arc_flags & ARC_PREFETCH) {
292913506d1eSmaybee 				(void) remove_reference(hdr, hash_lock,
293013506d1eSmaybee 				    private);
293113506d1eSmaybee 				hdr->b_flags |= ARC_PREFETCH;
293213506d1eSmaybee 			}
29333baa08fcSek 			if (*arc_flags & ARC_L2CACHE)
29343baa08fcSek 				hdr->b_flags |= ARC_L2CACHE;
2935*aad02571SSaso Kiselkov 			if (*arc_flags & ARC_L2COMPRESS)
2936*aad02571SSaso Kiselkov 				hdr->b_flags |= ARC_L2COMPRESS;
293713506d1eSmaybee 			if (BP_GET_LEVEL(bp) > 0)
293813506d1eSmaybee 				hdr->b_flags |= ARC_INDIRECT;
2939fa9e4066Sahrens 		} else {
2940fa9e4066Sahrens 			/* this block is in the ghost cache */
2941ea8dc4b6Seschrock 			ASSERT(GHOST_STATE(hdr->b_state));
2942ea8dc4b6Seschrock 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2943fb09f5aaSMadhav Suresh 			ASSERT0(refcount_count(&hdr->b_refcnt));
2944ea8dc4b6Seschrock 			ASSERT(hdr->b_buf == NULL);
294513506d1eSmaybee 
294613506d1eSmaybee 			/* if this is a prefetch, we don't have a reference */
294713506d1eSmaybee 			if (*arc_flags & ARC_PREFETCH)
294813506d1eSmaybee 				hdr->b_flags |= ARC_PREFETCH;
294913506d1eSmaybee 			else
295013506d1eSmaybee 				add_reference(hdr, hash_lock, private);
29513baa08fcSek 			if (*arc_flags & ARC_L2CACHE)
29523baa08fcSek 				hdr->b_flags |= ARC_L2CACHE;
2953*aad02571SSaso Kiselkov 			if (*arc_flags & ARC_L2COMPRESS)
2954*aad02571SSaso Kiselkov 				hdr->b_flags |= ARC_L2COMPRESS;
29551ab7f2deSmaybee 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2956fa9e4066Sahrens 			buf->b_hdr = hdr;
295744eda4d7Smaybee 			buf->b_data = NULL;
2958ea8dc4b6Seschrock 			buf->b_efunc = NULL;
2959ea8dc4b6Seschrock 			buf->b_private = NULL;
2960fa9e4066Sahrens 			buf->b_next = NULL;
2961fa9e4066Sahrens 			hdr->b_buf = buf;
2962ea8dc4b6Seschrock 			ASSERT(hdr->b_datacnt == 0);
2963ea8dc4b6Seschrock 			hdr->b_datacnt = 1;
29645614b00aSWilliam Gorrell 			arc_get_data_buf(buf);
29657e453561SWilliam Gorrell 			arc_access(hdr, hash_lock);
2966fa9e4066Sahrens 		}
2967fa9e4066Sahrens 
29685614b00aSWilliam Gorrell 		ASSERT(!GHOST_STATE(hdr->b_state));
29695614b00aSWilliam Gorrell 
2970fa9e4066Sahrens 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2971fa9e4066Sahrens 		acb->acb_done = done;
2972fa9e4066Sahrens 		acb->acb_private = private;
2973fa9e4066Sahrens 
2974fa9e4066Sahrens 		ASSERT(hdr->b_acb == NULL);
2975fa9e4066Sahrens 		hdr->b_acb = acb;
2976fa9e4066Sahrens 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
2977fa9e4066Sahrens 
2978e14bb325SJeff Bonwick 		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2979e14bb325SJeff Bonwick 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
29805a98e54bSBrendan Gregg - Sun Microsystems 			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
29813a737e0dSbrendan 			addr = hdr->b_l2hdr->b_daddr;
2982e14bb325SJeff Bonwick 			/*
2983e14bb325SJeff Bonwick 			 * Lock out device removal.
2984e14bb325SJeff Bonwick 			 */
2985e14bb325SJeff Bonwick 			if (vdev_is_dead(vd) ||
2986e14bb325SJeff Bonwick 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2987e14bb325SJeff Bonwick 				vd = NULL;
29883a737e0dSbrendan 		}
29893a737e0dSbrendan 
29903a737e0dSbrendan 		mutex_exit(hash_lock);
29913a737e0dSbrendan 
2992fa9e4066Sahrens 		ASSERT3U(hdr->b_size, ==, size);
29935c28183bSBrendan Gregg - Sun Microsystems 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
29945c28183bSBrendan Gregg - Sun Microsystems 		    uint64_t, size, zbookmark_t *, zb);
299544cb6abcSbmc 		ARCSTAT_BUMP(arcstat_misses);
299644cb6abcSbmc 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
299744cb6abcSbmc 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
299844cb6abcSbmc 		    data, metadata, misses);
2999ea8dc4b6Seschrock 
30005a98e54bSBrendan Gregg - Sun Microsystems 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3001fa94a07fSbrendan 			/*
3002fa94a07fSbrendan 			 * Read from the L2ARC if the following are true:
30033a737e0dSbrendan 			 * 1. The L2ARC vdev was previously cached.
30043a737e0dSbrendan 			 * 2. This buffer still has L2ARC metadata.
30053a737e0dSbrendan 			 * 3. This buffer isn't currently writing to the L2ARC.
30063a737e0dSbrendan 			 * 4. The L2ARC entry wasn't evicted, which may
30073a737e0dSbrendan 			 *    also have invalidated the vdev.
30085a98e54bSBrendan Gregg - Sun Microsystems 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3009fa94a07fSbrendan 			 */
3010e14bb325SJeff Bonwick 			if (hdr->b_l2hdr != NULL &&
30115a98e54bSBrendan Gregg - Sun Microsystems 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
30125a98e54bSBrendan Gregg - Sun Microsystems 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3013fa94a07fSbrendan 				l2arc_read_callback_t *cb;
3014fa94a07fSbrendan 
3015c5904d13Seschrock 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3016c5904d13Seschrock 				ARCSTAT_BUMP(arcstat_l2_hits);
3017c5904d13Seschrock 
3018fa94a07fSbrendan 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3019fa94a07fSbrendan 				    KM_SLEEP);
3020fa94a07fSbrendan 				cb->l2rcb_buf = buf;
3021fa94a07fSbrendan 				cb->l2rcb_spa = spa;
3022fa94a07fSbrendan 				cb->l2rcb_bp = *bp;
3023fa94a07fSbrendan 				cb->l2rcb_zb = *zb;
30243baa08fcSek 				cb->l2rcb_flags = zio_flags;
3025*aad02571SSaso Kiselkov 				cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3026fa94a07fSbrendan 
3027d5285caeSGeorge Wilson 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3028d5285caeSGeorge Wilson 				    addr + size < vd->vdev_psize -
3029d5285caeSGeorge Wilson 				    VDEV_LABEL_END_SIZE);
3030d5285caeSGeorge Wilson 
3031fa94a07fSbrendan 				/*
3032e14bb325SJeff Bonwick 				 * l2arc read.  The SCL_L2ARC lock will be
3033e14bb325SJeff Bonwick 				 * released by l2arc_read_done().
3034*aad02571SSaso Kiselkov 				 * Issue a null zio if the underlying buffer
3035*aad02571SSaso Kiselkov 				 * was squashed to zero size by compression.
3036fa94a07fSbrendan 				 */
3037*aad02571SSaso Kiselkov 				if (hdr->b_l2hdr->b_compress ==
3038*aad02571SSaso Kiselkov 				    ZIO_COMPRESS_EMPTY) {
3039*aad02571SSaso Kiselkov 					rzio = zio_null(pio, spa, vd,
3040*aad02571SSaso Kiselkov 					    l2arc_read_done, cb,
3041*aad02571SSaso Kiselkov 					    zio_flags | ZIO_FLAG_DONT_CACHE |
3042*aad02571SSaso Kiselkov 					    ZIO_FLAG_CANFAIL |
3043*aad02571SSaso Kiselkov 					    ZIO_FLAG_DONT_PROPAGATE |
3044*aad02571SSaso Kiselkov 					    ZIO_FLAG_DONT_RETRY);
3045*aad02571SSaso Kiselkov 				} else {
3046*aad02571SSaso Kiselkov 					rzio = zio_read_phys(pio, vd, addr,
3047*aad02571SSaso Kiselkov 					    hdr->b_l2hdr->b_asize,
3048*aad02571SSaso Kiselkov 					    buf->b_data, ZIO_CHECKSUM_OFF,
3049*aad02571SSaso Kiselkov 					    l2arc_read_done, cb, priority,
3050*aad02571SSaso Kiselkov 					    zio_flags | ZIO_FLAG_DONT_CACHE |
3051*aad02571SSaso Kiselkov 					    ZIO_FLAG_CANFAIL |
3052*aad02571SSaso Kiselkov 					    ZIO_FLAG_DONT_PROPAGATE |
3053*aad02571SSaso Kiselkov 					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3054*aad02571SSaso Kiselkov 				}
3055fa94a07fSbrendan 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3056fa94a07fSbrendan 				    zio_t *, rzio);
3057*aad02571SSaso Kiselkov 				ARCSTAT_INCR(arcstat_l2_read_bytes,
3058*aad02571SSaso Kiselkov 				    hdr->b_l2hdr->b_asize);
3059fa94a07fSbrendan 
30603a737e0dSbrendan 				if (*arc_flags & ARC_NOWAIT) {
30613a737e0dSbrendan 					zio_nowait(rzio);
30623a737e0dSbrendan 					return (0);
30633a737e0dSbrendan 				}
3064fa94a07fSbrendan 
30653a737e0dSbrendan 				ASSERT(*arc_flags & ARC_WAIT);
30663a737e0dSbrendan 				if (zio_wait(rzio) == 0)
30673a737e0dSbrendan 					return (0);
30683a737e0dSbrendan 
30693a737e0dSbrendan 				/* l2arc read error; goto zio_read() */
3070fa94a07fSbrendan 			} else {
3071fa94a07fSbrendan 				DTRACE_PROBE1(l2arc__miss,
3072fa94a07fSbrendan 				    arc_buf_hdr_t *, hdr);
3073fa94a07fSbrendan 				ARCSTAT_BUMP(arcstat_l2_misses);
3074fa94a07fSbrendan 				if (HDR_L2_WRITING(hdr))
3075fa94a07fSbrendan 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3076e14bb325SJeff Bonwick 				spa_config_exit(spa, SCL_L2ARC, vd);
3077fa94a07fSbrendan 			}
30785a98e54bSBrendan Gregg - Sun Microsystems 		} else {
307976a25fafSBill Moore 			if (vd != NULL)
308076a25fafSBill Moore 				spa_config_exit(spa, SCL_L2ARC, vd);
30815a98e54bSBrendan Gregg - Sun Microsystems 			if (l2arc_ndev != 0) {
30825a98e54bSBrendan Gregg - Sun Microsystems 				DTRACE_PROBE1(l2arc__miss,
30835a98e54bSBrendan Gregg - Sun Microsystems 				    arc_buf_hdr_t *, hdr);
30845a98e54bSBrendan Gregg - Sun Microsystems 				ARCSTAT_BUMP(arcstat_l2_misses);
30855a98e54bSBrendan Gregg - Sun Microsystems 			}
3086fa94a07fSbrendan 		}
3087c5904d13Seschrock 
3088fa9e4066Sahrens 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
30893baa08fcSek 		    arc_read_done, buf, priority, zio_flags, zb);
3090fa9e4066Sahrens 
309113506d1eSmaybee 		if (*arc_flags & ARC_WAIT)
3092fa9e4066Sahrens 			return (zio_wait(rzio));
3093fa9e4066Sahrens 
309413506d1eSmaybee 		ASSERT(*arc_flags & ARC_NOWAIT);
3095fa9e4066Sahrens 		zio_nowait(rzio);
3096fa9e4066Sahrens 	}
3097fa9e4066Sahrens 	return (0);
3098fa9e4066Sahrens }
3099fa9e4066Sahrens 
3100ea8dc4b6Seschrock void
3101ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3102ea8dc4b6Seschrock {
3103ea8dc4b6Seschrock 	ASSERT(buf->b_hdr != NULL);
310444cb6abcSbmc 	ASSERT(buf->b_hdr->b_state != arc_anon);
3105ea8dc4b6Seschrock 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3106b24ab676SJeff Bonwick 	ASSERT(buf->b_efunc == NULL);
3107b24ab676SJeff Bonwick 	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3108b24ab676SJeff Bonwick 
3109ea8dc4b6Seschrock 	buf->b_efunc = func;
3110ea8dc4b6Seschrock 	buf->b_private = private;
3111ea8dc4b6Seschrock }
3112ea8dc4b6Seschrock 
3113ea8dc4b6Seschrock /*
3114ea8dc4b6Seschrock  * This is used by the DMU to let the ARC know that a buffer is
3115ea8dc4b6Seschrock  * being evicted, so the ARC should clean up.  If this arc buf
3116ea8dc4b6Seschrock  * is not yet in the evicted state, it will be put there.
3117ea8dc4b6Seschrock  */
3118ea8dc4b6Seschrock int
3119ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf)
3120ea8dc4b6Seschrock {
312140d7d650Smaybee 	arc_buf_hdr_t *hdr;
3122ea8dc4b6Seschrock 	kmutex_t *hash_lock;
3123ea8dc4b6Seschrock 	arc_buf_t **bufp;
3124ea8dc4b6Seschrock 
31253f9d6ad7SLin Ling 	mutex_enter(&buf->b_evict_lock);
312640d7d650Smaybee 	hdr = buf->b_hdr;
3127ea8dc4b6Seschrock 	if (hdr == NULL) {
3128ea8dc4b6Seschrock 		/*
3129ea8dc4b6Seschrock 		 * We are in arc_do_user_evicts().
3130ea8dc4b6Seschrock 		 */
3131ea8dc4b6Seschrock 		ASSERT(buf->b_data == NULL);
31323f9d6ad7SLin Ling 		mutex_exit(&buf->b_evict_lock);
3133ea8dc4b6Seschrock 		return (0);
31346f83844dSMark Maybee 	} else if (buf->b_data == NULL) {
31356f83844dSMark Maybee 		arc_buf_t copy = *buf; /* structure assignment */
31369b23f181Smaybee 		/*
31376f83844dSMark Maybee 		 * We are on the eviction list; process this buffer now
31386f83844dSMark Maybee 		 * but let arc_do_user_evicts() do the reaping.
31399b23f181Smaybee 		 */
31406f83844dSMark Maybee 		buf->b_efunc = NULL;
31413f9d6ad7SLin Ling 		mutex_exit(&buf->b_evict_lock);
31426f83844dSMark Maybee 		VERIFY(copy.b_efunc(&copy) == 0);
31436f83844dSMark Maybee 		return (1);
31449b23f181Smaybee 	}
31456f83844dSMark Maybee 	hash_lock = HDR_LOCK(hdr);
31466f83844dSMark Maybee 	mutex_enter(hash_lock);
31473f9d6ad7SLin Ling 	hdr = buf->b_hdr;
31483f9d6ad7SLin Ling 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
31499b23f181Smaybee 
31509b23f181Smaybee 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
315144cb6abcSbmc 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3152ea8dc4b6Seschrock 
3153ea8dc4b6Seschrock 	/*
3154ea8dc4b6Seschrock 	 * Pull this buffer off of the hdr
3155ea8dc4b6Seschrock 	 */
3156ea8dc4b6Seschrock 	bufp = &hdr->b_buf;
3157ea8dc4b6Seschrock 	while (*bufp != buf)
3158ea8dc4b6Seschrock 		bufp = &(*bufp)->b_next;
3159ea8dc4b6Seschrock 	*bufp = buf->b_next;
3160ea8dc4b6Seschrock 
3161ea8dc4b6Seschrock 	ASSERT(buf->b_data != NULL);
316244eda4d7Smaybee 	arc_buf_destroy(buf, FALSE, FALSE);
3163ea8dc4b6Seschrock 
3164ea8dc4b6Seschrock 	if (hdr->b_datacnt == 0) {
3165ea8dc4b6Seschrock 		arc_state_t *old_state = hdr->b_state;
3166ea8dc4b6Seschrock 		arc_state_t *evicted_state;
3167ea8dc4b6Seschrock 
31683f9d6ad7SLin Ling 		ASSERT(hdr->b_buf == NULL);
3169ea8dc4b6Seschrock 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
3170ea8dc4b6Seschrock 
3171ea8dc4b6Seschrock 		evicted_state =
317244cb6abcSbmc 		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3173ea8dc4b6Seschrock 
317444cb6abcSbmc 		mutex_enter(&old_state->arcs_mtx);
317544cb6abcSbmc 		mutex_enter(&evicted_state->arcs_mtx);
3176ea8dc4b6Seschrock 
3177ea8dc4b6Seschrock 		arc_change_state(evicted_state, hdr, hash_lock);
3178ea8dc4b6Seschrock 		ASSERT(HDR_IN_HASH_TABLE(hdr));
3179fa94a07fSbrendan 		hdr->b_flags |= ARC_IN_HASH_TABLE;
3180fa94a07fSbrendan 		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3181ea8dc4b6Seschrock 
318244cb6abcSbmc 		mutex_exit(&evicted_state->arcs_mtx);
318344cb6abcSbmc 		mutex_exit(&old_state->arcs_mtx);
3184ea8dc4b6Seschrock 	}
3185ea8dc4b6Seschrock 	mutex_exit(hash_lock);
31863f9d6ad7SLin Ling 	mutex_exit(&buf->b_evict_lock);
3187dd6ef538Smaybee 
3188ea8dc4b6Seschrock 	VERIFY(buf->b_efunc(buf) == 0);
3189ea8dc4b6Seschrock 	buf->b_efunc = NULL;
3190ea8dc4b6Seschrock 	buf->b_private = NULL;
3191ea8dc4b6Seschrock 	buf->b_hdr = NULL;
31923f9d6ad7SLin Ling 	buf->b_next = NULL;
3193ea8dc4b6Seschrock 	kmem_cache_free(buf_cache, buf);
3194ea8dc4b6Seschrock 	return (1);
3195ea8dc4b6Seschrock }
3196ea8dc4b6Seschrock 
3197fa9e4066Sahrens /*
3198fa9e4066Sahrens  * Release this buffer from the cache.  This must be done
3199fa9e4066Sahrens  * after a read and prior to modifying the buffer contents.
3200fa9e4066Sahrens  * If the buffer has more than one reference, we must make
3201088f3894Sahrens  * a new hdr for the buffer.
3202fa9e4066Sahrens  */
3203fa9e4066Sahrens void
3204fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag)
3205fa9e4066Sahrens {
32066f83844dSMark Maybee 	arc_buf_hdr_t *hdr;
32073f9d6ad7SLin Ling 	kmutex_t *hash_lock = NULL;
32086f83844dSMark Maybee 	l2arc_buf_hdr_t *l2hdr;
3209fa94a07fSbrendan 	uint64_t buf_size;
3210fa9e4066Sahrens 
32113f9d6ad7SLin Ling 	/*
32123f9d6ad7SLin Ling 	 * It would be nice to assert that if it's DMU metadata (level >
32133f9d6ad7SLin Ling 	 * 0 || it's the dnode file), then it must be syncing context.
32143f9d6ad7SLin Ling 	 * But we don't know that information at this level.
32153f9d6ad7SLin Ling 	 */
32163f9d6ad7SLin Ling 
32173f9d6ad7SLin Ling 	mutex_enter(&buf->b_evict_lock);
32186f83844dSMark Maybee 	hdr = buf->b_hdr;
32196f83844dSMark Maybee 
3220fa9e4066Sahrens 	/* this buffer is not on any list */
3221fa9e4066Sahrens 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3222fa9e4066Sahrens 
322344cb6abcSbmc 	if (hdr->b_state == arc_anon) {
3224fa9e4066Sahrens 		/* this buffer is already released */
3225ea8dc4b6Seschrock 		ASSERT(buf->b_efunc == NULL);
32260a95608cSBrendan Gregg - Sun Microsystems 	} else {
32270a95608cSBrendan Gregg - Sun Microsystems 		hash_lock = HDR_LOCK(hdr);
32280a95608cSBrendan Gregg - Sun Microsystems 		mutex_enter(hash_lock);
32293f9d6ad7SLin Ling 		hdr = buf->b_hdr;
32303f9d6ad7SLin Ling 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3231fa9e4066Sahrens 	}
3232fa9e4066Sahrens 
32336f83844dSMark Maybee 	l2hdr = hdr->b_l2hdr;
32346f83844dSMark Maybee 	if (l2hdr) {
32356f83844dSMark Maybee 		mutex_enter(&l2arc_buflist_mtx);
32366f83844dSMark Maybee 		hdr->b_l2hdr = NULL;
32376f83844dSMark Maybee 	}
3238d5285caeSGeorge Wilson 	buf_size = hdr->b_size;
32396f83844dSMark Maybee 
3240ea8dc4b6Seschrock 	/*
3241ea8dc4b6Seschrock 	 * Do we have more than one buf?
3242ea8dc4b6Seschrock 	 */
32436f83844dSMark Maybee 	if (hdr->b_datacnt > 1) {
3244fa9e4066Sahrens 		arc_buf_hdr_t *nhdr;
3245fa9e4066Sahrens 		arc_buf_t **bufp;
3246fa9e4066Sahrens 		uint64_t blksz = hdr->b_size;
3247ac05c741SMark Maybee 		uint64_t spa = hdr->b_spa;
3248ad23a2dbSjohansen 		arc_buf_contents_t type = hdr->b_type;
3249fa94a07fSbrendan 		uint32_t flags = hdr->b_flags;
3250fa9e4066Sahrens 
32516f83844dSMark Maybee 		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3252fa9e4066Sahrens 		/*
32533f9d6ad7SLin Ling 		 * Pull the data off of this hdr and attach it to
32543f9d6ad7SLin Ling 		 * a new anonymous hdr.
3255fa9e4066Sahrens 		 */
3256ea8dc4b6Seschrock 		(void) remove_reference(hdr, hash_lock, tag);
3257fa9e4066Sahrens 		bufp = &hdr->b_buf;
3258ea8dc4b6Seschrock 		while (*bufp != buf)
3259fa9e4066Sahrens 			bufp = &(*bufp)->b_next;
32603f9d6ad7SLin Ling 		*bufp = buf->b_next;
3261af2c4821Smaybee 		buf->b_next = NULL;
3262ea8dc4b6Seschrock 
326344cb6abcSbmc 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
326444cb6abcSbmc 		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3265ea8dc4b6Seschrock 		if (refcount_is_zero(&hdr->b_refcnt)) {
32660e8c6158Smaybee 			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
32670e8c6158Smaybee 			ASSERT3U(*size, >=, hdr->b_size);
32680e8c6158Smaybee 			atomic_add_64(size, -hdr->b_size);
3269ea8dc4b6Seschrock 		}
32709253d63dSGeorge Wilson 
32719253d63dSGeorge Wilson 		/*
32729253d63dSGeorge Wilson 		 * We're releasing a duplicate user data buffer, update
32739253d63dSGeorge Wilson 		 * our statistics accordingly.
32749253d63dSGeorge Wilson 		 */
32759253d63dSGeorge Wilson 		if (hdr->b_type == ARC_BUFC_DATA) {
32769253d63dSGeorge Wilson 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
32779253d63dSGeorge Wilson 			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
32789253d63dSGeorge Wilson 			    -hdr->b_size);
32799253d63dSGeorge Wilson 		}
3280ea8dc4b6Seschrock 		hdr->b_datacnt -= 1;
3281c717a561Smaybee 		arc_cksum_verify(buf);
3282cd1c8b85SMatthew Ahrens 		arc_buf_unwatch(buf);
3283ea8dc4b6Seschrock 
3284fa9e4066Sahrens 		mutex_exit(hash_lock);
3285fa9e4066Sahrens 
32861ab7f2deSmaybee 		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3287fa9e4066Sahrens 		nhdr->b_size = blksz;
3288fa9e4066Sahrens 		nhdr->b_spa = spa;
3289ad23a2dbSjohansen 		nhdr->b_type = type;
3290fa9e4066Sahrens 		nhdr->b_buf = buf;
329144cb6abcSbmc 		nhdr->b_state = arc_anon;
3292fa9e4066Sahrens 		nhdr->b_arc_access = 0;
3293fa94a07fSbrendan 		nhdr->b_flags = flags & ARC_L2_WRITING;
3294fa94a07fSbrendan 		nhdr->b_l2hdr = NULL;
3295ea8dc4b6Seschrock 		nhdr->b_datacnt = 1;
3296c717a561Smaybee 		nhdr->b_freeze_cksum = NULL;
3297fa9e4066Sahrens 		(void) refcount_add(&nhdr->b_refcnt, tag);
3298af2c4821Smaybee 		buf->b_hdr = nhdr;
32993f9d6ad7SLin Ling 		mutex_exit(&buf->b_evict_lock);
330044cb6abcSbmc 		atomic_add_64(&arc_anon->arcs_size, blksz);
3301fa9e4066Sahrens 	} else {
33023f9d6ad7SLin Ling 		mutex_exit(&buf->b_evict_lock);
3303ea8dc4b6Seschrock 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3304fa9e4066Sahrens 		ASSERT(!list_link_active(&hdr->b_arc_node));
3305fa9e4066Sahrens 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
33063f9d6ad7SLin Ling 		if (hdr->b_state != arc_anon)
33073f9d6ad7SLin Ling 			arc_change_state(arc_anon, hdr, hash_lock);
3308fa9e4066Sahrens 		hdr->b_arc_access = 0;
33093f9d6ad7SLin Ling 		if (hash_lock)
33103f9d6ad7SLin Ling 			mutex_exit(hash_lock);
3311fa94a07fSbrendan 
33123f9d6ad7SLin Ling 		buf_discard_identity(hdr);
3313c717a561Smaybee 		arc_buf_thaw(buf);
3314fa9e4066Sahrens 	}
3315ea8dc4b6Seschrock 	buf->b_efunc = NULL;
3316ea8dc4b6Seschrock 	buf->b_private = NULL;
3317fa94a07fSbrendan 
3318fa94a07fSbrendan 	if (l2hdr) {
3319*aad02571SSaso Kiselkov 		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3320fa94a07fSbrendan 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3321fa94a07fSbrendan 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3322fa94a07fSbrendan 		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3323fa94a07fSbrendan 		mutex_exit(&l2arc_buflist_mtx);
33246f83844dSMark Maybee 	}
3325fa9e4066Sahrens }
3326fa9e4066Sahrens 
3327fa9e4066Sahrens int
3328fa9e4066Sahrens arc_released(arc_buf_t *buf)
3329fa9e4066Sahrens {
33306f83844dSMark Maybee 	int released;
33316f83844dSMark Maybee 
33323f9d6ad7SLin Ling 	mutex_enter(&buf->b_evict_lock);
33336f83844dSMark Maybee 	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
33343f9d6ad7SLin Ling 	mutex_exit(&buf->b_evict_lock);
33356f83844dSMark Maybee 	return (released);
3336ea8dc4b6Seschrock }
3337ea8dc4b6Seschrock 
3338ea8dc4b6Seschrock int
3339ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf)
3340ea8dc4b6Seschrock {
33416f83844dSMark Maybee 	int callback;
33426f83844dSMark Maybee 
33433f9d6ad7SLin Ling 	mutex_enter(&buf->b_evict_lock);
33446f83844dSMark Maybee 	callback = (buf->b_efunc != NULL);
33453f9d6ad7SLin Ling 	mutex_exit(&buf->b_evict_lock);
33466f83844dSMark Maybee 	return (callback);
3347fa9e4066Sahrens }
3348fa9e4066Sahrens 
3349ea8dc4b6Seschrock #ifdef ZFS_DEBUG
3350ea8dc4b6Seschrock int
3351ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf)
3352ea8dc4b6Seschrock {
33536f83844dSMark Maybee 	int referenced;
33546f83844dSMark Maybee 
33553f9d6ad7SLin Ling 	mutex_enter(&buf->b_evict_lock);
33566f83844dSMark Maybee 	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
33573f9d6ad7SLin Ling 	mutex_exit(&buf->b_evict_lock);
33586f83844dSMark Maybee 	return (referenced);
3359ea8dc4b6Seschrock }
3360ea8dc4b6Seschrock #endif
3361ea8dc4b6Seschrock 
3362c717a561Smaybee static void
3363c717a561Smaybee arc_write_ready(zio_t *zio)
3364c717a561Smaybee {
3365c717a561Smaybee 	arc_write_callback_t *callback = zio->io_private;
3366c717a561Smaybee 	arc_buf_t *buf = callback->awcb_buf;
33670a4e9518Sgw 	arc_buf_hdr_t *hdr = buf->b_hdr;
3368c717a561Smaybee 
3369e14bb325SJeff Bonwick 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3370e14bb325SJeff Bonwick 	callback->awcb_ready(zio, buf, callback->awcb_private);
3371e14bb325SJeff Bonwick 
33720a4e9518Sgw 	/*
33730a4e9518Sgw 	 * If the IO is already in progress, then this is a re-write
3374e14bb325SJeff Bonwick 	 * attempt, so we need to thaw and re-compute the cksum.
3375e14bb325SJeff Bonwick 	 * It is the responsibility of the callback to handle the
3376e14bb325SJeff Bonwick 	 * accounting for any re-write attempt.
33770a4e9518Sgw 	 */
33780a4e9518Sgw 	if (HDR_IO_IN_PROGRESS(hdr)) {
33790a4e9518Sgw 		mutex_enter(&hdr->b_freeze_lock);
33800a4e9518Sgw 		if (hdr->b_freeze_cksum != NULL) {
33810a4e9518Sgw 			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
33820a4e9518Sgw 			hdr->b_freeze_cksum = NULL;
33830a4e9518Sgw 		}
33840a4e9518Sgw 		mutex_exit(&hdr->b_freeze_lock);
33850a4e9518Sgw 	}
3386fa94a07fSbrendan 	arc_cksum_compute(buf, B_FALSE);
33870a4e9518Sgw 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3388c717a561Smaybee }
3389c717a561Smaybee 
3390fa9e4066Sahrens static void
3391fa9e4066Sahrens arc_write_done(zio_t *zio)
3392fa9e4066Sahrens {
3393c717a561Smaybee 	arc_write_callback_t *callback = zio->io_private;
3394c717a561Smaybee 	arc_buf_t *buf = callback->awcb_buf;
3395c717a561Smaybee 	arc_buf_hdr_t *hdr = buf->b_hdr;
3396fa9e4066Sahrens 
3397b24ab676SJeff Bonwick 	ASSERT(hdr->b_acb == NULL);
3398b24ab676SJeff Bonwick 
3399b24ab676SJeff Bonwick 	if (zio->io_error == 0) {
3400b24ab676SJeff Bonwick 		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3401b24ab676SJeff Bonwick 		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3402b24ab676SJeff Bonwick 		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3403b24ab676SJeff Bonwick 	} else {
3404b24ab676SJeff Bonwick 		ASSERT(BUF_EMPTY(hdr));
3405b24ab676SJeff Bonwick 	}
3406fa9e4066Sahrens 
3407ea8dc4b6Seschrock 	/*
3408ea8dc4b6Seschrock 	 * If the block to be written was all-zero, we may have
3409ea8dc4b6Seschrock 	 * compressed it away.  In this case no write was performed
34103f9d6ad7SLin Ling 	 * so there will be no dva/birth/checksum.  The buffer must
34113f9d6ad7SLin Ling 	 * therefore remain anonymous (and uncached).
3412ea8dc4b6Seschrock 	 */
3413fa9e4066Sahrens 	if (!BUF_EMPTY(hdr)) {
3414fa9e4066Sahrens 		arc_buf_hdr_t *exists;
3415fa9e4066Sahrens 		kmutex_t *hash_lock;
3416fa9e4066Sahrens 
3417b24ab676SJeff Bonwick 		ASSERT(zio->io_error == 0);
3418b24ab676SJeff Bonwick 
34196b4acc8bSahrens 		arc_cksum_verify(buf);
34206b4acc8bSahrens 
3421fa9e4066Sahrens 		exists = buf_hash_insert(hdr, &hash_lock);
3422fa9e4066Sahrens 		if (exists) {
3423fa9e4066Sahrens 			/*
3424fa9e4066Sahrens 			 * This can only happen if we overwrite for
3425fa9e4066Sahrens 			 * sync-to-convergence, because we remove
3426fa9e4066Sahrens 			 * buffers from the hash table when we arc_free().
3427fa9e4066Sahrens 			 */
3428b24ab676SJeff Bonwick 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3429b24ab676SJeff Bonwick 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3430b24ab676SJeff Bonwick 					panic("bad overwrite, hdr=%p exists=%p",
3431b24ab676SJeff Bonwick 					    (void *)hdr, (void *)exists);
3432b24ab676SJeff Bonwick 				ASSERT(refcount_is_zero(&exists->b_refcnt));
3433b24ab676SJeff Bonwick 				arc_change_state(arc_anon, exists, hash_lock);
3434b24ab676SJeff Bonwick 				mutex_exit(hash_lock);
3435b24ab676SJeff Bonwick 				arc_hdr_destroy(exists);
3436b24ab676SJeff Bonwick 				exists = buf_hash_insert(hdr, &hash_lock);
3437b24ab676SJeff Bonwick 				ASSERT3P(exists, ==, NULL);
343880901aeaSGeorge Wilson 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
343980901aeaSGeorge Wilson 				/* nopwrite */
344080901aeaSGeorge Wilson 				ASSERT(zio->io_prop.zp_nopwrite);
344180901aeaSGeorge Wilson 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
344280901aeaSGeorge Wilson 					panic("bad nopwrite, hdr=%p exists=%p",
344380901aeaSGeorge Wilson 					    (void *)hdr, (void *)exists);
3444b24ab676SJeff Bonwick 			} else {
3445b24ab676SJeff Bonwick 				/* Dedup */
3446b24ab676SJeff Bonwick 				ASSERT(hdr->b_datacnt == 1);
3447b24ab676SJeff Bonwick 				ASSERT(hdr->b_state == arc_anon);
3448b24ab676SJeff Bonwick 				ASSERT(BP_GET_DEDUP(zio->io_bp));
3449b24ab676SJeff Bonwick 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3450ae46e4c7SMatthew Ahrens 			}
3451fa9e4066Sahrens 		}
3452ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3453088f3894Sahrens 		/* if it's not anon, we are doing a scrub */
3454b24ab676SJeff Bonwick 		if (!exists && hdr->b_state == arc_anon)
3455088f3894Sahrens 			arc_access(hdr, hash_lock);
345644eda4d7Smaybee 		mutex_exit(hash_lock);
3457ea8dc4b6Seschrock 	} else {
3458ea8dc4b6Seschrock 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3459fa9e4066Sahrens 	}
3460ea8dc4b6Seschrock 
3461b24ab676SJeff Bonwick 	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3462b24ab676SJeff Bonwick 	callback->awcb_done(zio, buf, callback->awcb_private);
3463fa9e4066Sahrens 
3464c717a561Smaybee 	kmem_free(callback, sizeof (arc_write_callback_t));
3465fa9e4066Sahrens }
3466fa9e4066Sahrens 
3467c717a561Smaybee zio_t *
3468b24ab676SJeff Bonwick arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3469*aad02571SSaso Kiselkov     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3470*aad02571SSaso Kiselkov     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3471*aad02571SSaso Kiselkov     void *private, int priority, int zio_flags, const zbookmark_t *zb)
3472fa9e4066Sahrens {
3473fa9e4066Sahrens 	arc_buf_hdr_t *hdr = buf->b_hdr;
3474c717a561Smaybee 	arc_write_callback_t *callback;
3475e14bb325SJeff Bonwick 	zio_t *zio;
3476fa9e4066Sahrens 
3477e14bb325SJeff Bonwick 	ASSERT(ready != NULL);
3478b24ab676SJeff Bonwick 	ASSERT(done != NULL);
3479fa9e4066Sahrens 	ASSERT(!HDR_IO_ERROR(hdr));
3480c5c6ffa0Smaybee 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3481b24ab676SJeff Bonwick 	ASSERT(hdr->b_acb == NULL);
34823baa08fcSek 	if (l2arc)
34833baa08fcSek 		hdr->b_flags |= ARC_L2CACHE;
3484*aad02571SSaso Kiselkov 	if (l2arc_compress)
3485*aad02571SSaso Kiselkov 		hdr->b_flags |= ARC_L2COMPRESS;
3486c717a561Smaybee 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3487c717a561Smaybee 	callback->awcb_ready = ready;
3488c717a561Smaybee 	callback->awcb_done = done;
3489c717a561Smaybee 	callback->awcb_private = private;
3490c717a561Smaybee 	callback->awcb_buf = buf;
3491088f3894Sahrens 
3492b24ab676SJeff Bonwick 	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3493e14bb325SJeff Bonwick 	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3494fa9e4066Sahrens 
3495c717a561Smaybee 	return (zio);
3496fa9e4066Sahrens }
3497fa9e4066Sahrens 
34981ab7f2deSmaybee static int
34992fdbea25SAleksandr Guzovskiy arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
35001ab7f2deSmaybee {
35011ab7f2deSmaybee #ifdef _KERNEL
35021ab7f2deSmaybee 	uint64_t available_memory = ptob(freemem);
35031ab7f2deSmaybee 	static uint64_t page_load = 0;
35041ab7f2deSmaybee 	static uint64_t last_txg = 0;
35051ab7f2deSmaybee 
35061ab7f2deSmaybee #if defined(__i386)
35071ab7f2deSmaybee 	available_memory =
35081ab7f2deSmaybee 	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
35091ab7f2deSmaybee #endif
35101ab7f2deSmaybee 	if (available_memory >= zfs_write_limit_max)
35111ab7f2deSmaybee 		return (0);
35121ab7f2deSmaybee 
35131ab7f2deSmaybee 	if (txg > last_txg) {
35141ab7f2deSmaybee 		last_txg = txg;
35151ab7f2deSmaybee 		page_load = 0;
35161ab7f2deSmaybee 	}
35171ab7f2deSmaybee 	/*
35181ab7f2deSmaybee 	 * If we are in pageout, we know that memory is already tight,
35191ab7f2deSmaybee 	 * the arc is already going to be evicting, so we just want to
35201ab7f2deSmaybee 	 * continue to let page writes occur as quickly as possible.
35211ab7f2deSmaybee 	 */
35221ab7f2deSmaybee 	if (curproc == proc_pageout) {
35231ab7f2deSmaybee 		if (page_load > MAX(ptob(minfree), available_memory) / 4)
3524be6fd75aSMatthew Ahrens 			return (SET_ERROR(ERESTART));
35251ab7f2deSmaybee 		/* Note: reserve is inflated, so we deflate */
35261ab7f2deSmaybee 		page_load += reserve / 8;
35271ab7f2deSmaybee 		return (0);
35281ab7f2deSmaybee 	} else if (page_load > 0 && arc_reclaim_needed()) {
35291ab7f2deSmaybee 		/* memory is low, delay before restarting */
35301ab7f2deSmaybee 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3531be6fd75aSMatthew Ahrens 		return (SET_ERROR(EAGAIN));
35321ab7f2deSmaybee 	}
35331ab7f2deSmaybee 	page_load = 0;
35341ab7f2deSmaybee 
35351ab7f2deSmaybee 	if (arc_size > arc_c_min) {
35361ab7f2deSmaybee 		uint64_t evictable_memory =
35371ab7f2deSmaybee 		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
35381ab7f2deSmaybee 		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
35391ab7f2deSmaybee 		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
35401ab7f2deSmaybee 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
35411ab7f2deSmaybee 		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
35421ab7f2deSmaybee 	}
35431ab7f2deSmaybee 
35441ab7f2deSmaybee 	if (inflight_data > available_memory / 4) {
35451ab7f2deSmaybee 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3546be6fd75aSMatthew Ahrens 		return (SET_ERROR(ERESTART));
35471ab7f2deSmaybee 	}
35481ab7f2deSmaybee #endif
35491ab7f2deSmaybee 	return (0);
35501ab7f2deSmaybee }
35511ab7f2deSmaybee 
3552fa9e4066Sahrens void
35531ab7f2deSmaybee arc_tempreserve_clear(uint64_t reserve)
3554fa9e4066Sahrens {
35551ab7f2deSmaybee 	atomic_add_64(&arc_tempreserve, -reserve);
3556fa9e4066Sahrens 	ASSERT((int64_t)arc_tempreserve >= 0);
3557fa9e4066Sahrens }
3558fa9e4066Sahrens 
3559fa9e4066Sahrens int
35601ab7f2deSmaybee arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3561fa9e4066Sahrens {
35621ab7f2deSmaybee 	int error;
35632fdbea25SAleksandr Guzovskiy 	uint64_t anon_size;
35641ab7f2deSmaybee 
3565fa9e4066Sahrens #ifdef ZFS_DEBUG
3566fa9e4066Sahrens 	/*
3567fa9e4066Sahrens 	 * Once in a while, fail for no reason.  Everything should cope.
3568fa9e4066Sahrens 	 */
3569fa9e4066Sahrens 	if (spa_get_random(10000) == 0) {
3570fa9e4066Sahrens 		dprintf("forcing random failure\n");
3571be6fd75aSMatthew Ahrens 		return (SET_ERROR(ERESTART));
3572fa9e4066Sahrens 	}
3573fa9e4066Sahrens #endif
35741ab7f2deSmaybee 	if (reserve > arc_c/4 && !arc_no_grow)
35751ab7f2deSmaybee 		arc_c = MIN(arc_c_max, reserve * 4);
35761ab7f2deSmaybee 	if (reserve > arc_c)
3577be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENOMEM));
3578112fe045Smaybee 
35792fdbea25SAleksandr Guzovskiy 	/*
35802fdbea25SAleksandr Guzovskiy 	 * Don't count loaned bufs as in flight dirty data to prevent long
35812fdbea25SAleksandr Guzovskiy 	 * network delays from blocking transactions that are ready to be
35822fdbea25SAleksandr Guzovskiy 	 * assigned to a txg.
35832fdbea25SAleksandr Guzovskiy 	 */
35842fdbea25SAleksandr Guzovskiy 	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
35852fdbea25SAleksandr Guzovskiy 
35861ab7f2deSmaybee 	/*
35871ab7f2deSmaybee 	 * Writes will, almost always, require additional memory allocations
35881ab7f2deSmaybee 	 * in order to compress/encrypt/etc the data.  We therefor need to
35891ab7f2deSmaybee 	 * make sure that there is sufficient available memory for this.
35901ab7f2deSmaybee 	 */
35912fdbea25SAleksandr Guzovskiy 	if (error = arc_memory_throttle(reserve, anon_size, txg))
35921ab7f2deSmaybee 		return (error);
35931ab7f2deSmaybee 
3594fa9e4066Sahrens 	/*
3595112fe045Smaybee 	 * Throttle writes when the amount of dirty data in the cache
3596112fe045Smaybee 	 * gets too large.  We try to keep the cache less than half full
3597112fe045Smaybee 	 * of dirty blocks so that our sync times don't grow too large.
3598112fe045Smaybee 	 * Note: if two requests come in concurrently, we might let them
3599112fe045Smaybee 	 * both succeed, when one of them should fail.  Not a huge deal.
3600fa9e4066Sahrens 	 */
36012fdbea25SAleksandr Guzovskiy 
36022fdbea25SAleksandr Guzovskiy 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
36032fdbea25SAleksandr Guzovskiy 	    anon_size > arc_c / 4) {
36040e8c6158Smaybee 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
36050e8c6158Smaybee 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
36060e8c6158Smaybee 		    arc_tempreserve>>10,
36070e8c6158Smaybee 		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
36080e8c6158Smaybee 		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
36091ab7f2deSmaybee 		    reserve>>10, arc_c>>10);
3610be6fd75aSMatthew Ahrens 		return (SET_ERROR(ERESTART));
3611fa9e4066Sahrens 	}
36121ab7f2deSmaybee 	atomic_add_64(&arc_tempreserve, reserve);
3613fa9e4066Sahrens 	return (0);
3614fa9e4066Sahrens }
3615fa9e4066Sahrens 
3616fa9e4066Sahrens void
3617fa9e4066Sahrens arc_init(void)
3618fa9e4066Sahrens {
3619fa9e4066Sahrens 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3620fa9e4066Sahrens 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3621fa9e4066Sahrens 
362213506d1eSmaybee 	/* Convert seconds to clock ticks */
3623b19a79ecSperrin 	arc_min_prefetch_lifespan = 1 * hz;
362413506d1eSmaybee 
3625fa9e4066Sahrens 	/* Start out with 1/8 of all memory */
362644cb6abcSbmc 	arc_c = physmem * PAGESIZE / 8;
3627fa9e4066Sahrens 
3628fa9e4066Sahrens #ifdef _KERNEL
3629fa9e4066Sahrens 	/*
3630fa9e4066Sahrens 	 * On architectures where the physical memory can be larger
3631fa9e4066Sahrens 	 * than the addressable space (intel in 32-bit mode), we may
3632fa9e4066Sahrens 	 * need to limit the cache to 1/8 of VM size.
3633fa9e4066Sahrens 	 */
363444cb6abcSbmc 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3635fa9e4066Sahrens #endif
3636fa9e4066Sahrens 
3637112fe045Smaybee 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
363844cb6abcSbmc 	arc_c_min = MAX(arc_c / 4, 64<<20);
3639112fe045Smaybee 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
364044cb6abcSbmc 	if (arc_c * 8 >= 1<<30)
364144cb6abcSbmc 		arc_c_max = (arc_c * 8) - (1<<30);
3642fa9e4066Sahrens 	else
364344cb6abcSbmc 		arc_c_max = arc_c_min;
364444cb6abcSbmc 	arc_c_max = MAX(arc_c * 6, arc_c_max);
3645a2eea2e1Sahrens 
3646a2eea2e1Sahrens 	/*
3647a2eea2e1Sahrens 	 * Allow the tunables to override our calculations if they are
3648a2eea2e1Sahrens 	 * reasonable (ie. over 64MB)
3649a2eea2e1Sahrens 	 */
3650a2eea2e1Sahrens 	if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
365144cb6abcSbmc 		arc_c_max = zfs_arc_max;
365244cb6abcSbmc 	if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
365344cb6abcSbmc 		arc_c_min = zfs_arc_min;
3654a2eea2e1Sahrens 
365544cb6abcSbmc 	arc_c = arc_c_max;
365644cb6abcSbmc 	arc_p = (arc_c >> 1);
3657fa9e4066Sahrens 
36580e8c6158Smaybee 	/* limit meta-data to 1/4 of the arc capacity */
36590e8c6158Smaybee 	arc_meta_limit = arc_c_max / 4;
36601116048bSek 
36611116048bSek 	/* Allow the tunable to override if it is reasonable */
36621116048bSek 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
36631116048bSek 		arc_meta_limit = zfs_arc_meta_limit;
36641116048bSek 
36650e8c6158Smaybee 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
36660e8c6158Smaybee 		arc_c_min = arc_meta_limit / 2;
36670e8c6158Smaybee 
36685a98e54bSBrendan Gregg - Sun Microsystems 	if (zfs_arc_grow_retry > 0)
36695a98e54bSBrendan Gregg - Sun Microsystems 		arc_grow_retry = zfs_arc_grow_retry;
36705a98e54bSBrendan Gregg - Sun Microsystems 
36715a98e54bSBrendan Gregg - Sun Microsystems 	if (zfs_arc_shrink_shift > 0)
36725a98e54bSBrendan Gregg - Sun Microsystems 		arc_shrink_shift = zfs_arc_shrink_shift;
36735a98e54bSBrendan Gregg - Sun Microsystems 
36745a98e54bSBrendan Gregg - Sun Microsystems 	if (zfs_arc_p_min_shift > 0)
36755a98e54bSBrendan Gregg - Sun Microsystems 		arc_p_min_shift = zfs_arc_p_min_shift;
36765a98e54bSBrendan Gregg - Sun Microsystems 
3677fa9e4066Sahrens 	/* if kmem_flags are set, lets try to use less memory */
3678fa9e4066Sahrens 	if (kmem_debugging())
367944cb6abcSbmc 		arc_c = arc_c / 2;
368044cb6abcSbmc 	if (arc_c < arc_c_min)
368144cb6abcSbmc 		arc_c = arc_c_min;
368244cb6abcSbmc 
368344cb6abcSbmc 	arc_anon = &ARC_anon;
368444cb6abcSbmc 	arc_mru = &ARC_mru;
368544cb6abcSbmc 	arc_mru_ghost = &ARC_mru_ghost;
368644cb6abcSbmc 	arc_mfu = &ARC_mfu;
368744cb6abcSbmc 	arc_mfu_ghost = &ARC_mfu_ghost;
3688fa94a07fSbrendan 	arc_l2c_only = &ARC_l2c_only;
368944cb6abcSbmc 	arc_size = 0;
369044cb6abcSbmc 
369144cb6abcSbmc 	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
369244cb6abcSbmc 	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
369344cb6abcSbmc 	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
369444cb6abcSbmc 	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
369544cb6abcSbmc 	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3696fa94a07fSbrendan 	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
369744cb6abcSbmc 
36980e8c6158Smaybee 	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
36990e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37000e8c6158Smaybee 	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
37010e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37020e8c6158Smaybee 	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
37030e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37040e8c6158Smaybee 	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
37050e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37060e8c6158Smaybee 	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
37070e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37080e8c6158Smaybee 	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
37090e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37100e8c6158Smaybee 	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
37110e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
37120e8c6158Smaybee 	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
37130e8c6158Smaybee 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3714fa94a07fSbrendan 	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3715fa94a07fSbrendan 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3716fa94a07fSbrendan 	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3717fa94a07fSbrendan 	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3718fa9e4066Sahrens 
3719fa9e4066Sahrens 	buf_init();
3720fa9e4066Sahrens 
3721fa9e4066Sahrens 	arc_thread_exit = 0;
3722ea8dc4b6Seschrock 	arc_eviction_list = NULL;
3723ea8dc4b6Seschrock 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
372440d7d650Smaybee 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3725fa9e4066Sahrens 
372644cb6abcSbmc 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
372744cb6abcSbmc 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
372844cb6abcSbmc 
372944cb6abcSbmc 	if (arc_ksp != NULL) {
373044cb6abcSbmc 		arc_ksp->ks_data = &arc_stats;
373144cb6abcSbmc 		kstat_install(arc_ksp);
373244cb6abcSbmc 	}
373344cb6abcSbmc 
3734fa9e4066Sahrens 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3735fa9e4066Sahrens 	    TS_RUN, minclsyspri);
373649e3519aSmaybee 
373749e3519aSmaybee 	arc_dead = FALSE;
37383a737e0dSbrendan 	arc_warm = B_FALSE;
37391ab7f2deSmaybee 
37401ab7f2deSmaybee 	if (zfs_write_limit_max == 0)
374105715f94SMark Maybee 		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
37421ab7f2deSmaybee 	else
37431ab7f2deSmaybee 		zfs_write_limit_shift = 0;
374405715f94SMark Maybee 	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3745fa9e4066Sahrens }
3746fa9e4066Sahrens 
3747fa9e4066Sahrens void
3748fa9e4066Sahrens arc_fini(void)
3749fa9e4066Sahrens {
3750fa9e4066Sahrens 	mutex_enter(&arc_reclaim_thr_lock);
3751fa9e4066Sahrens 	arc_thread_exit = 1;
3752fa9e4066Sahrens 	while (arc_thread_exit != 0)
3753fa9e4066Sahrens 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3754fa9e4066Sahrens 	mutex_exit(&arc_reclaim_thr_lock);
3755fa9e4066Sahrens 
3756874395d5Smaybee 	arc_flush(NULL);
3757fa9e4066Sahrens 
3758fa9e4066Sahrens 	arc_dead = TRUE;
3759fa9e4066Sahrens 
376044cb6abcSbmc 	if (arc_ksp != NULL) {
376144cb6abcSbmc 		kstat_delete(arc_ksp);
376244cb6abcSbmc 		arc_ksp = NULL;
376344cb6abcSbmc 	}
376444cb6abcSbmc 
3765ea8dc4b6Seschrock 	mutex_destroy(&arc_eviction_mtx);
3766fa9e4066Sahrens 	mutex_destroy(&arc_reclaim_thr_lock);
3767fa9e4066Sahrens 	cv_destroy(&arc_reclaim_thr_cv);
3768fa9e4066Sahrens 
37690e8c6158Smaybee 	list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
37700e8c6158Smaybee 	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
37710e8c6158Smaybee 	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
37720e8c6158Smaybee 	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
37730e8c6158Smaybee 	list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
37740e8c6158Smaybee 	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
37750e8c6158Smaybee 	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
37760e8c6158Smaybee 	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3777fa9e4066Sahrens 
377844cb6abcSbmc 	mutex_destroy(&arc_anon->arcs_mtx);
377944cb6abcSbmc 	mutex_destroy(&arc_mru->arcs_mtx);
378044cb6abcSbmc 	mutex_destroy(&arc_mru_ghost->arcs_mtx);
378144cb6abcSbmc 	mutex_destroy(&arc_mfu->arcs_mtx);
378244cb6abcSbmc 	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3783b5e70f97SRicardo M. Correia 	mutex_destroy(&arc_l2c_only->arcs_mtx);
37845ad82045Snd 
378505715f94SMark Maybee 	mutex_destroy(&zfs_write_limit_lock);
378605715f94SMark Maybee 
3787fa9e4066Sahrens 	buf_fini();
37882fdbea25SAleksandr Guzovskiy 
37892fdbea25SAleksandr Guzovskiy 	ASSERT(arc_loaned_bytes == 0);
3790fa9e4066Sahrens }
3791fa94a07fSbrendan 
3792fa94a07fSbrendan /*
3793fa94a07fSbrendan  * Level 2 ARC
3794fa94a07fSbrendan  *
3795fa94a07fSbrendan  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3796fa94a07fSbrendan  * It uses dedicated storage devices to hold cached data, which are populated
3797fa94a07fSbrendan  * using large infrequent writes.  The main role of this cache is to boost
3798fa94a07fSbrendan  * the performance of random read workloads.  The intended L2ARC devices
3799fa94a07fSbrendan  * include short-stroked disks, solid state disks, and other media with
3800fa94a07fSbrendan  * substantially faster read latency than disk.
3801fa94a07fSbrendan  *
3802fa94a07fSbrendan  *                 +-----------------------+
3803fa94a07fSbrendan  *                 |         ARC           |
3804fa94a07fSbrendan  *                 +-----------------------+
3805fa94a07fSbrendan  *                    |         ^     ^
3806fa94a07fSbrendan  *                    |         |     |
3807fa94a07fSbrendan  *      l2arc_feed_thread()    arc_read()
3808fa94a07fSbrendan  *                    |         |     |
3809fa94a07fSbrendan  *                    |  l2arc read   |
3810fa94a07fSbrendan  *                    V         |     |
3811fa94a07fSbrendan  *               +---------------+    |
3812fa94a07fSbrendan  *               |     L2ARC     |    |
3813fa94a07fSbrendan  *               +---------------+    |
3814fa94a07fSbrendan  *                   |    ^           |
3815fa94a07fSbrendan  *          l2arc_write() |           |
3816fa94a07fSbrendan  *                   |    |           |
3817fa94a07fSbrendan  *                   V    |           |
3818fa94a07fSbrendan  *                 +-------+      +-------+
3819fa94a07fSbrendan  *                 | vdev  |      | vdev  |
3820fa94a07fSbrendan  *                 | cache |      | cache |
3821fa94a07fSbrendan  *                 +-------+      +-------+
3822fa94a07fSbrendan  *                 +=========+     .-----.
3823fa94a07fSbrendan  *                 :  L2ARC  :    |-_____-|
3824fa94a07fSbrendan  *                 : devices :    | Disks |
3825fa94a07fSbrendan  *                 +=========+    `-_____-'
3826fa94a07fSbrendan  *
3827fa94a07fSbrendan  * Read requests are satisfied from the following sources, in order:
3828fa94a07fSbrendan  *
3829fa94a07fSbrendan  *	1) ARC
3830fa94a07fSbrendan  *	2) vdev cache of L2ARC devices
3831fa94a07fSbrendan  *	3) L2ARC devices
3832fa94a07fSbrendan  *	4) vdev cache of disks
3833fa94a07fSbrendan  *	5) disks
3834fa94a07fSbrendan  *
3835fa94a07fSbrendan  * Some L2ARC device types exhibit extremely slow write performance.
3836fa94a07fSbrendan  * To accommodate for this there are some significant differences between
3837fa94a07fSbrendan  * the L2ARC and traditional cache design:
3838fa94a07fSbrendan  *
3839fa94a07fSbrendan  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3840fa94a07fSbrendan  * the ARC behave as usual, freeing buffers and placing headers on ghost
3841fa94a07fSbrendan  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3842fa94a07fSbrendan  * this would add inflated write latencies for all ARC memory pressure.
3843fa94a07fSbrendan  *
3844fa94a07fSbrendan  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3845fa94a07fSbrendan  * It does this by periodically scanning buffers from the eviction-end of
3846fa94a07fSbrendan  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3847*aad02571SSaso Kiselkov  * not already there. It scans until a headroom of buffers is satisfied,
3848*aad02571SSaso Kiselkov  * which itself is a buffer for ARC eviction. If a compressible buffer is
3849*aad02571SSaso Kiselkov  * found during scanning and selected for writing to an L2ARC device, we
3850*aad02571SSaso Kiselkov  * temporarily boost scanning headroom during the next scan cycle to make
3851*aad02571SSaso Kiselkov  * sure we adapt to compression effects (which might significantly reduce
3852*aad02571SSaso Kiselkov  * the data volume we write to L2ARC). The thread that does this is
3853fa94a07fSbrendan  * l2arc_feed_thread(), illustrated below; example sizes are included to
3854fa94a07fSbrendan  * provide a better sense of ratio than this diagram:
3855fa94a07fSbrendan  *
3856fa94a07fSbrendan  *	       head -->                        tail
3857fa94a07fSbrendan  *	        +---------------------+----------+
3858fa94a07fSbrendan  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3859fa94a07fSbrendan  *	        +---------------------+----------+   |   o L2ARC eligible
3860fa94a07fSbrendan  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3861fa94a07fSbrendan  *	        +---------------------+----------+   |
3862fa94a07fSbrendan  *	             15.9 Gbytes      ^ 32 Mbytes    |
3863fa94a07fSbrendan  *	                           headroom          |
3864fa94a07fSbrendan  *	                                      l2arc_feed_thread()
3865fa94a07fSbrendan  *	                                             |
3866fa94a07fSbrendan  *	                 l2arc write hand <--[oooo]--'
3867fa94a07fSbrendan  *	                         |           8 Mbyte
3868fa94a07fSbrendan  *	                         |          write max
3869fa94a07fSbrendan  *	                         V
3870fa94a07fSbrendan  *		  +==============================+
3871fa94a07fSbrendan  *	L2ARC dev |####|#|###|###|    |####| ... |
3872fa94a07fSbrendan  *	          +==============================+
3873fa94a07fSbrendan  *	                     32 Gbytes
3874fa94a07fSbrendan  *
3875fa94a07fSbrendan  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3876fa94a07fSbrendan  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3877fa94a07fSbrendan  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3878fa94a07fSbrendan  * safe to say that this is an uncommon case, since buffers at the end of
3879fa94a07fSbrendan  * the ARC lists have moved there due to inactivity.
3880fa94a07fSbrendan  *
3881fa94a07fSbrendan  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3882fa94a07fSbrendan  * then the L2ARC simply misses copying some buffers.  This serves as a
3883fa94a07fSbrendan  * pressure valve to prevent heavy read workloads from both stalling the ARC
3884fa94a07fSbrendan  * with waits and clogging the L2ARC with writes.  This also helps prevent
3885fa94a07fSbrendan  * the potential for the L2ARC to churn if it attempts to cache content too
3886fa94a07fSbrendan  * quickly, such as during backups of the entire pool.
3887fa94a07fSbrendan  *
38883a737e0dSbrendan  * 5. After system boot and before the ARC has filled main memory, there are
38893a737e0dSbrendan  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
38903a737e0dSbrendan  * lists can remain mostly static.  Instead of searching from tail of these
38913a737e0dSbrendan  * lists as pictured, the l2arc_feed_thread() will search from the list heads
38923a737e0dSbrendan  * for eligible buffers, greatly increasing its chance of finding them.
38933a737e0dSbrendan  *
38943a737e0dSbrendan  * The L2ARC device write speed is also boosted during this time so that
38953a737e0dSbrendan  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
38963a737e0dSbrendan  * there are no L2ARC reads, and no fear of degrading read performance
38973a737e0dSbrendan  * through increased writes.
38983a737e0dSbrendan  *
38993a737e0dSbrendan  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3900fa94a07fSbrendan  * the vdev queue can aggregate them into larger and fewer writes.  Each
3901fa94a07fSbrendan  * device is written to in a rotor fashion, sweeping writes through
3902fa94a07fSbrendan  * available space then repeating.
3903fa94a07fSbrendan  *
39043a737e0dSbrendan  * 7. The L2ARC does not store dirty content.  It never needs to flush
3905fa94a07fSbrendan  * write buffers back to disk based storage.
3906fa94a07fSbrendan  *
39073a737e0dSbrendan  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3908fa94a07fSbrendan  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3909fa94a07fSbrendan  *
3910fa94a07fSbrendan  * The performance of the L2ARC can be tweaked by a number of tunables, which
3911fa94a07fSbrendan  * may be necessary for different workloads:
3912fa94a07fSbrendan  *
3913fa94a07fSbrendan  *	l2arc_write_max		max write bytes per interval
39143a737e0dSbrendan  *	l2arc_write_boost	extra write bytes during device warmup
3915fa94a07fSbrendan  *	l2arc_noprefetch	skip caching prefetched buffers
3916fa94a07fSbrendan  *	l2arc_headroom		number of max device writes to precache
3917*aad02571SSaso Kiselkov  *	l2arc_headroom_boost	when we find compressed buffers during ARC
3918*aad02571SSaso Kiselkov  *				scanning, we multiply headroom by this
3919*aad02571SSaso Kiselkov  *				percentage factor for the next scan cycle,
3920*aad02571SSaso Kiselkov  *				since more compressed buffers are likely to
3921*aad02571SSaso Kiselkov  *				be present
3922fa94a07fSbrendan  *	l2arc_feed_secs		seconds between L2ARC writing
3923fa94a07fSbrendan  *
3924fa94a07fSbrendan  * Tunables may be removed or added as future performance improvements are
3925fa94a07fSbrendan  * integrated, and also may become zpool properties.
39265a98e54bSBrendan Gregg - Sun Microsystems  *
39275a98e54bSBrendan Gregg - Sun Microsystems  * There are three key functions that control how the L2ARC warms up:
39285a98e54bSBrendan Gregg - Sun Microsystems  *
39295a98e54bSBrendan Gregg - Sun Microsystems  *	l2arc_write_eligible()	check if a buffer is eligible to cache
39305a98e54bSBrendan Gregg - Sun Microsystems  *	l2arc_write_size()	calculate how much to write
39315a98e54bSBrendan Gregg - Sun Microsystems  *	l2arc_write_interval()	calculate sleep delay between writes
39325a98e54bSBrendan Gregg - Sun Microsystems  *
39335a98e54bSBrendan Gregg - Sun Microsystems  * These three functions determine what to write, how much, and how quickly
39345a98e54bSBrendan Gregg - Sun Microsystems  * to send writes.
3935fa94a07fSbrendan  */
3936fa94a07fSbrendan 
39375a98e54bSBrendan Gregg - Sun Microsystems static boolean_t
3938ac05c741SMark Maybee l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
39395a98e54bSBrendan Gregg - Sun Microsystems {
39405a98e54bSBrendan Gregg - Sun Microsystems 	/*
39415a98e54bSBrendan Gregg - Sun Microsystems 	 * A buffer is *not* eligible for the L2ARC if it:
39425a98e54bSBrendan Gregg - Sun Microsystems 	 * 1. belongs to a different spa.
39435ea40c06SBrendan Gregg - Sun Microsystems 	 * 2. is already cached on the L2ARC.
39445ea40c06SBrendan Gregg - Sun Microsystems 	 * 3. has an I/O in progress (it may be an incomplete read).
39455ea40c06SBrendan Gregg - Sun Microsystems 	 * 4. is flagged not eligible (zfs property).
39465a98e54bSBrendan Gregg - Sun Microsystems 	 */
39475ea40c06SBrendan Gregg - Sun Microsystems 	if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
39485a98e54bSBrendan Gregg - Sun Microsystems 	    HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
39495a98e54bSBrendan Gregg - Sun Microsystems 		return (B_FALSE);
39505a98e54bSBrendan Gregg - Sun Microsystems 
39515a98e54bSBrendan Gregg - Sun Microsystems 	return (B_TRUE);
39525a98e54bSBrendan Gregg - Sun Microsystems }
39535a98e54bSBrendan Gregg - Sun Microsystems 
39545a98e54bSBrendan Gregg - Sun Microsystems static uint64_t
3955*aad02571SSaso Kiselkov l2arc_write_size(void)
39565a98e54bSBrendan Gregg - Sun Microsystems {
39575a98e54bSBrendan Gregg - Sun Microsystems 	uint64_t size;
39585a98e54bSBrendan Gregg - Sun Microsystems 
3959*aad02571SSaso Kiselkov 	/*
3960*aad02571SSaso Kiselkov 	 * Make sure our globals have meaningful values in case the user
3961*aad02571SSaso Kiselkov 	 * altered them.
3962*aad02571SSaso Kiselkov 	 */
3963*aad02571SSaso Kiselkov 	size = l2arc_write_max;
3964*aad02571SSaso Kiselkov 	if (size == 0) {
3965*aad02571SSaso Kiselkov 		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
3966*aad02571SSaso Kiselkov 		    "be greater than zero, resetting it to the default (%d)",
3967*aad02571SSaso Kiselkov 		    L2ARC_WRITE_SIZE);
3968*aad02571SSaso Kiselkov 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
3969*aad02571SSaso Kiselkov 	}
39705a98e54bSBrendan Gregg - Sun Microsystems 
39715a98e54bSBrendan Gregg - Sun Microsystems 	if (arc_warm == B_FALSE)
3972*aad02571SSaso Kiselkov 		size += l2arc_write_boost;
39735a98e54bSBrendan Gregg - Sun Microsystems 
39745a98e54bSBrendan Gregg - Sun Microsystems 	return (size);
39755a98e54bSBrendan Gregg - Sun Microsystems 
39765a98e54bSBrendan Gregg - Sun Microsystems }
39775a98e54bSBrendan Gregg - Sun Microsystems 
39785a98e54bSBrendan Gregg - Sun Microsystems static clock_t
39795a98e54bSBrendan Gregg - Sun Microsystems l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
39805a98e54bSBrendan Gregg - Sun Microsystems {
3981d3d50737SRafael Vanoni 	clock_t interval, next, now;
39825a98e54bSBrendan Gregg - Sun Microsystems 
39835a98e54bSBrendan Gregg - Sun Microsystems 	/*
39845a98e54bSBrendan Gregg - Sun Microsystems 	 * If the ARC lists are busy, increase our write rate; if the
39855a98e54bSBrendan Gregg - Sun Microsystems 	 * lists are stale, idle back.  This is achieved by checking
39865a98e54bSBrendan Gregg - Sun Microsystems 	 * how much we previously wrote - if it was more than half of
39875a98e54bSBrendan Gregg - Sun Microsystems 	 * what we wanted, schedule the next write much sooner.
39885a98e54bSBrendan Gregg - Sun Microsystems 	 */
39895a98e54bSBrendan Gregg - Sun Microsystems 	if (l2arc_feed_again && wrote > (wanted / 2))
39905a98e54bSBrendan Gregg - Sun Microsystems 		interval = (hz * l2arc_feed_min_ms) / 1000;
39915a98e54bSBrendan Gregg - Sun Microsystems 	else
39925a98e54bSBrendan Gregg - Sun Microsystems 		interval = hz * l2arc_feed_secs;
39935a98e54bSBrendan Gregg - Sun Microsystems 
3994d3d50737SRafael Vanoni 	now = ddi_get_lbolt();
3995d3d50737SRafael Vanoni 	next = MAX(now, MIN(now + interval, began + interval));
39965a98e54bSBrendan Gregg - Sun Microsystems 
39975a98e54bSBrendan Gregg - Sun Microsystems 	return (next);
39985a98e54bSBrendan Gregg - Sun Microsystems }
39995a98e54bSBrendan Gregg - Sun Microsystems 
4000fa94a07fSbrendan static void
4001fa94a07fSbrendan l2arc_hdr_stat_add(void)
4002fa94a07fSbrendan {
4003e6c728e1Sbrendan 	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4004e6c728e1Sbrendan 	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4005fa94a07fSbrendan }
4006fa94a07fSbrendan 
4007fa94a07fSbrendan static void
4008fa94a07fSbrendan l2arc_hdr_stat_remove(void)
4009fa94a07fSbrendan {
4010e6c728e1Sbrendan 	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4011e6c728e1Sbrendan 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4012fa94a07fSbrendan }
4013fa94a07fSbrendan 
4014fa94a07fSbrendan /*
4015fa94a07fSbrendan  * Cycle through L2ARC devices.  This is how L2ARC load balances.
40163a737e0dSbrendan  * If a device is returned, this also returns holding the spa config lock.
4017fa94a07fSbrendan  */
4018fa94a07fSbrendan static l2arc_dev_t *
4019fa94a07fSbrendan l2arc_dev_get_next(void)
4020fa94a07fSbrendan {
40213a737e0dSbrendan 	l2arc_dev_t *first, *next = NULL;
40223a737e0dSbrendan 
40233a737e0dSbrendan 	/*
40243a737e0dSbrendan 	 * Lock out the removal of spas (spa_namespace_lock), then removal
40253a737e0dSbrendan 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
40263a737e0dSbrendan 	 * both locks will be dropped and a spa config lock held instead.
40273a737e0dSbrendan 	 */
40283a737e0dSbrendan 	mutex_enter(&spa_namespace_lock);
40293a737e0dSbrendan 	mutex_enter(&l2arc_dev_mtx);
4030fa94a07fSbrendan 
4031c5904d13Seschrock 	/* if there are no vdevs, there is nothing to do */
4032c5904d13Seschrock 	if (l2arc_ndev == 0)
40333a737e0dSbrendan 		goto out;
4034c5904d13Seschrock 
4035c5904d13Seschrock 	first = NULL;
4036c5904d13Seschrock 	next = l2arc_dev_last;
4037c5904d13Seschrock 	do {
4038c5904d13Seschrock 		/* loop around the list looking for a non-faulted vdev */
4039c5904d13Seschrock 		if (next == NULL) {
4040fa94a07fSbrendan 			next = list_head(l2arc_dev_list);
4041c5904d13Seschrock 		} else {
4042c5904d13Seschrock 			next = list_next(l2arc_dev_list, next);
4043c5904d13Seschrock 			if (next == NULL)
4044c5904d13Seschrock 				next = list_head(l2arc_dev_list);
4045c5904d13Seschrock 		}
4046c5904d13Seschrock 
4047c5904d13Seschrock 		/* if we have come back to the start, bail out */
4048c5904d13Seschrock 		if (first == NULL)
4049c5904d13Seschrock 			first = next;
4050c5904d13Seschrock 		else if (next == first)
4051c5904d13Seschrock 			break;
4052c5904d13Seschrock 
4053c5904d13Seschrock 	} while (vdev_is_dead(next->l2ad_vdev));
4054c5904d13Seschrock 
4055c5904d13Seschrock 	/* if we were unable to find any usable vdevs, return NULL */
4056c5904d13Seschrock 	if (vdev_is_dead(next->l2ad_vdev))
40573a737e0dSbrendan 		next = NULL;
4058fa94a07fSbrendan 
4059fa94a07fSbrendan 	l2arc_dev_last = next;
4060fa94a07fSbrendan 
40613a737e0dSbrendan out:
40623a737e0dSbrendan 	mutex_exit(&l2arc_dev_mtx);
40633a737e0dSbrendan 
40643a737e0dSbrendan 	/*
40653a737e0dSbrendan 	 * Grab the config lock to prevent the 'next' device from being
40663a737e0dSbrendan 	 * removed while we are writing to it.
40673a737e0dSbrendan 	 */
40683a737e0dSbrendan 	if (next != NULL)
4069e14bb325SJeff Bonwick 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
40703a737e0dSbrendan 	mutex_exit(&spa_namespace_lock);
40713a737e0dSbrendan 
4072fa94a07fSbrendan 	return (next);
4073fa94a07fSbrendan }
4074fa94a07fSbrendan 
40753a737e0dSbrendan /*
40763a737e0dSbrendan  * Free buffers that were tagged for destruction.
40773a737e0dSbrendan  */
40783a737e0dSbrendan static void
40793a737e0dSbrendan l2arc_do_free_on_write()
40803a737e0dSbrendan {
40813a737e0dSbrendan 	list_t *buflist;
40823a737e0dSbrendan 	l2arc_data_free_t *df, *df_prev;
40833a737e0dSbrendan 
40843a737e0dSbrendan 	mutex_enter(&l2arc_free_on_write_mtx);
40853a737e0dSbrendan 	buflist = l2arc_free_on_write;
40863a737e0dSbrendan 
40873a737e0dSbrendan 	for (df = list_tail(buflist); df; df = df_prev) {
40883a737e0dSbrendan 		df_prev = list_prev(buflist, df);
40893a737e0dSbrendan 		ASSERT(df->l2df_data != NULL);
40903a737e0dSbrendan 		ASSERT(df->l2df_func != NULL);
40913a737e0dSbrendan 		df->l2df_func(df->l2df_data, df->l2df_size);
40923a737e0dSbrendan 		list_remove(buflist, df);
40933a737e0dSbrendan 		kmem_free(df, sizeof (l2arc_data_free_t));
40943a737e0dSbrendan 	}
40953a737e0dSbrendan 
40963a737e0dSbrendan 	mutex_exit(&l2arc_free_on_write_mtx);
40973a737e0dSbrendan }
40983a737e0dSbrendan 
4099fa94a07fSbrendan /*
4100fa94a07fSbrendan  * A write to a cache device has completed.  Update all headers to allow
4101fa94a07fSbrendan  * reads from these buffers to begin.
4102fa94a07fSbrendan  */
4103fa94a07fSbrendan static void
4104fa94a07fSbrendan l2arc_write_done(zio_t *zio)
4105fa94a07fSbrendan {
4106fa94a07fSbrendan 	l2arc_write_callback_t *cb;
4107fa94a07fSbrendan 	l2arc_dev_t *dev;
4108fa94a07fSbrendan 	list_t *buflist;
4109fa94a07fSbrendan 	arc_buf_hdr_t *head, *ab, *ab_prev;
41103a737e0dSbrendan 	l2arc_buf_hdr_t *abl2;
4111fa94a07fSbrendan 	kmutex_t *hash_lock;
4112fa94a07fSbrendan 
4113fa94a07fSbrendan 	cb = zio->io_private;
4114fa94a07fSbrendan 	ASSERT(cb != NULL);
4115fa94a07fSbrendan 	dev = cb->l2wcb_dev;
4116fa94a07fSbrendan 	ASSERT(dev != NULL);
4117fa94a07fSbrendan 	head = cb->l2wcb_head;
4118fa94a07fSbrendan 	ASSERT(head != NULL);
4119fa94a07fSbrendan 	buflist = dev->l2ad_buflist;
4120fa94a07fSbrendan 	ASSERT(buflist != NULL);
4121fa94a07fSbrendan 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4122fa94a07fSbrendan 	    l2arc_write_callback_t *, cb);
4123fa94a07fSbrendan 
4124fa94a07fSbrendan 	if (zio->io_error != 0)
4125fa94a07fSbrendan 		ARCSTAT_BUMP(arcstat_l2_writes_error);
4126fa94a07fSbrendan 
4127fa94a07fSbrendan 	mutex_enter(&l2arc_buflist_mtx);
4128fa94a07fSbrendan 
4129fa94a07fSbrendan 	/*
4130fa94a07fSbrendan 	 * All writes completed, or an error was hit.
4131fa94a07fSbrendan 	 */
4132fa94a07fSbrendan 	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4133fa94a07fSbrendan 		ab_prev = list_prev(buflist, ab);
4134fa94a07fSbrendan 
4135fa94a07fSbrendan 		hash_lock = HDR_LOCK(ab);
4136fa94a07fSbrendan 		if (!mutex_tryenter(hash_lock)) {
4137fa94a07fSbrendan 			/*
4138fa94a07fSbrendan 			 * This buffer misses out.  It may be in a stage
4139fa94a07fSbrendan 			 * of eviction.  Its ARC_L2_WRITING flag will be
4140fa94a07fSbrendan 			 * left set, denying reads to this buffer.
4141fa94a07fSbrendan 			 */
4142fa94a07fSbrendan 			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4143fa94a07fSbrendan 			continue;
4144fa94a07fSbrendan 		}
4145fa94a07fSbrendan 
4146*aad02571SSaso Kiselkov 		abl2 = ab->b_l2hdr;
4147*aad02571SSaso Kiselkov 
4148*aad02571SSaso Kiselkov 		/*
4149*aad02571SSaso Kiselkov 		 * Release the temporary compressed buffer as soon as possible.
4150*aad02571SSaso Kiselkov 		 */
4151*aad02571SSaso Kiselkov 		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4152*aad02571SSaso Kiselkov 			l2arc_release_cdata_buf(ab);
4153*aad02571SSaso Kiselkov 
4154fa94a07fSbrendan 		if (zio->io_error != 0) {
4155fa94a07fSbrendan 			/*
41563a737e0dSbrendan 			 * Error - drop L2ARC entry.
4157fa94a07fSbrendan 			 */
41583a737e0dSbrendan 			list_remove(buflist, ab);
4159*aad02571SSaso Kiselkov 			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4160fa94a07fSbrendan 			ab->b_l2hdr = NULL;
41613a737e0dSbrendan 			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
41623a737e0dSbrendan 			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4163fa94a07fSbrendan 		}
4164fa94a07fSbrendan 
4165fa94a07fSbrendan 		/*
4166fa94a07fSbrendan 		 * Allow ARC to begin reads to this L2ARC entry.
4167fa94a07fSbrendan 		 */
4168fa94a07fSbrendan 		ab->b_flags &= ~ARC_L2_WRITING;
4169fa94a07fSbrendan 
4170fa94a07fSbrendan 		mutex_exit(hash_lock);
4171fa94a07fSbrendan 	}
4172fa94a07fSbrendan 
4173fa94a07fSbrendan 	atomic_inc_64(&l2arc_writes_done);
4174fa94a07fSbrendan 	list_remove(buflist, head);
4175fa94a07fSbrendan 	kmem_cache_free(hdr_cache, head);
4176fa94a07fSbrendan 	mutex_exit(&l2arc_buflist_mtx);
4177fa94a07fSbrendan 
41783a737e0dSbrendan 	l2arc_do_free_on_write();
4179fa94a07fSbrendan 
4180fa94a07fSbrendan 	kmem_free(cb, sizeof (l2arc_write_callback_t));
4181fa94a07fSbrendan }
4182fa94a07fSbrendan 
4183fa94a07fSbrendan /*
4184fa94a07fSbrendan  * A read to a cache device completed.  Validate buffer contents before
4185fa94a07fSbrendan  * handing over to the regular ARC routines.
4186fa94a07fSbrendan  */
4187fa94a07fSbrendan static void
4188fa94a07fSbrendan l2arc_read_done(zio_t *zio)
4189fa94a07fSbrendan {
4190fa94a07fSbrendan 	l2arc_read_callback_t *cb;
4191fa94a07fSbrendan 	arc_buf_hdr_t *hdr;
4192fa94a07fSbrendan 	arc_buf_t *buf;
4193fa94a07fSbrendan 	kmutex_t *hash_lock;
41943a737e0dSbrendan 	int equal;
4195fa94a07fSbrendan 
4196e14bb325SJeff Bonwick 	ASSERT(zio->io_vd != NULL);
4197e14bb325SJeff Bonwick 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4198e14bb325SJeff Bonwick 
4199e14bb325SJeff Bonwick 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4200e14bb325SJeff Bonwick 
4201fa94a07fSbrendan 	cb = zio->io_private;
4202fa94a07fSbrendan 	ASSERT(cb != NULL);
4203fa94a07fSbrendan 	buf = cb->l2rcb_buf;
4204fa94a07fSbrendan 	ASSERT(buf != NULL);
4205fa94a07fSbrendan 
42063f9d6ad7SLin Ling 	hash_lock = HDR_LOCK(buf->b_hdr);
4207fa94a07fSbrendan 	mutex_enter(hash_lock);
42083f9d6ad7SLin Ling 	hdr = buf->b_hdr;
42093f9d6ad7SLin Ling 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4210fa94a07fSbrendan 
4211*aad02571SSaso Kiselkov 	/*
4212*aad02571SSaso Kiselkov 	 * If the buffer was compressed, decompress it first.
4213*aad02571SSaso Kiselkov 	 */
4214*aad02571SSaso Kiselkov 	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4215*aad02571SSaso Kiselkov 		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4216*aad02571SSaso Kiselkov 	ASSERT(zio->io_data != NULL);
4217*aad02571SSaso Kiselkov 
4218fa94a07fSbrendan 	/*
4219fa94a07fSbrendan 	 * Check this survived the L2ARC journey.
4220fa94a07fSbrendan 	 */
4221fa94a07fSbrendan 	equal = arc_cksum_equal(buf);
4222fa94a07fSbrendan 	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4223fa94a07fSbrendan 		mutex_exit(hash_lock);
4224fa94a07fSbrendan 		zio->io_private = buf;
4225e14bb325SJeff Bonwick 		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4226e14bb325SJeff Bonwick 		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4227fa94a07fSbrendan 		arc_read_done(zio);
4228fa94a07fSbrendan 	} else {
4229fa94a07fSbrendan 		mutex_exit(hash_lock);
4230fa94a07fSbrendan 		/*
4231fa94a07fSbrendan 		 * Buffer didn't survive caching.  Increment stats and
4232fa94a07fSbrendan 		 * reissue to the original storage device.
4233fa94a07fSbrendan 		 */
42343a737e0dSbrendan 		if (zio->io_error != 0) {
4235fa94a07fSbrendan 			ARCSTAT_BUMP(arcstat_l2_io_error);
42363a737e0dSbrendan 		} else {
4237be6fd75aSMatthew Ahrens 			zio->io_error = SET_ERROR(EIO);
42383a737e0dSbrendan 		}
4239fa94a07fSbrendan 		if (!equal)
4240fa94a07fSbrendan 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4241fa94a07fSbrendan 
4242e14bb325SJeff Bonwick 		/*
4243e14bb325SJeff Bonwick 		 * If there's no waiter, issue an async i/o to the primary
4244e14bb325SJeff Bonwick 		 * storage now.  If there *is* a waiter, the caller must
4245e14bb325SJeff Bonwick 		 * issue the i/o in a context where it's OK to block.
4246e14bb325SJeff Bonwick 		 */
4247a3f829aeSBill Moore 		if (zio->io_waiter == NULL) {
4248a3f829aeSBill Moore 			zio_t *pio = zio_unique_parent(zio);
4249a3f829aeSBill Moore 
4250a3f829aeSBill Moore 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4251a3f829aeSBill Moore 
4252a3f829aeSBill Moore 			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4253e14bb325SJeff Bonwick 			    buf->b_data, zio->io_size, arc_read_done, buf,
4254e14bb325SJeff Bonwick 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4255a3f829aeSBill Moore 		}
4256fa94a07fSbrendan 	}
4257fa94a07fSbrendan 
4258fa94a07fSbrendan 	kmem_free(cb, sizeof (l2arc_read_callback_t));
4259fa94a07fSbrendan }
4260fa94a07fSbrendan 
4261fa94a07fSbrendan /*
4262fa94a07fSbrendan  * This is the list priority from which the L2ARC will search for pages to
4263fa94a07fSbrendan  * cache.  This is used within loops (0..3) to cycle through lists in the
4264fa94a07fSbrendan  * desired order.  This order can have a significant effect on cache
4265fa94a07fSbrendan  * performance.
4266fa94a07fSbrendan  *
4267fa94a07fSbrendan  * Currently the metadata lists are hit first, MFU then MRU, followed by
4268fa94a07fSbrendan  * the data lists.  This function returns a locked list, and also returns
4269fa94a07fSbrendan  * the lock pointer.
4270fa94a07fSbrendan  */
4271fa94a07fSbrendan static list_t *
4272fa94a07fSbrendan l2arc_list_locked(int list_num, kmutex_t **lock)
4273fa94a07fSbrendan {
4274d5285caeSGeorge Wilson 	list_t *list = NULL;
4275fa94a07fSbrendan 
4276fa94a07fSbrendan 	ASSERT(list_num >= 0 && list_num <= 3);
4277fa94a07fSbrendan 
4278fa94a07fSbrendan 	switch (list_num) {
4279fa94a07fSbrendan 	case 0:
4280fa94a07fSbrendan 		list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4281fa94a07fSbrendan 		*lock = &arc_mfu->arcs_mtx;
4282fa94a07fSbrendan 		break;
4283fa94a07fSbrendan 	case 1:
4284fa94a07fSbrendan 		list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4285fa94a07fSbrendan 		*lock = &arc_mru->arcs_mtx;
4286fa94a07fSbrendan 		break;
4287fa94a07fSbrendan 	case 2:
4288fa94a07fSbrendan 		list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4289fa94a07fSbrendan 		*lock = &arc_mfu->arcs_mtx;
4290fa94a07fSbrendan 		break;
4291fa94a07fSbrendan 	case 3:
4292fa94a07fSbrendan 		list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4293fa94a07fSbrendan 		*lock = &arc_mru->arcs_mtx;
4294fa94a07fSbrendan 		break;
4295fa94a07fSbrendan 	}
4296fa94a07fSbrendan 
4297fa94a07fSbrendan 	ASSERT(!(MUTEX_HELD(*lock)));
4298fa94a07fSbrendan 	mutex_enter(*lock);
4299fa94a07fSbrendan 	return (list);
4300fa94a07fSbrendan }
4301fa94a07fSbrendan 
4302fa94a07fSbrendan /*
4303fa94a07fSbrendan  * Evict buffers from the device write hand to the distance specified in
4304fa94a07fSbrendan  * bytes.  This distance may span populated buffers, it may span nothing.
4305fa94a07fSbrendan  * This is clearing a region on the L2ARC device ready for writing.
4306fa94a07fSbrendan  * If the 'all' boolean is set, every buffer is evicted.
4307fa94a07fSbrendan  */
4308fa94a07fSbrendan static void
4309fa94a07fSbrendan l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4310fa94a07fSbrendan {
4311fa94a07fSbrendan 	list_t *buflist;
4312fa94a07fSbrendan 	l2arc_buf_hdr_t *abl2;
4313fa94a07fSbrendan 	arc_buf_hdr_t *ab, *ab_prev;
4314fa94a07fSbrendan 	kmutex_t *hash_lock;
4315fa94a07fSbrendan 	uint64_t taddr;
4316fa94a07fSbrendan 
4317fa94a07fSbrendan 	buflist = dev->l2ad_buflist;
4318fa94a07fSbrendan 
4319fa94a07fSbrendan 	if (buflist == NULL)
4320fa94a07fSbrendan 		return;
4321fa94a07fSbrendan 
4322fa94a07fSbrendan 	if (!all && dev->l2ad_first) {
4323fa94a07fSbrendan 		/*
4324fa94a07fSbrendan 		 * This is the first sweep through the device.  There is
4325fa94a07fSbrendan 		 * nothing to evict.
4326fa94a07fSbrendan 		 */
4327fa94a07fSbrendan 		return;
4328fa94a07fSbrendan 	}
4329fa94a07fSbrendan 
43303a737e0dSbrendan 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4331fa94a07fSbrendan 		/*
4332fa94a07fSbrendan 		 * When nearing the end of the device, evict to the end
4333fa94a07fSbrendan 		 * before the device write hand jumps to the start.
4334fa94a07fSbrendan 		 */
4335fa94a07fSbrendan 		taddr = dev->l2ad_end;
4336fa94a07fSbrendan 	} else {
4337fa94a07fSbrendan 		taddr = dev->l2ad_hand + distance;
4338fa94a07fSbrendan 	}
4339fa94a07fSbrendan 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4340fa94a07fSbrendan 	    uint64_t, taddr, boolean_t, all);
4341fa94a07fSbrendan 
4342fa94a07fSbrendan top:
4343fa94a07fSbrendan 	mutex_enter(&l2arc_buflist_mtx);
4344fa94a07fSbrendan 	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4345fa94a07fSbrendan 		ab_prev = list_prev(buflist, ab);
4346fa94a07fSbrendan 
4347fa94a07fSbrendan 		hash_lock = HDR_LOCK(ab);
4348fa94a07fSbrendan 		if (!mutex_tryenter(hash_lock)) {
4349fa94a07fSbrendan 			/*
4350fa94a07fSbrendan 			 * Missed the hash lock.  Retry.
4351fa94a07fSbrendan 			 */
4352fa94a07fSbrendan 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4353fa94a07fSbrendan 			mutex_exit(&l2arc_buflist_mtx);
4354fa94a07fSbrendan 			mutex_enter(hash_lock);
4355fa94a07fSbrendan 			mutex_exit(hash_lock);
4356fa94a07fSbrendan 			goto top;
4357fa94a07fSbrendan 		}
4358fa94a07fSbrendan 
4359fa94a07fSbrendan 		if (HDR_L2_WRITE_HEAD(ab)) {
4360fa94a07fSbrendan 			/*
4361fa94a07fSbrendan 			 * We hit a write head node.  Leave it for
4362fa94a07fSbrendan 			 * l2arc_write_done().
4363fa94a07fSbrendan 			 */
4364fa94a07fSbrendan 			list_remove(buflist, ab);
4365fa94a07fSbrendan 			mutex_exit(hash_lock);
4366fa94a07fSbrendan 			continue;
4367fa94a07fSbrendan 		}
4368fa94a07fSbrendan 
4369fa94a07fSbrendan 		if (!all && ab->b_l2hdr != NULL &&
4370fa94a07fSbrendan 		    (ab->b_l2hdr->b_daddr > taddr ||
4371fa94a07fSbrendan 		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4372fa94a07fSbrendan 			/*
4373fa94a07fSbrendan 			 * We've evicted to the target address,
4374fa94a07fSbrendan 			 * or the end of the device.
4375fa94a07fSbrendan 			 */
4376fa94a07fSbrendan 			mutex_exit(hash_lock);
4377fa94a07fSbrendan 			break;
4378fa94a07fSbrendan 		}
4379fa94a07fSbrendan 
4380fa94a07fSbrendan 		if (HDR_FREE_IN_PROGRESS(ab)) {
4381fa94a07fSbrendan 			/*
4382fa94a07fSbrendan 			 * Already on the path to destruction.
4383fa94a07fSbrendan 			 */
4384fa94a07fSbrendan 			mutex_exit(hash_lock);
4385fa94a07fSbrendan 			continue;
4386fa94a07fSbrendan 		}
4387fa94a07fSbrendan 
4388fa94a07fSbrendan 		if (ab->b_state == arc_l2c_only) {
4389fa94a07fSbrendan 			ASSERT(!HDR_L2_READING(ab));
4390fa94a07fSbrendan 			/*
4391fa94a07fSbrendan 			 * This doesn't exist in the ARC.  Destroy.
4392fa94a07fSbrendan 			 * arc_hdr_destroy() will call list_remove()
4393fa94a07fSbrendan 			 * and decrement arcstat_l2_size.
4394fa94a07fSbrendan 			 */
4395fa94a07fSbrendan 			arc_change_state(arc_anon, ab, hash_lock);
4396fa94a07fSbrendan 			arc_hdr_destroy(ab);
4397fa94a07fSbrendan 		} else {
43983a737e0dSbrendan 			/*
43993a737e0dSbrendan 			 * Invalidate issued or about to be issued
44003a737e0dSbrendan 			 * reads, since we may be about to write
44013a737e0dSbrendan 			 * over this location.
44023a737e0dSbrendan 			 */
44033a737e0dSbrendan 			if (HDR_L2_READING(ab)) {
44043a737e0dSbrendan 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
44053a737e0dSbrendan 				ab->b_flags |= ARC_L2_EVICTED;
44063a737e0dSbrendan 			}
44073a737e0dSbrendan 
4408fa94a07fSbrendan 			/*
4409fa94a07fSbrendan 			 * Tell ARC this no longer exists in L2ARC.
4410fa94a07fSbrendan 			 */
4411fa94a07fSbrendan 			if (ab->b_l2hdr != NULL) {
4412fa94a07fSbrendan 				abl2 = ab->b_l2hdr;
4413*aad02571SSaso Kiselkov 				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4414fa94a07fSbrendan 				ab->b_l2hdr = NULL;
4415fa94a07fSbrendan 				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4416fa94a07fSbrendan 				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4417fa94a07fSbrendan 			}
4418fa94a07fSbrendan 			list_remove(buflist, ab);
4419fa94a07fSbrendan 
4420fa94a07fSbrendan 			/*
4421fa94a07fSbrendan 			 * This may have been leftover after a
4422fa94a07fSbrendan 			 * failed write.
4423fa94a07fSbrendan 			 */
4424fa94a07fSbrendan 			ab->b_flags &= ~ARC_L2_WRITING;
4425fa94a07fSbrendan 		}
4426fa94a07fSbrendan 		mutex_exit(hash_lock);
4427fa94a07fSbrendan 	}
4428fa94a07fSbrendan 	mutex_exit(&l2arc_buflist_mtx);
4429fa94a07fSbrendan 
4430b24ab676SJeff Bonwick 	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4431fa94a07fSbrendan 	dev->l2ad_evict = taddr;
4432fa94a07fSbrendan }
4433fa94a07fSbrendan 
4434fa94a07fSbrendan /*
4435fa94a07fSbrendan  * Find and write ARC buffers to the L2ARC device.
4436fa94a07fSbrendan  *
4437fa94a07fSbrendan  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4438fa94a07fSbrendan  * for reading until they have completed writing.
4439*aad02571SSaso Kiselkov  * The headroom_boost is an in-out parameter used to maintain headroom boost
4440*aad02571SSaso Kiselkov  * state between calls to this function.
4441*aad02571SSaso Kiselkov  *
4442*aad02571SSaso Kiselkov  * Returns the number of bytes actually written (which may be smaller than
4443*aad02571SSaso Kiselkov  * the delta by which the device hand has changed due to alignment).
4444fa94a07fSbrendan  */
44455a98e54bSBrendan Gregg - Sun Microsystems static uint64_t
4446*aad02571SSaso Kiselkov l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4447*aad02571SSaso Kiselkov     boolean_t *headroom_boost)
4448fa94a07fSbrendan {
4449fa94a07fSbrendan 	arc_buf_hdr_t *ab, *ab_prev, *head;
4450fa94a07fSbrendan 	list_t *list;
4451*aad02571SSaso Kiselkov 	uint64_t write_asize, write_psize, write_sz, headroom,
4452*aad02571SSaso Kiselkov 	    buf_compress_minsz;
4453fa94a07fSbrendan 	void *buf_data;
4454*aad02571SSaso Kiselkov 	kmutex_t *list_lock;
4455*aad02571SSaso Kiselkov 	boolean_t full;
4456fa94a07fSbrendan 	l2arc_write_callback_t *cb;
4457fa94a07fSbrendan 	zio_t *pio, *wzio;
4458e9103aaeSGarrett D'Amore 	uint64_t guid = spa_load_guid(spa);
4459*aad02571SSaso Kiselkov 	const boolean_t do_headroom_boost = *headroom_boost;
4460fa94a07fSbrendan 
4461fa94a07fSbrendan 	ASSERT(dev->l2ad_vdev != NULL);
4462fa94a07fSbrendan 
4463*aad02571SSaso Kiselkov 	/* Lower the flag now, we might want to raise it again later. */
4464*aad02571SSaso Kiselkov 	*headroom_boost = B_FALSE;
4465*aad02571SSaso Kiselkov 
4466fa94a07fSbrendan 	pio = NULL;
4467*aad02571SSaso Kiselkov 	write_sz = write_asize = write_psize = 0;
4468fa94a07fSbrendan 	full = B_FALSE;
44691ab7f2deSmaybee 	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4470fa94a07fSbrendan 	head->b_flags |= ARC_L2_WRITE_HEAD;
4471fa94a07fSbrendan 
4472*aad02571SSaso Kiselkov 	/*
4473*aad02571SSaso Kiselkov 	 * We will want to try to compress buffers that are at least 2x the
4474*aad02571SSaso Kiselkov 	 * device sector size.
4475*aad02571SSaso Kiselkov 	 */
4476*aad02571SSaso Kiselkov 	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4477*aad02571SSaso Kiselkov 
4478fa94a07fSbrendan 	/*
4479fa94a07fSbrendan 	 * Copy buffers for L2ARC writing.
4480fa94a07fSbrendan 	 */
4481fa94a07fSbrendan 	mutex_enter(&l2arc_buflist_mtx);
4482fa94a07fSbrendan 	for (int try = 0; try <= 3; try++) {
4483*aad02571SSaso Kiselkov 		uint64_t passed_sz = 0;
4484*aad02571SSaso Kiselkov 
4485fa94a07fSbrendan 		list = l2arc_list_locked(try, &list_lock);
4486fa94a07fSbrendan 
44873a737e0dSbrendan 		/*
44883a737e0dSbrendan 		 * L2ARC fast warmup.
44893a737e0dSbrendan 		 *
44903a737e0dSbrendan 		 * Until the ARC is warm and starts to evict, read from the
44913a737e0dSbrendan 		 * head of the ARC lists rather than the tail.
44923a737e0dSbrendan 		 */
44933a737e0dSbrendan 		if (arc_warm == B_FALSE)
44943a737e0dSbrendan 			ab = list_head(list);
44953a737e0dSbrendan 		else
44963a737e0dSbrendan 			ab = list_tail(list);
44973a737e0dSbrendan 
4498*aad02571SSaso Kiselkov 		headroom = target_sz * l2arc_headroom;
4499*aad02571SSaso Kiselkov 		if (do_headroom_boost)
4500*aad02571SSaso Kiselkov 			headroom = (headroom * l2arc_headroom_boost) / 100;
4501*aad02571SSaso Kiselkov 
45023a737e0dSbrendan 		for (; ab; ab = ab_prev) {
4503*aad02571SSaso Kiselkov 			l2arc_buf_hdr_t *l2hdr;
4504*aad02571SSaso Kiselkov 			kmutex_t *hash_lock;
4505*aad02571SSaso Kiselkov 			uint64_t buf_sz;
4506*aad02571SSaso Kiselkov 
45073a737e0dSbrendan 			if (arc_warm == B_FALSE)
45083a737e0dSbrendan 				ab_prev = list_next(list, ab);
45093a737e0dSbrendan 			else
45103a737e0dSbrendan 				ab_prev = list_prev(list, ab);
4511fa94a07fSbrendan 
4512fa94a07fSbrendan 			hash_lock = HDR_LOCK(ab);
4513*aad02571SSaso Kiselkov 			if (!mutex_tryenter(hash_lock)) {
4514fa94a07fSbrendan 				/*
4515fa94a07fSbrendan 				 * Skip this buffer rather than waiting.
4516fa94a07fSbrendan 				 */
4517fa94a07fSbrendan 				continue;
4518fa94a07fSbrendan 			}
4519fa94a07fSbrendan 
4520fa94a07fSbrendan 			passed_sz += ab->b_size;
4521fa94a07fSbrendan 			if (passed_sz > headroom) {
4522fa94a07fSbrendan 				/*
4523fa94a07fSbrendan 				 * Searched too far.
4524fa94a07fSbrendan 				 */
4525fa94a07fSbrendan 				mutex_exit(hash_lock);
4526fa94a07fSbrendan 				break;
4527fa94a07fSbrendan 			}
4528fa94a07fSbrendan 
4529ac05c741SMark Maybee 			if (!l2arc_write_eligible(guid, ab)) {
4530fa94a07fSbrendan 				mutex_exit(hash_lock);
4531fa94a07fSbrendan 				continue;
4532fa94a07fSbrendan 			}
4533fa94a07fSbrendan 
4534fa94a07fSbrendan 			if ((write_sz + ab->b_size) > target_sz) {
4535fa94a07fSbrendan 				full = B_TRUE;
4536fa94a07fSbrendan 				mutex_exit(hash_lock);
4537fa94a07fSbrendan 				break;
4538fa94a07fSbrendan 			}
4539fa94a07fSbrendan 
4540fa94a07fSbrendan 			if (pio == NULL) {
4541fa94a07fSbrendan 				/*
4542fa94a07fSbrendan 				 * Insert a dummy header on the buflist so
4543fa94a07fSbrendan 				 * l2arc_write_done() can find where the
4544fa94a07fSbrendan 				 * write buffers begin without searching.
4545fa94a07fSbrendan 				 */
4546fa94a07fSbrendan 				list_insert_head(dev->l2ad_buflist, head);
4547fa94a07fSbrendan 
4548fa94a07fSbrendan 				cb = kmem_alloc(
4549fa94a07fSbrendan 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4550fa94a07fSbrendan 				cb->l2wcb_dev = dev;
4551fa94a07fSbrendan 				cb->l2wcb_head = head;
4552fa94a07fSbrendan 				pio = zio_root(spa, l2arc_write_done, cb,
4553fa94a07fSbrendan 				    ZIO_FLAG_CANFAIL);
4554fa94a07fSbrendan 			}
4555fa94a07fSbrendan 
4556fa94a07fSbrendan 			/*
4557fa94a07fSbrendan 			 * Create and add a new L2ARC header.
4558fa94a07fSbrendan 			 */
4559*aad02571SSaso Kiselkov 			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4560*aad02571SSaso Kiselkov 			l2hdr->b_dev = dev;
4561fa94a07fSbrendan 			ab->b_flags |= ARC_L2_WRITING;
4562*aad02571SSaso Kiselkov 
4563*aad02571SSaso Kiselkov 			/*
4564*aad02571SSaso Kiselkov 			 * Temporarily stash the data buffer in b_tmp_cdata.
4565*aad02571SSaso Kiselkov 			 * The subsequent write step will pick it up from
4566*aad02571SSaso Kiselkov 			 * there. This is because can't access ab->b_buf
4567*aad02571SSaso Kiselkov 			 * without holding the hash_lock, which we in turn
4568*aad02571SSaso Kiselkov 			 * can't access without holding the ARC list locks
4569*aad02571SSaso Kiselkov 			 * (which we want to avoid during compression/writing).
4570*aad02571SSaso Kiselkov 			 */
4571*aad02571SSaso Kiselkov 			l2hdr->b_compress = ZIO_COMPRESS_OFF;
4572*aad02571SSaso Kiselkov 			l2hdr->b_asize = ab->b_size;
4573*aad02571SSaso Kiselkov 			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4574*aad02571SSaso Kiselkov 
4575fa94a07fSbrendan 			buf_sz = ab->b_size;
4576*aad02571SSaso Kiselkov 			ab->b_l2hdr = l2hdr;
4577*aad02571SSaso Kiselkov 
4578*aad02571SSaso Kiselkov 			list_insert_head(dev->l2ad_buflist, ab);
4579fa94a07fSbrendan 
4580fa94a07fSbrendan 			/*
4581fa94a07fSbrendan 			 * Compute and store the buffer cksum before
4582fa94a07fSbrendan 			 * writing.  On debug the cksum is verified first.
4583fa94a07fSbrendan 			 */
4584fa94a07fSbrendan 			arc_cksum_verify(ab->b_buf);
4585fa94a07fSbrendan 			arc_cksum_compute(ab->b_buf, B_TRUE);
4586fa94a07fSbrendan 
4587fa94a07fSbrendan 			mutex_exit(hash_lock);
4588fa94a07fSbrendan 
4589*aad02571SSaso Kiselkov 			write_sz += buf_sz;
4590*aad02571SSaso Kiselkov 		}
4591*aad02571SSaso Kiselkov 
4592*aad02571SSaso Kiselkov 		mutex_exit(list_lock);
4593*aad02571SSaso Kiselkov 
4594*aad02571SSaso Kiselkov 		if (full == B_TRUE)
4595*aad02571SSaso Kiselkov 			break;
4596*aad02571SSaso Kiselkov 	}
4597*aad02571SSaso Kiselkov 
4598*aad02571SSaso Kiselkov 	/* No buffers selected for writing? */
4599*aad02571SSaso Kiselkov 	if (pio == NULL) {
4600*aad02571SSaso Kiselkov 		ASSERT0(write_sz);
4601*aad02571SSaso Kiselkov 		mutex_exit(&l2arc_buflist_mtx);
4602*aad02571SSaso Kiselkov 		kmem_cache_free(hdr_cache, head);
4603*aad02571SSaso Kiselkov 		return (0);
4604*aad02571SSaso Kiselkov 	}
4605*aad02571SSaso Kiselkov 
4606*aad02571SSaso Kiselkov 	/*
4607*aad02571SSaso Kiselkov 	 * Now start writing the buffers. We're starting at the write head
4608*aad02571SSaso Kiselkov 	 * and work backwards, retracing the course of the buffer selector
4609*aad02571SSaso Kiselkov 	 * loop above.
4610*aad02571SSaso Kiselkov 	 */
4611*aad02571SSaso Kiselkov 	for (ab = list_prev(dev->l2ad_buflist, head); ab;
4612*aad02571SSaso Kiselkov 	    ab = list_prev(dev->l2ad_buflist, ab)) {
4613*aad02571SSaso Kiselkov 		l2arc_buf_hdr_t *l2hdr;
4614*aad02571SSaso Kiselkov 		uint64_t buf_sz;
4615*aad02571SSaso Kiselkov 
4616*aad02571SSaso Kiselkov 		/*
4617*aad02571SSaso Kiselkov 		 * We shouldn't need to lock the buffer here, since we flagged
4618*aad02571SSaso Kiselkov 		 * it as ARC_L2_WRITING in the previous step, but we must take
4619*aad02571SSaso Kiselkov 		 * care to only access its L2 cache parameters. In particular,
4620*aad02571SSaso Kiselkov 		 * ab->b_buf may be invalid by now due to ARC eviction.
4621*aad02571SSaso Kiselkov 		 */
4622*aad02571SSaso Kiselkov 		l2hdr = ab->b_l2hdr;
4623*aad02571SSaso Kiselkov 		l2hdr->b_daddr = dev->l2ad_hand;
4624*aad02571SSaso Kiselkov 
4625*aad02571SSaso Kiselkov 		if ((ab->b_flags & ARC_L2COMPRESS) &&
4626*aad02571SSaso Kiselkov 		    l2hdr->b_asize >= buf_compress_minsz) {
4627*aad02571SSaso Kiselkov 			if (l2arc_compress_buf(l2hdr)) {
4628*aad02571SSaso Kiselkov 				/*
4629*aad02571SSaso Kiselkov 				 * If compression succeeded, enable headroom
4630*aad02571SSaso Kiselkov 				 * boost on the next scan cycle.
4631*aad02571SSaso Kiselkov 				 */
4632*aad02571SSaso Kiselkov 				*headroom_boost = B_TRUE;
4633*aad02571SSaso Kiselkov 			}
4634*aad02571SSaso Kiselkov 		}
4635*aad02571SSaso Kiselkov 
4636*aad02571SSaso Kiselkov 		/*
4637*aad02571SSaso Kiselkov 		 * Pick up the buffer data we had previously stashed away
4638*aad02571SSaso Kiselkov 		 * (and now potentially also compressed).
4639*aad02571SSaso Kiselkov 		 */
4640*aad02571SSaso Kiselkov 		buf_data = l2hdr->b_tmp_cdata;
4641*aad02571SSaso Kiselkov 		buf_sz = l2hdr->b_asize;
4642*aad02571SSaso Kiselkov 
4643*aad02571SSaso Kiselkov 		/* Compression may have squashed the buffer to zero length. */
4644*aad02571SSaso Kiselkov 		if (buf_sz != 0) {
4645*aad02571SSaso Kiselkov 			uint64_t buf_p_sz;
4646*aad02571SSaso Kiselkov 
4647fa94a07fSbrendan 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
4648fa94a07fSbrendan 			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4649fa94a07fSbrendan 			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4650fa94a07fSbrendan 			    ZIO_FLAG_CANFAIL, B_FALSE);
4651fa94a07fSbrendan 
4652fa94a07fSbrendan 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4653fa94a07fSbrendan 			    zio_t *, wzio);
4654fa94a07fSbrendan 			(void) zio_nowait(wzio);
4655fa94a07fSbrendan 
4656*aad02571SSaso Kiselkov 			write_asize += buf_sz;
4657e14bb325SJeff Bonwick 			/*
4658e14bb325SJeff Bonwick 			 * Keep the clock hand suitably device-aligned.
4659e14bb325SJeff Bonwick 			 */
4660*aad02571SSaso Kiselkov 			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4661*aad02571SSaso Kiselkov 			write_psize += buf_p_sz;
4662*aad02571SSaso Kiselkov 			dev->l2ad_hand += buf_p_sz;
4663fa94a07fSbrendan 		}
4664fa94a07fSbrendan 	}
4665fa94a07fSbrendan 
4666*aad02571SSaso Kiselkov 	mutex_exit(&l2arc_buflist_mtx);
4667fa94a07fSbrendan 
4668*aad02571SSaso Kiselkov 	ASSERT3U(write_asize, <=, target_sz);
4669fa94a07fSbrendan 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
4670*aad02571SSaso Kiselkov 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
4671fa94a07fSbrendan 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
4672*aad02571SSaso Kiselkov 	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
4673*aad02571SSaso Kiselkov 	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
4674fa94a07fSbrendan 
4675fa94a07fSbrendan 	/*
4676fa94a07fSbrendan 	 * Bump device hand to the device start if it is approaching the end.
4677fa94a07fSbrendan 	 * l2arc_evict() will already have evicted ahead for this case.
4678fa94a07fSbrendan 	 */
46793a737e0dSbrendan 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4680b24ab676SJeff Bonwick 		vdev_space_update(dev->l2ad_vdev,
4681b24ab676SJeff Bonwick 		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
4682fa94a07fSbrendan 		dev->l2ad_hand = dev->l2ad_start;
4683fa94a07fSbrendan 		dev->l2ad_evict = dev->l2ad_start;
4684fa94a07fSbrendan 		dev->l2ad_first = B_FALSE;
4685fa94a07fSbrendan 	}
4686fa94a07fSbrendan 
46875a98e54bSBrendan Gregg - Sun Microsystems 	dev->l2ad_writing = B_TRUE;
4688fa94a07fSbrendan 	(void) zio_wait(pio);
46895a98e54bSBrendan Gregg - Sun Microsystems 	dev->l2ad_writing = B_FALSE;
46905a98e54bSBrendan Gregg - Sun Microsystems 
4691*aad02571SSaso Kiselkov 	return (write_asize);
4692*aad02571SSaso Kiselkov }
4693*aad02571SSaso Kiselkov 
4694*aad02571SSaso Kiselkov /*
4695*aad02571SSaso Kiselkov  * Compresses an L2ARC buffer.
4696*aad02571SSaso Kiselkov  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
4697*aad02571SSaso Kiselkov  * size in l2hdr->b_asize. This routine tries to compress the data and
4698*aad02571SSaso Kiselkov  * depending on the compression result there are three possible outcomes:
4699*aad02571SSaso Kiselkov  * *) The buffer was incompressible. The original l2hdr contents were left
4700*aad02571SSaso Kiselkov  *    untouched and are ready for writing to an L2 device.
4701*aad02571SSaso Kiselkov  * *) The buffer was all-zeros, so there is no need to write it to an L2
4702*aad02571SSaso Kiselkov  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
4703*aad02571SSaso Kiselkov  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
4704*aad02571SSaso Kiselkov  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
4705*aad02571SSaso Kiselkov  *    data buffer which holds the compressed data to be written, and b_asize
4706*aad02571SSaso Kiselkov  *    tells us how much data there is. b_compress is set to the appropriate
4707*aad02571SSaso Kiselkov  *    compression algorithm. Once writing is done, invoke
4708*aad02571SSaso Kiselkov  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
4709*aad02571SSaso Kiselkov  *
4710*aad02571SSaso Kiselkov  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
4711*aad02571SSaso Kiselkov  * buffer was incompressible).
4712*aad02571SSaso Kiselkov  */
4713*aad02571SSaso Kiselkov static boolean_t
4714*aad02571SSaso Kiselkov l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
4715*aad02571SSaso Kiselkov {
4716*aad02571SSaso Kiselkov 	void *cdata;
4717*aad02571SSaso Kiselkov 	size_t csize, len;
4718*aad02571SSaso Kiselkov 
4719*aad02571SSaso Kiselkov 	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
4720*aad02571SSaso Kiselkov 	ASSERT(l2hdr->b_tmp_cdata != NULL);
4721*aad02571SSaso Kiselkov 
4722*aad02571SSaso Kiselkov 	len = l2hdr->b_asize;
4723*aad02571SSaso Kiselkov 	cdata = zio_data_buf_alloc(len);
4724*aad02571SSaso Kiselkov 	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
4725*aad02571SSaso Kiselkov 	    cdata, l2hdr->b_asize);
4726*aad02571SSaso Kiselkov 
4727*aad02571SSaso Kiselkov 	if (csize == 0) {
4728*aad02571SSaso Kiselkov 		/* zero block, indicate that there's nothing to write */
4729*aad02571SSaso Kiselkov 		zio_data_buf_free(cdata, len);
4730*aad02571SSaso Kiselkov 		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
4731*aad02571SSaso Kiselkov 		l2hdr->b_asize = 0;
4732*aad02571SSaso Kiselkov 		l2hdr->b_tmp_cdata = NULL;
4733*aad02571SSaso Kiselkov 		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
4734*aad02571SSaso Kiselkov 		return (B_TRUE);
4735*aad02571SSaso Kiselkov 	} else if (csize > 0 && csize < len) {
4736*aad02571SSaso Kiselkov 		/*
4737*aad02571SSaso Kiselkov 		 * Compression succeeded, we'll keep the cdata around for
4738*aad02571SSaso Kiselkov 		 * writing and release it afterwards.
4739*aad02571SSaso Kiselkov 		 */
4740*aad02571SSaso Kiselkov 		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
4741*aad02571SSaso Kiselkov 		l2hdr->b_asize = csize;
4742*aad02571SSaso Kiselkov 		l2hdr->b_tmp_cdata = cdata;
4743*aad02571SSaso Kiselkov 		ARCSTAT_BUMP(arcstat_l2_compress_successes);
4744*aad02571SSaso Kiselkov 		return (B_TRUE);
4745*aad02571SSaso Kiselkov 	} else {
4746*aad02571SSaso Kiselkov 		/*
4747*aad02571SSaso Kiselkov 		 * Compression failed, release the compressed buffer.
4748*aad02571SSaso Kiselkov 		 * l2hdr will be left unmodified.
4749*aad02571SSaso Kiselkov 		 */
4750*aad02571SSaso Kiselkov 		zio_data_buf_free(cdata, len);
4751*aad02571SSaso Kiselkov 		ARCSTAT_BUMP(arcstat_l2_compress_failures);
4752*aad02571SSaso Kiselkov 		return (B_FALSE);
4753*aad02571SSaso Kiselkov 	}
4754*aad02571SSaso Kiselkov }
4755*aad02571SSaso Kiselkov 
4756*aad02571SSaso Kiselkov /*
4757*aad02571SSaso Kiselkov  * Decompresses a zio read back from an l2arc device. On success, the
4758*aad02571SSaso Kiselkov  * underlying zio's io_data buffer is overwritten by the uncompressed
4759*aad02571SSaso Kiselkov  * version. On decompression error (corrupt compressed stream), the
4760*aad02571SSaso Kiselkov  * zio->io_error value is set to signal an I/O error.
4761*aad02571SSaso Kiselkov  *
4762*aad02571SSaso Kiselkov  * Please note that the compressed data stream is not checksummed, so
4763*aad02571SSaso Kiselkov  * if the underlying device is experiencing data corruption, we may feed
4764*aad02571SSaso Kiselkov  * corrupt data to the decompressor, so the decompressor needs to be
4765*aad02571SSaso Kiselkov  * able to handle this situation (LZ4 does).
4766*aad02571SSaso Kiselkov  */
4767*aad02571SSaso Kiselkov static void
4768*aad02571SSaso Kiselkov l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
4769*aad02571SSaso Kiselkov {
4770*aad02571SSaso Kiselkov 	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
4771*aad02571SSaso Kiselkov 
4772*aad02571SSaso Kiselkov 	if (zio->io_error != 0) {
4773*aad02571SSaso Kiselkov 		/*
4774*aad02571SSaso Kiselkov 		 * An io error has occured, just restore the original io
4775*aad02571SSaso Kiselkov 		 * size in preparation for a main pool read.
4776*aad02571SSaso Kiselkov 		 */
4777*aad02571SSaso Kiselkov 		zio->io_orig_size = zio->io_size = hdr->b_size;
4778*aad02571SSaso Kiselkov 		return;
4779*aad02571SSaso Kiselkov 	}
4780*aad02571SSaso Kiselkov 
4781*aad02571SSaso Kiselkov 	if (c == ZIO_COMPRESS_EMPTY) {
4782*aad02571SSaso Kiselkov 		/*
4783*aad02571SSaso Kiselkov 		 * An empty buffer results in a null zio, which means we
4784*aad02571SSaso Kiselkov 		 * need to fill its io_data after we're done restoring the
4785*aad02571SSaso Kiselkov 		 * buffer's contents.
4786*aad02571SSaso Kiselkov 		 */
4787*aad02571SSaso Kiselkov 		ASSERT(hdr->b_buf != NULL);
4788*aad02571SSaso Kiselkov 		bzero(hdr->b_buf->b_data, hdr->b_size);
4789*aad02571SSaso Kiselkov 		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
4790*aad02571SSaso Kiselkov 	} else {
4791*aad02571SSaso Kiselkov 		ASSERT(zio->io_data != NULL);
4792*aad02571SSaso Kiselkov 		/*
4793*aad02571SSaso Kiselkov 		 * We copy the compressed data from the start of the arc buffer
4794*aad02571SSaso Kiselkov 		 * (the zio_read will have pulled in only what we need, the
4795*aad02571SSaso Kiselkov 		 * rest is garbage which we will overwrite at decompression)
4796*aad02571SSaso Kiselkov 		 * and then decompress back to the ARC data buffer. This way we
4797*aad02571SSaso Kiselkov 		 * can minimize copying by simply decompressing back over the
4798*aad02571SSaso Kiselkov 		 * original compressed data (rather than decompressing to an
4799*aad02571SSaso Kiselkov 		 * aux buffer and then copying back the uncompressed buffer,
4800*aad02571SSaso Kiselkov 		 * which is likely to be much larger).
4801*aad02571SSaso Kiselkov 		 */
4802*aad02571SSaso Kiselkov 		uint64_t csize;
4803*aad02571SSaso Kiselkov 		void *cdata;
4804*aad02571SSaso Kiselkov 
4805*aad02571SSaso Kiselkov 		csize = zio->io_size;
4806*aad02571SSaso Kiselkov 		cdata = zio_data_buf_alloc(csize);
4807*aad02571SSaso Kiselkov 		bcopy(zio->io_data, cdata, csize);
4808*aad02571SSaso Kiselkov 		if (zio_decompress_data(c, cdata, zio->io_data, csize,
4809*aad02571SSaso Kiselkov 		    hdr->b_size) != 0)
4810*aad02571SSaso Kiselkov 			zio->io_error = EIO;
4811*aad02571SSaso Kiselkov 		zio_data_buf_free(cdata, csize);
4812*aad02571SSaso Kiselkov 	}
4813*aad02571SSaso Kiselkov 
4814*aad02571SSaso Kiselkov 	/* Restore the expected uncompressed IO size. */
4815*aad02571SSaso Kiselkov 	zio->io_orig_size = zio->io_size = hdr->b_size;
4816*aad02571SSaso Kiselkov }
4817*aad02571SSaso Kiselkov 
4818*aad02571SSaso Kiselkov /*
4819*aad02571SSaso Kiselkov  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
4820*aad02571SSaso Kiselkov  * This buffer serves as a temporary holder of compressed data while
4821*aad02571SSaso Kiselkov  * the buffer entry is being written to an l2arc device. Once that is
4822*aad02571SSaso Kiselkov  * done, we can dispose of it.
4823*aad02571SSaso Kiselkov  */
4824*aad02571SSaso Kiselkov static void
4825*aad02571SSaso Kiselkov l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
4826*aad02571SSaso Kiselkov {
4827*aad02571SSaso Kiselkov 	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
4828*aad02571SSaso Kiselkov 
4829*aad02571SSaso Kiselkov 	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
4830*aad02571SSaso Kiselkov 		/*
4831*aad02571SSaso Kiselkov 		 * If the data was compressed, then we've allocated a
4832*aad02571SSaso Kiselkov 		 * temporary buffer for it, so now we need to release it.
4833*aad02571SSaso Kiselkov 		 */
4834*aad02571SSaso Kiselkov 		ASSERT(l2hdr->b_tmp_cdata != NULL);
4835*aad02571SSaso Kiselkov 		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
4836*aad02571SSaso Kiselkov 	}
4837*aad02571SSaso Kiselkov 	l2hdr->b_tmp_cdata = NULL;
4838fa94a07fSbrendan }
4839fa94a07fSbrendan 
4840fa94a07fSbrendan /*
4841fa94a07fSbrendan  * This thread feeds the L2ARC at regular intervals.  This is the beating
4842fa94a07fSbrendan  * heart of the L2ARC.
4843fa94a07fSbrendan  */
4844fa94a07fSbrendan static void
4845fa94a07fSbrendan l2arc_feed_thread(void)
4846fa94a07fSbrendan {
4847fa94a07fSbrendan 	callb_cpr_t cpr;
4848fa94a07fSbrendan 	l2arc_dev_t *dev;
4849fa94a07fSbrendan 	spa_t *spa;
48505a98e54bSBrendan Gregg - Sun Microsystems 	uint64_t size, wrote;
4851d3d50737SRafael Vanoni 	clock_t begin, next = ddi_get_lbolt();
4852*aad02571SSaso Kiselkov 	boolean_t headroom_boost = B_FALSE;
4853fa94a07fSbrendan 
4854fa94a07fSbrendan 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4855fa94a07fSbrendan 
4856fa94a07fSbrendan 	mutex_enter(&l2arc_feed_thr_lock);
4857fa94a07fSbrendan 
4858fa94a07fSbrendan 	while (l2arc_thread_exit == 0) {
4859fa94a07fSbrendan 		CALLB_CPR_SAFE_BEGIN(&cpr);
4860fa94a07fSbrendan 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
48615a98e54bSBrendan Gregg - Sun Microsystems 		    next);
4862fa94a07fSbrendan 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4863d3d50737SRafael Vanoni 		next = ddi_get_lbolt() + hz;
4864fa94a07fSbrendan 
48653a737e0dSbrendan 		/*
48663a737e0dSbrendan 		 * Quick check for L2ARC devices.
48673a737e0dSbrendan 		 */
4868c5904d13Seschrock 		mutex_enter(&l2arc_dev_mtx);
48693a737e0dSbrendan 		if (l2arc_ndev == 0) {
48703a737e0dSbrendan 			mutex_exit(&l2arc_dev_mtx);
48713a737e0dSbrendan 			continue;
48723a737e0dSbrendan 		}
48733a737e0dSbrendan 		mutex_exit(&l2arc_dev_mtx);
4874d3d50737SRafael Vanoni 		begin = ddi_get_lbolt();
4875c5904d13Seschrock 
4876fa94a07fSbrendan 		/*
4877c5904d13Seschrock 		 * This selects the next l2arc device to write to, and in
4878c5904d13Seschrock 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
48793a737e0dSbrendan 		 * will return NULL if there are now no l2arc devices or if
48803a737e0dSbrendan 		 * they are all faulted.
48813a737e0dSbrendan 		 *
48823a737e0dSbrendan 		 * If a device is returned, its spa's config lock is also
48833a737e0dSbrendan 		 * held to prevent device removal.  l2arc_dev_get_next()
48843a737e0dSbrendan 		 * will grab and release l2arc_dev_mtx.
4885fa94a07fSbrendan 		 */
48863a737e0dSbrendan 		if ((dev = l2arc_dev_get_next()) == NULL)
4887fa94a07fSbrendan 			continue;
48883a737e0dSbrendan 
48893a737e0dSbrendan 		spa = dev->l2ad_spa;
48903a737e0dSbrendan 		ASSERT(spa != NULL);
4891fa94a07fSbrendan 
4892f9af39baSGeorge Wilson 		/*
4893f9af39baSGeorge Wilson 		 * If the pool is read-only then force the feed thread to
4894f9af39baSGeorge Wilson 		 * sleep a little longer.
4895f9af39baSGeorge Wilson 		 */
4896f9af39baSGeorge Wilson 		if (!spa_writeable(spa)) {
4897f9af39baSGeorge Wilson 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4898f9af39baSGeorge Wilson 			spa_config_exit(spa, SCL_L2ARC, dev);
4899f9af39baSGeorge Wilson 			continue;
4900f9af39baSGeorge Wilson 		}
4901f9af39baSGeorge Wilson 
4902fa94a07fSbrendan 		/*
4903fa94a07fSbrendan 		 * Avoid contributing to memory pressure.
4904fa94a07fSbrendan 		 */
4905fa94a07fSbrendan 		if (arc_reclaim_needed()) {
4906fa94a07fSbrendan 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4907e14bb325SJeff Bonwick 			spa_config_exit(spa, SCL_L2ARC, dev);
4908fa94a07fSbrendan 			continue;
4909fa94a07fSbrendan 		}
4910fa94a07fSbrendan 
4911fa94a07fSbrendan 		ARCSTAT_BUMP(arcstat_l2_feeds);
4912fa94a07fSbrendan 
4913*aad02571SSaso Kiselkov 		size = l2arc_write_size();
49143a737e0dSbrendan 
4915fa94a07fSbrendan 		/*
4916fa94a07fSbrendan 		 * Evict L2ARC buffers that will be overwritten.
4917fa94a07fSbrendan 		 */
49183a737e0dSbrendan 		l2arc_evict(dev, size, B_FALSE);
4919fa94a07fSbrendan 
4920fa94a07fSbrendan 		/*
4921fa94a07fSbrendan 		 * Write ARC buffers.
4922fa94a07fSbrendan 		 */
4923*aad02571SSaso Kiselkov 		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
49245a98e54bSBrendan Gregg - Sun Microsystems 
49255a98e54bSBrendan Gregg - Sun Microsystems 		/*
49265a98e54bSBrendan Gregg - Sun Microsystems 		 * Calculate interval between writes.
49275a98e54bSBrendan Gregg - Sun Microsystems 		 */
49285a98e54bSBrendan Gregg - Sun Microsystems 		next = l2arc_write_interval(begin, size, wrote);
4929e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_L2ARC, dev);
4930fa94a07fSbrendan 	}
4931fa94a07fSbrendan 
4932fa94a07fSbrendan 	l2arc_thread_exit = 0;
4933fa94a07fSbrendan 	cv_broadcast(&l2arc_feed_thr_cv);
4934fa94a07fSbrendan 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
4935fa94a07fSbrendan 	thread_exit();
4936fa94a07fSbrendan }
4937fa94a07fSbrendan 
4938c5904d13Seschrock boolean_t
4939c5904d13Seschrock l2arc_vdev_present(vdev_t *vd)
4940c5904d13Seschrock {
4941c5904d13Seschrock 	l2arc_dev_t *dev;
4942c5904d13Seschrock 
4943c5904d13Seschrock 	mutex_enter(&l2arc_dev_mtx);
4944c5904d13Seschrock 	for (dev = list_head(l2arc_dev_list); dev != NULL;
4945c5904d13Seschrock 	    dev = list_next(l2arc_dev_list, dev)) {
4946c5904d13Seschrock 		if (dev->l2ad_vdev == vd)
4947c5904d13Seschrock 			break;
4948c5904d13Seschrock 	}
4949c5904d13Seschrock 	mutex_exit(&l2arc_dev_mtx);
4950c5904d13Seschrock 
4951c5904d13Seschrock 	return (dev != NULL);
4952c5904d13Seschrock }
4953c5904d13Seschrock 
4954fa94a07fSbrendan /*
4955fa94a07fSbrendan  * Add a vdev for use by the L2ARC.  By this point the spa has already
4956fa94a07fSbrendan  * validated the vdev and opened it.
4957fa94a07fSbrendan  */
4958fa94a07fSbrendan void
4959573ca77eSGeorge Wilson l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4960fa94a07fSbrendan {
4961fa94a07fSbrendan 	l2arc_dev_t *adddev;
4962fa94a07fSbrendan 
4963c5904d13Seschrock 	ASSERT(!l2arc_vdev_present(vd));
4964c5904d13Seschrock 
4965fa94a07fSbrendan 	/*
4966fa94a07fSbrendan 	 * Create a new l2arc device entry.
4967fa94a07fSbrendan 	 */
4968fa94a07fSbrendan 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4969fa94a07fSbrendan 	adddev->l2ad_spa = spa;
4970fa94a07fSbrendan 	adddev->l2ad_vdev = vd;
4971573ca77eSGeorge Wilson 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4972573ca77eSGeorge Wilson 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4973fa94a07fSbrendan 	adddev->l2ad_hand = adddev->l2ad_start;
4974fa94a07fSbrendan 	adddev->l2ad_evict = adddev->l2ad_start;
4975fa94a07fSbrendan 	adddev->l2ad_first = B_TRUE;
49765a98e54bSBrendan Gregg - Sun Microsystems 	adddev->l2ad_writing = B_FALSE;
4977fa94a07fSbrendan 
4978fa94a07fSbrendan 	/*
4979fa94a07fSbrendan 	 * This is a list of all ARC buffers that are still valid on the
4980fa94a07fSbrendan 	 * device.
4981fa94a07fSbrendan 	 */
4982fa94a07fSbrendan 	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4983fa94a07fSbrendan 	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4984fa94a07fSbrendan 	    offsetof(arc_buf_hdr_t, b_l2node));
4985fa94a07fSbrendan 
4986b24ab676SJeff Bonwick 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4987fa94a07fSbrendan 
4988fa94a07fSbrendan 	/*
4989fa94a07fSbrendan 	 * Add device to global list
4990fa94a07fSbrendan 	 */
4991fa94a07fSbrendan 	mutex_enter(&l2arc_dev_mtx);
4992fa94a07fSbrendan 	list_insert_head(l2arc_dev_list, adddev);
4993fa94a07fSbrendan 	atomic_inc_64(&l2arc_ndev);
4994fa94a07fSbrendan 	mutex_exit(&l2arc_dev_mtx);
4995fa94a07fSbrendan }
4996fa94a07fSbrendan 
4997fa94a07fSbrendan /*
4998fa94a07fSbrendan  * Remove a vdev from the L2ARC.
4999fa94a07fSbrendan  */
5000fa94a07fSbrendan void
5001fa94a07fSbrendan l2arc_remove_vdev(vdev_t *vd)
5002fa94a07fSbrendan {
5003fa94a07fSbrendan 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5004fa94a07fSbrendan 
5005fa94a07fSbrendan 	/*
5006fa94a07fSbrendan 	 * Find the device by vdev
5007fa94a07fSbrendan 	 */
5008fa94a07fSbrendan 	mutex_enter(&l2arc_dev_mtx);
5009fa94a07fSbrendan 	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5010fa94a07fSbrendan 		nextdev = list_next(l2arc_dev_list, dev);
5011fa94a07fSbrendan 		if (vd == dev->l2ad_vdev) {
5012fa94a07fSbrendan 			remdev = dev;
5013fa94a07fSbrendan 			break;
5014fa94a07fSbrendan 		}
5015fa94a07fSbrendan 	}
5016fa94a07fSbrendan 	ASSERT(remdev != NULL);
5017fa94a07fSbrendan 
5018fa94a07fSbrendan 	/*
5019fa94a07fSbrendan 	 * Remove device from global list
5020fa94a07fSbrendan 	 */
5021fa94a07fSbrendan 	list_remove(l2arc_dev_list, remdev);
5022fa94a07fSbrendan 	l2arc_dev_last = NULL;		/* may have been invalidated */
50233a737e0dSbrendan 	atomic_dec_64(&l2arc_ndev);
50243a737e0dSbrendan 	mutex_exit(&l2arc_dev_mtx);
5025fa94a07fSbrendan 
5026fa94a07fSbrendan 	/*
5027fa94a07fSbrendan 	 * Clear all buflists and ARC references.  L2ARC device flush.
5028fa94a07fSbrendan 	 */
5029fa94a07fSbrendan 	l2arc_evict(remdev, 0, B_TRUE);
5030fa94a07fSbrendan 	list_destroy(remdev->l2ad_buflist);
5031fa94a07fSbrendan 	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5032fa94a07fSbrendan 	kmem_free(remdev, sizeof (l2arc_dev_t));
5033fa94a07fSbrendan }
5034fa94a07fSbrendan 
5035fa94a07fSbrendan void
5036e14bb325SJeff Bonwick l2arc_init(void)
5037fa94a07fSbrendan {
5038fa94a07fSbrendan 	l2arc_thread_exit = 0;
5039fa94a07fSbrendan 	l2arc_ndev = 0;
5040fa94a07fSbrendan 	l2arc_writes_sent = 0;
5041fa94a07fSbrendan 	l2arc_writes_done = 0;
5042fa94a07fSbrendan 
5043fa94a07fSbrendan 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5044fa94a07fSbrendan 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5045fa94a07fSbrendan 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5046fa94a07fSbrendan 	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5047fa94a07fSbrendan 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5048fa94a07fSbrendan 
5049fa94a07fSbrendan 	l2arc_dev_list = &L2ARC_dev_list;
5050fa94a07fSbrendan 	l2arc_free_on_write = &L2ARC_free_on_write;
5051fa94a07fSbrendan 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5052fa94a07fSbrendan 	    offsetof(l2arc_dev_t, l2ad_node));
5053fa94a07fSbrendan 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5054fa94a07fSbrendan 	    offsetof(l2arc_data_free_t, l2df_list_node));
5055fa94a07fSbrendan }
5056fa94a07fSbrendan 
5057fa94a07fSbrendan void
5058e14bb325SJeff Bonwick l2arc_fini(void)
5059fa94a07fSbrendan {
50603a737e0dSbrendan 	/*
50613a737e0dSbrendan 	 * This is called from dmu_fini(), which is called from spa_fini();
50623a737e0dSbrendan 	 * Because of this, we can assume that all l2arc devices have
50633a737e0dSbrendan 	 * already been removed when the pools themselves were removed.
50643a737e0dSbrendan 	 */
50653a737e0dSbrendan 
50663a737e0dSbrendan 	l2arc_do_free_on_write();
50673a737e0dSbrendan 
5068fa94a07fSbrendan 	mutex_destroy(&l2arc_feed_thr_lock);
5069fa94a07fSbrendan 	cv_destroy(&l2arc_feed_thr_cv);
5070fa94a07fSbrendan 	mutex_destroy(&l2arc_dev_mtx);
5071fa94a07fSbrendan 	mutex_destroy(&l2arc_buflist_mtx);
5072fa94a07fSbrendan 	mutex_destroy(&l2arc_free_on_write_mtx);
5073fa94a07fSbrendan 
5074fa94a07fSbrendan 	list_destroy(l2arc_dev_list);
5075fa94a07fSbrendan 	list_destroy(l2arc_free_on_write);
5076fa94a07fSbrendan }
5077e14bb325SJeff Bonwick 
5078e14bb325SJeff Bonwick void
5079e14bb325SJeff Bonwick l2arc_start(void)
5080e14bb325SJeff Bonwick {
50818ad4d6ddSJeff Bonwick 	if (!(spa_mode_global & FWRITE))
5082e14bb325SJeff Bonwick 		return;
5083e14bb325SJeff Bonwick 
5084e14bb325SJeff Bonwick 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5085e14bb325SJeff Bonwick 	    TS_RUN, minclsyspri);
5086e14bb325SJeff Bonwick }
5087e14bb325SJeff Bonwick 
5088e14bb325SJeff Bonwick void
5089e14bb325SJeff Bonwick l2arc_stop(void)
5090e14bb325SJeff Bonwick {
50918ad4d6ddSJeff Bonwick 	if (!(spa_mode_global & FWRITE))
5092e14bb325SJeff Bonwick 		return;
5093e14bb325SJeff Bonwick 
5094e14bb325SJeff Bonwick 	mutex_enter(&l2arc_feed_thr_lock);
5095e14bb325SJeff Bonwick 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5096e14bb325SJeff Bonwick 	l2arc_thread_exit = 1;
5097e14bb325SJeff Bonwick 	while (l2arc_thread_exit != 0)
5098e14bb325SJeff Bonwick 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5099e14bb325SJeff Bonwick 	mutex_exit(&l2arc_feed_thr_lock);
5100e14bb325SJeff Bonwick }
5101