1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2018, Joyent, Inc.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
27 */
28
29/*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory.  This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about.  Our cache is not so simple.  At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them.  Blocks are only evictable
44 * when there are no external references active.  This makes
45 * eviction far more problematic:  we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space.  In these circumstances we are unable to adjust the cache
50 * size.  To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss.  Our model has a variable sized cache.  It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size.  So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict.  In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes).  We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists.  The arc_read() interface
80 * uses method 1, while the internal ARC algorithms for
81 * adjusting the cache use method 2.  We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * ARC list locks.
84 *
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table.  It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each ARC state also has a mutex which is used to protect the
97 * buffer list associated with the state.  When attempting to
98 * obtain a hash table lock while holding an ARC list lock you
99 * must use: mutex_tryenter() to avoid deadlock.  Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Note that the majority of the performance stats are manipulated
103 * with atomic operations.
104 *
105 * The L2ARC uses the l2ad_mtx on each vdev for the following:
106 *
107 *	- L2ARC buflist creation
108 *	- L2ARC buflist eviction
109 *	- L2ARC write completion, which walks L2ARC buflists
110 *	- ARC header destruction, as it removes from L2ARC buflists
111 *	- ARC header release, as it removes from L2ARC buflists
112 */
113
114/*
115 * ARC operation:
116 *
117 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
118 * This structure can point either to a block that is still in the cache or to
119 * one that is only accessible in an L2 ARC device, or it can provide
120 * information about a block that was recently evicted. If a block is
121 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
122 * information to retrieve it from the L2ARC device. This information is
123 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
124 * that is in this state cannot access the data directly.
125 *
126 * Blocks that are actively being referenced or have not been evicted
127 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
128 * the arc_buf_hdr_t that will point to the data block in memory. A block can
129 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
130 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
131 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
132 *
133 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
134 * ability to store the physical data (b_pabd) associated with the DVA of the
135 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
136 * it will match its on-disk compression characteristics. This behavior can be
137 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
138 * compressed ARC functionality is disabled, the b_pabd will point to an
139 * uncompressed version of the on-disk data.
140 *
141 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
142 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
143 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
144 * consumer. The ARC will provide references to this data and will keep it
145 * cached until it is no longer in use. The ARC caches only the L1ARC's physical
146 * data block and will evict any arc_buf_t that is no longer referenced. The
147 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
148 * "overhead_size" kstat.
149 *
150 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
151 * compressed form. The typical case is that consumers will want uncompressed
152 * data, and when that happens a new data buffer is allocated where the data is
153 * decompressed for them to use. Currently the only consumer who wants
154 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
155 * exists on disk. When this happens, the arc_buf_t's data buffer is shared
156 * with the arc_buf_hdr_t.
157 *
158 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
159 * first one is owned by a compressed send consumer (and therefore references
160 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
161 * used by any other consumer (and has its own uncompressed copy of the data
162 * buffer).
163 *
164 *   arc_buf_hdr_t
165 *   +-----------+
166 *   | fields    |
167 *   | common to |
168 *   | L1- and   |
169 *   | L2ARC     |
170 *   +-----------+
171 *   | l2arc_buf_hdr_t
172 *   |           |
173 *   +-----------+
174 *   | l1arc_buf_hdr_t
175 *   |           |              arc_buf_t
176 *   | b_buf     +------------>+-----------+      arc_buf_t
177 *   | b_pabd    +-+           |b_next     +---->+-----------+
178 *   +-----------+ |           |-----------|     |b_next     +-->NULL
179 *                 |           |b_comp = T |     +-----------+
180 *                 |           |b_data     +-+   |b_comp = F |
181 *                 |           +-----------+ |   |b_data     +-+
182 *                 +->+------+               |   +-----------+ |
183 *        compressed  |      |               |                 |
184 *           data     |      |<--------------+                 | uncompressed
185 *                    +------+          compressed,            |     data
186 *                                        shared               +-->+------+
187 *                                         data                    |      |
188 *                                                                 |      |
189 *                                                                 +------+
190 *
191 * When a consumer reads a block, the ARC must first look to see if the
192 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
193 * arc_buf_t and either copies uncompressed data into a new data buffer from an
194 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
195 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
196 * hdr is compressed and the desired compression characteristics of the
197 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
198 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
199 * the last buffer in the hdr's b_buf list, however a shared compressed buf can
200 * be anywhere in the hdr's list.
201 *
202 * The diagram below shows an example of an uncompressed ARC hdr that is
203 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
204 * the last element in the buf list):
205 *
206 *                arc_buf_hdr_t
207 *                +-----------+
208 *                |           |
209 *                |           |
210 *                |           |
211 *                +-----------+
212 * l2arc_buf_hdr_t|           |
213 *                |           |
214 *                +-----------+
215 * l1arc_buf_hdr_t|           |
216 *                |           |                 arc_buf_t    (shared)
217 *                |    b_buf  +------------>+---------+      arc_buf_t
218 *                |           |             |b_next   +---->+---------+
219 *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
220 *                +-----------+ |           |         |     +---------+
221 *                              |           |b_data   +-+   |         |
222 *                              |           +---------+ |   |b_data   +-+
223 *                              +->+------+             |   +---------+ |
224 *                                 |      |             |               |
225 *                   uncompressed  |      |             |               |
226 *                        data     +------+             |               |
227 *                                    ^                 +->+------+     |
228 *                                    |       uncompressed |      |     |
229 *                                    |           data     |      |     |
230 *                                    |                    +------+     |
231 *                                    +---------------------------------+
232 *
233 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
234 * since the physical block is about to be rewritten. The new data contents
235 * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
236 * it may compress the data before writing it to disk. The ARC will be called
237 * with the transformed data and will bcopy the transformed on-disk block into
238 * a newly allocated b_pabd. Writes are always done into buffers which have
239 * either been loaned (and hence are new and don't have other readers) or
240 * buffers which have been released (and hence have their own hdr, if there
241 * were originally other readers of the buf's original hdr). This ensures that
242 * the ARC only needs to update a single buf and its hdr after a write occurs.
243 *
244 * When the L2ARC is in use, it will also take advantage of the b_pabd. The
245 * L2ARC will always write the contents of b_pabd to the L2ARC. This means
246 * that when compressed ARC is enabled that the L2ARC blocks are identical
247 * to the on-disk block in the main data pool. This provides a significant
248 * advantage since the ARC can leverage the bp's checksum when reading from the
249 * L2ARC to determine if the contents are valid. However, if the compressed
250 * ARC is disabled, then the L2ARC's block must be transformed to look
251 * like the physical block in the main data pool before comparing the
252 * checksum and determining its validity.
253 */
254
255#include <sys/spa.h>
256#include <sys/zio.h>
257#include <sys/spa_impl.h>
258#include <sys/zio_compress.h>
259#include <sys/zio_checksum.h>
260#include <sys/zfs_context.h>
261#include <sys/arc.h>
262#include <sys/refcount.h>
263#include <sys/vdev.h>
264#include <sys/vdev_impl.h>
265#include <sys/dsl_pool.h>
266#include <sys/zio_checksum.h>
267#include <sys/multilist.h>
268#include <sys/abd.h>
269#ifdef _KERNEL
270#include <sys/dnlc.h>
271#include <sys/racct.h>
272#endif
273#include <sys/callb.h>
274#include <sys/kstat.h>
275#include <sys/trim_map.h>
276#include <zfs_fletcher.h>
277#include <sys/sdt.h>
278#include <sys/aggsum.h>
279#include <sys/cityhash.h>
280
281#include <machine/vmparam.h>
282
283#ifdef illumos
284#ifndef _KERNEL
285/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
286boolean_t arc_watch = B_FALSE;
287int arc_procfd;
288#endif
289#endif /* illumos */
290
291static kmutex_t		arc_reclaim_lock;
292static kcondvar_t	arc_reclaim_thread_cv;
293static boolean_t	arc_reclaim_thread_exit;
294static kcondvar_t	arc_reclaim_waiters_cv;
295
296static kmutex_t		arc_dnlc_evicts_lock;
297static kcondvar_t	arc_dnlc_evicts_cv;
298static boolean_t	arc_dnlc_evicts_thread_exit;
299
300uint_t arc_reduce_dnlc_percent = 3;
301
302/*
303 * The number of headers to evict in arc_evict_state_impl() before
304 * dropping the sublist lock and evicting from another sublist. A lower
305 * value means we're more likely to evict the "correct" header (i.e. the
306 * oldest header in the arc state), but comes with higher overhead
307 * (i.e. more invocations of arc_evict_state_impl()).
308 */
309int zfs_arc_evict_batch_limit = 10;
310
311/* number of seconds before growing cache again */
312static int		arc_grow_retry = 60;
313
314/* number of milliseconds before attempting a kmem-cache-reap */
315static int		arc_kmem_cache_reap_retry_ms = 1000;
316
317/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
318int		zfs_arc_overflow_shift = 8;
319
320/* shift of arc_c for calculating both min and max arc_p */
321static int		arc_p_min_shift = 4;
322
323/* log2(fraction of arc to reclaim) */
324static int		arc_shrink_shift = 7;
325
326/*
327 * log2(fraction of ARC which must be free to allow growing).
328 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
329 * when reading a new block into the ARC, we will evict an equal-sized block
330 * from the ARC.
331 *
332 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
333 * we will still not allow it to grow.
334 */
335int			arc_no_grow_shift = 5;
336
337
338/*
339 * minimum lifespan of a prefetch block in clock ticks
340 * (initialized in arc_init())
341 */
342static int		zfs_arc_min_prefetch_ms = 1;
343static int		zfs_arc_min_prescient_prefetch_ms = 6;
344
345/*
346 * If this percent of memory is free, don't throttle.
347 */
348int arc_lotsfree_percent = 10;
349
350static int arc_dead;
351extern boolean_t zfs_prefetch_disable;
352
353/*
354 * The arc has filled available memory and has now warmed up.
355 */
356static boolean_t arc_warm;
357
358/*
359 * log2 fraction of the zio arena to keep free.
360 */
361int arc_zio_arena_free_shift = 2;
362
363/*
364 * These tunables are for performance analysis.
365 */
366uint64_t zfs_arc_max;
367uint64_t zfs_arc_min;
368uint64_t zfs_arc_meta_limit = 0;
369uint64_t zfs_arc_meta_min = 0;
370int zfs_arc_grow_retry = 0;
371int zfs_arc_shrink_shift = 0;
372int zfs_arc_no_grow_shift = 0;
373int zfs_arc_p_min_shift = 0;
374uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
375u_int zfs_arc_free_target = 0;
376
377/* Absolute min for arc min / max is 16MB. */
378static uint64_t arc_abs_min = 16 << 20;
379
380boolean_t zfs_compressed_arc_enabled = B_TRUE;
381
382static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
383static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
384static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
385static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
386static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
387
388#if defined(__FreeBSD__) && defined(_KERNEL)
389static void
390arc_free_target_init(void *unused __unused)
391{
392
393	zfs_arc_free_target = vm_cnt.v_free_target;
394}
395SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
396    arc_free_target_init, NULL);
397
398TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
399TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
400TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
401TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
402TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
403SYSCTL_DECL(_vfs_zfs);
404SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN,
405    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
406SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN,
407    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
408SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN,
409    0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
410    "log2(fraction of ARC which must be free to allow growing)");
411SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
412    &zfs_arc_average_blocksize, 0,
413    "ARC average blocksize");
414SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
415    &arc_shrink_shift, 0,
416    "log2(fraction of arc to reclaim)");
417SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
418    &arc_grow_retry, 0,
419    "Wait in seconds before considering growing ARC");
420SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
421    &zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
422
423/*
424 * We don't have a tunable for arc_free_target due to the dependency on
425 * pagedaemon initialisation.
426 */
427SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
428    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
429    sysctl_vfs_zfs_arc_free_target, "IU",
430    "Desired number of free pages below which ARC triggers reclaim");
431
432static int
433sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
434{
435	u_int val;
436	int err;
437
438	val = zfs_arc_free_target;
439	err = sysctl_handle_int(oidp, &val, 0, req);
440	if (err != 0 || req->newptr == NULL)
441		return (err);
442
443	if (val < minfree)
444		return (EINVAL);
445	if (val > vm_cnt.v_page_count)
446		return (EINVAL);
447
448	zfs_arc_free_target = val;
449
450	return (0);
451}
452
453/*
454 * Must be declared here, before the definition of corresponding kstat
455 * macro which uses the same names will confuse the compiler.
456 */
457SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
458    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
459    sysctl_vfs_zfs_arc_meta_limit, "QU",
460    "ARC metadata limit");
461#endif
462
463/*
464 * Note that buffers can be in one of 6 states:
465 *	ARC_anon	- anonymous (discussed below)
466 *	ARC_mru		- recently used, currently cached
467 *	ARC_mru_ghost	- recentely used, no longer in cache
468 *	ARC_mfu		- frequently used, currently cached
469 *	ARC_mfu_ghost	- frequently used, no longer in cache
470 *	ARC_l2c_only	- exists in L2ARC but not other states
471 * When there are no active references to the buffer, they are
472 * are linked onto a list in one of these arc states.  These are
473 * the only buffers that can be evicted or deleted.  Within each
474 * state there are multiple lists, one for meta-data and one for
475 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
476 * etc.) is tracked separately so that it can be managed more
477 * explicitly: favored over data, limited explicitly.
478 *
479 * Anonymous buffers are buffers that are not associated with
480 * a DVA.  These are buffers that hold dirty block copies
481 * before they are written to stable storage.  By definition,
482 * they are "ref'd" and are considered part of arc_mru
483 * that cannot be freed.  Generally, they will aquire a DVA
484 * as they are written and migrate onto the arc_mru list.
485 *
486 * The ARC_l2c_only state is for buffers that are in the second
487 * level ARC but no longer in any of the ARC_m* lists.  The second
488 * level ARC itself may also contain buffers that are in any of
489 * the ARC_m* states - meaning that a buffer can exist in two
490 * places.  The reason for the ARC_l2c_only state is to keep the
491 * buffer header in the hash table, so that reads that hit the
492 * second level ARC benefit from these fast lookups.
493 */
494
495typedef struct arc_state {
496	/*
497	 * list of evictable buffers
498	 */
499	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
500	/*
501	 * total amount of evictable data in this state
502	 */
503	refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
504	/*
505	 * total amount of data in this state; this includes: evictable,
506	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
507	 */
508	refcount_t arcs_size;
509} arc_state_t;
510
511/* The 6 states: */
512static arc_state_t ARC_anon;
513static arc_state_t ARC_mru;
514static arc_state_t ARC_mru_ghost;
515static arc_state_t ARC_mfu;
516static arc_state_t ARC_mfu_ghost;
517static arc_state_t ARC_l2c_only;
518
519typedef struct arc_stats {
520	kstat_named_t arcstat_hits;
521	kstat_named_t arcstat_misses;
522	kstat_named_t arcstat_demand_data_hits;
523	kstat_named_t arcstat_demand_data_misses;
524	kstat_named_t arcstat_demand_metadata_hits;
525	kstat_named_t arcstat_demand_metadata_misses;
526	kstat_named_t arcstat_prefetch_data_hits;
527	kstat_named_t arcstat_prefetch_data_misses;
528	kstat_named_t arcstat_prefetch_metadata_hits;
529	kstat_named_t arcstat_prefetch_metadata_misses;
530	kstat_named_t arcstat_mru_hits;
531	kstat_named_t arcstat_mru_ghost_hits;
532	kstat_named_t arcstat_mfu_hits;
533	kstat_named_t arcstat_mfu_ghost_hits;
534	kstat_named_t arcstat_allocated;
535	kstat_named_t arcstat_deleted;
536	/*
537	 * Number of buffers that could not be evicted because the hash lock
538	 * was held by another thread.  The lock may not necessarily be held
539	 * by something using the same buffer, since hash locks are shared
540	 * by multiple buffers.
541	 */
542	kstat_named_t arcstat_mutex_miss;
543	/*
544	 * Number of buffers skipped when updating the access state due to the
545	 * header having already been released after acquiring the hash lock.
546	 */
547	kstat_named_t arcstat_access_skip;
548	/*
549	 * Number of buffers skipped because they have I/O in progress, are
550	 * indirect prefetch buffers that have not lived long enough, or are
551	 * not from the spa we're trying to evict from.
552	 */
553	kstat_named_t arcstat_evict_skip;
554	/*
555	 * Number of times arc_evict_state() was unable to evict enough
556	 * buffers to reach it's target amount.
557	 */
558	kstat_named_t arcstat_evict_not_enough;
559	kstat_named_t arcstat_evict_l2_cached;
560	kstat_named_t arcstat_evict_l2_eligible;
561	kstat_named_t arcstat_evict_l2_ineligible;
562	kstat_named_t arcstat_evict_l2_skip;
563	kstat_named_t arcstat_hash_elements;
564	kstat_named_t arcstat_hash_elements_max;
565	kstat_named_t arcstat_hash_collisions;
566	kstat_named_t arcstat_hash_chains;
567	kstat_named_t arcstat_hash_chain_max;
568	kstat_named_t arcstat_p;
569	kstat_named_t arcstat_c;
570	kstat_named_t arcstat_c_min;
571	kstat_named_t arcstat_c_max;
572	/* Not updated directly; only synced in arc_kstat_update. */
573	kstat_named_t arcstat_size;
574	/*
575	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
576	 * Note that the compressed bytes may match the uncompressed bytes
577	 * if the block is either not compressed or compressed arc is disabled.
578	 */
579	kstat_named_t arcstat_compressed_size;
580	/*
581	 * Uncompressed size of the data stored in b_pabd. If compressed
582	 * arc is disabled then this value will be identical to the stat
583	 * above.
584	 */
585	kstat_named_t arcstat_uncompressed_size;
586	/*
587	 * Number of bytes stored in all the arc_buf_t's. This is classified
588	 * as "overhead" since this data is typically short-lived and will
589	 * be evicted from the arc when it becomes unreferenced unless the
590	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
591	 * values have been set (see comment in dbuf.c for more information).
592	 */
593	kstat_named_t arcstat_overhead_size;
594	/*
595	 * Number of bytes consumed by internal ARC structures necessary
596	 * for tracking purposes; these structures are not actually
597	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
598	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
599	 * caches), and arc_buf_t structures (allocated via arc_buf_t
600	 * cache).
601	 * Not updated directly; only synced in arc_kstat_update.
602	 */
603	kstat_named_t arcstat_hdr_size;
604	/*
605	 * Number of bytes consumed by ARC buffers of type equal to
606	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
607	 * on disk user data (e.g. plain file contents).
608	 * Not updated directly; only synced in arc_kstat_update.
609	 */
610	kstat_named_t arcstat_data_size;
611	/*
612	 * Number of bytes consumed by ARC buffers of type equal to
613	 * ARC_BUFC_METADATA. This is generally consumed by buffers
614	 * backing on disk data that is used for internal ZFS
615	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
616	 * Not updated directly; only synced in arc_kstat_update.
617	 */
618	kstat_named_t arcstat_metadata_size;
619	/*
620	 * Number of bytes consumed by various buffers and structures
621	 * not actually backed with ARC buffers. This includes bonus
622	 * buffers (allocated directly via zio_buf_* functions),
623	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
624	 * cache), and dnode_t structures (allocated via dnode_t cache).
625	 * Not updated directly; only synced in arc_kstat_update.
626	 */
627	kstat_named_t arcstat_other_size;
628	/*
629	 * Total number of bytes consumed by ARC buffers residing in the
630	 * arc_anon state. This includes *all* buffers in the arc_anon
631	 * state; e.g. data, metadata, evictable, and unevictable buffers
632	 * are all included in this value.
633	 * Not updated directly; only synced in arc_kstat_update.
634	 */
635	kstat_named_t arcstat_anon_size;
636	/*
637	 * Number of bytes consumed by ARC buffers that meet the
638	 * following criteria: backing buffers of type ARC_BUFC_DATA,
639	 * residing in the arc_anon state, and are eligible for eviction
640	 * (e.g. have no outstanding holds on the buffer).
641	 * Not updated directly; only synced in arc_kstat_update.
642	 */
643	kstat_named_t arcstat_anon_evictable_data;
644	/*
645	 * Number of bytes consumed by ARC buffers that meet the
646	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
647	 * residing in the arc_anon state, and are eligible for eviction
648	 * (e.g. have no outstanding holds on the buffer).
649	 * Not updated directly; only synced in arc_kstat_update.
650	 */
651	kstat_named_t arcstat_anon_evictable_metadata;
652	/*
653	 * Total number of bytes consumed by ARC buffers residing in the
654	 * arc_mru state. This includes *all* buffers in the arc_mru
655	 * state; e.g. data, metadata, evictable, and unevictable buffers
656	 * are all included in this value.
657	 * Not updated directly; only synced in arc_kstat_update.
658	 */
659	kstat_named_t arcstat_mru_size;
660	/*
661	 * Number of bytes consumed by ARC buffers that meet the
662	 * following criteria: backing buffers of type ARC_BUFC_DATA,
663	 * residing in the arc_mru state, and are eligible for eviction
664	 * (e.g. have no outstanding holds on the buffer).
665	 * Not updated directly; only synced in arc_kstat_update.
666	 */
667	kstat_named_t arcstat_mru_evictable_data;
668	/*
669	 * Number of bytes consumed by ARC buffers that meet the
670	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
671	 * residing in the arc_mru state, and are eligible for eviction
672	 * (e.g. have no outstanding holds on the buffer).
673	 * Not updated directly; only synced in arc_kstat_update.
674	 */
675	kstat_named_t arcstat_mru_evictable_metadata;
676	/*
677	 * Total number of bytes that *would have been* consumed by ARC
678	 * buffers in the arc_mru_ghost state. The key thing to note
679	 * here, is the fact that this size doesn't actually indicate
680	 * RAM consumption. The ghost lists only consist of headers and
681	 * don't actually have ARC buffers linked off of these headers.
682	 * Thus, *if* the headers had associated ARC buffers, these
683	 * buffers *would have* consumed this number of bytes.
684	 * Not updated directly; only synced in arc_kstat_update.
685	 */
686	kstat_named_t arcstat_mru_ghost_size;
687	/*
688	 * Number of bytes that *would have been* consumed by ARC
689	 * buffers that are eligible for eviction, of type
690	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
691	 * Not updated directly; only synced in arc_kstat_update.
692	 */
693	kstat_named_t arcstat_mru_ghost_evictable_data;
694	/*
695	 * Number of bytes that *would have been* consumed by ARC
696	 * buffers that are eligible for eviction, of type
697	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
698	 * Not updated directly; only synced in arc_kstat_update.
699	 */
700	kstat_named_t arcstat_mru_ghost_evictable_metadata;
701	/*
702	 * Total number of bytes consumed by ARC buffers residing in the
703	 * arc_mfu state. This includes *all* buffers in the arc_mfu
704	 * state; e.g. data, metadata, evictable, and unevictable buffers
705	 * are all included in this value.
706	 * Not updated directly; only synced in arc_kstat_update.
707	 */
708	kstat_named_t arcstat_mfu_size;
709	/*
710	 * Number of bytes consumed by ARC buffers that are eligible for
711	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
712	 * state.
713	 * Not updated directly; only synced in arc_kstat_update.
714	 */
715	kstat_named_t arcstat_mfu_evictable_data;
716	/*
717	 * Number of bytes consumed by ARC buffers that are eligible for
718	 * eviction, of type ARC_BUFC_METADATA, and reside in the
719	 * arc_mfu state.
720	 * Not updated directly; only synced in arc_kstat_update.
721	 */
722	kstat_named_t arcstat_mfu_evictable_metadata;
723	/*
724	 * Total number of bytes that *would have been* consumed by ARC
725	 * buffers in the arc_mfu_ghost state. See the comment above
726	 * arcstat_mru_ghost_size for more details.
727	 * Not updated directly; only synced in arc_kstat_update.
728	 */
729	kstat_named_t arcstat_mfu_ghost_size;
730	/*
731	 * Number of bytes that *would have been* consumed by ARC
732	 * buffers that are eligible for eviction, of type
733	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
734	 * Not updated directly; only synced in arc_kstat_update.
735	 */
736	kstat_named_t arcstat_mfu_ghost_evictable_data;
737	/*
738	 * Number of bytes that *would have been* consumed by ARC
739	 * buffers that are eligible for eviction, of type
740	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
741	 * Not updated directly; only synced in arc_kstat_update.
742	 */
743	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
744	kstat_named_t arcstat_l2_hits;
745	kstat_named_t arcstat_l2_misses;
746	kstat_named_t arcstat_l2_feeds;
747	kstat_named_t arcstat_l2_rw_clash;
748	kstat_named_t arcstat_l2_read_bytes;
749	kstat_named_t arcstat_l2_write_bytes;
750	kstat_named_t arcstat_l2_writes_sent;
751	kstat_named_t arcstat_l2_writes_done;
752	kstat_named_t arcstat_l2_writes_error;
753	kstat_named_t arcstat_l2_writes_lock_retry;
754	kstat_named_t arcstat_l2_evict_lock_retry;
755	kstat_named_t arcstat_l2_evict_reading;
756	kstat_named_t arcstat_l2_evict_l1cached;
757	kstat_named_t arcstat_l2_free_on_write;
758	kstat_named_t arcstat_l2_abort_lowmem;
759	kstat_named_t arcstat_l2_cksum_bad;
760	kstat_named_t arcstat_l2_io_error;
761	kstat_named_t arcstat_l2_lsize;
762	kstat_named_t arcstat_l2_psize;
763	/* Not updated directly; only synced in arc_kstat_update. */
764	kstat_named_t arcstat_l2_hdr_size;
765	kstat_named_t arcstat_l2_write_trylock_fail;
766	kstat_named_t arcstat_l2_write_passed_headroom;
767	kstat_named_t arcstat_l2_write_spa_mismatch;
768	kstat_named_t arcstat_l2_write_in_l2;
769	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
770	kstat_named_t arcstat_l2_write_not_cacheable;
771	kstat_named_t arcstat_l2_write_full;
772	kstat_named_t arcstat_l2_write_buffer_iter;
773	kstat_named_t arcstat_l2_write_pios;
774	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
775	kstat_named_t arcstat_l2_write_buffer_list_iter;
776	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
777	kstat_named_t arcstat_memory_throttle_count;
778	/* Not updated directly; only synced in arc_kstat_update. */
779	kstat_named_t arcstat_meta_used;
780	kstat_named_t arcstat_meta_limit;
781	kstat_named_t arcstat_meta_max;
782	kstat_named_t arcstat_meta_min;
783	kstat_named_t arcstat_async_upgrade_sync;
784	kstat_named_t arcstat_demand_hit_predictive_prefetch;
785	kstat_named_t arcstat_demand_hit_prescient_prefetch;
786} arc_stats_t;
787
788static arc_stats_t arc_stats = {
789	{ "hits",			KSTAT_DATA_UINT64 },
790	{ "misses",			KSTAT_DATA_UINT64 },
791	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
792	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
793	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
794	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
795	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
796	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
797	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
798	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
799	{ "mru_hits",			KSTAT_DATA_UINT64 },
800	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
801	{ "mfu_hits",			KSTAT_DATA_UINT64 },
802	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
803	{ "allocated",			KSTAT_DATA_UINT64 },
804	{ "deleted",			KSTAT_DATA_UINT64 },
805	{ "mutex_miss",			KSTAT_DATA_UINT64 },
806	{ "access_skip",		KSTAT_DATA_UINT64 },
807	{ "evict_skip",			KSTAT_DATA_UINT64 },
808	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
809	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
810	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
811	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
812	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
813	{ "hash_elements",		KSTAT_DATA_UINT64 },
814	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
815	{ "hash_collisions",		KSTAT_DATA_UINT64 },
816	{ "hash_chains",		KSTAT_DATA_UINT64 },
817	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
818	{ "p",				KSTAT_DATA_UINT64 },
819	{ "c",				KSTAT_DATA_UINT64 },
820	{ "c_min",			KSTAT_DATA_UINT64 },
821	{ "c_max",			KSTAT_DATA_UINT64 },
822	{ "size",			KSTAT_DATA_UINT64 },
823	{ "compressed_size",		KSTAT_DATA_UINT64 },
824	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
825	{ "overhead_size",		KSTAT_DATA_UINT64 },
826	{ "hdr_size",			KSTAT_DATA_UINT64 },
827	{ "data_size",			KSTAT_DATA_UINT64 },
828	{ "metadata_size",		KSTAT_DATA_UINT64 },
829	{ "other_size",			KSTAT_DATA_UINT64 },
830	{ "anon_size",			KSTAT_DATA_UINT64 },
831	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
832	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
833	{ "mru_size",			KSTAT_DATA_UINT64 },
834	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
835	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
836	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
837	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
838	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
839	{ "mfu_size",			KSTAT_DATA_UINT64 },
840	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
841	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
842	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
843	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
844	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
845	{ "l2_hits",			KSTAT_DATA_UINT64 },
846	{ "l2_misses",			KSTAT_DATA_UINT64 },
847	{ "l2_feeds",			KSTAT_DATA_UINT64 },
848	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
849	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
850	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
851	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
852	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
853	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
854	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
855	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
856	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
857	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
858	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
859	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
860	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
861	{ "l2_io_error",		KSTAT_DATA_UINT64 },
862	{ "l2_size",			KSTAT_DATA_UINT64 },
863	{ "l2_asize",			KSTAT_DATA_UINT64 },
864	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
865	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
866	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
867	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
868	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
869	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
870	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
871	{ "l2_write_full",		KSTAT_DATA_UINT64 },
872	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
873	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
874	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
875	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
876	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
877	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
878	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
879	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
880	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
881	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
882	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
883	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
884	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
885};
886
887#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
888
889#define	ARCSTAT_INCR(stat, val) \
890	atomic_add_64(&arc_stats.stat.value.ui64, (val))
891
892#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
893#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
894
895#define	ARCSTAT_MAX(stat, val) {					\
896	uint64_t m;							\
897	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
898	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
899		continue;						\
900}
901
902#define	ARCSTAT_MAXSTAT(stat) \
903	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
904
905/*
906 * We define a macro to allow ARC hits/misses to be easily broken down by
907 * two separate conditions, giving a total of four different subtypes for
908 * each of hits and misses (so eight statistics total).
909 */
910#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
911	if (cond1) {							\
912		if (cond2) {						\
913			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
914		} else {						\
915			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
916		}							\
917	} else {							\
918		if (cond2) {						\
919			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
920		} else {						\
921			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
922		}							\
923	}
924
925kstat_t			*arc_ksp;
926static arc_state_t	*arc_anon;
927static arc_state_t	*arc_mru;
928static arc_state_t	*arc_mru_ghost;
929static arc_state_t	*arc_mfu;
930static arc_state_t	*arc_mfu_ghost;
931static arc_state_t	*arc_l2c_only;
932
933/*
934 * There are several ARC variables that are critical to export as kstats --
935 * but we don't want to have to grovel around in the kstat whenever we wish to
936 * manipulate them.  For these variables, we therefore define them to be in
937 * terms of the statistic variable.  This assures that we are not introducing
938 * the possibility of inconsistency by having shadow copies of the variables,
939 * while still allowing the code to be readable.
940 */
941#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
942#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
943#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
944#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
945#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
946#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
947#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
948
949/* compressed size of entire arc */
950#define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
951/* uncompressed size of entire arc */
952#define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
953/* number of bytes in the arc from arc_buf_t's */
954#define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
955
956/*
957 * There are also some ARC variables that we want to export, but that are
958 * updated so often that having the canonical representation be the statistic
959 * variable causes a performance bottleneck. We want to use aggsum_t's for these
960 * instead, but still be able to export the kstat in the same way as before.
961 * The solution is to always use the aggsum version, except in the kstat update
962 * callback.
963 */
964aggsum_t arc_size;
965aggsum_t arc_meta_used;
966aggsum_t astat_data_size;
967aggsum_t astat_metadata_size;
968aggsum_t astat_hdr_size;
969aggsum_t astat_other_size;
970aggsum_t astat_l2_hdr_size;
971
972static int		arc_no_grow;	/* Don't try to grow cache size */
973static uint64_t		arc_tempreserve;
974static uint64_t		arc_loaned_bytes;
975
976typedef struct arc_callback arc_callback_t;
977
978struct arc_callback {
979	void			*acb_private;
980	arc_read_done_func_t	*acb_done;
981	arc_buf_t		*acb_buf;
982	boolean_t		acb_compressed;
983	zio_t			*acb_zio_dummy;
984	zio_t			*acb_zio_head;
985	arc_callback_t		*acb_next;
986};
987
988typedef struct arc_write_callback arc_write_callback_t;
989
990struct arc_write_callback {
991	void			*awcb_private;
992	arc_write_done_func_t	*awcb_ready;
993	arc_write_done_func_t	*awcb_children_ready;
994	arc_write_done_func_t	*awcb_physdone;
995	arc_write_done_func_t	*awcb_done;
996	arc_buf_t		*awcb_buf;
997};
998
999/*
1000 * ARC buffers are separated into multiple structs as a memory saving measure:
1001 *   - Common fields struct, always defined, and embedded within it:
1002 *       - L2-only fields, always allocated but undefined when not in L2ARC
1003 *       - L1-only fields, only allocated when in L1ARC
1004 *
1005 *           Buffer in L1                     Buffer only in L2
1006 *    +------------------------+          +------------------------+
1007 *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
1008 *    |                        |          |                        |
1009 *    |                        |          |                        |
1010 *    |                        |          |                        |
1011 *    +------------------------+          +------------------------+
1012 *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
1013 *    | (undefined if L1-only) |          |                        |
1014 *    +------------------------+          +------------------------+
1015 *    | l1arc_buf_hdr_t        |
1016 *    |                        |
1017 *    |                        |
1018 *    |                        |
1019 *    |                        |
1020 *    +------------------------+
1021 *
1022 * Because it's possible for the L2ARC to become extremely large, we can wind
1023 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
1024 * is minimized by only allocating the fields necessary for an L1-cached buffer
1025 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
1026 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
1027 * words in pointers. arc_hdr_realloc() is used to switch a header between
1028 * these two allocation states.
1029 */
1030typedef struct l1arc_buf_hdr {
1031	kmutex_t		b_freeze_lock;
1032	zio_cksum_t		*b_freeze_cksum;
1033#ifdef ZFS_DEBUG
1034	/*
1035	 * Used for debugging with kmem_flags - by allocating and freeing
1036	 * b_thawed when the buffer is thawed, we get a record of the stack
1037	 * trace that thawed it.
1038	 */
1039	void			*b_thawed;
1040#endif
1041
1042	arc_buf_t		*b_buf;
1043	uint32_t		b_bufcnt;
1044	/* for waiting on writes to complete */
1045	kcondvar_t		b_cv;
1046	uint8_t			b_byteswap;
1047
1048	/* protected by arc state mutex */
1049	arc_state_t		*b_state;
1050	multilist_node_t	b_arc_node;
1051
1052	/* updated atomically */
1053	clock_t			b_arc_access;
1054
1055	/* self protecting */
1056	refcount_t		b_refcnt;
1057
1058	arc_callback_t		*b_acb;
1059	abd_t			*b_pabd;
1060} l1arc_buf_hdr_t;
1061
1062typedef struct l2arc_dev l2arc_dev_t;
1063
1064typedef struct l2arc_buf_hdr {
1065	/* protected by arc_buf_hdr mutex */
1066	l2arc_dev_t		*b_dev;		/* L2ARC device */
1067	uint64_t		b_daddr;	/* disk address, offset byte */
1068
1069	list_node_t		b_l2node;
1070} l2arc_buf_hdr_t;
1071
1072struct arc_buf_hdr {
1073	/* protected by hash lock */
1074	dva_t			b_dva;
1075	uint64_t		b_birth;
1076
1077	arc_buf_contents_t	b_type;
1078	arc_buf_hdr_t		*b_hash_next;
1079	arc_flags_t		b_flags;
1080
1081	/*
1082	 * This field stores the size of the data buffer after
1083	 * compression, and is set in the arc's zio completion handlers.
1084	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
1085	 *
1086	 * While the block pointers can store up to 32MB in their psize
1087	 * field, we can only store up to 32MB minus 512B. This is due
1088	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
1089	 * a field of zeros represents 512B in the bp). We can't use a
1090	 * bias of 1 since we need to reserve a psize of zero, here, to
1091	 * represent holes and embedded blocks.
1092	 *
1093	 * This isn't a problem in practice, since the maximum size of a
1094	 * buffer is limited to 16MB, so we never need to store 32MB in
1095	 * this field. Even in the upstream illumos code base, the
1096	 * maximum size of a buffer is limited to 16MB.
1097	 */
1098	uint16_t		b_psize;
1099
1100	/*
1101	 * This field stores the size of the data buffer before
1102	 * compression, and cannot change once set. It is in units
1103	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
1104	 */
1105	uint16_t		b_lsize;	/* immutable */
1106	uint64_t		b_spa;		/* immutable */
1107
1108	/* L2ARC fields. Undefined when not in L2ARC. */
1109	l2arc_buf_hdr_t		b_l2hdr;
1110	/* L1ARC fields. Undefined when in l2arc_only state */
1111	l1arc_buf_hdr_t		b_l1hdr;
1112};
1113
1114#if defined(__FreeBSD__) && defined(_KERNEL)
1115static int
1116sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
1117{
1118	uint64_t val;
1119	int err;
1120
1121	val = arc_meta_limit;
1122	err = sysctl_handle_64(oidp, &val, 0, req);
1123	if (err != 0 || req->newptr == NULL)
1124		return (err);
1125
1126        if (val <= 0 || val > arc_c_max)
1127		return (EINVAL);
1128
1129	arc_meta_limit = val;
1130	return (0);
1131}
1132
1133static int
1134sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
1135{
1136	uint32_t val;
1137	int err;
1138
1139	val = arc_no_grow_shift;
1140	err = sysctl_handle_32(oidp, &val, 0, req);
1141	if (err != 0 || req->newptr == NULL)
1142		return (err);
1143
1144        if (val >= arc_shrink_shift)
1145		return (EINVAL);
1146
1147	arc_no_grow_shift = val;
1148	return (0);
1149}
1150
1151static int
1152sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
1153{
1154	uint64_t val;
1155	int err;
1156
1157	val = zfs_arc_max;
1158	err = sysctl_handle_64(oidp, &val, 0, req);
1159	if (err != 0 || req->newptr == NULL)
1160		return (err);
1161
1162	if (zfs_arc_max == 0) {
1163		/* Loader tunable so blindly set */
1164		zfs_arc_max = val;
1165		return (0);
1166	}
1167
1168	if (val < arc_abs_min || val > kmem_size())
1169		return (EINVAL);
1170	if (val < arc_c_min)
1171		return (EINVAL);
1172	if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
1173		return (EINVAL);
1174
1175	arc_c_max = val;
1176
1177	arc_c = arc_c_max;
1178        arc_p = (arc_c >> 1);
1179
1180	if (zfs_arc_meta_limit == 0) {
1181		/* limit meta-data to 1/4 of the arc capacity */
1182		arc_meta_limit = arc_c_max / 4;
1183	}
1184
1185	/* if kmem_flags are set, lets try to use less memory */
1186	if (kmem_debugging())
1187		arc_c = arc_c / 2;
1188
1189	zfs_arc_max = arc_c;
1190
1191	return (0);
1192}
1193
1194static int
1195sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
1196{
1197	uint64_t val;
1198	int err;
1199
1200	val = zfs_arc_min;
1201	err = sysctl_handle_64(oidp, &val, 0, req);
1202	if (err != 0 || req->newptr == NULL)
1203		return (err);
1204
1205	if (zfs_arc_min == 0) {
1206		/* Loader tunable so blindly set */
1207		zfs_arc_min = val;
1208		return (0);
1209	}
1210
1211	if (val < arc_abs_min || val > arc_c_max)
1212		return (EINVAL);
1213
1214	arc_c_min = val;
1215
1216	if (zfs_arc_meta_min == 0)
1217                arc_meta_min = arc_c_min / 2;
1218
1219	if (arc_c < arc_c_min)
1220                arc_c = arc_c_min;
1221
1222	zfs_arc_min = arc_c_min;
1223
1224	return (0);
1225}
1226#endif
1227
1228#define	GHOST_STATE(state)	\
1229	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
1230	(state) == arc_l2c_only)
1231
1232#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
1233#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
1234#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
1235#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
1236#define	HDR_PRESCIENT_PREFETCH(hdr)	\
1237	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
1238#define	HDR_COMPRESSION_ENABLED(hdr)	\
1239	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
1240
1241#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
1242#define	HDR_L2_READING(hdr)	\
1243	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
1244	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
1245#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
1246#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
1247#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
1248#define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
1249
1250#define	HDR_ISTYPE_METADATA(hdr)	\
1251	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
1252#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
1253
1254#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
1255#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
1256
1257/* For storing compression mode in b_flags */
1258#define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
1259
1260#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
1261	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
1262#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
1263	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
1264
1265#define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
1266#define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
1267#define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
1268
1269/*
1270 * Other sizes
1271 */
1272
1273#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
1274#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
1275
1276/*
1277 * Hash table routines
1278 */
1279
1280#define	HT_LOCK_PAD	CACHE_LINE_SIZE
1281
1282struct ht_lock {
1283	kmutex_t	ht_lock;
1284#ifdef _KERNEL
1285	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
1286#endif
1287};
1288
1289#define	BUF_LOCKS 256
1290typedef struct buf_hash_table {
1291	uint64_t ht_mask;
1292	arc_buf_hdr_t **ht_table;
1293	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
1294} buf_hash_table_t;
1295
1296static buf_hash_table_t buf_hash_table;
1297
1298#define	BUF_HASH_INDEX(spa, dva, birth) \
1299	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
1300#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
1301#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
1302#define	HDR_LOCK(hdr) \
1303	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
1304
1305uint64_t zfs_crc64_table[256];
1306
1307/*
1308 * Level 2 ARC
1309 */
1310
1311#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
1312#define	L2ARC_HEADROOM		2			/* num of writes */
1313/*
1314 * If we discover during ARC scan any buffers to be compressed, we boost
1315 * our headroom for the next scanning cycle by this percentage multiple.
1316 */
1317#define	L2ARC_HEADROOM_BOOST	200
1318#define	L2ARC_FEED_SECS		1		/* caching interval secs */
1319#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
1320
1321#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
1322#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
1323
1324/* L2ARC Performance Tunables */
1325uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
1326uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
1327uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
1328uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
1329uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
1330uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
1331boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
1332boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
1333boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
1334
1335SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
1336    &l2arc_write_max, 0, "max write size");
1337SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
1338    &l2arc_write_boost, 0, "extra write during warmup");
1339SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
1340    &l2arc_headroom, 0, "number of dev writes");
1341SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
1342    &l2arc_feed_secs, 0, "interval seconds");
1343SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
1344    &l2arc_feed_min_ms, 0, "min interval milliseconds");
1345
1346SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
1347    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
1348SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
1349    &l2arc_feed_again, 0, "turbo warmup");
1350SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
1351    &l2arc_norw, 0, "no reads during writes");
1352
1353SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
1354    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
1355SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
1356    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1357    "size of anonymous state");
1358SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
1359    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1360    "size of anonymous state");
1361
1362SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
1363    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
1364SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
1365    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1366    "size of metadata in mru state");
1367SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
1368    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1369    "size of data in mru state");
1370
1371SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
1372    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
1373SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
1374    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1375    "size of metadata in mru ghost state");
1376SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
1377    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1378    "size of data in mru ghost state");
1379
1380SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
1381    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
1382SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
1383    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1384    "size of metadata in mfu state");
1385SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
1386    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1387    "size of data in mfu state");
1388
1389SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
1390    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
1391SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
1392    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1393    "size of metadata in mfu ghost state");
1394SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
1395    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1396    "size of data in mfu ghost state");
1397
1398SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
1399    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
1400
1401SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
1402    &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
1403SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
1404    &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
1405
1406/*
1407 * L2ARC Internals
1408 */
1409struct l2arc_dev {
1410	vdev_t			*l2ad_vdev;	/* vdev */
1411	spa_t			*l2ad_spa;	/* spa */
1412	uint64_t		l2ad_hand;	/* next write location */
1413	uint64_t		l2ad_start;	/* first addr on device */
1414	uint64_t		l2ad_end;	/* last addr on device */
1415	boolean_t		l2ad_first;	/* first sweep through */
1416	boolean_t		l2ad_writing;	/* currently writing */
1417	kmutex_t		l2ad_mtx;	/* lock for buffer list */
1418	list_t			l2ad_buflist;	/* buffer list */
1419	list_node_t		l2ad_node;	/* device list node */
1420	refcount_t		l2ad_alloc;	/* allocated bytes */
1421};
1422
1423static list_t L2ARC_dev_list;			/* device list */
1424static list_t *l2arc_dev_list;			/* device list pointer */
1425static kmutex_t l2arc_dev_mtx;			/* device list mutex */
1426static l2arc_dev_t *l2arc_dev_last;		/* last device used */
1427static list_t L2ARC_free_on_write;		/* free after write buf list */
1428static list_t *l2arc_free_on_write;		/* free after write list ptr */
1429static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
1430static uint64_t l2arc_ndev;			/* number of devices */
1431
1432typedef struct l2arc_read_callback {
1433	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
1434	blkptr_t		l2rcb_bp;		/* original blkptr */
1435	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
1436	int			l2rcb_flags;		/* original flags */
1437	abd_t			*l2rcb_abd;		/* temporary buffer */
1438} l2arc_read_callback_t;
1439
1440typedef struct l2arc_write_callback {
1441	l2arc_dev_t	*l2wcb_dev;		/* device info */
1442	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
1443} l2arc_write_callback_t;
1444
1445typedef struct l2arc_data_free {
1446	/* protected by l2arc_free_on_write_mtx */
1447	abd_t		*l2df_abd;
1448	size_t		l2df_size;
1449	arc_buf_contents_t l2df_type;
1450	list_node_t	l2df_list_node;
1451} l2arc_data_free_t;
1452
1453static kmutex_t l2arc_feed_thr_lock;
1454static kcondvar_t l2arc_feed_thr_cv;
1455static uint8_t l2arc_thread_exit;
1456
1457static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
1458static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
1459static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
1460static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
1461static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
1462static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
1463static void arc_hdr_free_pabd(arc_buf_hdr_t *);
1464static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
1465static void arc_access(arc_buf_hdr_t *, kmutex_t *);
1466static boolean_t arc_is_overflowing();
1467static void arc_buf_watch(arc_buf_t *);
1468
1469static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1470static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1471static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1472static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1473
1474static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1475static void l2arc_read_done(zio_t *);
1476
1477static void
1478l2arc_trim(const arc_buf_hdr_t *hdr)
1479{
1480	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1481
1482	ASSERT(HDR_HAS_L2HDR(hdr));
1483	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
1484
1485	if (HDR_GET_PSIZE(hdr) != 0) {
1486		trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
1487		    HDR_GET_PSIZE(hdr), 0);
1488	}
1489}
1490
1491/*
1492 * We use Cityhash for this. It's fast, and has good hash properties without
1493 * requiring any large static buffers.
1494 */
1495static uint64_t
1496buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1497{
1498	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
1499}
1500
1501#define	HDR_EMPTY(hdr)						\
1502	((hdr)->b_dva.dva_word[0] == 0 &&			\
1503	(hdr)->b_dva.dva_word[1] == 0)
1504
1505#define	HDR_EQUAL(spa, dva, birth, hdr)				\
1506	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
1507	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
1508	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
1509
1510static void
1511buf_discard_identity(arc_buf_hdr_t *hdr)
1512{
1513	hdr->b_dva.dva_word[0] = 0;
1514	hdr->b_dva.dva_word[1] = 0;
1515	hdr->b_birth = 0;
1516}
1517
1518static arc_buf_hdr_t *
1519buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1520{
1521	const dva_t *dva = BP_IDENTITY(bp);
1522	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1523	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1524	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1525	arc_buf_hdr_t *hdr;
1526
1527	mutex_enter(hash_lock);
1528	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1529	    hdr = hdr->b_hash_next) {
1530		if (HDR_EQUAL(spa, dva, birth, hdr)) {
1531			*lockp = hash_lock;
1532			return (hdr);
1533		}
1534	}
1535	mutex_exit(hash_lock);
1536	*lockp = NULL;
1537	return (NULL);
1538}
1539
1540/*
1541 * Insert an entry into the hash table.  If there is already an element
1542 * equal to elem in the hash table, then the already existing element
1543 * will be returned and the new element will not be inserted.
1544 * Otherwise returns NULL.
1545 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1546 */
1547static arc_buf_hdr_t *
1548buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1549{
1550	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1551	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1552	arc_buf_hdr_t *fhdr;
1553	uint32_t i;
1554
1555	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1556	ASSERT(hdr->b_birth != 0);
1557	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1558
1559	if (lockp != NULL) {
1560		*lockp = hash_lock;
1561		mutex_enter(hash_lock);
1562	} else {
1563		ASSERT(MUTEX_HELD(hash_lock));
1564	}
1565
1566	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1567	    fhdr = fhdr->b_hash_next, i++) {
1568		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1569			return (fhdr);
1570	}
1571
1572	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1573	buf_hash_table.ht_table[idx] = hdr;
1574	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1575
1576	/* collect some hash table performance data */
1577	if (i > 0) {
1578		ARCSTAT_BUMP(arcstat_hash_collisions);
1579		if (i == 1)
1580			ARCSTAT_BUMP(arcstat_hash_chains);
1581
1582		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1583	}
1584
1585	ARCSTAT_BUMP(arcstat_hash_elements);
1586	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1587
1588	return (NULL);
1589}
1590
1591static void
1592buf_hash_remove(arc_buf_hdr_t *hdr)
1593{
1594	arc_buf_hdr_t *fhdr, **hdrp;
1595	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1596
1597	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1598	ASSERT(HDR_IN_HASH_TABLE(hdr));
1599
1600	hdrp = &buf_hash_table.ht_table[idx];
1601	while ((fhdr = *hdrp) != hdr) {
1602		ASSERT3P(fhdr, !=, NULL);
1603		hdrp = &fhdr->b_hash_next;
1604	}
1605	*hdrp = hdr->b_hash_next;
1606	hdr->b_hash_next = NULL;
1607	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1608
1609	/* collect some hash table performance data */
1610	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1611
1612	if (buf_hash_table.ht_table[idx] &&
1613	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1614		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1615}
1616
1617/*
1618 * Global data structures and functions for the buf kmem cache.
1619 */
1620static kmem_cache_t *hdr_full_cache;
1621static kmem_cache_t *hdr_l2only_cache;
1622static kmem_cache_t *buf_cache;
1623
1624static void
1625buf_fini(void)
1626{
1627	int i;
1628
1629	kmem_free(buf_hash_table.ht_table,
1630	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1631	for (i = 0; i < BUF_LOCKS; i++)
1632		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1633	kmem_cache_destroy(hdr_full_cache);
1634	kmem_cache_destroy(hdr_l2only_cache);
1635	kmem_cache_destroy(buf_cache);
1636}
1637
1638/*
1639 * Constructor callback - called when the cache is empty
1640 * and a new buf is requested.
1641 */
1642/* ARGSUSED */
1643static int
1644hdr_full_cons(void *vbuf, void *unused, int kmflag)
1645{
1646	arc_buf_hdr_t *hdr = vbuf;
1647
1648	bzero(hdr, HDR_FULL_SIZE);
1649	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1650	refcount_create(&hdr->b_l1hdr.b_refcnt);
1651	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1652	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1653	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1654
1655	return (0);
1656}
1657
1658/* ARGSUSED */
1659static int
1660hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1661{
1662	arc_buf_hdr_t *hdr = vbuf;
1663
1664	bzero(hdr, HDR_L2ONLY_SIZE);
1665	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1666
1667	return (0);
1668}
1669
1670/* ARGSUSED */
1671static int
1672buf_cons(void *vbuf, void *unused, int kmflag)
1673{
1674	arc_buf_t *buf = vbuf;
1675
1676	bzero(buf, sizeof (arc_buf_t));
1677	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1678	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1679
1680	return (0);
1681}
1682
1683/*
1684 * Destructor callback - called when a cached buf is
1685 * no longer required.
1686 */
1687/* ARGSUSED */
1688static void
1689hdr_full_dest(void *vbuf, void *unused)
1690{
1691	arc_buf_hdr_t *hdr = vbuf;
1692
1693	ASSERT(HDR_EMPTY(hdr));
1694	cv_destroy(&hdr->b_l1hdr.b_cv);
1695	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1696	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1697	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1698	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1699}
1700
1701/* ARGSUSED */
1702static void
1703hdr_l2only_dest(void *vbuf, void *unused)
1704{
1705	arc_buf_hdr_t *hdr = vbuf;
1706
1707	ASSERT(HDR_EMPTY(hdr));
1708	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1709}
1710
1711/* ARGSUSED */
1712static void
1713buf_dest(void *vbuf, void *unused)
1714{
1715	arc_buf_t *buf = vbuf;
1716
1717	mutex_destroy(&buf->b_evict_lock);
1718	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1719}
1720
1721/*
1722 * Reclaim callback -- invoked when memory is low.
1723 */
1724/* ARGSUSED */
1725static void
1726hdr_recl(void *unused)
1727{
1728	dprintf("hdr_recl called\n");
1729	/*
1730	 * umem calls the reclaim func when we destroy the buf cache,
1731	 * which is after we do arc_fini().
1732	 */
1733	if (!arc_dead)
1734		cv_signal(&arc_reclaim_thread_cv);
1735}
1736
1737static void
1738buf_init(void)
1739{
1740	uint64_t *ct;
1741	uint64_t hsize = 1ULL << 12;
1742	int i, j;
1743
1744	/*
1745	 * The hash table is big enough to fill all of physical memory
1746	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1747	 * By default, the table will take up
1748	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1749	 */
1750	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1751		hsize <<= 1;
1752retry:
1753	buf_hash_table.ht_mask = hsize - 1;
1754	buf_hash_table.ht_table =
1755	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1756	if (buf_hash_table.ht_table == NULL) {
1757		ASSERT(hsize > (1ULL << 8));
1758		hsize >>= 1;
1759		goto retry;
1760	}
1761
1762	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1763	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1764	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1765	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1766	    NULL, NULL, 0);
1767	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1768	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1769
1770	for (i = 0; i < 256; i++)
1771		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1772			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1773
1774	for (i = 0; i < BUF_LOCKS; i++) {
1775		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1776		    NULL, MUTEX_DEFAULT, NULL);
1777	}
1778}
1779
1780/*
1781 * This is the size that the buf occupies in memory. If the buf is compressed,
1782 * it will correspond to the compressed size. You should use this method of
1783 * getting the buf size unless you explicitly need the logical size.
1784 */
1785int32_t
1786arc_buf_size(arc_buf_t *buf)
1787{
1788	return (ARC_BUF_COMPRESSED(buf) ?
1789	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1790}
1791
1792int32_t
1793arc_buf_lsize(arc_buf_t *buf)
1794{
1795	return (HDR_GET_LSIZE(buf->b_hdr));
1796}
1797
1798enum zio_compress
1799arc_get_compression(arc_buf_t *buf)
1800{
1801	return (ARC_BUF_COMPRESSED(buf) ?
1802	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1803}
1804
1805#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1806
1807static inline boolean_t
1808arc_buf_is_shared(arc_buf_t *buf)
1809{
1810	boolean_t shared = (buf->b_data != NULL &&
1811	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1812	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1813	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1814	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1815	IMPLY(shared, ARC_BUF_SHARED(buf));
1816	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1817
1818	/*
1819	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1820	 * already being shared" requirement prevents us from doing that.
1821	 */
1822
1823	return (shared);
1824}
1825
1826/*
1827 * Free the checksum associated with this header. If there is no checksum, this
1828 * is a no-op.
1829 */
1830static inline void
1831arc_cksum_free(arc_buf_hdr_t *hdr)
1832{
1833	ASSERT(HDR_HAS_L1HDR(hdr));
1834	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1835	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1836		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1837		hdr->b_l1hdr.b_freeze_cksum = NULL;
1838	}
1839	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1840}
1841
1842/*
1843 * Return true iff at least one of the bufs on hdr is not compressed.
1844 */
1845static boolean_t
1846arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1847{
1848	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1849		if (!ARC_BUF_COMPRESSED(b)) {
1850			return (B_TRUE);
1851		}
1852	}
1853	return (B_FALSE);
1854}
1855
1856/*
1857 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1858 * matches the checksum that is stored in the hdr. If there is no checksum,
1859 * or if the buf is compressed, this is a no-op.
1860 */
1861static void
1862arc_cksum_verify(arc_buf_t *buf)
1863{
1864	arc_buf_hdr_t *hdr = buf->b_hdr;
1865	zio_cksum_t zc;
1866
1867	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1868		return;
1869
1870	if (ARC_BUF_COMPRESSED(buf)) {
1871		ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
1872		    arc_hdr_has_uncompressed_buf(hdr));
1873		return;
1874	}
1875
1876	ASSERT(HDR_HAS_L1HDR(hdr));
1877
1878	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1879	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1880		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1881		return;
1882	}
1883
1884	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1885	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1886		panic("buffer modified while frozen!");
1887	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1888}
1889
1890static boolean_t
1891arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1892{
1893	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1894	boolean_t valid_cksum;
1895
1896	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1897	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1898
1899	/*
1900	 * We rely on the blkptr's checksum to determine if the block
1901	 * is valid or not. When compressed arc is enabled, the l2arc
1902	 * writes the block to the l2arc just as it appears in the pool.
1903	 * This allows us to use the blkptr's checksum to validate the
1904	 * data that we just read off of the l2arc without having to store
1905	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
1906	 * arc is disabled, then the data written to the l2arc is always
1907	 * uncompressed and won't match the block as it exists in the main
1908	 * pool. When this is the case, we must first compress it if it is
1909	 * compressed on the main pool before we can validate the checksum.
1910	 */
1911	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1912		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1913		uint64_t lsize = HDR_GET_LSIZE(hdr);
1914		uint64_t csize;
1915
1916		abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
1917		csize = zio_compress_data(compress, zio->io_abd,
1918		    abd_to_buf(cdata), lsize);
1919
1920		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1921		if (csize < HDR_GET_PSIZE(hdr)) {
1922			/*
1923			 * Compressed blocks are always a multiple of the
1924			 * smallest ashift in the pool. Ideally, we would
1925			 * like to round up the csize to the next
1926			 * spa_min_ashift but that value may have changed
1927			 * since the block was last written. Instead,
1928			 * we rely on the fact that the hdr's psize
1929			 * was set to the psize of the block when it was
1930			 * last written. We set the csize to that value
1931			 * and zero out any part that should not contain
1932			 * data.
1933			 */
1934			abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
1935			csize = HDR_GET_PSIZE(hdr);
1936		}
1937		zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
1938	}
1939
1940	/*
1941	 * Block pointers always store the checksum for the logical data.
1942	 * If the block pointer has the gang bit set, then the checksum
1943	 * it represents is for the reconstituted data and not for an
1944	 * individual gang member. The zio pipeline, however, must be able to
1945	 * determine the checksum of each of the gang constituents so it
1946	 * treats the checksum comparison differently than what we need
1947	 * for l2arc blocks. This prevents us from using the
1948	 * zio_checksum_error() interface directly. Instead we must call the
1949	 * zio_checksum_error_impl() so that we can ensure the checksum is
1950	 * generated using the correct checksum algorithm and accounts for the
1951	 * logical I/O size and not just a gang fragment.
1952	 */
1953	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1954	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1955	    zio->io_offset, NULL) == 0);
1956	zio_pop_transforms(zio);
1957	return (valid_cksum);
1958}
1959
1960/*
1961 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1962 * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1963 * isn't modified later on. If buf is compressed or there is already a checksum
1964 * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1965 */
1966static void
1967arc_cksum_compute(arc_buf_t *buf)
1968{
1969	arc_buf_hdr_t *hdr = buf->b_hdr;
1970
1971	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1972		return;
1973
1974	ASSERT(HDR_HAS_L1HDR(hdr));
1975
1976	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1977	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1978		ASSERT(arc_hdr_has_uncompressed_buf(hdr));
1979		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1980		return;
1981	} else if (ARC_BUF_COMPRESSED(buf)) {
1982		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1983		return;
1984	}
1985
1986	ASSERT(!ARC_BUF_COMPRESSED(buf));
1987	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1988	    KM_SLEEP);
1989	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1990	    hdr->b_l1hdr.b_freeze_cksum);
1991	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1992#ifdef illumos
1993	arc_buf_watch(buf);
1994#endif
1995}
1996
1997#ifdef illumos
1998#ifndef _KERNEL
1999typedef struct procctl {
2000	long cmd;
2001	prwatch_t prwatch;
2002} procctl_t;
2003#endif
2004
2005/* ARGSUSED */
2006static void
2007arc_buf_unwatch(arc_buf_t *buf)
2008{
2009#ifndef _KERNEL
2010	if (arc_watch) {
2011		int result;
2012		procctl_t ctl;
2013		ctl.cmd = PCWATCH;
2014		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
2015		ctl.prwatch.pr_size = 0;
2016		ctl.prwatch.pr_wflags = 0;
2017		result = write(arc_procfd, &ctl, sizeof (ctl));
2018		ASSERT3U(result, ==, sizeof (ctl));
2019	}
2020#endif
2021}
2022
2023/* ARGSUSED */
2024static void
2025arc_buf_watch(arc_buf_t *buf)
2026{
2027#ifndef _KERNEL
2028	if (arc_watch) {
2029		int result;
2030		procctl_t ctl;
2031		ctl.cmd = PCWATCH;
2032		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
2033		ctl.prwatch.pr_size = arc_buf_size(buf);
2034		ctl.prwatch.pr_wflags = WA_WRITE;
2035		result = write(arc_procfd, &ctl, sizeof (ctl));
2036		ASSERT3U(result, ==, sizeof (ctl));
2037	}
2038#endif
2039}
2040#endif /* illumos */
2041
2042static arc_buf_contents_t
2043arc_buf_type(arc_buf_hdr_t *hdr)
2044{
2045	arc_buf_contents_t type;
2046	if (HDR_ISTYPE_METADATA(hdr)) {
2047		type = ARC_BUFC_METADATA;
2048	} else {
2049		type = ARC_BUFC_DATA;
2050	}
2051	VERIFY3U(hdr->b_type, ==, type);
2052	return (type);
2053}
2054
2055boolean_t
2056arc_is_metadata(arc_buf_t *buf)
2057{
2058	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
2059}
2060
2061static uint32_t
2062arc_bufc_to_flags(arc_buf_contents_t type)
2063{
2064	switch (type) {
2065	case ARC_BUFC_DATA:
2066		/* metadata field is 0 if buffer contains normal data */
2067		return (0);
2068	case ARC_BUFC_METADATA:
2069		return (ARC_FLAG_BUFC_METADATA);
2070	default:
2071		break;
2072	}
2073	panic("undefined ARC buffer type!");
2074	return ((uint32_t)-1);
2075}
2076
2077void
2078arc_buf_thaw(arc_buf_t *buf)
2079{
2080	arc_buf_hdr_t *hdr = buf->b_hdr;
2081
2082	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2083	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2084
2085	arc_cksum_verify(buf);
2086
2087	/*
2088	 * Compressed buffers do not manipulate the b_freeze_cksum or
2089	 * allocate b_thawed.
2090	 */
2091	if (ARC_BUF_COMPRESSED(buf)) {
2092		ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
2093		    arc_hdr_has_uncompressed_buf(hdr));
2094		return;
2095	}
2096
2097	ASSERT(HDR_HAS_L1HDR(hdr));
2098	arc_cksum_free(hdr);
2099
2100	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
2101#ifdef ZFS_DEBUG
2102	if (zfs_flags & ZFS_DEBUG_MODIFY) {
2103		if (hdr->b_l1hdr.b_thawed != NULL)
2104			kmem_free(hdr->b_l1hdr.b_thawed, 1);
2105		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
2106	}
2107#endif
2108
2109	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
2110
2111#ifdef illumos
2112	arc_buf_unwatch(buf);
2113#endif
2114}
2115
2116void
2117arc_buf_freeze(arc_buf_t *buf)
2118{
2119	arc_buf_hdr_t *hdr = buf->b_hdr;
2120	kmutex_t *hash_lock;
2121
2122	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
2123		return;
2124
2125	if (ARC_BUF_COMPRESSED(buf)) {
2126		ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
2127		    arc_hdr_has_uncompressed_buf(hdr));
2128		return;
2129	}
2130
2131	hash_lock = HDR_LOCK(hdr);
2132	mutex_enter(hash_lock);
2133
2134	ASSERT(HDR_HAS_L1HDR(hdr));
2135	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
2136	    hdr->b_l1hdr.b_state == arc_anon);
2137	arc_cksum_compute(buf);
2138	mutex_exit(hash_lock);
2139}
2140
2141/*
2142 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
2143 * the following functions should be used to ensure that the flags are
2144 * updated in a thread-safe way. When manipulating the flags either
2145 * the hash_lock must be held or the hdr must be undiscoverable. This
2146 * ensures that we're not racing with any other threads when updating
2147 * the flags.
2148 */
2149static inline void
2150arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
2151{
2152	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2153	hdr->b_flags |= flags;
2154}
2155
2156static inline void
2157arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
2158{
2159	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2160	hdr->b_flags &= ~flags;
2161}
2162
2163/*
2164 * Setting the compression bits in the arc_buf_hdr_t's b_flags is
2165 * done in a special way since we have to clear and set bits
2166 * at the same time. Consumers that wish to set the compression bits
2167 * must use this function to ensure that the flags are updated in
2168 * thread-safe manner.
2169 */
2170static void
2171arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
2172{
2173	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2174
2175	/*
2176	 * Holes and embedded blocks will always have a psize = 0 so
2177	 * we ignore the compression of the blkptr and set the
2178	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
2179	 * Holes and embedded blocks remain anonymous so we don't
2180	 * want to uncompress them. Mark them as uncompressed.
2181	 */
2182	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
2183		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2184		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
2185		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
2186		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
2187	} else {
2188		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2189		HDR_SET_COMPRESS(hdr, cmp);
2190		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
2191		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
2192	}
2193}
2194
2195/*
2196 * Looks for another buf on the same hdr which has the data decompressed, copies
2197 * from it, and returns true. If no such buf exists, returns false.
2198 */
2199static boolean_t
2200arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
2201{
2202	arc_buf_hdr_t *hdr = buf->b_hdr;
2203	boolean_t copied = B_FALSE;
2204
2205	ASSERT(HDR_HAS_L1HDR(hdr));
2206	ASSERT3P(buf->b_data, !=, NULL);
2207	ASSERT(!ARC_BUF_COMPRESSED(buf));
2208
2209	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
2210	    from = from->b_next) {
2211		/* can't use our own data buffer */
2212		if (from == buf) {
2213			continue;
2214		}
2215
2216		if (!ARC_BUF_COMPRESSED(from)) {
2217			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
2218			copied = B_TRUE;
2219			break;
2220		}
2221	}
2222
2223	/*
2224	 * There were no decompressed bufs, so there should not be a
2225	 * checksum on the hdr either.
2226	 */
2227	EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
2228
2229	return (copied);
2230}
2231
2232/*
2233 * Given a buf that has a data buffer attached to it, this function will
2234 * efficiently fill the buf with data of the specified compression setting from
2235 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
2236 * are already sharing a data buf, no copy is performed.
2237 *
2238 * If the buf is marked as compressed but uncompressed data was requested, this
2239 * will allocate a new data buffer for the buf, remove that flag, and fill the
2240 * buf with uncompressed data. You can't request a compressed buf on a hdr with
2241 * uncompressed data, and (since we haven't added support for it yet) if you
2242 * want compressed data your buf must already be marked as compressed and have
2243 * the correct-sized data buffer.
2244 */
2245static int
2246arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
2247{
2248	arc_buf_hdr_t *hdr = buf->b_hdr;
2249	boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
2250	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
2251
2252	ASSERT3P(buf->b_data, !=, NULL);
2253	IMPLY(compressed, hdr_compressed);
2254	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
2255
2256	if (hdr_compressed == compressed) {
2257		if (!arc_buf_is_shared(buf)) {
2258			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
2259			    arc_buf_size(buf));
2260		}
2261	} else {
2262		ASSERT(hdr_compressed);
2263		ASSERT(!compressed);
2264		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
2265
2266		/*
2267		 * If the buf is sharing its data with the hdr, unlink it and
2268		 * allocate a new data buffer for the buf.
2269		 */
2270		if (arc_buf_is_shared(buf)) {
2271			ASSERT(ARC_BUF_COMPRESSED(buf));
2272
2273			/* We need to give the buf it's own b_data */
2274			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2275			buf->b_data =
2276			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2277			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2278
2279			/* Previously overhead was 0; just add new overhead */
2280			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2281		} else if (ARC_BUF_COMPRESSED(buf)) {
2282			/* We need to reallocate the buf's b_data */
2283			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2284			    buf);
2285			buf->b_data =
2286			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2287
2288			/* We increased the size of b_data; update overhead */
2289			ARCSTAT_INCR(arcstat_overhead_size,
2290			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2291		}
2292
2293		/*
2294		 * Regardless of the buf's previous compression settings, it
2295		 * should not be compressed at the end of this function.
2296		 */
2297		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2298
2299		/*
2300		 * Try copying the data from another buf which already has a
2301		 * decompressed version. If that's not possible, it's time to
2302		 * bite the bullet and decompress the data from the hdr.
2303		 */
2304		if (arc_buf_try_copy_decompressed_data(buf)) {
2305			/* Skip byteswapping and checksumming (already done) */
2306			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2307			return (0);
2308		} else {
2309			int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2310			    hdr->b_l1hdr.b_pabd, buf->b_data,
2311			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2312
2313			/*
2314			 * Absent hardware errors or software bugs, this should
2315			 * be impossible, but log it anyway so we can debug it.
2316			 */
2317			if (error != 0) {
2318				zfs_dbgmsg(
2319				    "hdr %p, compress %d, psize %d, lsize %d",
2320				    hdr, HDR_GET_COMPRESS(hdr),
2321				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2322				return (SET_ERROR(EIO));
2323			}
2324		}
2325	}
2326
2327	/* Byteswap the buf's data if necessary */
2328	if (bswap != DMU_BSWAP_NUMFUNCS) {
2329		ASSERT(!HDR_SHARED_DATA(hdr));
2330		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2331		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2332	}
2333
2334	/* Compute the hdr's checksum if necessary */
2335	arc_cksum_compute(buf);
2336
2337	return (0);
2338}
2339
2340int
2341arc_decompress(arc_buf_t *buf)
2342{
2343	return (arc_buf_fill(buf, B_FALSE));
2344}
2345
2346/*
2347 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
2348 */
2349static uint64_t
2350arc_hdr_size(arc_buf_hdr_t *hdr)
2351{
2352	uint64_t size;
2353
2354	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
2355	    HDR_GET_PSIZE(hdr) > 0) {
2356		size = HDR_GET_PSIZE(hdr);
2357	} else {
2358		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
2359		size = HDR_GET_LSIZE(hdr);
2360	}
2361	return (size);
2362}
2363
2364/*
2365 * Increment the amount of evictable space in the arc_state_t's refcount.
2366 * We account for the space used by the hdr and the arc buf individually
2367 * so that we can add and remove them from the refcount individually.
2368 */
2369static void
2370arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2371{
2372	arc_buf_contents_t type = arc_buf_type(hdr);
2373
2374	ASSERT(HDR_HAS_L1HDR(hdr));
2375
2376	if (GHOST_STATE(state)) {
2377		ASSERT0(hdr->b_l1hdr.b_bufcnt);
2378		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2379		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2380		(void) refcount_add_many(&state->arcs_esize[type],
2381		    HDR_GET_LSIZE(hdr), hdr);
2382		return;
2383	}
2384
2385	ASSERT(!GHOST_STATE(state));
2386	if (hdr->b_l1hdr.b_pabd != NULL) {
2387		(void) refcount_add_many(&state->arcs_esize[type],
2388		    arc_hdr_size(hdr), hdr);
2389	}
2390	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2391	    buf = buf->b_next) {
2392		if (arc_buf_is_shared(buf))
2393			continue;
2394		(void) refcount_add_many(&state->arcs_esize[type],
2395		    arc_buf_size(buf), buf);
2396	}
2397}
2398
2399/*
2400 * Decrement the amount of evictable space in the arc_state_t's refcount.
2401 * We account for the space used by the hdr and the arc buf individually
2402 * so that we can add and remove them from the refcount individually.
2403 */
2404static void
2405arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2406{
2407	arc_buf_contents_t type = arc_buf_type(hdr);
2408
2409	ASSERT(HDR_HAS_L1HDR(hdr));
2410
2411	if (GHOST_STATE(state)) {
2412		ASSERT0(hdr->b_l1hdr.b_bufcnt);
2413		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2414		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2415		(void) refcount_remove_many(&state->arcs_esize[type],
2416		    HDR_GET_LSIZE(hdr), hdr);
2417		return;
2418	}
2419
2420	ASSERT(!GHOST_STATE(state));
2421	if (hdr->b_l1hdr.b_pabd != NULL) {
2422		(void) refcount_remove_many(&state->arcs_esize[type],
2423		    arc_hdr_size(hdr), hdr);
2424	}
2425	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2426	    buf = buf->b_next) {
2427		if (arc_buf_is_shared(buf))
2428			continue;
2429		(void) refcount_remove_many(&state->arcs_esize[type],
2430		    arc_buf_size(buf), buf);
2431	}
2432}
2433
2434/*
2435 * Add a reference to this hdr indicating that someone is actively
2436 * referencing that memory. When the refcount transitions from 0 to 1,
2437 * we remove it from the respective arc_state_t list to indicate that
2438 * it is not evictable.
2439 */
2440static void
2441add_reference(arc_buf_hdr_t *hdr, void *tag)
2442{
2443	ASSERT(HDR_HAS_L1HDR(hdr));
2444	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
2445		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2446		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2447		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2448	}
2449
2450	arc_state_t *state = hdr->b_l1hdr.b_state;
2451
2452	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2453	    (state != arc_anon)) {
2454		/* We don't use the L2-only state list. */
2455		if (state != arc_l2c_only) {
2456			multilist_remove(state->arcs_list[arc_buf_type(hdr)],
2457			    hdr);
2458			arc_evictable_space_decrement(hdr, state);
2459		}
2460		/* remove the prefetch flag if we get a reference */
2461		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2462	}
2463}
2464
2465/*
2466 * Remove a reference from this hdr. When the reference transitions from
2467 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2468 * list making it eligible for eviction.
2469 */
2470static int
2471remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2472{
2473	int cnt;
2474	arc_state_t *state = hdr->b_l1hdr.b_state;
2475
2476	ASSERT(HDR_HAS_L1HDR(hdr));
2477	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2478	ASSERT(!GHOST_STATE(state));
2479
2480	/*
2481	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
2482	 * check to prevent usage of the arc_l2c_only list.
2483	 */
2484	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2485	    (state != arc_anon)) {
2486		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
2487		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2488		arc_evictable_space_increment(hdr, state);
2489	}
2490	return (cnt);
2491}
2492
2493/*
2494 * Move the supplied buffer to the indicated state. The hash lock
2495 * for the buffer must be held by the caller.
2496 */
2497static void
2498arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2499    kmutex_t *hash_lock)
2500{
2501	arc_state_t *old_state;
2502	int64_t refcnt;
2503	uint32_t bufcnt;
2504	boolean_t update_old, update_new;
2505	arc_buf_contents_t buftype = arc_buf_type(hdr);
2506
2507	/*
2508	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2509	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
2510	 * L1 hdr doesn't always exist when we change state to arc_anon before
2511	 * destroying a header, in which case reallocating to add the L1 hdr is
2512	 * pointless.
2513	 */
2514	if (HDR_HAS_L1HDR(hdr)) {
2515		old_state = hdr->b_l1hdr.b_state;
2516		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
2517		bufcnt = hdr->b_l1hdr.b_bufcnt;
2518		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
2519	} else {
2520		old_state = arc_l2c_only;
2521		refcnt = 0;
2522		bufcnt = 0;
2523		update_old = B_FALSE;
2524	}
2525	update_new = update_old;
2526
2527	ASSERT(MUTEX_HELD(hash_lock));
2528	ASSERT3P(new_state, !=, old_state);
2529	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2530	ASSERT(old_state != arc_anon || bufcnt <= 1);
2531
2532	/*
2533	 * If this buffer is evictable, transfer it from the
2534	 * old state list to the new state list.
2535	 */
2536	if (refcnt == 0) {
2537		if (old_state != arc_anon && old_state != arc_l2c_only) {
2538			ASSERT(HDR_HAS_L1HDR(hdr));
2539			multilist_remove(old_state->arcs_list[buftype], hdr);
2540
2541			if (GHOST_STATE(old_state)) {
2542				ASSERT0(bufcnt);
2543				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2544				update_old = B_TRUE;
2545			}
2546			arc_evictable_space_decrement(hdr, old_state);
2547		}
2548		if (new_state != arc_anon && new_state != arc_l2c_only) {
2549
2550			/*
2551			 * An L1 header always exists here, since if we're
2552			 * moving to some L1-cached state (i.e. not l2c_only or
2553			 * anonymous), we realloc the header to add an L1hdr
2554			 * beforehand.
2555			 */
2556			ASSERT(HDR_HAS_L1HDR(hdr));
2557			multilist_insert(new_state->arcs_list[buftype], hdr);
2558
2559			if (GHOST_STATE(new_state)) {
2560				ASSERT0(bufcnt);
2561				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2562				update_new = B_TRUE;
2563			}
2564			arc_evictable_space_increment(hdr, new_state);
2565		}
2566	}
2567
2568	ASSERT(!HDR_EMPTY(hdr));
2569	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2570		buf_hash_remove(hdr);
2571
2572	/* adjust state sizes (ignore arc_l2c_only) */
2573
2574	if (update_new && new_state != arc_l2c_only) {
2575		ASSERT(HDR_HAS_L1HDR(hdr));
2576		if (GHOST_STATE(new_state)) {
2577			ASSERT0(bufcnt);
2578
2579			/*
2580			 * When moving a header to a ghost state, we first
2581			 * remove all arc buffers. Thus, we'll have a
2582			 * bufcnt of zero, and no arc buffer to use for
2583			 * the reference. As a result, we use the arc
2584			 * header pointer for the reference.
2585			 */
2586			(void) refcount_add_many(&new_state->arcs_size,
2587			    HDR_GET_LSIZE(hdr), hdr);
2588			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2589		} else {
2590			uint32_t buffers = 0;
2591
2592			/*
2593			 * Each individual buffer holds a unique reference,
2594			 * thus we must remove each of these references one
2595			 * at a time.
2596			 */
2597			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2598			    buf = buf->b_next) {
2599				ASSERT3U(bufcnt, !=, 0);
2600				buffers++;
2601
2602				/*
2603				 * When the arc_buf_t is sharing the data
2604				 * block with the hdr, the owner of the
2605				 * reference belongs to the hdr. Only
2606				 * add to the refcount if the arc_buf_t is
2607				 * not shared.
2608				 */
2609				if (arc_buf_is_shared(buf))
2610					continue;
2611
2612				(void) refcount_add_many(&new_state->arcs_size,
2613				    arc_buf_size(buf), buf);
2614			}
2615			ASSERT3U(bufcnt, ==, buffers);
2616
2617			if (hdr->b_l1hdr.b_pabd != NULL) {
2618				(void) refcount_add_many(&new_state->arcs_size,
2619				    arc_hdr_size(hdr), hdr);
2620			} else {
2621				ASSERT(GHOST_STATE(old_state));
2622			}
2623		}
2624	}
2625
2626	if (update_old && old_state != arc_l2c_only) {
2627		ASSERT(HDR_HAS_L1HDR(hdr));
2628		if (GHOST_STATE(old_state)) {
2629			ASSERT0(bufcnt);
2630			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2631
2632			/*
2633			 * When moving a header off of a ghost state,
2634			 * the header will not contain any arc buffers.
2635			 * We use the arc header pointer for the reference
2636			 * which is exactly what we did when we put the
2637			 * header on the ghost state.
2638			 */
2639
2640			(void) refcount_remove_many(&old_state->arcs_size,
2641			    HDR_GET_LSIZE(hdr), hdr);
2642		} else {
2643			uint32_t buffers = 0;
2644
2645			/*
2646			 * Each individual buffer holds a unique reference,
2647			 * thus we must remove each of these references one
2648			 * at a time.
2649			 */
2650			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2651			    buf = buf->b_next) {
2652				ASSERT3U(bufcnt, !=, 0);
2653				buffers++;
2654
2655				/*
2656				 * When the arc_buf_t is sharing the data
2657				 * block with the hdr, the owner of the
2658				 * reference belongs to the hdr. Only
2659				 * add to the refcount if the arc_buf_t is
2660				 * not shared.
2661				 */
2662				if (arc_buf_is_shared(buf))
2663					continue;
2664
2665				(void) refcount_remove_many(
2666				    &old_state->arcs_size, arc_buf_size(buf),
2667				    buf);
2668			}
2669			ASSERT3U(bufcnt, ==, buffers);
2670			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2671			(void) refcount_remove_many(
2672			    &old_state->arcs_size, arc_hdr_size(hdr), hdr);
2673		}
2674	}
2675
2676	if (HDR_HAS_L1HDR(hdr))
2677		hdr->b_l1hdr.b_state = new_state;
2678
2679	/*
2680	 * L2 headers should never be on the L2 state list since they don't
2681	 * have L1 headers allocated.
2682	 */
2683	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2684	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2685}
2686
2687void
2688arc_space_consume(uint64_t space, arc_space_type_t type)
2689{
2690	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2691
2692	switch (type) {
2693	case ARC_SPACE_DATA:
2694		aggsum_add(&astat_data_size, space);
2695		break;
2696	case ARC_SPACE_META:
2697		aggsum_add(&astat_metadata_size, space);
2698		break;
2699	case ARC_SPACE_OTHER:
2700		aggsum_add(&astat_other_size, space);
2701		break;
2702	case ARC_SPACE_HDRS:
2703		aggsum_add(&astat_hdr_size, space);
2704		break;
2705	case ARC_SPACE_L2HDRS:
2706		aggsum_add(&astat_l2_hdr_size, space);
2707		break;
2708	}
2709
2710	if (type != ARC_SPACE_DATA)
2711		aggsum_add(&arc_meta_used, space);
2712
2713	aggsum_add(&arc_size, space);
2714}
2715
2716void
2717arc_space_return(uint64_t space, arc_space_type_t type)
2718{
2719	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2720
2721	switch (type) {
2722	case ARC_SPACE_DATA:
2723		aggsum_add(&astat_data_size, -space);
2724		break;
2725	case ARC_SPACE_META:
2726		aggsum_add(&astat_metadata_size, -space);
2727		break;
2728	case ARC_SPACE_OTHER:
2729		aggsum_add(&astat_other_size, -space);
2730		break;
2731	case ARC_SPACE_HDRS:
2732		aggsum_add(&astat_hdr_size, -space);
2733		break;
2734	case ARC_SPACE_L2HDRS:
2735		aggsum_add(&astat_l2_hdr_size, -space);
2736		break;
2737	}
2738
2739	if (type != ARC_SPACE_DATA) {
2740		ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2741		/*
2742		 * We use the upper bound here rather than the precise value
2743		 * because the arc_meta_max value doesn't need to be
2744		 * precise. It's only consumed by humans via arcstats.
2745		 */
2746		if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2747			arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2748		aggsum_add(&arc_meta_used, -space);
2749	}
2750
2751	ASSERT(aggsum_compare(&arc_size, space) >= 0);
2752	aggsum_add(&arc_size, -space);
2753}
2754
2755/*
2756 * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2757 * with the hdr's b_pabd.
2758 */
2759static boolean_t
2760arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2761{
2762	/*
2763	 * The criteria for sharing a hdr's data are:
2764	 * 1. the hdr's compression matches the buf's compression
2765	 * 2. the hdr doesn't need to be byteswapped
2766	 * 3. the hdr isn't already being shared
2767	 * 4. the buf is either compressed or it is the last buf in the hdr list
2768	 *
2769	 * Criterion #4 maintains the invariant that shared uncompressed
2770	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2771	 * might ask, "if a compressed buf is allocated first, won't that be the
2772	 * last thing in the list?", but in that case it's impossible to create
2773	 * a shared uncompressed buf anyway (because the hdr must be compressed
2774	 * to have the compressed buf). You might also think that #3 is
2775	 * sufficient to make this guarantee, however it's possible
2776	 * (specifically in the rare L2ARC write race mentioned in
2777	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2778	 * is sharable, but wasn't at the time of its allocation. Rather than
2779	 * allow a new shared uncompressed buf to be created and then shuffle
2780	 * the list around to make it the last element, this simply disallows
2781	 * sharing if the new buf isn't the first to be added.
2782	 */
2783	ASSERT3P(buf->b_hdr, ==, hdr);
2784	boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
2785	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2786	return (buf_compressed == hdr_compressed &&
2787	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2788	    !HDR_SHARED_DATA(hdr) &&
2789	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2790}
2791
2792/*
2793 * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2794 * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2795 * copy was made successfully, or an error code otherwise.
2796 */
2797static int
2798arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
2799    boolean_t fill, arc_buf_t **ret)
2800{
2801	arc_buf_t *buf;
2802
2803	ASSERT(HDR_HAS_L1HDR(hdr));
2804	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2805	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2806	    hdr->b_type == ARC_BUFC_METADATA);
2807	ASSERT3P(ret, !=, NULL);
2808	ASSERT3P(*ret, ==, NULL);
2809
2810	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2811	buf->b_hdr = hdr;
2812	buf->b_data = NULL;
2813	buf->b_next = hdr->b_l1hdr.b_buf;
2814	buf->b_flags = 0;
2815
2816	add_reference(hdr, tag);
2817
2818	/*
2819	 * We're about to change the hdr's b_flags. We must either
2820	 * hold the hash_lock or be undiscoverable.
2821	 */
2822	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2823
2824	/*
2825	 * Only honor requests for compressed bufs if the hdr is actually
2826	 * compressed.
2827	 */
2828	if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
2829		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2830
2831	/*
2832	 * If the hdr's data can be shared then we share the data buffer and
2833	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2834	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
2835	 * buffer to store the buf's data.
2836	 *
2837	 * There are two additional restrictions here because we're sharing
2838	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2839	 * actively involved in an L2ARC write, because if this buf is used by
2840	 * an arc_write() then the hdr's data buffer will be released when the
2841	 * write completes, even though the L2ARC write might still be using it.
2842	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
2843	 * need to be ABD-aware.
2844	 */
2845	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
2846	    abd_is_linear(hdr->b_l1hdr.b_pabd);
2847
2848	/* Set up b_data and sharing */
2849	if (can_share) {
2850		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2851		buf->b_flags |= ARC_BUF_FLAG_SHARED;
2852		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2853	} else {
2854		buf->b_data =
2855		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2856		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2857	}
2858	VERIFY3P(buf->b_data, !=, NULL);
2859
2860	hdr->b_l1hdr.b_buf = buf;
2861	hdr->b_l1hdr.b_bufcnt += 1;
2862
2863	/*
2864	 * If the user wants the data from the hdr, we need to either copy or
2865	 * decompress the data.
2866	 */
2867	if (fill) {
2868		return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
2869	}
2870
2871	return (0);
2872}
2873
2874static char *arc_onloan_tag = "onloan";
2875
2876static inline void
2877arc_loaned_bytes_update(int64_t delta)
2878{
2879	atomic_add_64(&arc_loaned_bytes, delta);
2880
2881	/* assert that it did not wrap around */
2882	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2883}
2884
2885/*
2886 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2887 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2888 * buffers must be returned to the arc before they can be used by the DMU or
2889 * freed.
2890 */
2891arc_buf_t *
2892arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2893{
2894	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2895	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2896
2897	arc_loaned_bytes_update(arc_buf_size(buf));
2898
2899	return (buf);
2900}
2901
2902arc_buf_t *
2903arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2904    enum zio_compress compression_type)
2905{
2906	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2907	    psize, lsize, compression_type);
2908
2909	arc_loaned_bytes_update(arc_buf_size(buf));
2910
2911	return (buf);
2912}
2913
2914
2915/*
2916 * Return a loaned arc buffer to the arc.
2917 */
2918void
2919arc_return_buf(arc_buf_t *buf, void *tag)
2920{
2921	arc_buf_hdr_t *hdr = buf->b_hdr;
2922
2923	ASSERT3P(buf->b_data, !=, NULL);
2924	ASSERT(HDR_HAS_L1HDR(hdr));
2925	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2926	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2927
2928	arc_loaned_bytes_update(-arc_buf_size(buf));
2929}
2930
2931/* Detach an arc_buf from a dbuf (tag) */
2932void
2933arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2934{
2935	arc_buf_hdr_t *hdr = buf->b_hdr;
2936
2937	ASSERT3P(buf->b_data, !=, NULL);
2938	ASSERT(HDR_HAS_L1HDR(hdr));
2939	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2940	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2941
2942	arc_loaned_bytes_update(arc_buf_size(buf));
2943}
2944
2945static void
2946l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
2947{
2948	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2949
2950	df->l2df_abd = abd;
2951	df->l2df_size = size;
2952	df->l2df_type = type;
2953	mutex_enter(&l2arc_free_on_write_mtx);
2954	list_insert_head(l2arc_free_on_write, df);
2955	mutex_exit(&l2arc_free_on_write_mtx);
2956}
2957
2958static void
2959arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
2960{
2961	arc_state_t *state = hdr->b_l1hdr.b_state;
2962	arc_buf_contents_t type = arc_buf_type(hdr);
2963	uint64_t size = arc_hdr_size(hdr);
2964
2965	/* protected by hash lock, if in the hash table */
2966	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2967		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2968		ASSERT(state != arc_anon && state != arc_l2c_only);
2969
2970		(void) refcount_remove_many(&state->arcs_esize[type],
2971		    size, hdr);
2972	}
2973	(void) refcount_remove_many(&state->arcs_size, size, hdr);
2974	if (type == ARC_BUFC_METADATA) {
2975		arc_space_return(size, ARC_SPACE_META);
2976	} else {
2977		ASSERT(type == ARC_BUFC_DATA);
2978		arc_space_return(size, ARC_SPACE_DATA);
2979	}
2980
2981	l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
2982}
2983
2984/*
2985 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
2986 * data buffer, we transfer the refcount ownership to the hdr and update
2987 * the appropriate kstats.
2988 */
2989static void
2990arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2991{
2992	arc_state_t *state = hdr->b_l1hdr.b_state;
2993
2994	ASSERT(arc_can_share(hdr, buf));
2995	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2996	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2997
2998	/*
2999	 * Start sharing the data buffer. We transfer the
3000	 * refcount ownership to the hdr since it always owns
3001	 * the refcount whenever an arc_buf_t is shared.
3002	 */
3003	refcount_transfer_ownership(&state->arcs_size, buf, hdr);
3004	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
3005	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
3006	    HDR_ISTYPE_METADATA(hdr));
3007	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
3008	buf->b_flags |= ARC_BUF_FLAG_SHARED;
3009
3010	/*
3011	 * Since we've transferred ownership to the hdr we need
3012	 * to increment its compressed and uncompressed kstats and
3013	 * decrement the overhead size.
3014	 */
3015	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3016	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3017	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
3018}
3019
3020static void
3021arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3022{
3023	arc_state_t *state = hdr->b_l1hdr.b_state;
3024
3025	ASSERT(arc_buf_is_shared(buf));
3026	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3027	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3028
3029	/*
3030	 * We are no longer sharing this buffer so we need
3031	 * to transfer its ownership to the rightful owner.
3032	 */
3033	refcount_transfer_ownership(&state->arcs_size, hdr, buf);
3034	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3035	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3036	abd_put(hdr->b_l1hdr.b_pabd);
3037	hdr->b_l1hdr.b_pabd = NULL;
3038	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
3039
3040	/*
3041	 * Since the buffer is no longer shared between
3042	 * the arc buf and the hdr, count it as overhead.
3043	 */
3044	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3045	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3046	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3047}
3048
3049/*
3050 * Remove an arc_buf_t from the hdr's buf list and return the last
3051 * arc_buf_t on the list. If no buffers remain on the list then return
3052 * NULL.
3053 */
3054static arc_buf_t *
3055arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3056{
3057	ASSERT(HDR_HAS_L1HDR(hdr));
3058	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3059
3060	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3061	arc_buf_t *lastbuf = NULL;
3062
3063	/*
3064	 * Remove the buf from the hdr list and locate the last
3065	 * remaining buffer on the list.
3066	 */
3067	while (*bufp != NULL) {
3068		if (*bufp == buf)
3069			*bufp = buf->b_next;
3070
3071		/*
3072		 * If we've removed a buffer in the middle of
3073		 * the list then update the lastbuf and update
3074		 * bufp.
3075		 */
3076		if (*bufp != NULL) {
3077			lastbuf = *bufp;
3078			bufp = &(*bufp)->b_next;
3079		}
3080	}
3081	buf->b_next = NULL;
3082	ASSERT3P(lastbuf, !=, buf);
3083	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
3084	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
3085	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3086
3087	return (lastbuf);
3088}
3089
3090/*
3091 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
3092 * list and free it.
3093 */
3094static void
3095arc_buf_destroy_impl(arc_buf_t *buf)
3096{
3097	arc_buf_hdr_t *hdr = buf->b_hdr;
3098
3099	/*
3100	 * Free up the data associated with the buf but only if we're not
3101	 * sharing this with the hdr. If we are sharing it with the hdr, the
3102	 * hdr is responsible for doing the free.
3103	 */
3104	if (buf->b_data != NULL) {
3105		/*
3106		 * We're about to change the hdr's b_flags. We must either
3107		 * hold the hash_lock or be undiscoverable.
3108		 */
3109		ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3110
3111		arc_cksum_verify(buf);
3112#ifdef illumos
3113		arc_buf_unwatch(buf);
3114#endif
3115
3116		if (arc_buf_is_shared(buf)) {
3117			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3118		} else {
3119			uint64_t size = arc_buf_size(buf);
3120			arc_free_data_buf(hdr, buf->b_data, size, buf);
3121			ARCSTAT_INCR(arcstat_overhead_size, -size);
3122		}
3123		buf->b_data = NULL;
3124
3125		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3126		hdr->b_l1hdr.b_bufcnt -= 1;
3127	}
3128
3129	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3130
3131	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3132		/*
3133		 * If the current arc_buf_t is sharing its data buffer with the
3134		 * hdr, then reassign the hdr's b_pabd to share it with the new
3135		 * buffer at the end of the list. The shared buffer is always
3136		 * the last one on the hdr's buffer list.
3137		 *
3138		 * There is an equivalent case for compressed bufs, but since
3139		 * they aren't guaranteed to be the last buf in the list and
3140		 * that is an exceedingly rare case, we just allow that space be
3141		 * wasted temporarily.
3142		 */
3143		if (lastbuf != NULL) {
3144			/* Only one buf can be shared at once */
3145			VERIFY(!arc_buf_is_shared(lastbuf));
3146			/* hdr is uncompressed so can't have compressed buf */
3147			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3148
3149			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3150			arc_hdr_free_pabd(hdr);
3151
3152			/*
3153			 * We must setup a new shared block between the
3154			 * last buffer and the hdr. The data would have
3155			 * been allocated by the arc buf so we need to transfer
3156			 * ownership to the hdr since it's now being shared.
3157			 */
3158			arc_share_buf(hdr, lastbuf);
3159		}
3160	} else if (HDR_SHARED_DATA(hdr)) {
3161		/*
3162		 * Uncompressed shared buffers are always at the end
3163		 * of the list. Compressed buffers don't have the
3164		 * same requirements. This makes it hard to
3165		 * simply assert that the lastbuf is shared so
3166		 * we rely on the hdr's compression flags to determine
3167		 * if we have a compressed, shared buffer.
3168		 */
3169		ASSERT3P(lastbuf, !=, NULL);
3170		ASSERT(arc_buf_is_shared(lastbuf) ||
3171		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
3172	}
3173
3174	/*
3175	 * Free the checksum if we're removing the last uncompressed buf from
3176	 * this hdr.
3177	 */
3178	if (!arc_hdr_has_uncompressed_buf(hdr)) {
3179		arc_cksum_free(hdr);
3180	}
3181
3182	/* clean up the buf */
3183	buf->b_hdr = NULL;
3184	kmem_cache_free(buf_cache, buf);
3185}
3186
3187static void
3188arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
3189{
3190	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3191	ASSERT(HDR_HAS_L1HDR(hdr));
3192	ASSERT(!HDR_SHARED_DATA(hdr));
3193
3194	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3195	hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
3196	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3197	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3198
3199	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3200	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3201}
3202
3203static void
3204arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
3205{
3206	ASSERT(HDR_HAS_L1HDR(hdr));
3207	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3208
3209	/*
3210	 * If the hdr is currently being written to the l2arc then
3211	 * we defer freeing the data by adding it to the l2arc_free_on_write
3212	 * list. The l2arc will free the data once it's finished
3213	 * writing it to the l2arc device.
3214	 */
3215	if (HDR_L2_WRITING(hdr)) {
3216		arc_hdr_free_on_write(hdr);
3217		ARCSTAT_BUMP(arcstat_l2_free_on_write);
3218	} else {
3219		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
3220		    arc_hdr_size(hdr), hdr);
3221	}
3222	hdr->b_l1hdr.b_pabd = NULL;
3223	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3224
3225	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3226	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3227}
3228
3229static arc_buf_hdr_t *
3230arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3231    enum zio_compress compression_type, arc_buf_contents_t type)
3232{
3233	arc_buf_hdr_t *hdr;
3234
3235	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3236
3237	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3238	ASSERT(HDR_EMPTY(hdr));
3239	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3240	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
3241	HDR_SET_PSIZE(hdr, psize);
3242	HDR_SET_LSIZE(hdr, lsize);
3243	hdr->b_spa = spa;
3244	hdr->b_type = type;
3245	hdr->b_flags = 0;
3246	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3247	arc_hdr_set_compress(hdr, compression_type);
3248
3249	hdr->b_l1hdr.b_state = arc_anon;
3250	hdr->b_l1hdr.b_arc_access = 0;
3251	hdr->b_l1hdr.b_bufcnt = 0;
3252	hdr->b_l1hdr.b_buf = NULL;
3253
3254	/*
3255	 * Allocate the hdr's buffer. This will contain either
3256	 * the compressed or uncompressed data depending on the block
3257	 * it references and compressed arc enablement.
3258	 */
3259	arc_hdr_alloc_pabd(hdr);
3260	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3261
3262	return (hdr);
3263}
3264
3265/*
3266 * Transition between the two allocation states for the arc_buf_hdr struct.
3267 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3268 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3269 * version is used when a cache buffer is only in the L2ARC in order to reduce
3270 * memory usage.
3271 */
3272static arc_buf_hdr_t *
3273arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3274{
3275	ASSERT(HDR_HAS_L2HDR(hdr));
3276
3277	arc_buf_hdr_t *nhdr;
3278	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3279
3280	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3281	    (old == hdr_l2only_cache && new == hdr_full_cache));
3282
3283	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3284
3285	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3286	buf_hash_remove(hdr);
3287
3288	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3289
3290	if (new == hdr_full_cache) {
3291		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3292		/*
3293		 * arc_access and arc_change_state need to be aware that a
3294		 * header has just come out of L2ARC, so we set its state to
3295		 * l2c_only even though it's about to change.
3296		 */
3297		nhdr->b_l1hdr.b_state = arc_l2c_only;
3298
3299		/* Verify previous threads set to NULL before freeing */
3300		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3301	} else {
3302		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3303		ASSERT0(hdr->b_l1hdr.b_bufcnt);
3304		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3305
3306		/*
3307		 * If we've reached here, We must have been called from
3308		 * arc_evict_hdr(), as such we should have already been
3309		 * removed from any ghost list we were previously on
3310		 * (which protects us from racing with arc_evict_state),
3311		 * thus no locking is needed during this check.
3312		 */
3313		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3314
3315		/*
3316		 * A buffer must not be moved into the arc_l2c_only
3317		 * state if it's not finished being written out to the
3318		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3319		 * might try to be accessed, even though it was removed.
3320		 */
3321		VERIFY(!HDR_L2_WRITING(hdr));
3322		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3323
3324#ifdef ZFS_DEBUG
3325		if (hdr->b_l1hdr.b_thawed != NULL) {
3326			kmem_free(hdr->b_l1hdr.b_thawed, 1);
3327			hdr->b_l1hdr.b_thawed = NULL;
3328		}
3329#endif
3330
3331		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3332	}
3333	/*
3334	 * The header has been reallocated so we need to re-insert it into any
3335	 * lists it was on.
3336	 */
3337	(void) buf_hash_insert(nhdr, NULL);
3338
3339	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3340
3341	mutex_enter(&dev->l2ad_mtx);
3342
3343	/*
3344	 * We must place the realloc'ed header back into the list at
3345	 * the same spot. Otherwise, if it's placed earlier in the list,
3346	 * l2arc_write_buffers() could find it during the function's
3347	 * write phase, and try to write it out to the l2arc.
3348	 */
3349	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3350	list_remove(&dev->l2ad_buflist, hdr);
3351
3352	mutex_exit(&dev->l2ad_mtx);
3353
3354	/*
3355	 * Since we're using the pointer address as the tag when
3356	 * incrementing and decrementing the l2ad_alloc refcount, we
3357	 * must remove the old pointer (that we're about to destroy) and
3358	 * add the new pointer to the refcount. Otherwise we'd remove
3359	 * the wrong pointer address when calling arc_hdr_destroy() later.
3360	 */
3361
3362	(void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
3363	(void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
3364
3365	buf_discard_identity(hdr);
3366	kmem_cache_free(old, hdr);
3367
3368	return (nhdr);
3369}
3370
3371/*
3372 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3373 * The buf is returned thawed since we expect the consumer to modify it.
3374 */
3375arc_buf_t *
3376arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
3377{
3378	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3379	    ZIO_COMPRESS_OFF, type);
3380	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3381
3382	arc_buf_t *buf = NULL;
3383	VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
3384	arc_buf_thaw(buf);
3385
3386	return (buf);
3387}
3388
3389/*
3390 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3391 * for bufs containing metadata.
3392 */
3393arc_buf_t *
3394arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
3395    enum zio_compress compression_type)
3396{
3397	ASSERT3U(lsize, >, 0);
3398	ASSERT3U(lsize, >=, psize);
3399	ASSERT(compression_type > ZIO_COMPRESS_OFF);
3400	ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
3401
3402	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3403	    compression_type, ARC_BUFC_DATA);
3404	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3405
3406	arc_buf_t *buf = NULL;
3407	VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
3408	arc_buf_thaw(buf);
3409	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3410
3411	if (!arc_buf_is_shared(buf)) {
3412		/*
3413		 * To ensure that the hdr has the correct data in it if we call
3414		 * arc_decompress() on this buf before it's been written to
3415		 * disk, it's easiest if we just set up sharing between the
3416		 * buf and the hdr.
3417		 */
3418		ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
3419		arc_hdr_free_pabd(hdr);
3420		arc_share_buf(hdr, buf);
3421	}
3422
3423	return (buf);
3424}
3425
3426static void
3427arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3428{
3429	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3430	l2arc_dev_t *dev = l2hdr->b_dev;
3431	uint64_t psize = arc_hdr_size(hdr);
3432
3433	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3434	ASSERT(HDR_HAS_L2HDR(hdr));
3435
3436	list_remove(&dev->l2ad_buflist, hdr);
3437
3438	ARCSTAT_INCR(arcstat_l2_psize, -psize);
3439	ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
3440
3441	vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
3442
3443	(void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
3444	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3445}
3446
3447static void
3448arc_hdr_destroy(arc_buf_hdr_t *hdr)
3449{
3450	if (HDR_HAS_L1HDR(hdr)) {
3451		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
3452		    hdr->b_l1hdr.b_bufcnt > 0);
3453		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3454		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3455	}
3456	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3457	ASSERT(!HDR_IN_HASH_TABLE(hdr));
3458
3459	if (!HDR_EMPTY(hdr))
3460		buf_discard_identity(hdr);
3461
3462	if (HDR_HAS_L2HDR(hdr)) {
3463		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3464		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3465
3466		if (!buflist_held)
3467			mutex_enter(&dev->l2ad_mtx);
3468
3469		/*
3470		 * Even though we checked this conditional above, we
3471		 * need to check this again now that we have the
3472		 * l2ad_mtx. This is because we could be racing with
3473		 * another thread calling l2arc_evict() which might have
3474		 * destroyed this header's L2 portion as we were waiting
3475		 * to acquire the l2ad_mtx. If that happens, we don't
3476		 * want to re-destroy the header's L2 portion.
3477		 */
3478		if (HDR_HAS_L2HDR(hdr)) {
3479			l2arc_trim(hdr);
3480			arc_hdr_l2hdr_destroy(hdr);
3481		}
3482
3483		if (!buflist_held)
3484			mutex_exit(&dev->l2ad_mtx);
3485	}
3486
3487	if (HDR_HAS_L1HDR(hdr)) {
3488		arc_cksum_free(hdr);
3489
3490		while (hdr->b_l1hdr.b_buf != NULL)
3491			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3492
3493#ifdef ZFS_DEBUG
3494		if (hdr->b_l1hdr.b_thawed != NULL) {
3495			kmem_free(hdr->b_l1hdr.b_thawed, 1);
3496			hdr->b_l1hdr.b_thawed = NULL;
3497		}
3498#endif
3499
3500		if (hdr->b_l1hdr.b_pabd != NULL) {
3501			arc_hdr_free_pabd(hdr);
3502		}
3503	}
3504
3505	ASSERT3P(hdr->b_hash_next, ==, NULL);
3506	if (HDR_HAS_L1HDR(hdr)) {
3507		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3508		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
3509		kmem_cache_free(hdr_full_cache, hdr);
3510	} else {
3511		kmem_cache_free(hdr_l2only_cache, hdr);
3512	}
3513}
3514
3515void
3516arc_buf_destroy(arc_buf_t *buf, void* tag)
3517{
3518	arc_buf_hdr_t *hdr = buf->b_hdr;
3519	kmutex_t *hash_lock = HDR_LOCK(hdr);
3520
3521	if (hdr->b_l1hdr.b_state == arc_anon) {
3522		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
3523		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3524		VERIFY0(remove_reference(hdr, NULL, tag));
3525		arc_hdr_destroy(hdr);
3526		return;
3527	}
3528
3529	mutex_enter(hash_lock);
3530	ASSERT3P(hdr, ==, buf->b_hdr);
3531	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3532	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3533	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3534	ASSERT3P(buf->b_data, !=, NULL);
3535
3536	(void) remove_reference(hdr, hash_lock, tag);
3537	arc_buf_destroy_impl(buf);
3538	mutex_exit(hash_lock);
3539}
3540
3541/*
3542 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3543 * state of the header is dependent on its state prior to entering this
3544 * function. The following transitions are possible:
3545 *
3546 *    - arc_mru -> arc_mru_ghost
3547 *    - arc_mfu -> arc_mfu_ghost
3548 *    - arc_mru_ghost -> arc_l2c_only
3549 *    - arc_mru_ghost -> deleted
3550 *    - arc_mfu_ghost -> arc_l2c_only
3551 *    - arc_mfu_ghost -> deleted
3552 */
3553static int64_t
3554arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3555{
3556	arc_state_t *evicted_state, *state;
3557	int64_t bytes_evicted = 0;
3558	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3559	    zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
3560
3561	ASSERT(MUTEX_HELD(hash_lock));
3562	ASSERT(HDR_HAS_L1HDR(hdr));
3563
3564	state = hdr->b_l1hdr.b_state;
3565	if (GHOST_STATE(state)) {
3566		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3567		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3568
3569		/*
3570		 * l2arc_write_buffers() relies on a header's L1 portion
3571		 * (i.e. its b_pabd field) during it's write phase.
3572		 * Thus, we cannot push a header onto the arc_l2c_only
3573		 * state (removing it's L1 piece) until the header is
3574		 * done being written to the l2arc.
3575		 */
3576		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3577			ARCSTAT_BUMP(arcstat_evict_l2_skip);
3578			return (bytes_evicted);
3579		}
3580
3581		ARCSTAT_BUMP(arcstat_deleted);
3582		bytes_evicted += HDR_GET_LSIZE(hdr);
3583
3584		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3585
3586		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3587		if (HDR_HAS_L2HDR(hdr)) {
3588			/*
3589			 * This buffer is cached on the 2nd Level ARC;
3590			 * don't destroy the header.
3591			 */
3592			arc_change_state(arc_l2c_only, hdr, hash_lock);
3593			/*
3594			 * dropping from L1+L2 cached to L2-only,
3595			 * realloc to remove the L1 header.
3596			 */
3597			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
3598			    hdr_l2only_cache);
3599		} else {
3600			arc_change_state(arc_anon, hdr, hash_lock);
3601			arc_hdr_destroy(hdr);
3602		}
3603		return (bytes_evicted);
3604	}
3605
3606	ASSERT(state == arc_mru || state == arc_mfu);
3607	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3608
3609	/* prefetch buffers have a minimum lifespan */
3610	if (HDR_IO_IN_PROGRESS(hdr) ||
3611	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
3612	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
3613		ARCSTAT_BUMP(arcstat_evict_skip);
3614		return (bytes_evicted);
3615	}
3616
3617	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3618	while (hdr->b_l1hdr.b_buf) {
3619		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3620		if (!mutex_tryenter(&buf->b_evict_lock)) {
3621			ARCSTAT_BUMP(arcstat_mutex_miss);
3622			break;
3623		}
3624		if (buf->b_data != NULL)
3625			bytes_evicted += HDR_GET_LSIZE(hdr);
3626		mutex_exit(&buf->b_evict_lock);
3627		arc_buf_destroy_impl(buf);
3628	}
3629
3630	if (HDR_HAS_L2HDR(hdr)) {
3631		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3632	} else {
3633		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3634			ARCSTAT_INCR(arcstat_evict_l2_eligible,
3635			    HDR_GET_LSIZE(hdr));
3636		} else {
3637			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3638			    HDR_GET_LSIZE(hdr));
3639		}
3640	}
3641
3642	if (hdr->b_l1hdr.b_bufcnt == 0) {
3643		arc_cksum_free(hdr);
3644
3645		bytes_evicted += arc_hdr_size(hdr);
3646
3647		/*
3648		 * If this hdr is being evicted and has a compressed
3649		 * buffer then we discard it here before we change states.
3650		 * This ensures that the accounting is updated correctly
3651		 * in arc_free_data_impl().
3652		 */
3653		arc_hdr_free_pabd(hdr);
3654
3655		arc_change_state(evicted_state, hdr, hash_lock);
3656		ASSERT(HDR_IN_HASH_TABLE(hdr));
3657		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
3658		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3659	}
3660
3661	return (bytes_evicted);
3662}
3663
3664static uint64_t
3665arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3666    uint64_t spa, int64_t bytes)
3667{
3668	multilist_sublist_t *mls;
3669	uint64_t bytes_evicted = 0;
3670	arc_buf_hdr_t *hdr;
3671	kmutex_t *hash_lock;
3672	int evict_count = 0;
3673
3674	ASSERT3P(marker, !=, NULL);
3675	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3676
3677	mls = multilist_sublist_lock(ml, idx);
3678
3679	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
3680	    hdr = multilist_sublist_prev(mls, marker)) {
3681		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
3682		    (evict_count >= zfs_arc_evict_batch_limit))
3683			break;
3684
3685		/*
3686		 * To keep our iteration location, move the marker
3687		 * forward. Since we're not holding hdr's hash lock, we
3688		 * must be very careful and not remove 'hdr' from the
3689		 * sublist. Otherwise, other consumers might mistake the
3690		 * 'hdr' as not being on a sublist when they call the
3691		 * multilist_link_active() function (they all rely on
3692		 * the hash lock protecting concurrent insertions and
3693		 * removals). multilist_sublist_move_forward() was
3694		 * specifically implemented to ensure this is the case
3695		 * (only 'marker' will be removed and re-inserted).
3696		 */
3697		multilist_sublist_move_forward(mls, marker);
3698
3699		/*
3700		 * The only case where the b_spa field should ever be
3701		 * zero, is the marker headers inserted by
3702		 * arc_evict_state(). It's possible for multiple threads
3703		 * to be calling arc_evict_state() concurrently (e.g.
3704		 * dsl_pool_close() and zio_inject_fault()), so we must
3705		 * skip any markers we see from these other threads.
3706		 */
3707		if (hdr->b_spa == 0)
3708			continue;
3709
3710		/* we're only interested in evicting buffers of a certain spa */
3711		if (spa != 0 && hdr->b_spa != spa) {
3712			ARCSTAT_BUMP(arcstat_evict_skip);
3713			continue;
3714		}
3715
3716		hash_lock = HDR_LOCK(hdr);
3717
3718		/*
3719		 * We aren't calling this function from any code path
3720		 * that would already be holding a hash lock, so we're
3721		 * asserting on this assumption to be defensive in case
3722		 * this ever changes. Without this check, it would be
3723		 * possible to incorrectly increment arcstat_mutex_miss
3724		 * below (e.g. if the code changed such that we called
3725		 * this function with a hash lock held).
3726		 */
3727		ASSERT(!MUTEX_HELD(hash_lock));
3728
3729		if (mutex_tryenter(hash_lock)) {
3730			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
3731			mutex_exit(hash_lock);
3732
3733			bytes_evicted += evicted;
3734
3735			/*
3736			 * If evicted is zero, arc_evict_hdr() must have
3737			 * decided to skip this header, don't increment
3738			 * evict_count in this case.
3739			 */
3740			if (evicted != 0)
3741				evict_count++;
3742
3743			/*
3744			 * If arc_size isn't overflowing, signal any
3745			 * threads that might happen to be waiting.
3746			 *
3747			 * For each header evicted, we wake up a single
3748			 * thread. If we used cv_broadcast, we could
3749			 * wake up "too many" threads causing arc_size
3750			 * to significantly overflow arc_c; since
3751			 * arc_get_data_impl() doesn't check for overflow
3752			 * when it's woken up (it doesn't because it's
3753			 * possible for the ARC to be overflowing while
3754			 * full of un-evictable buffers, and the
3755			 * function should proceed in this case).
3756			 *
3757			 * If threads are left sleeping, due to not
3758			 * using cv_broadcast, they will be woken up
3759			 * just before arc_reclaim_thread() sleeps.
3760			 */
3761			mutex_enter(&arc_reclaim_lock);
3762			if (!arc_is_overflowing())
3763				cv_signal(&arc_reclaim_waiters_cv);
3764			mutex_exit(&arc_reclaim_lock);
3765		} else {
3766			ARCSTAT_BUMP(arcstat_mutex_miss);
3767		}
3768	}
3769
3770	multilist_sublist_unlock(mls);
3771
3772	return (bytes_evicted);
3773}
3774
3775/*
3776 * Evict buffers from the given arc state, until we've removed the
3777 * specified number of bytes. Move the removed buffers to the
3778 * appropriate evict state.
3779 *
3780 * This function makes a "best effort". It skips over any buffers
3781 * it can't get a hash_lock on, and so, may not catch all candidates.
3782 * It may also return without evicting as much space as requested.
3783 *
3784 * If bytes is specified using the special value ARC_EVICT_ALL, this
3785 * will evict all available (i.e. unlocked and evictable) buffers from
3786 * the given arc state; which is used by arc_flush().
3787 */
3788static uint64_t
3789arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
3790    arc_buf_contents_t type)
3791{
3792	uint64_t total_evicted = 0;
3793	multilist_t *ml = state->arcs_list[type];
3794	int num_sublists;
3795	arc_buf_hdr_t **markers;
3796
3797	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3798
3799	num_sublists = multilist_get_num_sublists(ml);
3800
3801	/*
3802	 * If we've tried to evict from each sublist, made some
3803	 * progress, but still have not hit the target number of bytes
3804	 * to evict, we want to keep trying. The markers allow us to
3805	 * pick up where we left off for each individual sublist, rather
3806	 * than starting from the tail each time.
3807	 */
3808	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
3809	for (int i = 0; i < num_sublists; i++) {
3810		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
3811
3812		/*
3813		 * A b_spa of 0 is used to indicate that this header is
3814		 * a marker. This fact is used in arc_adjust_type() and
3815		 * arc_evict_state_impl().
3816		 */
3817		markers[i]->b_spa = 0;
3818
3819		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3820		multilist_sublist_insert_tail(mls, markers[i]);
3821		multilist_sublist_unlock(mls);
3822	}
3823
3824	/*
3825	 * While we haven't hit our target number of bytes to evict, or
3826	 * we're evicting all available buffers.
3827	 */
3828	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
3829		/*
3830		 * Start eviction using a randomly selected sublist,
3831		 * this is to try and evenly balance eviction across all
3832		 * sublists. Always starting at the same sublist
3833		 * (e.g. index 0) would cause evictions to favor certain
3834		 * sublists over others.
3835		 */
3836		int sublist_idx = multilist_get_random_index(ml);
3837		uint64_t scan_evicted = 0;
3838
3839		for (int i = 0; i < num_sublists; i++) {
3840			uint64_t bytes_remaining;
3841			uint64_t bytes_evicted;
3842
3843			if (bytes == ARC_EVICT_ALL)
3844				bytes_remaining = ARC_EVICT_ALL;
3845			else if (total_evicted < bytes)
3846				bytes_remaining = bytes - total_evicted;
3847			else
3848				break;
3849
3850			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
3851			    markers[sublist_idx], spa, bytes_remaining);
3852
3853			scan_evicted += bytes_evicted;
3854			total_evicted += bytes_evicted;
3855
3856			/* we've reached the end, wrap to the beginning */
3857			if (++sublist_idx >= num_sublists)
3858				sublist_idx = 0;
3859		}
3860
3861		/*
3862		 * If we didn't evict anything during this scan, we have
3863		 * no reason to believe we'll evict more during another
3864		 * scan, so break the loop.
3865		 */
3866		if (scan_evicted == 0) {
3867			/* This isn't possible, let's make that obvious */
3868			ASSERT3S(bytes, !=, 0);
3869
3870			/*
3871			 * When bytes is ARC_EVICT_ALL, the only way to
3872			 * break the loop is when scan_evicted is zero.
3873			 * In that case, we actually have evicted enough,
3874			 * so we don't want to increment the kstat.
3875			 */
3876			if (bytes != ARC_EVICT_ALL) {
3877				ASSERT3S(total_evicted, <, bytes);
3878				ARCSTAT_BUMP(arcstat_evict_not_enough);
3879			}
3880
3881			break;
3882		}
3883	}
3884
3885	for (int i = 0; i < num_sublists; i++) {
3886		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3887		multilist_sublist_remove(mls, markers[i]);
3888		multilist_sublist_unlock(mls);
3889
3890		kmem_cache_free(hdr_full_cache, markers[i]);
3891	}
3892	kmem_free(markers, sizeof (*markers) * num_sublists);
3893
3894	return (total_evicted);
3895}
3896
3897/*
3898 * Flush all "evictable" data of the given type from the arc state
3899 * specified. This will not evict any "active" buffers (i.e. referenced).
3900 *
3901 * When 'retry' is set to B_FALSE, the function will make a single pass
3902 * over the state and evict any buffers that it can. Since it doesn't
3903 * continually retry the eviction, it might end up leaving some buffers
3904 * in the ARC due to lock misses.
3905 *
3906 * When 'retry' is set to B_TRUE, the function will continually retry the
3907 * eviction until *all* evictable buffers have been removed from the
3908 * state. As a result, if concurrent insertions into the state are
3909 * allowed (e.g. if the ARC isn't shutting down), this function might
3910 * wind up in an infinite loop, continually trying to evict buffers.
3911 */
3912static uint64_t
3913arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
3914    boolean_t retry)
3915{
3916	uint64_t evicted = 0;
3917
3918	while (refcount_count(&state->arcs_esize[type]) != 0) {
3919		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
3920
3921		if (!retry)
3922			break;
3923	}
3924
3925	return (evicted);
3926}
3927
3928/*
3929 * Evict the specified number of bytes from the state specified,
3930 * restricting eviction to the spa and type given. This function
3931 * prevents us from trying to evict more from a state's list than
3932 * is "evictable", and to skip evicting altogether when passed a
3933 * negative value for "bytes". In contrast, arc_evict_state() will
3934 * evict everything it can, when passed a negative value for "bytes".
3935 */
3936static uint64_t
3937arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
3938    arc_buf_contents_t type)
3939{
3940	int64_t delta;
3941
3942	if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
3943		delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
3944		return (arc_evict_state(state, spa, delta, type));
3945	}
3946
3947	return (0);
3948}
3949
3950/*
3951 * Evict metadata buffers from the cache, such that arc_meta_used is
3952 * capped by the arc_meta_limit tunable.
3953 */
3954static uint64_t
3955arc_adjust_meta(uint64_t meta_used)
3956{
3957	uint64_t total_evicted = 0;
3958	int64_t target;
3959
3960	/*
3961	 * If we're over the meta limit, we want to evict enough
3962	 * metadata to get back under the meta limit. We don't want to
3963	 * evict so much that we drop the MRU below arc_p, though. If
3964	 * we're over the meta limit more than we're over arc_p, we
3965	 * evict some from the MRU here, and some from the MFU below.
3966	 */
3967	target = MIN((int64_t)(meta_used - arc_meta_limit),
3968	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
3969	    refcount_count(&arc_mru->arcs_size) - arc_p));
3970
3971	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3972
3973	/*
3974	 * Similar to the above, we want to evict enough bytes to get us
3975	 * below the meta limit, but not so much as to drop us below the
3976	 * space allotted to the MFU (which is defined as arc_c - arc_p).
3977	 */
3978	target = MIN((int64_t)(meta_used - arc_meta_limit),
3979	    (int64_t)(refcount_count(&arc_mfu->arcs_size) -
3980	    (arc_c - arc_p)));
3981
3982	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3983
3984	return (total_evicted);
3985}
3986
3987/*
3988 * Return the type of the oldest buffer in the given arc state
3989 *
3990 * This function will select a random sublist of type ARC_BUFC_DATA and
3991 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
3992 * is compared, and the type which contains the "older" buffer will be
3993 * returned.
3994 */
3995static arc_buf_contents_t
3996arc_adjust_type(arc_state_t *state)
3997{
3998	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
3999	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
4000	int data_idx = multilist_get_random_index(data_ml);
4001	int meta_idx = multilist_get_random_index(meta_ml);
4002	multilist_sublist_t *data_mls;
4003	multilist_sublist_t *meta_mls;
4004	arc_buf_contents_t type;
4005	arc_buf_hdr_t *data_hdr;
4006	arc_buf_hdr_t *meta_hdr;
4007
4008	/*
4009	 * We keep the sublist lock until we're finished, to prevent
4010	 * the headers from being destroyed via arc_evict_state().
4011	 */
4012	data_mls = multilist_sublist_lock(data_ml, data_idx);
4013	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4014
4015	/*
4016	 * These two loops are to ensure we skip any markers that
4017	 * might be at the tail of the lists due to arc_evict_state().
4018	 */
4019
4020	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
4021	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
4022		if (data_hdr->b_spa != 0)
4023			break;
4024	}
4025
4026	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
4027	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
4028		if (meta_hdr->b_spa != 0)
4029			break;
4030	}
4031
4032	if (data_hdr == NULL && meta_hdr == NULL) {
4033		type = ARC_BUFC_DATA;
4034	} else if (data_hdr == NULL) {
4035		ASSERT3P(meta_hdr, !=, NULL);
4036		type = ARC_BUFC_METADATA;
4037	} else if (meta_hdr == NULL) {
4038		ASSERT3P(data_hdr, !=, NULL);
4039		type = ARC_BUFC_DATA;
4040	} else {
4041		ASSERT3P(data_hdr, !=, NULL);
4042		ASSERT3P(meta_hdr, !=, NULL);
4043
4044		/* The headers can't be on the sublist without an L1 header */
4045		ASSERT(HDR_HAS_L1HDR(data_hdr));
4046		ASSERT(HDR_HAS_L1HDR(meta_hdr));
4047
4048		if (data_hdr->b_l1hdr.b_arc_access <
4049		    meta_hdr->b_l1hdr.b_arc_access) {
4050			type = ARC_BUFC_DATA;
4051		} else {
4052			type = ARC_BUFC_METADATA;
4053		}
4054	}
4055
4056	multilist_sublist_unlock(meta_mls);
4057	multilist_sublist_unlock(data_mls);
4058
4059	return (type);
4060}
4061
4062/*
4063 * Evict buffers from the cache, such that arc_size is capped by arc_c.
4064 */
4065static uint64_t
4066arc_adjust(void)
4067{
4068	uint64_t total_evicted = 0;
4069	uint64_t bytes;
4070	int64_t target;
4071	uint64_t asize = aggsum_value(&arc_size);
4072	uint64_t ameta = aggsum_value(&arc_meta_used);
4073
4074	/*
4075	 * If we're over arc_meta_limit, we want to correct that before
4076	 * potentially evicting data buffers below.
4077	 */
4078	total_evicted += arc_adjust_meta(ameta);
4079
4080	/*
4081	 * Adjust MRU size
4082	 *
4083	 * If we're over the target cache size, we want to evict enough
4084	 * from the list to get back to our target size. We don't want
4085	 * to evict too much from the MRU, such that it drops below
4086	 * arc_p. So, if we're over our target cache size more than
4087	 * the MRU is over arc_p, we'll evict enough to get back to
4088	 * arc_p here, and then evict more from the MFU below.
4089	 */
4090	target = MIN((int64_t)(asize - arc_c),
4091	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
4092	    refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
4093
4094	/*
4095	 * If we're below arc_meta_min, always prefer to evict data.
4096	 * Otherwise, try to satisfy the requested number of bytes to
4097	 * evict from the type which contains older buffers; in an
4098	 * effort to keep newer buffers in the cache regardless of their
4099	 * type. If we cannot satisfy the number of bytes from this
4100	 * type, spill over into the next type.
4101	 */
4102	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
4103	    ameta > arc_meta_min) {
4104		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4105		total_evicted += bytes;
4106
4107		/*
4108		 * If we couldn't evict our target number of bytes from
4109		 * metadata, we try to get the rest from data.
4110		 */
4111		target -= bytes;
4112
4113		total_evicted +=
4114		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4115	} else {
4116		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4117		total_evicted += bytes;
4118
4119		/*
4120		 * If we couldn't evict our target number of bytes from
4121		 * data, we try to get the rest from metadata.
4122		 */
4123		target -= bytes;
4124
4125		total_evicted +=
4126		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4127	}
4128
4129	/*
4130	 * Adjust MFU size
4131	 *
4132	 * Now that we've tried to evict enough from the MRU to get its
4133	 * size back to arc_p, if we're still above the target cache
4134	 * size, we evict the rest from the MFU.
4135	 */
4136	target = asize - arc_c;
4137
4138	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
4139	    ameta > arc_meta_min) {
4140		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4141		total_evicted += bytes;
4142
4143		/*
4144		 * If we couldn't evict our target number of bytes from
4145		 * metadata, we try to get the rest from data.
4146		 */
4147		target -= bytes;
4148
4149		total_evicted +=
4150		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4151	} else {
4152		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4153		total_evicted += bytes;
4154
4155		/*
4156		 * If we couldn't evict our target number of bytes from
4157		 * data, we try to get the rest from data.
4158		 */
4159		target -= bytes;
4160
4161		total_evicted +=
4162		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4163	}
4164
4165	/*
4166	 * Adjust ghost lists
4167	 *
4168	 * In addition to the above, the ARC also defines target values
4169	 * for the ghost lists. The sum of the mru list and mru ghost
4170	 * list should never exceed the target size of the cache, and
4171	 * the sum of the mru list, mfu list, mru ghost list, and mfu
4172	 * ghost list should never exceed twice the target size of the
4173	 * cache. The following logic enforces these limits on the ghost
4174	 * caches, and evicts from them as needed.
4175	 */
4176	target = refcount_count(&arc_mru->arcs_size) +
4177	    refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
4178
4179	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
4180	total_evicted += bytes;
4181
4182	target -= bytes;
4183
4184	total_evicted +=
4185	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
4186
4187	/*
4188	 * We assume the sum of the mru list and mfu list is less than
4189	 * or equal to arc_c (we enforced this above), which means we
4190	 * can use the simpler of the two equations below:
4191	 *
4192	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
4193	 *		    mru ghost + mfu ghost <= arc_c
4194	 */
4195	target = refcount_count(&arc_mru_ghost->arcs_size) +
4196	    refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
4197
4198	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
4199	total_evicted += bytes;
4200
4201	target -= bytes;
4202
4203	total_evicted +=
4204	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
4205
4206	return (total_evicted);
4207}
4208
4209void
4210arc_flush(spa_t *spa, boolean_t retry)
4211{
4212	uint64_t guid = 0;
4213
4214	/*
4215	 * If retry is B_TRUE, a spa must not be specified since we have
4216	 * no good way to determine if all of a spa's buffers have been
4217	 * evicted from an arc state.
4218	 */
4219	ASSERT(!retry || spa == 0);
4220
4221	if (spa != NULL)
4222		guid = spa_load_guid(spa);
4223
4224	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4225	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4226
4227	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4228	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4229
4230	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4231	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4232
4233	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4234	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4235}
4236
4237void
4238arc_shrink(int64_t to_free)
4239{
4240	uint64_t asize = aggsum_value(&arc_size);
4241	if (arc_c > arc_c_min) {
4242		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
4243			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
4244		if (arc_c > arc_c_min + to_free)
4245			atomic_add_64(&arc_c, -to_free);
4246		else
4247			arc_c = arc_c_min;
4248
4249		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
4250		if (asize < arc_c)
4251			arc_c = MAX(asize, arc_c_min);
4252		if (arc_p > arc_c)
4253			arc_p = (arc_c >> 1);
4254
4255		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
4256			arc_p);
4257
4258		ASSERT(arc_c >= arc_c_min);
4259		ASSERT((int64_t)arc_p >= 0);
4260	}
4261
4262	if (asize > arc_c) {
4263		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
4264			uint64_t, arc_c);
4265		(void) arc_adjust();
4266	}
4267}
4268
4269typedef enum free_memory_reason_t {
4270	FMR_UNKNOWN,
4271	FMR_NEEDFREE,
4272	FMR_LOTSFREE,
4273	FMR_SWAPFS_MINFREE,
4274	FMR_PAGES_PP_MAXIMUM,
4275	FMR_HEAP_ARENA,
4276	FMR_ZIO_ARENA,
4277} free_memory_reason_t;
4278
4279int64_t last_free_memory;
4280free_memory_reason_t last_free_reason;
4281
4282/*
4283 * Additional reserve of pages for pp_reserve.
4284 */
4285int64_t arc_pages_pp_reserve = 64;
4286
4287/*
4288 * Additional reserve of pages for swapfs.
4289 */
4290int64_t arc_swapfs_reserve = 64;
4291
4292/*
4293 * Return the amount of memory that can be consumed before reclaim will be
4294 * needed.  Positive if there is sufficient free memory, negative indicates
4295 * the amount of memory that needs to be freed up.
4296 */
4297static int64_t
4298arc_available_memory(void)
4299{
4300	int64_t lowest = INT64_MAX;
4301	int64_t n;
4302	free_memory_reason_t r = FMR_UNKNOWN;
4303
4304#ifdef _KERNEL
4305#ifdef __FreeBSD__
4306	/*
4307	 * Cooperate with pagedaemon when it's time for it to scan
4308	 * and reclaim some pages.
4309	 */
4310	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
4311	if (n < lowest) {
4312		lowest = n;
4313		r = FMR_LOTSFREE;
4314	}
4315
4316#else
4317	if (needfree > 0) {
4318		n = PAGESIZE * (-needfree);
4319		if (n < lowest) {
4320			lowest = n;
4321			r = FMR_NEEDFREE;
4322		}
4323	}
4324
4325	/*
4326	 * check that we're out of range of the pageout scanner.  It starts to
4327	 * schedule paging if freemem is less than lotsfree and needfree.
4328	 * lotsfree is the high-water mark for pageout, and needfree is the
4329	 * number of needed free pages.  We add extra pages here to make sure
4330	 * the scanner doesn't start up while we're freeing memory.
4331	 */
4332	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
4333	if (n < lowest) {
4334		lowest = n;
4335		r = FMR_LOTSFREE;
4336	}
4337
4338	/*
4339	 * check to make sure that swapfs has enough space so that anon
4340	 * reservations can still succeed. anon_resvmem() checks that the
4341	 * availrmem is greater than swapfs_minfree, and the number of reserved
4342	 * swap pages.  We also add a bit of extra here just to prevent
4343	 * circumstances from getting really dire.
4344	 */
4345	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
4346	    desfree - arc_swapfs_reserve);
4347	if (n < lowest) {
4348		lowest = n;
4349		r = FMR_SWAPFS_MINFREE;
4350	}
4351
4352
4353	/*
4354	 * Check that we have enough availrmem that memory locking (e.g., via
4355	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
4356	 * stores the number of pages that cannot be locked; when availrmem
4357	 * drops below pages_pp_maximum, page locking mechanisms such as
4358	 * page_pp_lock() will fail.)
4359	 */
4360	n = PAGESIZE * (availrmem - pages_pp_maximum -
4361	    arc_pages_pp_reserve);
4362	if (n < lowest) {
4363		lowest = n;
4364		r = FMR_PAGES_PP_MAXIMUM;
4365	}
4366
4367#endif	/* __FreeBSD__ */
4368#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4369	/*
4370	 * If we're on an i386 platform, it's possible that we'll exhaust the
4371	 * kernel heap space before we ever run out of available physical
4372	 * memory.  Most checks of the size of the heap_area compare against
4373	 * tune.t_minarmem, which is the minimum available real memory that we
4374	 * can have in the system.  However, this is generally fixed at 25 pages
4375	 * which is so low that it's useless.  In this comparison, we seek to
4376	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
4377	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
4378	 * free)
4379	 */
4380	n = uma_avail() - (long)(uma_limit() / 4);
4381	if (n < lowest) {
4382		lowest = n;
4383		r = FMR_HEAP_ARENA;
4384	}
4385#endif
4386
4387	/*
4388	 * If zio data pages are being allocated out of a separate heap segment,
4389	 * then enforce that the size of available vmem for this arena remains
4390	 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
4391	 *
4392	 * Note that reducing the arc_zio_arena_free_shift keeps more virtual
4393	 * memory (in the zio_arena) free, which can avoid memory
4394	 * fragmentation issues.
4395	 */
4396	if (zio_arena != NULL) {
4397		n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
4398		    (vmem_size(zio_arena, VMEM_ALLOC) >>
4399		    arc_zio_arena_free_shift);
4400		if (n < lowest) {
4401			lowest = n;
4402			r = FMR_ZIO_ARENA;
4403		}
4404	}
4405
4406#else	/* _KERNEL */
4407	/* Every 100 calls, free a small amount */
4408	if (spa_get_random(100) == 0)
4409		lowest = -1024;
4410#endif	/* _KERNEL */
4411
4412	last_free_memory = lowest;
4413	last_free_reason = r;
4414	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
4415	return (lowest);
4416}
4417
4418
4419/*
4420 * Determine if the system is under memory pressure and is asking
4421 * to reclaim memory. A return value of B_TRUE indicates that the system
4422 * is under memory pressure and that the arc should adjust accordingly.
4423 */
4424static boolean_t
4425arc_reclaim_needed(void)
4426{
4427	return (arc_available_memory() < 0);
4428}
4429
4430extern kmem_cache_t	*zio_buf_cache[];
4431extern kmem_cache_t	*zio_data_buf_cache[];
4432extern kmem_cache_t	*range_seg_cache;
4433extern kmem_cache_t	*abd_chunk_cache;
4434
4435static __noinline void
4436arc_kmem_reap_now(void)
4437{
4438	size_t			i;
4439	kmem_cache_t		*prev_cache = NULL;
4440	kmem_cache_t		*prev_data_cache = NULL;
4441
4442	DTRACE_PROBE(arc__kmem_reap_start);
4443#ifdef _KERNEL
4444	if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
4445		/*
4446		 * We are exceeding our meta-data cache limit.
4447		 * Purge some DNLC entries to release holds on meta-data.
4448		 */
4449		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4450	}
4451#if defined(__i386)
4452	/*
4453	 * Reclaim unused memory from all kmem caches.
4454	 */
4455	kmem_reap();
4456#endif
4457#endif
4458
4459	/*
4460	 * If a kmem reap is already active, don't schedule more.  We must
4461	 * check for this because kmem_cache_reap_soon() won't actually
4462	 * block on the cache being reaped (this is to prevent callers from
4463	 * becoming implicitly blocked by a system-wide kmem reap -- which,
4464	 * on a system with many, many full magazines, can take minutes).
4465	 */
4466	if (kmem_cache_reap_active())
4467		return;
4468
4469	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4470		if (zio_buf_cache[i] != prev_cache) {
4471			prev_cache = zio_buf_cache[i];
4472			kmem_cache_reap_soon(zio_buf_cache[i]);
4473		}
4474		if (zio_data_buf_cache[i] != prev_data_cache) {
4475			prev_data_cache = zio_data_buf_cache[i];
4476			kmem_cache_reap_soon(zio_data_buf_cache[i]);
4477		}
4478	}
4479	kmem_cache_reap_soon(abd_chunk_cache);
4480	kmem_cache_reap_soon(buf_cache);
4481	kmem_cache_reap_soon(hdr_full_cache);
4482	kmem_cache_reap_soon(hdr_l2only_cache);
4483	kmem_cache_reap_soon(range_seg_cache);
4484
4485#ifdef illumos
4486	if (zio_arena != NULL) {
4487		/*
4488		 * Ask the vmem arena to reclaim unused memory from its
4489		 * quantum caches.
4490		 */
4491		vmem_qcache_reap(zio_arena);
4492	}
4493#endif
4494	DTRACE_PROBE(arc__kmem_reap_end);
4495}
4496
4497/*
4498 * Threads can block in arc_get_data_impl() waiting for this thread to evict
4499 * enough data and signal them to proceed. When this happens, the threads in
4500 * arc_get_data_impl() are sleeping while holding the hash lock for their
4501 * particular arc header. Thus, we must be careful to never sleep on a
4502 * hash lock in this thread. This is to prevent the following deadlock:
4503 *
4504 *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4505 *    waiting for the reclaim thread to signal it.
4506 *
4507 *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4508 *    fails, and goes to sleep forever.
4509 *
4510 * This possible deadlock is avoided by always acquiring a hash lock
4511 * using mutex_tryenter() from arc_reclaim_thread().
4512 */
4513/* ARGSUSED */
4514static void
4515arc_reclaim_thread(void *unused __unused)
4516{
4517	hrtime_t		growtime = 0;
4518	hrtime_t		kmem_reap_time = 0;
4519	callb_cpr_t		cpr;
4520
4521	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4522
4523	mutex_enter(&arc_reclaim_lock);
4524	while (!arc_reclaim_thread_exit) {
4525		uint64_t evicted = 0;
4526
4527		/*
4528		 * This is necessary in order for the mdb ::arc dcmd to
4529		 * show up to date information. Since the ::arc command
4530		 * does not call the kstat's update function, without
4531		 * this call, the command may show stale stats for the
4532		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4533		 * with this change, the data might be up to 1 second
4534		 * out of date; but that should suffice. The arc_state_t
4535		 * structures can be queried directly if more accurate
4536		 * information is needed.
4537		 */
4538		if (arc_ksp != NULL)
4539			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4540
4541		mutex_exit(&arc_reclaim_lock);
4542
4543		/*
4544		 * We call arc_adjust() before (possibly) calling
4545		 * arc_kmem_reap_now(), so that we can wake up
4546		 * arc_get_data_impl() sooner.
4547		 */
4548		evicted = arc_adjust();
4549
4550		int64_t free_memory = arc_available_memory();
4551		if (free_memory < 0) {
4552			hrtime_t curtime = gethrtime();
4553			arc_no_grow = B_TRUE;
4554			arc_warm = B_TRUE;
4555
4556			/*
4557			 * Wait at least zfs_grow_retry (default 60) seconds
4558			 * before considering growing.
4559			 */
4560			growtime = curtime + SEC2NSEC(arc_grow_retry);
4561
4562			/*
4563			 * Wait at least arc_kmem_cache_reap_retry_ms
4564			 * between arc_kmem_reap_now() calls. Without
4565			 * this check it is possible to end up in a
4566			 * situation where we spend lots of time
4567			 * reaping caches, while we're near arc_c_min.
4568			 */
4569			if (curtime >= kmem_reap_time) {
4570				arc_kmem_reap_now();
4571				kmem_reap_time = gethrtime() +
4572				    MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
4573			}
4574
4575			/*
4576			 * If we are still low on memory, shrink the ARC
4577			 * so that we have arc_shrink_min free space.
4578			 */
4579			free_memory = arc_available_memory();
4580
4581			int64_t to_free =
4582			    (arc_c >> arc_shrink_shift) - free_memory;
4583			if (to_free > 0) {
4584#ifdef _KERNEL
4585#ifdef illumos
4586				to_free = MAX(to_free, ptob(needfree));
4587#endif
4588#endif
4589				arc_shrink(to_free);
4590			}
4591		} else if (free_memory < arc_c >> arc_no_grow_shift) {
4592			arc_no_grow = B_TRUE;
4593		} else if (gethrtime() >= growtime) {
4594			arc_no_grow = B_FALSE;
4595		}
4596
4597		mutex_enter(&arc_reclaim_lock);
4598
4599		/*
4600		 * If evicted is zero, we couldn't evict anything via
4601		 * arc_adjust(). This could be due to hash lock
4602		 * collisions, but more likely due to the majority of
4603		 * arc buffers being unevictable. Therefore, even if
4604		 * arc_size is above arc_c, another pass is unlikely to
4605		 * be helpful and could potentially cause us to enter an
4606		 * infinite loop.
4607		 */
4608		if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
4609			/*
4610			 * We're either no longer overflowing, or we
4611			 * can't evict anything more, so we should wake
4612			 * up any threads before we go to sleep.
4613			 */
4614			cv_broadcast(&arc_reclaim_waiters_cv);
4615
4616			/*
4617			 * Block until signaled, or after one second (we
4618			 * might need to perform arc_kmem_reap_now()
4619			 * even if we aren't being signalled)
4620			 */
4621			CALLB_CPR_SAFE_BEGIN(&cpr);
4622			(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
4623			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
4624			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
4625		}
4626	}
4627
4628	arc_reclaim_thread_exit = B_FALSE;
4629	cv_broadcast(&arc_reclaim_thread_cv);
4630	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
4631	thread_exit();
4632}
4633
4634static u_int arc_dnlc_evicts_arg;
4635extern struct vfsops zfs_vfsops;
4636
4637static void
4638arc_dnlc_evicts_thread(void *dummy __unused)
4639{
4640	callb_cpr_t cpr;
4641	u_int percent;
4642
4643	CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
4644
4645	mutex_enter(&arc_dnlc_evicts_lock);
4646	while (!arc_dnlc_evicts_thread_exit) {
4647		CALLB_CPR_SAFE_BEGIN(&cpr);
4648		(void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
4649		CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
4650		if (arc_dnlc_evicts_arg != 0) {
4651			percent = arc_dnlc_evicts_arg;
4652			mutex_exit(&arc_dnlc_evicts_lock);
4653#ifdef _KERNEL
4654			vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
4655#endif
4656			mutex_enter(&arc_dnlc_evicts_lock);
4657			/*
4658			 * Clear our token only after vnlru_free()
4659			 * pass is done, to avoid false queueing of
4660			 * the requests.
4661			 */
4662			arc_dnlc_evicts_arg = 0;
4663		}
4664	}
4665	arc_dnlc_evicts_thread_exit = FALSE;
4666	cv_broadcast(&arc_dnlc_evicts_cv);
4667	CALLB_CPR_EXIT(&cpr);
4668	thread_exit();
4669}
4670
4671void
4672dnlc_reduce_cache(void *arg)
4673{
4674	u_int percent;
4675
4676	percent = (u_int)(uintptr_t)arg;
4677	mutex_enter(&arc_dnlc_evicts_lock);
4678	if (arc_dnlc_evicts_arg == 0) {
4679		arc_dnlc_evicts_arg = percent;
4680		cv_broadcast(&arc_dnlc_evicts_cv);
4681	}
4682	mutex_exit(&arc_dnlc_evicts_lock);
4683}
4684
4685/*
4686 * Adapt arc info given the number of bytes we are trying to add and
4687 * the state that we are comming from.  This function is only called
4688 * when we are adding new content to the cache.
4689 */
4690static void
4691arc_adapt(int bytes, arc_state_t *state)
4692{
4693	int mult;
4694	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
4695	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
4696	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
4697
4698	if (state == arc_l2c_only)
4699		return;
4700
4701	ASSERT(bytes > 0);
4702	/*
4703	 * Adapt the target size of the MRU list:
4704	 *	- if we just hit in the MRU ghost list, then increase
4705	 *	  the target size of the MRU list.
4706	 *	- if we just hit in the MFU ghost list, then increase
4707	 *	  the target size of the MFU list by decreasing the
4708	 *	  target size of the MRU list.
4709	 */
4710	if (state == arc_mru_ghost) {
4711		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
4712		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
4713
4714		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
4715	} else if (state == arc_mfu_ghost) {
4716		uint64_t delta;
4717
4718		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
4719		mult = MIN(mult, 10);
4720
4721		delta = MIN(bytes * mult, arc_p);
4722		arc_p = MAX(arc_p_min, arc_p - delta);
4723	}
4724	ASSERT((int64_t)arc_p >= 0);
4725
4726	if (arc_reclaim_needed()) {
4727		cv_signal(&arc_reclaim_thread_cv);
4728		return;
4729	}
4730
4731	if (arc_no_grow)
4732		return;
4733
4734	if (arc_c >= arc_c_max)
4735		return;
4736
4737	/*
4738	 * If we're within (2 * maxblocksize) bytes of the target
4739	 * cache size, increment the target cache size
4740	 */
4741	if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
4742	    0) {
4743		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
4744		atomic_add_64(&arc_c, (int64_t)bytes);
4745		if (arc_c > arc_c_max)
4746			arc_c = arc_c_max;
4747		else if (state == arc_anon)
4748			atomic_add_64(&arc_p, (int64_t)bytes);
4749		if (arc_p > arc_c)
4750			arc_p = arc_c;
4751	}
4752	ASSERT((int64_t)arc_p >= 0);
4753}
4754
4755/*
4756 * Check if arc_size has grown past our upper threshold, determined by
4757 * zfs_arc_overflow_shift.
4758 */
4759static boolean_t
4760arc_is_overflowing(void)
4761{
4762	/* Always allow at least one block of overflow */
4763	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
4764	    arc_c >> zfs_arc_overflow_shift);
4765
4766	/*
4767	 * We just compare the lower bound here for performance reasons. Our
4768	 * primary goals are to make sure that the arc never grows without
4769	 * bound, and that it can reach its maximum size. This check
4770	 * accomplishes both goals. The maximum amount we could run over by is
4771	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
4772	 * in the ARC. In practice, that's in the tens of MB, which is low
4773	 * enough to be safe.
4774	 */
4775	return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
4776}
4777
4778static abd_t *
4779arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
4780{
4781	arc_buf_contents_t type = arc_buf_type(hdr);
4782
4783	arc_get_data_impl(hdr, size, tag);
4784	if (type == ARC_BUFC_METADATA) {
4785		return (abd_alloc(size, B_TRUE));
4786	} else {
4787		ASSERT(type == ARC_BUFC_DATA);
4788		return (abd_alloc(size, B_FALSE));
4789	}
4790}
4791
4792static void *
4793arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
4794{
4795	arc_buf_contents_t type = arc_buf_type(hdr);
4796
4797	arc_get_data_impl(hdr, size, tag);
4798	if (type == ARC_BUFC_METADATA) {
4799		return (zio_buf_alloc(size));
4800	} else {
4801		ASSERT(type == ARC_BUFC_DATA);
4802		return (zio_data_buf_alloc(size));
4803	}
4804}
4805
4806/*
4807 * Allocate a block and return it to the caller. If we are hitting the
4808 * hard limit for the cache size, we must sleep, waiting for the eviction
4809 * thread to catch up. If we're past the target size but below the hard
4810 * limit, we'll only signal the reclaim thread and continue on.
4811 */
4812static void
4813arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
4814{
4815	arc_state_t *state = hdr->b_l1hdr.b_state;
4816	arc_buf_contents_t type = arc_buf_type(hdr);
4817
4818	arc_adapt(size, state);
4819
4820	/*
4821	 * If arc_size is currently overflowing, and has grown past our
4822	 * upper limit, we must be adding data faster than the evict
4823	 * thread can evict. Thus, to ensure we don't compound the
4824	 * problem by adding more data and forcing arc_size to grow even
4825	 * further past it's target size, we halt and wait for the
4826	 * eviction thread to catch up.
4827	 *
4828	 * It's also possible that the reclaim thread is unable to evict
4829	 * enough buffers to get arc_size below the overflow limit (e.g.
4830	 * due to buffers being un-evictable, or hash lock collisions).
4831	 * In this case, we want to proceed regardless if we're
4832	 * overflowing; thus we don't use a while loop here.
4833	 */
4834	if (arc_is_overflowing()) {
4835		mutex_enter(&arc_reclaim_lock);
4836
4837		/*
4838		 * Now that we've acquired the lock, we may no longer be
4839		 * over the overflow limit, lets check.
4840		 *
4841		 * We're ignoring the case of spurious wake ups. If that
4842		 * were to happen, it'd let this thread consume an ARC
4843		 * buffer before it should have (i.e. before we're under
4844		 * the overflow limit and were signalled by the reclaim
4845		 * thread). As long as that is a rare occurrence, it
4846		 * shouldn't cause any harm.
4847		 */
4848		if (arc_is_overflowing()) {
4849			cv_signal(&arc_reclaim_thread_cv);
4850			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
4851		}
4852
4853		mutex_exit(&arc_reclaim_lock);
4854	}
4855
4856	VERIFY3U(hdr->b_type, ==, type);
4857	if (type == ARC_BUFC_METADATA) {
4858		arc_space_consume(size, ARC_SPACE_META);
4859	} else {
4860		arc_space_consume(size, ARC_SPACE_DATA);
4861	}
4862
4863	/*
4864	 * Update the state size.  Note that ghost states have a
4865	 * "ghost size" and so don't need to be updated.
4866	 */
4867	if (!GHOST_STATE(state)) {
4868
4869		(void) refcount_add_many(&state->arcs_size, size, tag);
4870
4871		/*
4872		 * If this is reached via arc_read, the link is
4873		 * protected by the hash lock. If reached via
4874		 * arc_buf_alloc, the header should not be accessed by
4875		 * any other thread. And, if reached via arc_read_done,
4876		 * the hash lock will protect it if it's found in the
4877		 * hash table; otherwise no other thread should be
4878		 * trying to [add|remove]_reference it.
4879		 */
4880		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
4881			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4882			(void) refcount_add_many(&state->arcs_esize[type],
4883			    size, tag);
4884		}
4885
4886		/*
4887		 * If we are growing the cache, and we are adding anonymous
4888		 * data, and we have outgrown arc_p, update arc_p
4889		 */
4890		if (aggsum_compare(&arc_size, arc_c) < 0 &&
4891		    hdr->b_l1hdr.b_state == arc_anon &&
4892		    (refcount_count(&arc_anon->arcs_size) +
4893		    refcount_count(&arc_mru->arcs_size) > arc_p))
4894			arc_p = MIN(arc_c, arc_p + size);
4895	}
4896	ARCSTAT_BUMP(arcstat_allocated);
4897}
4898
4899static void
4900arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
4901{
4902	arc_free_data_impl(hdr, size, tag);
4903	abd_free(abd);
4904}
4905
4906static void
4907arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
4908{
4909	arc_buf_contents_t type = arc_buf_type(hdr);
4910
4911	arc_free_data_impl(hdr, size, tag);
4912	if (type == ARC_BUFC_METADATA) {
4913		zio_buf_free(buf, size);
4914	} else {
4915		ASSERT(type == ARC_BUFC_DATA);
4916		zio_data_buf_free(buf, size);
4917	}
4918}
4919
4920/*
4921 * Free the arc data buffer.
4922 */
4923static void
4924arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
4925{
4926	arc_state_t *state = hdr->b_l1hdr.b_state;
4927	arc_buf_contents_t type = arc_buf_type(hdr);
4928
4929	/* protected by hash lock, if in the hash table */
4930	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
4931		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4932		ASSERT(state != arc_anon && state != arc_l2c_only);
4933
4934		(void) refcount_remove_many(&state->arcs_esize[type],
4935		    size, tag);
4936	}
4937	(void) refcount_remove_many(&state->arcs_size, size, tag);
4938
4939	VERIFY3U(hdr->b_type, ==, type);
4940	if (type == ARC_BUFC_METADATA) {
4941		arc_space_return(size, ARC_SPACE_META);
4942	} else {
4943		ASSERT(type == ARC_BUFC_DATA);
4944		arc_space_return(size, ARC_SPACE_DATA);
4945	}
4946}
4947
4948/*
4949 * This routine is called whenever a buffer is accessed.
4950 * NOTE: the hash lock is dropped in this function.
4951 */
4952static void
4953arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
4954{
4955	clock_t now;
4956
4957	ASSERT(MUTEX_HELD(hash_lock));
4958	ASSERT(HDR_HAS_L1HDR(hdr));
4959
4960	if (hdr->b_l1hdr.b_state == arc_anon) {
4961		/*
4962		 * This buffer is not in the cache, and does not
4963		 * appear in our "ghost" list.  Add the new buffer
4964		 * to the MRU state.
4965		 */
4966
4967		ASSERT0(hdr->b_l1hdr.b_arc_access);
4968		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4969		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4970		arc_change_state(arc_mru, hdr, hash_lock);
4971
4972	} else if (hdr->b_l1hdr.b_state == arc_mru) {
4973		now = ddi_get_lbolt();
4974
4975		/*
4976		 * If this buffer is here because of a prefetch, then either:
4977		 * - clear the flag if this is a "referencing" read
4978		 *   (any subsequent access will bump this into the MFU state).
4979		 * or
4980		 * - move the buffer to the head of the list if this is
4981		 *   another prefetch (to make it less likely to be evicted).
4982		 */
4983		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
4984			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4985				/* link protected by hash lock */
4986				ASSERT(multilist_link_active(
4987				    &hdr->b_l1hdr.b_arc_node));
4988			} else {
4989				arc_hdr_clear_flags(hdr,
4990				    ARC_FLAG_PREFETCH |
4991				    ARC_FLAG_PRESCIENT_PREFETCH);
4992				ARCSTAT_BUMP(arcstat_mru_hits);
4993			}
4994			hdr->b_l1hdr.b_arc_access = now;
4995			return;
4996		}
4997
4998		/*
4999		 * This buffer has been "accessed" only once so far,
5000		 * but it is still in the cache. Move it to the MFU
5001		 * state.
5002		 */
5003		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
5004			/*
5005			 * More than 125ms have passed since we
5006			 * instantiated this buffer.  Move it to the
5007			 * most frequently used state.
5008			 */
5009			hdr->b_l1hdr.b_arc_access = now;
5010			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5011			arc_change_state(arc_mfu, hdr, hash_lock);
5012		}
5013		ARCSTAT_BUMP(arcstat_mru_hits);
5014	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5015		arc_state_t	*new_state;
5016		/*
5017		 * This buffer has been "accessed" recently, but
5018		 * was evicted from the cache.  Move it to the
5019		 * MFU state.
5020		 */
5021
5022		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5023			new_state = arc_mru;
5024			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
5025				arc_hdr_clear_flags(hdr,
5026				    ARC_FLAG_PREFETCH |
5027				    ARC_FLAG_PRESCIENT_PREFETCH);
5028			}
5029			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5030		} else {
5031			new_state = arc_mfu;
5032			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5033		}
5034
5035		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5036		arc_change_state(new_state, hdr, hash_lock);
5037
5038		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5039	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
5040		/*
5041		 * This buffer has been accessed more than once and is
5042		 * still in the cache.  Keep it in the MFU state.
5043		 *
5044		 * NOTE: an add_reference() that occurred when we did
5045		 * the arc_read() will have kicked this off the list.
5046		 * If it was a prefetch, we will explicitly move it to
5047		 * the head of the list now.
5048		 */
5049
5050		ARCSTAT_BUMP(arcstat_mfu_hits);
5051		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5052	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5053		arc_state_t	*new_state = arc_mfu;
5054		/*
5055		 * This buffer has been accessed more than once but has
5056		 * been evicted from the cache.  Move it back to the
5057		 * MFU state.
5058		 */
5059
5060		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5061			/*
5062			 * This is a prefetch access...
5063			 * move this block back to the MRU state.
5064			 */
5065			new_state = arc_mru;
5066		}
5067
5068		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5069		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5070		arc_change_state(new_state, hdr, hash_lock);
5071
5072		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5073	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5074		/*
5075		 * This buffer is on the 2nd Level ARC.
5076		 */
5077
5078		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5079		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5080		arc_change_state(arc_mfu, hdr, hash_lock);
5081	} else {
5082		ASSERT(!"invalid arc state");
5083	}
5084}
5085
5086/*
5087 * This routine is called by dbuf_hold() to update the arc_access() state
5088 * which otherwise would be skipped for entries in the dbuf cache.
5089 */
5090void
5091arc_buf_access(arc_buf_t *buf)
5092{
5093	mutex_enter(&buf->b_evict_lock);
5094	arc_buf_hdr_t *hdr = buf->b_hdr;
5095
5096	/*
5097	 * Avoid taking the hash_lock when possible as an optimization.
5098	 * The header must be checked again under the hash_lock in order
5099	 * to handle the case where it is concurrently being released.
5100	 */
5101	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5102		mutex_exit(&buf->b_evict_lock);
5103		ARCSTAT_BUMP(arcstat_access_skip);
5104		return;
5105	}
5106
5107	kmutex_t *hash_lock = HDR_LOCK(hdr);
5108	mutex_enter(hash_lock);
5109
5110	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5111		mutex_exit(hash_lock);
5112		mutex_exit(&buf->b_evict_lock);
5113		ARCSTAT_BUMP(arcstat_access_skip);
5114		return;
5115	}
5116
5117	mutex_exit(&buf->b_evict_lock);
5118
5119	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5120	    hdr->b_l1hdr.b_state == arc_mfu);
5121
5122	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5123	arc_access(hdr, hash_lock);
5124	mutex_exit(hash_lock);
5125
5126	ARCSTAT_BUMP(arcstat_hits);
5127	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5128	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5129}
5130
5131/* a generic arc_read_done_func_t which you can use */
5132/* ARGSUSED */
5133void
5134arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5135    arc_buf_t *buf, void *arg)
5136{
5137	if (buf == NULL)
5138		return;
5139
5140	bcopy(buf->b_data, arg, arc_buf_size(buf));
5141	arc_buf_destroy(buf, arg);
5142}
5143
5144/* a generic arc_read_done_func_t */
5145/* ARGSUSED */
5146void
5147arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5148    arc_buf_t *buf, void *arg)
5149{
5150	arc_buf_t **bufp = arg;
5151
5152	if (buf == NULL) {
5153		*bufp = NULL;
5154	} else {
5155		*bufp = buf;
5156		ASSERT(buf->b_data);
5157	}
5158}
5159
5160static void
5161arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
5162{
5163	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5164		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
5165		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
5166	} else {
5167		if (HDR_COMPRESSION_ENABLED(hdr)) {
5168			ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
5169			    BP_GET_COMPRESS(bp));
5170		}
5171		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5172		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5173	}
5174}
5175
5176static void
5177arc_read_done(zio_t *zio)
5178{
5179	arc_buf_hdr_t	*hdr = zio->io_private;
5180	kmutex_t	*hash_lock = NULL;
5181	arc_callback_t	*callback_list;
5182	arc_callback_t	*acb;
5183	boolean_t	freeable = B_FALSE;
5184
5185	/*
5186	 * The hdr was inserted into hash-table and removed from lists
5187	 * prior to starting I/O.  We should find this header, since
5188	 * it's in the hash table, and it should be legit since it's
5189	 * not possible to evict it during the I/O.  The only possible
5190	 * reason for it not to be found is if we were freed during the
5191	 * read.
5192	 */
5193	if (HDR_IN_HASH_TABLE(hdr)) {
5194		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
5195		ASSERT3U(hdr->b_dva.dva_word[0], ==,
5196		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
5197		ASSERT3U(hdr->b_dva.dva_word[1], ==,
5198		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
5199
5200		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
5201		    &hash_lock);
5202
5203		ASSERT((found == hdr &&
5204		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5205		    (found == hdr && HDR_L2_READING(hdr)));
5206		ASSERT3P(hash_lock, !=, NULL);
5207	}
5208
5209	if (zio->io_error == 0) {
5210		/* byteswap if necessary */
5211		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5212			if (BP_GET_LEVEL(zio->io_bp) > 0) {
5213				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5214			} else {
5215				hdr->b_l1hdr.b_byteswap =
5216				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5217			}
5218		} else {
5219			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5220		}
5221	}
5222
5223	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5224	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
5225		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
5226
5227	callback_list = hdr->b_l1hdr.b_acb;
5228	ASSERT3P(callback_list, !=, NULL);
5229
5230	if (hash_lock && zio->io_error == 0 &&
5231	    hdr->b_l1hdr.b_state == arc_anon) {
5232		/*
5233		 * Only call arc_access on anonymous buffers.  This is because
5234		 * if we've issued an I/O for an evicted buffer, we've already
5235		 * called arc_access (to prevent any simultaneous readers from
5236		 * getting confused).
5237		 */
5238		arc_access(hdr, hash_lock);
5239	}
5240
5241	/*
5242	 * If a read request has a callback (i.e. acb_done is not NULL), then we
5243	 * make a buf containing the data according to the parameters which were
5244	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5245	 * aren't needlessly decompressing the data multiple times.
5246	 */
5247	int callback_cnt = 0;
5248	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5249		if (!acb->acb_done)
5250			continue;
5251
5252		callback_cnt++;
5253
5254		if (zio->io_error != 0)
5255			continue;
5256
5257		int error = arc_buf_alloc_impl(hdr, acb->acb_private,
5258		    acb->acb_compressed,
5259		    B_TRUE, &acb->acb_buf);
5260		if (error != 0) {
5261			arc_buf_destroy(acb->acb_buf, acb->acb_private);
5262			acb->acb_buf = NULL;
5263		}
5264
5265		if (zio->io_error == 0)
5266			zio->io_error = error;
5267	}
5268	hdr->b_l1hdr.b_acb = NULL;
5269	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5270	if (callback_cnt == 0) {
5271		ASSERT(HDR_PREFETCH(hdr));
5272		ASSERT0(hdr->b_l1hdr.b_bufcnt);
5273		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
5274	}
5275
5276	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
5277	    callback_list != NULL);
5278
5279	if (zio->io_error == 0) {
5280		arc_hdr_verify(hdr, zio->io_bp);
5281	} else {
5282		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5283		if (hdr->b_l1hdr.b_state != arc_anon)
5284			arc_change_state(arc_anon, hdr, hash_lock);
5285		if (HDR_IN_HASH_TABLE(hdr))
5286			buf_hash_remove(hdr);
5287		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5288	}
5289
5290	/*
5291	 * Broadcast before we drop the hash_lock to avoid the possibility
5292	 * that the hdr (and hence the cv) might be freed before we get to
5293	 * the cv_broadcast().
5294	 */
5295	cv_broadcast(&hdr->b_l1hdr.b_cv);
5296
5297	if (hash_lock != NULL) {
5298		mutex_exit(hash_lock);
5299	} else {
5300		/*
5301		 * This block was freed while we waited for the read to
5302		 * complete.  It has been removed from the hash table and
5303		 * moved to the anonymous state (so that it won't show up
5304		 * in the cache).
5305		 */
5306		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
5307		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5308	}
5309
5310	/* execute each callback and free its structure */
5311	while ((acb = callback_list) != NULL) {
5312		if (acb->acb_done) {
5313			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5314			    acb->acb_buf, acb->acb_private);
5315		}
5316
5317		if (acb->acb_zio_dummy != NULL) {
5318			acb->acb_zio_dummy->io_error = zio->io_error;
5319			zio_nowait(acb->acb_zio_dummy);
5320		}
5321
5322		callback_list = acb->acb_next;
5323		kmem_free(acb, sizeof (arc_callback_t));
5324	}
5325
5326	if (freeable)
5327		arc_hdr_destroy(hdr);
5328}
5329
5330/*
5331 * "Read" the block at the specified DVA (in bp) via the
5332 * cache.  If the block is found in the cache, invoke the provided
5333 * callback immediately and return.  Note that the `zio' parameter
5334 * in the callback will be NULL in this case, since no IO was
5335 * required.  If the block is not in the cache pass the read request
5336 * on to the spa with a substitute callback function, so that the
5337 * requested block will be added to the cache.
5338 *
5339 * If a read request arrives for a block that has a read in-progress,
5340 * either wait for the in-progress read to complete (and return the
5341 * results); or, if this is a read with a "done" func, add a record
5342 * to the read to invoke the "done" func when the read completes,
5343 * and return; or just return.
5344 *
5345 * arc_read_done() will invoke all the requested "done" functions
5346 * for readers of this block.
5347 */
5348int
5349arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
5350    void *private, zio_priority_t priority, int zio_flags,
5351    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5352{
5353	arc_buf_hdr_t *hdr = NULL;
5354	kmutex_t *hash_lock = NULL;
5355	zio_t *rzio;
5356	uint64_t guid = spa_load_guid(spa);
5357	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
5358	int rc = 0;
5359
5360	ASSERT(!BP_IS_EMBEDDED(bp) ||
5361	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5362
5363top:
5364	if (!BP_IS_EMBEDDED(bp)) {
5365		/*
5366		 * Embedded BP's have no DVA and require no I/O to "read".
5367		 * Create an anonymous arc buf to back it.
5368		 */
5369		hdr = buf_hash_find(guid, bp, &hash_lock);
5370	}
5371
5372	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
5373		arc_buf_t *buf = NULL;
5374		*arc_flags |= ARC_FLAG_CACHED;
5375
5376		if (HDR_IO_IN_PROGRESS(hdr)) {
5377			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5378
5379			ASSERT3P(head_zio, !=, NULL);
5380			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5381			    priority == ZIO_PRIORITY_SYNC_READ) {
5382				/*
5383				 * This is a sync read that needs to wait for
5384				 * an in-flight async read. Request that the
5385				 * zio have its priority upgraded.
5386				 */
5387				zio_change_priority(head_zio, priority);
5388				DTRACE_PROBE1(arc__async__upgrade__sync,
5389				    arc_buf_hdr_t *, hdr);
5390				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
5391			}
5392			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5393				arc_hdr_clear_flags(hdr,
5394				    ARC_FLAG_PREDICTIVE_PREFETCH);
5395			}
5396
5397			if (*arc_flags & ARC_FLAG_WAIT) {
5398				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
5399				mutex_exit(hash_lock);
5400				goto top;
5401			}
5402			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
5403
5404			if (done) {
5405				arc_callback_t *acb = NULL;
5406
5407				acb = kmem_zalloc(sizeof (arc_callback_t),
5408				    KM_SLEEP);
5409				acb->acb_done = done;
5410				acb->acb_private = private;
5411				acb->acb_compressed = compressed_read;
5412				if (pio != NULL)
5413					acb->acb_zio_dummy = zio_null(pio,
5414					    spa, NULL, NULL, NULL, zio_flags);
5415
5416				ASSERT3P(acb->acb_done, !=, NULL);
5417				acb->acb_zio_head = head_zio;
5418				acb->acb_next = hdr->b_l1hdr.b_acb;
5419				hdr->b_l1hdr.b_acb = acb;
5420				mutex_exit(hash_lock);
5421				return (0);
5422			}
5423			mutex_exit(hash_lock);
5424			return (0);
5425		}
5426
5427		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5428		    hdr->b_l1hdr.b_state == arc_mfu);
5429
5430		if (done) {
5431			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5432				/*
5433				 * This is a demand read which does not have to
5434				 * wait for i/o because we did a predictive
5435				 * prefetch i/o for it, which has completed.
5436				 */
5437				DTRACE_PROBE1(
5438				    arc__demand__hit__predictive__prefetch,
5439				    arc_buf_hdr_t *, hdr);
5440				ARCSTAT_BUMP(
5441				    arcstat_demand_hit_predictive_prefetch);
5442				arc_hdr_clear_flags(hdr,
5443				    ARC_FLAG_PREDICTIVE_PREFETCH);
5444			}
5445
5446			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
5447				ARCSTAT_BUMP(
5448                                    arcstat_demand_hit_prescient_prefetch);
5449				arc_hdr_clear_flags(hdr,
5450                                    ARC_FLAG_PRESCIENT_PREFETCH);
5451			}
5452
5453			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
5454			/* Get a buf with the desired data in it. */
5455			rc = arc_buf_alloc_impl(hdr, private,
5456			   compressed_read, B_TRUE, &buf);
5457			if (rc != 0) {
5458				arc_buf_destroy(buf, private);
5459				buf = NULL;
5460			}
5461			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
5462                            rc == 0 || rc != ENOENT);
5463		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
5464		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
5465			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5466		}
5467		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5468		arc_access(hdr, hash_lock);
5469		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
5470                        arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5471		if (*arc_flags & ARC_FLAG_L2CACHE)
5472			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5473		mutex_exit(hash_lock);
5474		ARCSTAT_BUMP(arcstat_hits);
5475		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5476		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
5477		    data, metadata, hits);
5478
5479		if (done)
5480			done(NULL, zb, bp, buf, private);
5481	} else {
5482		uint64_t lsize = BP_GET_LSIZE(bp);
5483		uint64_t psize = BP_GET_PSIZE(bp);
5484		arc_callback_t *acb;
5485		vdev_t *vd = NULL;
5486		uint64_t addr = 0;
5487		boolean_t devw = B_FALSE;
5488		uint64_t size;
5489
5490		if (hdr == NULL) {
5491			/* this block is not in the cache */
5492			arc_buf_hdr_t *exists = NULL;
5493			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
5494			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
5495			    BP_GET_COMPRESS(bp), type);
5496
5497			if (!BP_IS_EMBEDDED(bp)) {
5498				hdr->b_dva = *BP_IDENTITY(bp);
5499				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
5500				exists = buf_hash_insert(hdr, &hash_lock);
5501			}
5502			if (exists != NULL) {
5503				/* somebody beat us to the hash insert */
5504				mutex_exit(hash_lock);
5505				buf_discard_identity(hdr);
5506				arc_hdr_destroy(hdr);
5507				goto top; /* restart the IO request */
5508			}
5509		} else {
5510			/*
5511			 * This block is in the ghost cache. If it was L2-only
5512			 * (and thus didn't have an L1 hdr), we realloc the
5513			 * header to add an L1 hdr.
5514			 */
5515			if (!HDR_HAS_L1HDR(hdr)) {
5516				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
5517				    hdr_full_cache);
5518			}
5519			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
5520			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
5521			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5522			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5523			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
5524			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
5525
5526			/*
5527			 * This is a delicate dance that we play here.
5528			 * This hdr is in the ghost list so we access it
5529			 * to move it out of the ghost list before we
5530			 * initiate the read. If it's a prefetch then
5531			 * it won't have a callback so we'll remove the
5532			 * reference that arc_buf_alloc_impl() created. We
5533			 * do this after we've called arc_access() to
5534			 * avoid hitting an assert in remove_reference().
5535			 */
5536			arc_access(hdr, hash_lock);
5537			arc_hdr_alloc_pabd(hdr);
5538		}
5539		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
5540		size = arc_hdr_size(hdr);
5541
5542		/*
5543		 * If compression is enabled on the hdr, then will do
5544		 * RAW I/O and will store the compressed data in the hdr's
5545		 * data block. Otherwise, the hdr's data block will contain
5546		 * the uncompressed data.
5547		 */
5548		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
5549			zio_flags |= ZIO_FLAG_RAW;
5550		}
5551
5552		if (*arc_flags & ARC_FLAG_PREFETCH)
5553			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5554		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
5555			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5556
5557		if (*arc_flags & ARC_FLAG_L2CACHE)
5558			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5559		if (BP_GET_LEVEL(bp) > 0)
5560			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
5561		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
5562			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
5563		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
5564
5565		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
5566		acb->acb_done = done;
5567		acb->acb_private = private;
5568		acb->acb_compressed = compressed_read;
5569
5570		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
5571		hdr->b_l1hdr.b_acb = acb;
5572		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5573
5574		if (HDR_HAS_L2HDR(hdr) &&
5575		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
5576			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
5577			addr = hdr->b_l2hdr.b_daddr;
5578			/*
5579			 * Lock out L2ARC device removal.
5580			 */
5581			if (vdev_is_dead(vd) ||
5582			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
5583				vd = NULL;
5584		}
5585
5586		/*
5587		 * We count both async reads and scrub IOs as asynchronous so
5588		 * that both can be upgraded in the event of a cache hit while
5589		 * the read IO is still in-flight.
5590		 */
5591		if (priority == ZIO_PRIORITY_ASYNC_READ ||
5592		    priority == ZIO_PRIORITY_SCRUB)
5593			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
5594		else
5595			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
5596
5597		/*
5598		 * At this point, we have a level 1 cache miss.  Try again in
5599		 * L2ARC if possible.
5600		 */
5601		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
5602
5603		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
5604		    uint64_t, lsize, zbookmark_phys_t *, zb);
5605		ARCSTAT_BUMP(arcstat_misses);
5606		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5607		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
5608		    data, metadata, misses);
5609#ifdef _KERNEL
5610#ifdef RACCT
5611		if (racct_enable) {
5612			PROC_LOCK(curproc);
5613			racct_add_force(curproc, RACCT_READBPS, size);
5614			racct_add_force(curproc, RACCT_READIOPS, 1);
5615			PROC_UNLOCK(curproc);
5616		}
5617#endif /* RACCT */
5618		curthread->td_ru.ru_inblock++;
5619#endif
5620
5621		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
5622			/*
5623			 * Read from the L2ARC if the following are true:
5624			 * 1. The L2ARC vdev was previously cached.
5625			 * 2. This buffer still has L2ARC metadata.
5626			 * 3. This buffer isn't currently writing to the L2ARC.
5627			 * 4. The L2ARC entry wasn't evicted, which may
5628			 *    also have invalidated the vdev.
5629			 * 5. This isn't prefetch and l2arc_noprefetch is set.
5630			 */
5631			if (HDR_HAS_L2HDR(hdr) &&
5632			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
5633			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
5634				l2arc_read_callback_t *cb;
5635				abd_t *abd;
5636				uint64_t asize;
5637
5638				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
5639				ARCSTAT_BUMP(arcstat_l2_hits);
5640
5641				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
5642				    KM_SLEEP);
5643				cb->l2rcb_hdr = hdr;
5644				cb->l2rcb_bp = *bp;
5645				cb->l2rcb_zb = *zb;
5646				cb->l2rcb_flags = zio_flags;
5647
5648				asize = vdev_psize_to_asize(vd, size);
5649				if (asize != size) {
5650					abd = abd_alloc_for_io(asize,
5651					    HDR_ISTYPE_METADATA(hdr));
5652					cb->l2rcb_abd = abd;
5653				} else {
5654					abd = hdr->b_l1hdr.b_pabd;
5655				}
5656
5657				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
5658				    addr + asize <= vd->vdev_psize -
5659				    VDEV_LABEL_END_SIZE);
5660
5661				/*
5662				 * l2arc read.  The SCL_L2ARC lock will be
5663				 * released by l2arc_read_done().
5664				 * Issue a null zio if the underlying buffer
5665				 * was squashed to zero size by compression.
5666				 */
5667				ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
5668				    ZIO_COMPRESS_EMPTY);
5669				rzio = zio_read_phys(pio, vd, addr,
5670				    asize, abd,
5671				    ZIO_CHECKSUM_OFF,
5672				    l2arc_read_done, cb, priority,
5673				    zio_flags | ZIO_FLAG_DONT_CACHE |
5674				    ZIO_FLAG_CANFAIL |
5675				    ZIO_FLAG_DONT_PROPAGATE |
5676				    ZIO_FLAG_DONT_RETRY, B_FALSE);
5677				acb->acb_zio_head = rzio;
5678
5679				if (hash_lock != NULL)
5680					mutex_exit(hash_lock);
5681
5682				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
5683				    zio_t *, rzio);
5684				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
5685
5686				if (*arc_flags & ARC_FLAG_NOWAIT) {
5687					zio_nowait(rzio);
5688					return (0);
5689				}
5690
5691				ASSERT(*arc_flags & ARC_FLAG_WAIT);
5692				if (zio_wait(rzio) == 0)
5693					return (0);
5694
5695				/* l2arc read error; goto zio_read() */
5696				if (hash_lock != NULL)
5697					mutex_enter(hash_lock);
5698			} else {
5699				DTRACE_PROBE1(l2arc__miss,
5700				    arc_buf_hdr_t *, hdr);
5701				ARCSTAT_BUMP(arcstat_l2_misses);
5702				if (HDR_L2_WRITING(hdr))
5703					ARCSTAT_BUMP(arcstat_l2_rw_clash);
5704				spa_config_exit(spa, SCL_L2ARC, vd);
5705			}
5706		} else {
5707			if (vd != NULL)
5708				spa_config_exit(spa, SCL_L2ARC, vd);
5709			if (l2arc_ndev != 0) {
5710				DTRACE_PROBE1(l2arc__miss,
5711				    arc_buf_hdr_t *, hdr);
5712				ARCSTAT_BUMP(arcstat_l2_misses);
5713			}
5714		}
5715
5716		rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
5717		    arc_read_done, hdr, priority, zio_flags, zb);
5718		acb->acb_zio_head = rzio;
5719
5720		if (hash_lock != NULL)
5721			mutex_exit(hash_lock);
5722
5723		if (*arc_flags & ARC_FLAG_WAIT)
5724			return (zio_wait(rzio));
5725
5726		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
5727		zio_nowait(rzio);
5728	}
5729	return (0);
5730}
5731
5732/*
5733 * Notify the arc that a block was freed, and thus will never be used again.
5734 */
5735void
5736arc_freed(spa_t *spa, const blkptr_t *bp)
5737{
5738	arc_buf_hdr_t *hdr;
5739	kmutex_t *hash_lock;
5740	uint64_t guid = spa_load_guid(spa);
5741
5742	ASSERT(!BP_IS_EMBEDDED(bp));
5743
5744	hdr = buf_hash_find(guid, bp, &hash_lock);
5745	if (hdr == NULL)
5746		return;
5747
5748	/*
5749	 * We might be trying to free a block that is still doing I/O
5750	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
5751	 * dmu_sync-ed block). If this block is being prefetched, then it
5752	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
5753	 * until the I/O completes. A block may also have a reference if it is
5754	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
5755	 * have written the new block to its final resting place on disk but
5756	 * without the dedup flag set. This would have left the hdr in the MRU
5757	 * state and discoverable. When the txg finally syncs it detects that
5758	 * the block was overridden in open context and issues an override I/O.
5759	 * Since this is a dedup block, the override I/O will determine if the
5760	 * block is already in the DDT. If so, then it will replace the io_bp
5761	 * with the bp from the DDT and allow the I/O to finish. When the I/O
5762	 * reaches the done callback, dbuf_write_override_done, it will
5763	 * check to see if the io_bp and io_bp_override are identical.
5764	 * If they are not, then it indicates that the bp was replaced with
5765	 * the bp in the DDT and the override bp is freed. This allows
5766	 * us to arrive here with a reference on a block that is being
5767	 * freed. So if we have an I/O in progress, or a reference to
5768	 * this hdr, then we don't destroy the hdr.
5769	 */
5770	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
5771	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
5772		arc_change_state(arc_anon, hdr, hash_lock);
5773		arc_hdr_destroy(hdr);
5774		mutex_exit(hash_lock);
5775	} else {
5776		mutex_exit(hash_lock);
5777	}
5778
5779}
5780
5781/*
5782 * Release this buffer from the cache, making it an anonymous buffer.  This
5783 * must be done after a read and prior to modifying the buffer contents.
5784 * If the buffer has more than one reference, we must make
5785 * a new hdr for the buffer.
5786 */
5787void
5788arc_release(arc_buf_t *buf, void *tag)
5789{
5790	arc_buf_hdr_t *hdr = buf->b_hdr;
5791
5792	/*
5793	 * It would be nice to assert that if it's DMU metadata (level >
5794	 * 0 || it's the dnode file), then it must be syncing context.
5795	 * But we don't know that information at this level.
5796	 */
5797
5798	mutex_enter(&buf->b_evict_lock);
5799
5800	ASSERT(HDR_HAS_L1HDR(hdr));
5801
5802	/*
5803	 * We don't grab the hash lock prior to this check, because if
5804	 * the buffer's header is in the arc_anon state, it won't be
5805	 * linked into the hash table.
5806	 */
5807	if (hdr->b_l1hdr.b_state == arc_anon) {
5808		mutex_exit(&buf->b_evict_lock);
5809		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5810		ASSERT(!HDR_IN_HASH_TABLE(hdr));
5811		ASSERT(!HDR_HAS_L2HDR(hdr));
5812		ASSERT(HDR_EMPTY(hdr));
5813		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
5814		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
5815		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
5816
5817		hdr->b_l1hdr.b_arc_access = 0;
5818
5819		/*
5820		 * If the buf is being overridden then it may already
5821		 * have a hdr that is not empty.
5822		 */
5823		buf_discard_identity(hdr);
5824		arc_buf_thaw(buf);
5825
5826		return;
5827	}
5828
5829	kmutex_t *hash_lock = HDR_LOCK(hdr);
5830	mutex_enter(hash_lock);
5831
5832	/*
5833	 * This assignment is only valid as long as the hash_lock is
5834	 * held, we must be careful not to reference state or the
5835	 * b_state field after dropping the lock.
5836	 */
5837	arc_state_t *state = hdr->b_l1hdr.b_state;
5838	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5839	ASSERT3P(state, !=, arc_anon);
5840
5841	/* this buffer is not on any list */
5842	ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
5843
5844	if (HDR_HAS_L2HDR(hdr)) {
5845		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
5846
5847		/*
5848		 * We have to recheck this conditional again now that
5849		 * we're holding the l2ad_mtx to prevent a race with
5850		 * another thread which might be concurrently calling
5851		 * l2arc_evict(). In that case, l2arc_evict() might have
5852		 * destroyed the header's L2 portion as we were waiting
5853		 * to acquire the l2ad_mtx.
5854		 */
5855		if (HDR_HAS_L2HDR(hdr)) {
5856			l2arc_trim(hdr);
5857			arc_hdr_l2hdr_destroy(hdr);
5858		}
5859
5860		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
5861	}
5862
5863	/*
5864	 * Do we have more than one buf?
5865	 */
5866	if (hdr->b_l1hdr.b_bufcnt > 1) {
5867		arc_buf_hdr_t *nhdr;
5868		uint64_t spa = hdr->b_spa;
5869		uint64_t psize = HDR_GET_PSIZE(hdr);
5870		uint64_t lsize = HDR_GET_LSIZE(hdr);
5871		enum zio_compress compress = HDR_GET_COMPRESS(hdr);
5872		arc_buf_contents_t type = arc_buf_type(hdr);
5873		VERIFY3U(hdr->b_type, ==, type);
5874
5875		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
5876		(void) remove_reference(hdr, hash_lock, tag);
5877
5878		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
5879			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
5880			ASSERT(ARC_BUF_LAST(buf));
5881		}
5882
5883		/*
5884		 * Pull the data off of this hdr and attach it to
5885		 * a new anonymous hdr. Also find the last buffer
5886		 * in the hdr's buffer list.
5887		 */
5888		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
5889		ASSERT3P(lastbuf, !=, NULL);
5890
5891		/*
5892		 * If the current arc_buf_t and the hdr are sharing their data
5893		 * buffer, then we must stop sharing that block.
5894		 */
5895		if (arc_buf_is_shared(buf)) {
5896			VERIFY(!arc_buf_is_shared(lastbuf));
5897
5898			/*
5899			 * First, sever the block sharing relationship between
5900			 * buf and the arc_buf_hdr_t.
5901			 */
5902			arc_unshare_buf(hdr, buf);
5903
5904			/*
5905			 * Now we need to recreate the hdr's b_pabd. Since we
5906			 * have lastbuf handy, we try to share with it, but if
5907			 * we can't then we allocate a new b_pabd and copy the
5908			 * data from buf into it.
5909			 */
5910			if (arc_can_share(hdr, lastbuf)) {
5911				arc_share_buf(hdr, lastbuf);
5912			} else {
5913				arc_hdr_alloc_pabd(hdr);
5914				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
5915				    buf->b_data, psize);
5916			}
5917			VERIFY3P(lastbuf->b_data, !=, NULL);
5918		} else if (HDR_SHARED_DATA(hdr)) {
5919			/*
5920			 * Uncompressed shared buffers are always at the end
5921			 * of the list. Compressed buffers don't have the
5922			 * same requirements. This makes it hard to
5923			 * simply assert that the lastbuf is shared so
5924			 * we rely on the hdr's compression flags to determine
5925			 * if we have a compressed, shared buffer.
5926			 */
5927			ASSERT(arc_buf_is_shared(lastbuf) ||
5928			    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
5929			ASSERT(!ARC_BUF_SHARED(buf));
5930		}
5931		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
5932		ASSERT3P(state, !=, arc_l2c_only);
5933
5934		(void) refcount_remove_many(&state->arcs_size,
5935		    arc_buf_size(buf), buf);
5936
5937		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
5938			ASSERT3P(state, !=, arc_l2c_only);
5939			(void) refcount_remove_many(&state->arcs_esize[type],
5940			    arc_buf_size(buf), buf);
5941		}
5942
5943		hdr->b_l1hdr.b_bufcnt -= 1;
5944		arc_cksum_verify(buf);
5945#ifdef illumos
5946		arc_buf_unwatch(buf);
5947#endif
5948
5949		mutex_exit(hash_lock);
5950
5951		/*
5952		 * Allocate a new hdr. The new hdr will contain a b_pabd
5953		 * buffer which will be freed in arc_write().
5954		 */
5955		nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
5956		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
5957		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
5958		ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
5959		VERIFY3U(nhdr->b_type, ==, type);
5960		ASSERT(!HDR_SHARED_DATA(nhdr));
5961
5962		nhdr->b_l1hdr.b_buf = buf;
5963		nhdr->b_l1hdr.b_bufcnt = 1;
5964		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
5965		buf->b_hdr = nhdr;
5966
5967		mutex_exit(&buf->b_evict_lock);
5968		(void) refcount_add_many(&arc_anon->arcs_size,
5969		    arc_buf_size(buf), buf);
5970	} else {
5971		mutex_exit(&buf->b_evict_lock);
5972		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
5973		/* protected by hash lock, or hdr is on arc_anon */
5974		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
5975		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5976		arc_change_state(arc_anon, hdr, hash_lock);
5977		hdr->b_l1hdr.b_arc_access = 0;
5978		mutex_exit(hash_lock);
5979
5980		buf_discard_identity(hdr);
5981		arc_buf_thaw(buf);
5982	}
5983}
5984
5985int
5986arc_released(arc_buf_t *buf)
5987{
5988	int released;
5989
5990	mutex_enter(&buf->b_evict_lock);
5991	released = (buf->b_data != NULL &&
5992	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
5993	mutex_exit(&buf->b_evict_lock);
5994	return (released);
5995}
5996
5997#ifdef ZFS_DEBUG
5998int
5999arc_referenced(arc_buf_t *buf)
6000{
6001	int referenced;
6002
6003	mutex_enter(&buf->b_evict_lock);
6004	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6005	mutex_exit(&buf->b_evict_lock);
6006	return (referenced);
6007}
6008#endif
6009
6010static void
6011arc_write_ready(zio_t *zio)
6012{
6013	arc_write_callback_t *callback = zio->io_private;
6014	arc_buf_t *buf = callback->awcb_buf;
6015	arc_buf_hdr_t *hdr = buf->b_hdr;
6016	uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
6017
6018	ASSERT(HDR_HAS_L1HDR(hdr));
6019	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6020	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
6021
6022	/*
6023	 * If we're reexecuting this zio because the pool suspended, then
6024	 * cleanup any state that was previously set the first time the
6025	 * callback was invoked.
6026	 */
6027	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6028		arc_cksum_free(hdr);
6029#ifdef illumos
6030		arc_buf_unwatch(buf);
6031#endif
6032		if (hdr->b_l1hdr.b_pabd != NULL) {
6033			if (arc_buf_is_shared(buf)) {
6034				arc_unshare_buf(hdr, buf);
6035			} else {
6036				arc_hdr_free_pabd(hdr);
6037			}
6038		}
6039	}
6040	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6041	ASSERT(!HDR_SHARED_DATA(hdr));
6042	ASSERT(!arc_buf_is_shared(buf));
6043
6044	callback->awcb_ready(zio, buf, callback->awcb_private);
6045
6046	if (HDR_IO_IN_PROGRESS(hdr))
6047		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6048
6049	arc_cksum_compute(buf);
6050	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6051
6052	enum zio_compress compress;
6053	if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6054		compress = ZIO_COMPRESS_OFF;
6055	} else {
6056		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
6057		compress = BP_GET_COMPRESS(zio->io_bp);
6058	}
6059	HDR_SET_PSIZE(hdr, psize);
6060	arc_hdr_set_compress(hdr, compress);
6061
6062
6063	/*
6064	 * Fill the hdr with data. If the hdr is compressed, the data we want
6065	 * is available from the zio, otherwise we can take it from the buf.
6066	 *
6067	 * We might be able to share the buf's data with the hdr here. However,
6068	 * doing so would cause the ARC to be full of linear ABDs if we write a
6069	 * lot of shareable data. As a compromise, we check whether scattered
6070	 * ABDs are allowed, and assume that if they are then the user wants
6071	 * the ARC to be primarily filled with them regardless of the data being
6072	 * written. Therefore, if they're allowed then we allocate one and copy
6073	 * the data into it; otherwise, we share the data directly if we can.
6074	 */
6075	if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
6076		arc_hdr_alloc_pabd(hdr);
6077
6078		/*
6079		 * Ideally, we would always copy the io_abd into b_pabd, but the
6080		 * user may have disabled compressed ARC, thus we must check the
6081		 * hdr's compression setting rather than the io_bp's.
6082		 */
6083		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
6084			ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
6085			    ZIO_COMPRESS_OFF);
6086			ASSERT3U(psize, >, 0);
6087
6088			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6089		} else {
6090			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6091
6092			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6093			    arc_buf_size(buf));
6094		}
6095	} else {
6096		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6097		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6098		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6099
6100		arc_share_buf(hdr, buf);
6101	}
6102
6103	arc_hdr_verify(hdr, zio->io_bp);
6104}
6105
6106static void
6107arc_write_children_ready(zio_t *zio)
6108{
6109	arc_write_callback_t *callback = zio->io_private;
6110	arc_buf_t *buf = callback->awcb_buf;
6111
6112	callback->awcb_children_ready(zio, buf, callback->awcb_private);
6113}
6114
6115/*
6116 * The SPA calls this callback for each physical write that happens on behalf
6117 * of a logical write.  See the comment in dbuf_write_physdone() for details.
6118 */
6119static void
6120arc_write_physdone(zio_t *zio)
6121{
6122	arc_write_callback_t *cb = zio->io_private;
6123	if (cb->awcb_physdone != NULL)
6124		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
6125}
6126
6127static void
6128arc_write_done(zio_t *zio)
6129{
6130	arc_write_callback_t *callback = zio->io_private;
6131	arc_buf_t *buf = callback->awcb_buf;
6132	arc_buf_hdr_t *hdr = buf->b_hdr;
6133
6134	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6135
6136	if (zio->io_error == 0) {
6137		arc_hdr_verify(hdr, zio->io_bp);
6138
6139		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6140			buf_discard_identity(hdr);
6141		} else {
6142			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
6143			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
6144		}
6145	} else {
6146		ASSERT(HDR_EMPTY(hdr));
6147	}
6148
6149	/*
6150	 * If the block to be written was all-zero or compressed enough to be
6151	 * embedded in the BP, no write was performed so there will be no
6152	 * dva/birth/checksum.  The buffer must therefore remain anonymous
6153	 * (and uncached).
6154	 */
6155	if (!HDR_EMPTY(hdr)) {
6156		arc_buf_hdr_t *exists;
6157		kmutex_t *hash_lock;
6158
6159		ASSERT3U(zio->io_error, ==, 0);
6160
6161		arc_cksum_verify(buf);
6162
6163		exists = buf_hash_insert(hdr, &hash_lock);
6164		if (exists != NULL) {
6165			/*
6166			 * This can only happen if we overwrite for
6167			 * sync-to-convergence, because we remove
6168			 * buffers from the hash table when we arc_free().
6169			 */
6170			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
6171				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6172					panic("bad overwrite, hdr=%p exists=%p",
6173					    (void *)hdr, (void *)exists);
6174				ASSERT(refcount_is_zero(
6175				    &exists->b_l1hdr.b_refcnt));
6176				arc_change_state(arc_anon, exists, hash_lock);
6177				mutex_exit(hash_lock);
6178				arc_hdr_destroy(exists);
6179				exists = buf_hash_insert(hdr, &hash_lock);
6180				ASSERT3P(exists, ==, NULL);
6181			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
6182				/* nopwrite */
6183				ASSERT(zio->io_prop.zp_nopwrite);
6184				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6185					panic("bad nopwrite, hdr=%p exists=%p",
6186					    (void *)hdr, (void *)exists);
6187			} else {
6188				/* Dedup */
6189				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
6190				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
6191				ASSERT(BP_GET_DEDUP(zio->io_bp));
6192				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
6193			}
6194		}
6195		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6196		/* if it's not anon, we are doing a scrub */
6197		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
6198			arc_access(hdr, hash_lock);
6199		mutex_exit(hash_lock);
6200	} else {
6201		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6202	}
6203
6204	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
6205	callback->awcb_done(zio, buf, callback->awcb_private);
6206
6207	abd_put(zio->io_abd);
6208	kmem_free(callback, sizeof (arc_write_callback_t));
6209}
6210
6211zio_t *
6212arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
6213    boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
6214    arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
6215    arc_write_done_func_t *done, void *private, zio_priority_t priority,
6216    int zio_flags, const zbookmark_phys_t *zb)
6217{
6218	arc_buf_hdr_t *hdr = buf->b_hdr;
6219	arc_write_callback_t *callback;
6220	zio_t *zio;
6221	zio_prop_t localprop = *zp;
6222
6223	ASSERT3P(ready, !=, NULL);
6224	ASSERT3P(done, !=, NULL);
6225	ASSERT(!HDR_IO_ERROR(hdr));
6226	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6227	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6228	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
6229	if (l2arc)
6230		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6231	if (ARC_BUF_COMPRESSED(buf)) {
6232		/*
6233		 * We're writing a pre-compressed buffer.  Make the
6234		 * compression algorithm requested by the zio_prop_t match
6235		 * the pre-compressed buffer's compression algorithm.
6236		 */
6237		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
6238
6239		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
6240		zio_flags |= ZIO_FLAG_RAW;
6241	}
6242	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
6243	callback->awcb_ready = ready;
6244	callback->awcb_children_ready = children_ready;
6245	callback->awcb_physdone = physdone;
6246	callback->awcb_done = done;
6247	callback->awcb_private = private;
6248	callback->awcb_buf = buf;
6249
6250	/*
6251	 * The hdr's b_pabd is now stale, free it now. A new data block
6252	 * will be allocated when the zio pipeline calls arc_write_ready().
6253	 */
6254	if (hdr->b_l1hdr.b_pabd != NULL) {
6255		/*
6256		 * If the buf is currently sharing the data block with
6257		 * the hdr then we need to break that relationship here.
6258		 * The hdr will remain with a NULL data pointer and the
6259		 * buf will take sole ownership of the block.
6260		 */
6261		if (arc_buf_is_shared(buf)) {
6262			arc_unshare_buf(hdr, buf);
6263		} else {
6264			arc_hdr_free_pabd(hdr);
6265		}
6266		VERIFY3P(buf->b_data, !=, NULL);
6267		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
6268	}
6269	ASSERT(!arc_buf_is_shared(buf));
6270	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6271
6272	zio = zio_write(pio, spa, txg, bp,
6273	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
6274	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
6275	    (children_ready != NULL) ? arc_write_children_ready : NULL,
6276	    arc_write_physdone, arc_write_done, callback,
6277	    priority, zio_flags, zb);
6278
6279	return (zio);
6280}
6281
6282static int
6283arc_memory_throttle(uint64_t reserve, uint64_t txg)
6284{
6285#ifdef _KERNEL
6286	uint64_t available_memory = ptob(freemem);
6287	static uint64_t page_load = 0;
6288	static uint64_t last_txg = 0;
6289
6290#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
6291	available_memory = MIN(available_memory, uma_avail());
6292#endif
6293
6294	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
6295		return (0);
6296
6297	if (txg > last_txg) {
6298		last_txg = txg;
6299		page_load = 0;
6300	}
6301	/*
6302	 * If we are in pageout, we know that memory is already tight,
6303	 * the arc is already going to be evicting, so we just want to
6304	 * continue to let page writes occur as quickly as possible.
6305	 */
6306	if (curproc == pageproc) {
6307		if (page_load > MAX(ptob(minfree), available_memory) / 4)
6308			return (SET_ERROR(ERESTART));
6309		/* Note: reserve is inflated, so we deflate */
6310		page_load += reserve / 8;
6311		return (0);
6312	} else if (page_load > 0 && arc_reclaim_needed()) {
6313		/* memory is low, delay before restarting */
6314		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
6315		return (SET_ERROR(EAGAIN));
6316	}
6317	page_load = 0;
6318#endif
6319	return (0);
6320}
6321
6322void
6323arc_tempreserve_clear(uint64_t reserve)
6324{
6325	atomic_add_64(&arc_tempreserve, -reserve);
6326	ASSERT((int64_t)arc_tempreserve >= 0);
6327}
6328
6329int
6330arc_tempreserve_space(uint64_t reserve, uint64_t txg)
6331{
6332	int error;
6333	uint64_t anon_size;
6334
6335	if (reserve > arc_c/4 && !arc_no_grow) {
6336		arc_c = MIN(arc_c_max, reserve * 4);
6337		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
6338	}
6339	if (reserve > arc_c)
6340		return (SET_ERROR(ENOMEM));
6341
6342	/*
6343	 * Don't count loaned bufs as in flight dirty data to prevent long
6344	 * network delays from blocking transactions that are ready to be
6345	 * assigned to a txg.
6346	 */
6347
6348	/* assert that it has not wrapped around */
6349	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
6350
6351	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
6352	    arc_loaned_bytes), 0);
6353
6354	/*
6355	 * Writes will, almost always, require additional memory allocations
6356	 * in order to compress/encrypt/etc the data.  We therefore need to
6357	 * make sure that there is sufficient available memory for this.
6358	 */
6359	error = arc_memory_throttle(reserve, txg);
6360	if (error != 0)
6361		return (error);
6362
6363	/*
6364	 * Throttle writes when the amount of dirty data in the cache
6365	 * gets too large.  We try to keep the cache less than half full
6366	 * of dirty blocks so that our sync times don't grow too large.
6367	 * Note: if two requests come in concurrently, we might let them
6368	 * both succeed, when one of them should fail.  Not a huge deal.
6369	 */
6370
6371	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
6372	    anon_size > arc_c / 4) {
6373		uint64_t meta_esize =
6374		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6375		uint64_t data_esize =
6376		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6377		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
6378		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
6379		    arc_tempreserve >> 10, meta_esize >> 10,
6380		    data_esize >> 10, reserve >> 10, arc_c >> 10);
6381		return (SET_ERROR(ERESTART));
6382	}
6383	atomic_add_64(&arc_tempreserve, reserve);
6384	return (0);
6385}
6386
6387static void
6388arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
6389    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
6390{
6391	size->value.ui64 = refcount_count(&state->arcs_size);
6392	evict_data->value.ui64 =
6393	    refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
6394	evict_metadata->value.ui64 =
6395	    refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
6396}
6397
6398static int
6399arc_kstat_update(kstat_t *ksp, int rw)
6400{
6401	arc_stats_t *as = ksp->ks_data;
6402
6403	if (rw == KSTAT_WRITE) {
6404		return (EACCES);
6405	} else {
6406		arc_kstat_update_state(arc_anon,
6407		    &as->arcstat_anon_size,
6408		    &as->arcstat_anon_evictable_data,
6409		    &as->arcstat_anon_evictable_metadata);
6410		arc_kstat_update_state(arc_mru,
6411		    &as->arcstat_mru_size,
6412		    &as->arcstat_mru_evictable_data,
6413		    &as->arcstat_mru_evictable_metadata);
6414		arc_kstat_update_state(arc_mru_ghost,
6415		    &as->arcstat_mru_ghost_size,
6416		    &as->arcstat_mru_ghost_evictable_data,
6417		    &as->arcstat_mru_ghost_evictable_metadata);
6418		arc_kstat_update_state(arc_mfu,
6419		    &as->arcstat_mfu_size,
6420		    &as->arcstat_mfu_evictable_data,
6421		    &as->arcstat_mfu_evictable_metadata);
6422		arc_kstat_update_state(arc_mfu_ghost,
6423		    &as->arcstat_mfu_ghost_size,
6424		    &as->arcstat_mfu_ghost_evictable_data,
6425		    &as->arcstat_mfu_ghost_evictable_metadata);
6426
6427		ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
6428		ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
6429		ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
6430		ARCSTAT(arcstat_metadata_size) =
6431		    aggsum_value(&astat_metadata_size);
6432		ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
6433		ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size);
6434		ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
6435	}
6436
6437	return (0);
6438}
6439
6440/*
6441 * This function *must* return indices evenly distributed between all
6442 * sublists of the multilist. This is needed due to how the ARC eviction
6443 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
6444 * distributed between all sublists and uses this assumption when
6445 * deciding which sublist to evict from and how much to evict from it.
6446 */
6447unsigned int
6448arc_state_multilist_index_func(multilist_t *ml, void *obj)
6449{
6450	arc_buf_hdr_t *hdr = obj;
6451
6452	/*
6453	 * We rely on b_dva to generate evenly distributed index
6454	 * numbers using buf_hash below. So, as an added precaution,
6455	 * let's make sure we never add empty buffers to the arc lists.
6456	 */
6457	ASSERT(!HDR_EMPTY(hdr));
6458
6459	/*
6460	 * The assumption here, is the hash value for a given
6461	 * arc_buf_hdr_t will remain constant throughout it's lifetime
6462	 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
6463	 * Thus, we don't need to store the header's sublist index
6464	 * on insertion, as this index can be recalculated on removal.
6465	 *
6466	 * Also, the low order bits of the hash value are thought to be
6467	 * distributed evenly. Otherwise, in the case that the multilist
6468	 * has a power of two number of sublists, each sublists' usage
6469	 * would not be evenly distributed.
6470	 */
6471	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
6472	    multilist_get_num_sublists(ml));
6473}
6474
6475#ifdef _KERNEL
6476static eventhandler_tag arc_event_lowmem = NULL;
6477
6478static void
6479arc_lowmem(void *arg __unused, int howto __unused)
6480{
6481
6482	mutex_enter(&arc_reclaim_lock);
6483	DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE);
6484