xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision f43aa5fa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2019, Joyent, Inc.
24  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
27  * Copyright (c) 2011, 2019, Delphix. All rights reserved.
28  * Copyright (c) 2020, George Amanakis. All rights reserved.
29  * Copyright (c) 2020, The FreeBSD Foundation [1]
30  *
31  * [1] Portions of this software were developed by Allan Jude
32  *     under sponsorship from the FreeBSD Foundation.
33  */
34 
35 /*
36  * DVA-based Adjustable Replacement Cache
37  *
38  * While much of the theory of operation used here is
39  * based on the self-tuning, low overhead replacement cache
40  * presented by Megiddo and Modha at FAST 2003, there are some
41  * significant differences:
42  *
43  * 1. The Megiddo and Modha model assumes any page is evictable.
44  * Pages in its cache cannot be "locked" into memory.  This makes
45  * the eviction algorithm simple: evict the last page in the list.
46  * This also make the performance characteristics easy to reason
47  * about.  Our cache is not so simple.  At any given moment, some
48  * subset of the blocks in the cache are un-evictable because we
49  * have handed out a reference to them.  Blocks are only evictable
50  * when there are no external references active.  This makes
51  * eviction far more problematic:  we choose to evict the evictable
52  * blocks that are the "lowest" in the list.
53  *
54  * There are times when it is not possible to evict the requested
55  * space.  In these circumstances we are unable to adjust the cache
56  * size.  To prevent the cache growing unbounded at these times we
57  * implement a "cache throttle" that slows the flow of new data
58  * into the cache until we can make space available.
59  *
60  * 2. The Megiddo and Modha model assumes a fixed cache size.
61  * Pages are evicted when the cache is full and there is a cache
62  * miss.  Our model has a variable sized cache.  It grows with
63  * high use, but also tries to react to memory pressure from the
64  * operating system: decreasing its size when system memory is
65  * tight.
66  *
67  * 3. The Megiddo and Modha model assumes a fixed page size. All
68  * elements of the cache are therefore exactly the same size.  So
69  * when adjusting the cache size following a cache miss, its simply
70  * a matter of choosing a single page to evict.  In our model, we
71  * have variable sized cache blocks (rangeing from 512 bytes to
72  * 128K bytes).  We therefore choose a set of blocks to evict to make
73  * space for a cache miss that approximates as closely as possible
74  * the space used by the new block.
75  *
76  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
77  * by N. Megiddo & D. Modha, FAST 2003
78  */
79 
80 /*
81  * The locking model:
82  *
83  * A new reference to a cache buffer can be obtained in two
84  * ways: 1) via a hash table lookup using the DVA as a key,
85  * or 2) via one of the ARC lists.  The arc_read() interface
86  * uses method 1, while the internal ARC algorithms for
87  * adjusting the cache use method 2.  We therefore provide two
88  * types of locks: 1) the hash table lock array, and 2) the
89  * ARC list locks.
90  *
91  * Buffers do not have their own mutexes, rather they rely on the
92  * hash table mutexes for the bulk of their protection (i.e. most
93  * fields in the arc_buf_hdr_t are protected by these mutexes).
94  *
95  * buf_hash_find() returns the appropriate mutex (held) when it
96  * locates the requested buffer in the hash table.  It returns
97  * NULL for the mutex if the buffer was not in the table.
98  *
99  * buf_hash_remove() expects the appropriate hash mutex to be
100  * already held before it is invoked.
101  *
102  * Each ARC state also has a mutex which is used to protect the
103  * buffer list associated with the state.  When attempting to
104  * obtain a hash table lock while holding an ARC list lock you
105  * must use: mutex_tryenter() to avoid deadlock.  Also note that
106  * the active state mutex must be held before the ghost state mutex.
107  *
108  * Note that the majority of the performance stats are manipulated
109  * with atomic operations.
110  *
111  * The L2ARC uses the l2ad_mtx on each vdev for the following:
112  *
113  *	- L2ARC buflist creation
114  *	- L2ARC buflist eviction
115  *	- L2ARC write completion, which walks L2ARC buflists
116  *	- ARC header destruction, as it removes from L2ARC buflists
117  *	- ARC header release, as it removes from L2ARC buflists
118  */
119 
120 /*
121  * ARC operation:
122  *
123  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
124  * This structure can point either to a block that is still in the cache or to
125  * one that is only accessible in an L2 ARC device, or it can provide
126  * information about a block that was recently evicted. If a block is
127  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
128  * information to retrieve it from the L2ARC device. This information is
129  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
130  * that is in this state cannot access the data directly.
131  *
132  * Blocks that are actively being referenced or have not been evicted
133  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
134  * the arc_buf_hdr_t that will point to the data block in memory. A block can
135  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
136  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
137  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
138  *
139  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
140  * ability to store the physical data (b_pabd) associated with the DVA of the
141  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
142  * it will match its on-disk compression characteristics. This behavior can be
143  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
144  * compressed ARC functionality is disabled, the b_pabd will point to an
145  * uncompressed version of the on-disk data.
146  *
147  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
148  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
149  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
150  * consumer. The ARC will provide references to this data and will keep it
151  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
152  * data block and will evict any arc_buf_t that is no longer referenced. The
153  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
154  * "overhead_size" kstat.
155  *
156  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
157  * compressed form. The typical case is that consumers will want uncompressed
158  * data, and when that happens a new data buffer is allocated where the data is
159  * decompressed for them to use. Currently the only consumer who wants
160  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
161  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
162  * with the arc_buf_hdr_t.
163  *
164  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
165  * first one is owned by a compressed send consumer (and therefore references
166  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
167  * used by any other consumer (and has its own uncompressed copy of the data
168  * buffer).
169  *
170  *   arc_buf_hdr_t
171  *   +-----------+
172  *   | fields    |
173  *   | common to |
174  *   | L1- and   |
175  *   | L2ARC     |
176  *   +-----------+
177  *   | l2arc_buf_hdr_t
178  *   |           |
179  *   +-----------+
180  *   | l1arc_buf_hdr_t
181  *   |           |              arc_buf_t
182  *   | b_buf     +------------>+-----------+      arc_buf_t
183  *   | b_pabd    +-+           |b_next     +---->+-----------+
184  *   +-----------+ |           |-----------|     |b_next     +-->NULL
185  *                 |           |b_comp = T |     +-----------+
186  *                 |           |b_data     +-+   |b_comp = F |
187  *                 |           +-----------+ |   |b_data     +-+
188  *                 +->+------+               |   +-----------+ |
189  *        compressed  |      |               |                 |
190  *           data     |      |<--------------+                 | uncompressed
191  *                    +------+          compressed,            |     data
192  *                                        shared               +-->+------+
193  *                                         data                    |      |
194  *                                                                 |      |
195  *                                                                 +------+
196  *
197  * When a consumer reads a block, the ARC must first look to see if the
198  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
199  * arc_buf_t and either copies uncompressed data into a new data buffer from an
200  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
201  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
202  * hdr is compressed and the desired compression characteristics of the
203  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
204  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
205  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
206  * be anywhere in the hdr's list.
207  *
208  * The diagram below shows an example of an uncompressed ARC hdr that is
209  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
210  * the last element in the buf list):
211  *
212  *                arc_buf_hdr_t
213  *                +-----------+
214  *                |           |
215  *                |           |
216  *                |           |
217  *                +-----------+
218  * l2arc_buf_hdr_t|           |
219  *                |           |
220  *                +-----------+
221  * l1arc_buf_hdr_t|           |
222  *                |           |                 arc_buf_t    (shared)
223  *                |    b_buf  +------------>+---------+      arc_buf_t
224  *                |           |             |b_next   +---->+---------+
225  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
226  *                +-----------+ |           |         |     +---------+
227  *                              |           |b_data   +-+   |         |
228  *                              |           +---------+ |   |b_data   +-+
229  *                              +->+------+             |   +---------+ |
230  *                                 |      |             |               |
231  *                   uncompressed  |      |             |               |
232  *                        data     +------+             |               |
233  *                                    ^                 +->+------+     |
234  *                                    |       uncompressed |      |     |
235  *                                    |           data     |      |     |
236  *                                    |                    +------+     |
237  *                                    +---------------------------------+
238  *
239  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
240  * since the physical block is about to be rewritten. The new data contents
241  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
242  * it may compress the data before writing it to disk. The ARC will be called
243  * with the transformed data and will bcopy the transformed on-disk block into
244  * a newly allocated b_pabd. Writes are always done into buffers which have
245  * either been loaned (and hence are new and don't have other readers) or
246  * buffers which have been released (and hence have their own hdr, if there
247  * were originally other readers of the buf's original hdr). This ensures that
248  * the ARC only needs to update a single buf and its hdr after a write occurs.
249  *
250  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
251  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
252  * that when compressed ARC is enabled that the L2ARC blocks are identical
253  * to the on-disk block in the main data pool. This provides a significant
254  * advantage since the ARC can leverage the bp's checksum when reading from the
255  * L2ARC to determine if the contents are valid. However, if the compressed
256  * ARC is disabled, then the L2ARC's block must be transformed to look
257  * like the physical block in the main data pool before comparing the
258  * checksum and determining its validity.
259  *
260  * The L1ARC has a slightly different system for storing encrypted data.
261  * Raw (encrypted + possibly compressed) data has a few subtle differences from
262  * data that is just compressed. The biggest difference is that it is not
263  * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded.
264  * The other difference is that encryption cannot be treated as a suggestion.
265  * If a caller would prefer compressed data, but they actually wind up with
266  * uncompressed data the worst thing that could happen is there might be a
267  * performance hit. If the caller requests encrypted data, however, we must be
268  * sure they actually get it or else secret information could be leaked. Raw
269  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
270  * may have both an encrypted version and a decrypted version of its data at
271  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
272  * copied out of this header. To avoid complications with b_pabd, raw buffers
273  * cannot be shared.
274  */
275 
276 #include <sys/spa.h>
277 #include <sys/zio.h>
278 #include <sys/spa_impl.h>
279 #include <sys/zio_compress.h>
280 #include <sys/zio_checksum.h>
281 #include <sys/zfs_context.h>
282 #include <sys/arc.h>
283 #include <sys/refcount.h>
284 #include <sys/vdev.h>
285 #include <sys/vdev_impl.h>
286 #include <sys/dsl_pool.h>
287 #include <sys/zio_checksum.h>
288 #include <sys/multilist.h>
289 #include <sys/abd.h>
290 #include <sys/zil.h>
291 #include <sys/fm/fs/zfs.h>
292 #ifdef _KERNEL
293 #include <sys/vmsystm.h>
294 #include <vm/anon.h>
295 #include <sys/fs/swapnode.h>
296 #include <sys/dnlc.h>
297 #endif
298 #include <sys/callb.h>
299 #include <sys/kstat.h>
300 #include <sys/zthr.h>
301 #include <zfs_fletcher.h>
302 #include <sys/arc_impl.h>
303 #include <sys/aggsum.h>
304 #include <sys/cityhash.h>
305 #include <sys/param.h>
306 
307 #ifndef _KERNEL
308 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
309 boolean_t arc_watch = B_FALSE;
310 int arc_procfd;
311 #endif
312 
313 /*
314  * This thread's job is to keep enough free memory in the system, by
315  * calling arc_kmem_reap_now() plus arc_shrink(), which improves
316  * arc_available_memory().
317  */
318 static zthr_t		*arc_reap_zthr;
319 
320 /*
321  * This thread's job is to keep arc_size under arc_c, by calling
322  * arc_adjust(), which improves arc_is_overflowing().
323  */
324 static zthr_t		*arc_adjust_zthr;
325 
326 static kmutex_t		arc_adjust_lock;
327 static kcondvar_t	arc_adjust_waiters_cv;
328 static boolean_t	arc_adjust_needed = B_FALSE;
329 
330 uint_t arc_reduce_dnlc_percent = 3;
331 
332 /*
333  * The number of headers to evict in arc_evict_state_impl() before
334  * dropping the sublist lock and evicting from another sublist. A lower
335  * value means we're more likely to evict the "correct" header (i.e. the
336  * oldest header in the arc state), but comes with higher overhead
337  * (i.e. more invocations of arc_evict_state_impl()).
338  */
339 int zfs_arc_evict_batch_limit = 10;
340 
341 /* number of seconds before growing cache again */
342 int arc_grow_retry = 60;
343 
344 /*
345  * Minimum time between calls to arc_kmem_reap_soon().  Note that this will
346  * be converted to ticks, so with the default hz=100, a setting of 15 ms
347  * will actually wait 2 ticks, or 20ms.
348  */
349 int arc_kmem_cache_reap_retry_ms = 1000;
350 
351 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
352 int zfs_arc_overflow_shift = 8;
353 
354 /* shift of arc_c for calculating both min and max arc_p */
355 int arc_p_min_shift = 4;
356 
357 /* log2(fraction of arc to reclaim) */
358 int arc_shrink_shift = 7;
359 
360 /*
361  * log2(fraction of ARC which must be free to allow growing).
362  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
363  * when reading a new block into the ARC, we will evict an equal-sized block
364  * from the ARC.
365  *
366  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
367  * we will still not allow it to grow.
368  */
369 int			arc_no_grow_shift = 5;
370 
371 
372 /*
373  * minimum lifespan of a prefetch block in clock ticks
374  * (initialized in arc_init())
375  */
376 static int		zfs_arc_min_prefetch_ms = 1;
377 static int		zfs_arc_min_prescient_prefetch_ms = 6;
378 
379 /*
380  * If this percent of memory is free, don't throttle.
381  */
382 int arc_lotsfree_percent = 10;
383 
384 static boolean_t arc_initialized;
385 
386 /*
387  * The arc has filled available memory and has now warmed up.
388  */
389 static boolean_t arc_warm;
390 
391 /*
392  * log2 fraction of the zio arena to keep free.
393  */
394 int arc_zio_arena_free_shift = 2;
395 
396 /*
397  * These tunables are for performance analysis.
398  */
399 uint64_t zfs_arc_max;
400 uint64_t zfs_arc_min;
401 uint64_t zfs_arc_meta_limit = 0;
402 uint64_t zfs_arc_meta_min = 0;
403 int zfs_arc_grow_retry = 0;
404 int zfs_arc_shrink_shift = 0;
405 int zfs_arc_p_min_shift = 0;
406 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
407 
408 /*
409  * ARC dirty data constraints for arc_tempreserve_space() throttle
410  */
411 uint_t zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
412 uint_t zfs_arc_anon_limit_percent = 25;		/* anon block dirty limit */
413 uint_t zfs_arc_pool_dirty_percent = 20;		/* each pool's anon allowance */
414 
415 boolean_t zfs_compressed_arc_enabled = B_TRUE;
416 
417 /* The 6 states: */
418 static arc_state_t ARC_anon;
419 static arc_state_t ARC_mru;
420 static arc_state_t ARC_mru_ghost;
421 static arc_state_t ARC_mfu;
422 static arc_state_t ARC_mfu_ghost;
423 static arc_state_t ARC_l2c_only;
424 
425 arc_stats_t arc_stats = {
426 	{ "hits",			KSTAT_DATA_UINT64 },
427 	{ "misses",			KSTAT_DATA_UINT64 },
428 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
429 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
430 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
431 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
432 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
433 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
434 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
435 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
436 	{ "mru_hits",			KSTAT_DATA_UINT64 },
437 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
438 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
439 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
440 	{ "deleted",			KSTAT_DATA_UINT64 },
441 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
442 	{ "access_skip",		KSTAT_DATA_UINT64 },
443 	{ "evict_skip",			KSTAT_DATA_UINT64 },
444 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
445 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
446 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
447 	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
448 	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
449 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
450 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
451 	{ "hash_elements",		KSTAT_DATA_UINT64 },
452 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
453 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
454 	{ "hash_chains",		KSTAT_DATA_UINT64 },
455 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
456 	{ "p",				KSTAT_DATA_UINT64 },
457 	{ "c",				KSTAT_DATA_UINT64 },
458 	{ "c_min",			KSTAT_DATA_UINT64 },
459 	{ "c_max",			KSTAT_DATA_UINT64 },
460 	{ "size",			KSTAT_DATA_UINT64 },
461 	{ "compressed_size",		KSTAT_DATA_UINT64 },
462 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
463 	{ "overhead_size",		KSTAT_DATA_UINT64 },
464 	{ "hdr_size",			KSTAT_DATA_UINT64 },
465 	{ "data_size",			KSTAT_DATA_UINT64 },
466 	{ "metadata_size",		KSTAT_DATA_UINT64 },
467 	{ "other_size",			KSTAT_DATA_UINT64 },
468 	{ "anon_size",			KSTAT_DATA_UINT64 },
469 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
470 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
471 	{ "mru_size",			KSTAT_DATA_UINT64 },
472 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
473 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
474 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
475 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
476 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
477 	{ "mfu_size",			KSTAT_DATA_UINT64 },
478 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
479 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
480 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
481 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
482 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
483 	{ "l2_hits",			KSTAT_DATA_UINT64 },
484 	{ "l2_misses",			KSTAT_DATA_UINT64 },
485 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
486 	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
487 	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
488 	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
489 	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
490 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
491 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
492 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
493 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
494 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
495 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
496 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
497 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
498 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
499 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
500 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
501 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
502 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
503 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
504 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
505 	{ "l2_size",			KSTAT_DATA_UINT64 },
506 	{ "l2_asize",			KSTAT_DATA_UINT64 },
507 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
508 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
509 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
510 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
511 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
512 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
513 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
514 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
515 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
516 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
517 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
518 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
519 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
520 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
521 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
522 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
523 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
524 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
525 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
526 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
527 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
528 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
529 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
530 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
531 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
532 };
533 
534 #define	ARCSTAT_MAX(stat, val) {					\
535 	uint64_t m;							\
536 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
537 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
538 		continue;						\
539 }
540 
541 #define	ARCSTAT_MAXSTAT(stat) \
542 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
543 
544 /*
545  * We define a macro to allow ARC hits/misses to be easily broken down by
546  * two separate conditions, giving a total of four different subtypes for
547  * each of hits and misses (so eight statistics total).
548  */
549 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
550 	if (cond1) {							\
551 		if (cond2) {						\
552 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
553 		} else {						\
554 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
555 		}							\
556 	} else {							\
557 		if (cond2) {						\
558 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
559 		} else {						\
560 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
561 		}							\
562 	}
563 
564 /*
565  * This macro allows us to use kstats as floating averages. Each time we
566  * update this kstat, we first factor it and the update value by
567  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
568  * average. This macro assumes that integer loads and stores are atomic, but
569  * is not safe for multiple writers updating the kstat in parallel (only the
570  * last writer's update will remain).
571  */
572 #define	ARCSTAT_F_AVG_FACTOR	3
573 #define	ARCSTAT_F_AVG(stat, value) \
574 	do { \
575 		uint64_t x = ARCSTAT(stat); \
576 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
577 		    (value) / ARCSTAT_F_AVG_FACTOR; \
578 		ARCSTAT(stat) = x; \
579 		_NOTE(CONSTCOND) \
580 	} while (0)
581 
582 kstat_t			*arc_ksp;
583 static arc_state_t	*arc_anon;
584 static arc_state_t	*arc_mru;
585 static arc_state_t	*arc_mru_ghost;
586 static arc_state_t	*arc_mfu;
587 static arc_state_t	*arc_mfu_ghost;
588 static arc_state_t	*arc_l2c_only;
589 
590 /*
591  * There are also some ARC variables that we want to export, but that are
592  * updated so often that having the canonical representation be the statistic
593  * variable causes a performance bottleneck. We want to use aggsum_t's for these
594  * instead, but still be able to export the kstat in the same way as before.
595  * The solution is to always use the aggsum version, except in the kstat update
596  * callback.
597  */
598 aggsum_t arc_size;
599 aggsum_t arc_meta_used;
600 aggsum_t astat_data_size;
601 aggsum_t astat_metadata_size;
602 aggsum_t astat_hdr_size;
603 aggsum_t astat_other_size;
604 aggsum_t astat_l2_hdr_size;
605 
606 static int		arc_no_grow;	/* Don't try to grow cache size */
607 static hrtime_t		arc_growtime;
608 static uint64_t		arc_tempreserve;
609 static uint64_t		arc_loaned_bytes;
610 
611 #define	GHOST_STATE(state)	\
612 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
613 	(state) == arc_l2c_only)
614 
615 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
616 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
617 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
618 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
619 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
620 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
621 #define	HDR_COMPRESSION_ENABLED(hdr)	\
622 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
623 
624 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
625 #define	HDR_L2_READING(hdr)	\
626 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
627 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
628 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
629 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
630 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
631 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
632 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
633 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
634 
635 #define	HDR_ISTYPE_METADATA(hdr)	\
636 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
637 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
638 
639 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
640 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
641 #define	HDR_HAS_RABD(hdr)	\
642 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
643 	(hdr)->b_crypt_hdr.b_rabd != NULL)
644 #define	HDR_ENCRYPTED(hdr)	\
645 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
646 #define	HDR_AUTHENTICATED(hdr)	\
647 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
648 
649 /* For storing compression mode in b_flags */
650 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
651 
652 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
653 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
654 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
655 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
656 
657 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
658 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
659 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
660 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
661 
662 /*
663  * Other sizes
664  */
665 
666 #define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
667 #define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
668 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
669 
670 /*
671  * Hash table routines
672  */
673 
674 #define	HT_LOCK_PAD	64
675 
676 struct ht_lock {
677 	kmutex_t	ht_lock;
678 #ifdef _KERNEL
679 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
680 #endif
681 };
682 
683 #define	BUF_LOCKS 256
684 typedef struct buf_hash_table {
685 	uint64_t ht_mask;
686 	arc_buf_hdr_t **ht_table;
687 	struct ht_lock ht_locks[BUF_LOCKS];
688 } buf_hash_table_t;
689 
690 static buf_hash_table_t buf_hash_table;
691 
692 #define	BUF_HASH_INDEX(spa, dva, birth) \
693 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
694 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
695 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
696 #define	HDR_LOCK(hdr) \
697 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
698 
699 uint64_t zfs_crc64_table[256];
700 
701 /*
702  * Level 2 ARC
703  */
704 
705 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
706 #define	L2ARC_HEADROOM		2			/* num of writes */
707 /*
708  * If we discover during ARC scan any buffers to be compressed, we boost
709  * our headroom for the next scanning cycle by this percentage multiple.
710  */
711 #define	L2ARC_HEADROOM_BOOST	200
712 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
713 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
714 
715 /*
716  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
717  * and each of the state has two types: data and metadata.
718  */
719 #define	L2ARC_FEED_TYPES	4
720 
721 
722 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
723 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
724 
725 /* L2ARC Performance Tunables */
726 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
727 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
728 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
729 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
730 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
731 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
732 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
733 boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
734 boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
735 int l2arc_meta_percent = 33;			/* limit on headers size */
736 
737 /*
738  * L2ARC Internals
739  */
740 static list_t L2ARC_dev_list;			/* device list */
741 static list_t *l2arc_dev_list;			/* device list pointer */
742 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
743 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
744 static list_t L2ARC_free_on_write;		/* free after write buf list */
745 static list_t *l2arc_free_on_write;		/* free after write list ptr */
746 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
747 static uint64_t l2arc_ndev;			/* number of devices */
748 
749 typedef struct l2arc_read_callback {
750 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
751 	blkptr_t		l2rcb_bp;		/* original blkptr */
752 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
753 	int			l2rcb_flags;		/* original flags */
754 	abd_t			*l2rcb_abd;		/* temporary buffer */
755 } l2arc_read_callback_t;
756 
757 typedef struct l2arc_data_free {
758 	/* protected by l2arc_free_on_write_mtx */
759 	abd_t		*l2df_abd;
760 	size_t		l2df_size;
761 	arc_buf_contents_t l2df_type;
762 	list_node_t	l2df_list_node;
763 } l2arc_data_free_t;
764 
765 static kmutex_t l2arc_feed_thr_lock;
766 static kcondvar_t l2arc_feed_thr_cv;
767 static uint8_t l2arc_thread_exit;
768 
769 static kmutex_t l2arc_rebuild_thr_lock;
770 static kcondvar_t l2arc_rebuild_thr_cv;
771 
772 enum arc_hdr_alloc_flags {
773 	ARC_HDR_ALLOC_RDATA = 0x1,
774 	ARC_HDR_DO_ADAPT = 0x2,
775 };
776 
777 
778 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
779 typedef enum arc_fill_flags {
780 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
781 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
782 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
783 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
784 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
785 } arc_fill_flags_t;
786 
787 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
788 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
789 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
790 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
791 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
792 static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t);
793 static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, int);
794 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
795 static boolean_t arc_is_overflowing();
796 static void arc_buf_watch(arc_buf_t *);
797 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
798 
799 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
800 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
801 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
802 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
803 
804 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
805 static void l2arc_read_done(zio_t *);
806 static void l2arc_do_free_on_write(void);
807 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
808     boolean_t state_only);
809 
810 #define	l2arc_hdr_arcstats_increment(hdr) \
811 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
812 #define	l2arc_hdr_arcstats_decrement(hdr) \
813 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
814 #define	l2arc_hdr_arcstats_increment_state(hdr) \
815 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
816 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
817 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
818 
819 /*
820  * The arc_all_memory function is a ZoL enhancement that lives in their OSL
821  * code. In user-space code, which is used primarily for testing, we return
822  * half of all memory.
823  */
824 uint64_t
arc_all_memory(void)825 arc_all_memory(void)
826 {
827 #ifdef _KERNEL
828 	return (ptob(physmem));
829 #else
830 	return ((sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES)) / 2);
831 #endif
832 }
833 
834 /*
835  * We use Cityhash for this. It's fast, and has good hash properties without
836  * requiring any large static buffers.
837  */
838 static uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)839 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
840 {
841 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
842 }
843 
844 #define	HDR_EMPTY(hdr)						\
845 	((hdr)->b_dva.dva_word[0] == 0 &&			\
846 	(hdr)->b_dva.dva_word[1] == 0)
847 
848 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
849 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
850 
851 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
852 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
853 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
854 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
855 
856 static void
buf_discard_identity(arc_buf_hdr_t * hdr)857 buf_discard_identity(arc_buf_hdr_t *hdr)
858 {
859 	hdr->b_dva.dva_word[0] = 0;
860 	hdr->b_dva.dva_word[1] = 0;
861 	hdr->b_birth = 0;
862 }
863 
864 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)865 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
866 {
867 	const dva_t *dva = BP_IDENTITY(bp);
868 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
869 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
870 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
871 	arc_buf_hdr_t *hdr;
872 
873 	mutex_enter(hash_lock);
874 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
875 	    hdr = hdr->b_hash_next) {
876 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
877 			*lockp = hash_lock;
878 			return (hdr);
879 		}
880 	}
881 	mutex_exit(hash_lock);
882 	*lockp = NULL;
883 	return (NULL);
884 }
885 
886 /*
887  * Insert an entry into the hash table.  If there is already an element
888  * equal to elem in the hash table, then the already existing element
889  * will be returned and the new element will not be inserted.
890  * Otherwise returns NULL.
891  * If lockp == NULL, the caller is assumed to already hold the hash lock.
892  */
893 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)894 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
895 {
896 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
897 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
898 	arc_buf_hdr_t *fhdr;
899 	uint32_t i;
900 
901 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
902 	ASSERT(hdr->b_birth != 0);
903 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
904 
905 	if (lockp != NULL) {
906 		*lockp = hash_lock;
907 		mutex_enter(hash_lock);
908 	} else {
909 		ASSERT(MUTEX_HELD(hash_lock));
910 	}
911 
912 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
913 	    fhdr = fhdr->b_hash_next, i++) {
914 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
915 			return (fhdr);
916 	}
917 
918 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
919 	buf_hash_table.ht_table[idx] = hdr;
920 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
921 
922 	/* collect some hash table performance data */
923 	if (i > 0) {
924 		ARCSTAT_BUMP(arcstat_hash_collisions);
925 		if (i == 1)
926 			ARCSTAT_BUMP(arcstat_hash_chains);
927 
928 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
929 	}
930 
931 	ARCSTAT_BUMP(arcstat_hash_elements);
932 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
933 
934 	return (NULL);
935 }
936 
937 static void
buf_hash_remove(arc_buf_hdr_t * hdr)938 buf_hash_remove(arc_buf_hdr_t *hdr)
939 {
940 	arc_buf_hdr_t *fhdr, **hdrp;
941 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
942 
943 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
944 	ASSERT(HDR_IN_HASH_TABLE(hdr));
945 
946 	hdrp = &buf_hash_table.ht_table[idx];
947 	while ((fhdr = *hdrp) != hdr) {
948 		ASSERT3P(fhdr, !=, NULL);
949 		hdrp = &fhdr->b_hash_next;
950 	}
951 	*hdrp = hdr->b_hash_next;
952 	hdr->b_hash_next = NULL;
953 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
954 
955 	/* collect some hash table performance data */
956 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
957 
958 	if (buf_hash_table.ht_table[idx] &&
959 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
960 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
961 }
962 
963 /*
964  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
965  *		metadata and data are cached from ARC into L2ARC.
966  */
967 int l2arc_mfuonly = 0;
968 
969 /*
970  * Global data structures and functions for the buf kmem cache.
971  */
972 
973 static kmem_cache_t *hdr_full_cache;
974 static kmem_cache_t *hdr_full_crypt_cache;
975 static kmem_cache_t *hdr_l2only_cache;
976 static kmem_cache_t *buf_cache;
977 
978 static void
buf_fini(void)979 buf_fini(void)
980 {
981 	int i;
982 
983 	kmem_free(buf_hash_table.ht_table,
984 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
985 	for (i = 0; i < BUF_LOCKS; i++)
986 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
987 	kmem_cache_destroy(hdr_full_cache);
988 	kmem_cache_destroy(hdr_full_crypt_cache);
989 	kmem_cache_destroy(hdr_l2only_cache);
990 	kmem_cache_destroy(buf_cache);
991 }
992 
993 /*
994  * Constructor callback - called when the cache is empty
995  * and a new buf is requested.
996  */
997 /* ARGSUSED */
998 static int
hdr_full_cons(void * vbuf,void * unused,int kmflag)999 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1000 {
1001 	arc_buf_hdr_t *hdr = vbuf;
1002 
1003 	bzero(hdr, HDR_FULL_SIZE);
1004 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1005 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1006 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1007 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1008 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1009 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1010 
1011 	return (0);
1012 }
1013 
1014 /* ARGSUSED */
1015 static int
hdr_full_crypt_cons(void * vbuf,void * unused,int kmflag)1016 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
1017 {
1018 	arc_buf_hdr_t *hdr = vbuf;
1019 
1020 	(void) hdr_full_cons(vbuf, unused, kmflag);
1021 	bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
1022 	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1023 
1024 	return (0);
1025 }
1026 
1027 /* ARGSUSED */
1028 static int
hdr_l2only_cons(void * vbuf,void * unused,int kmflag)1029 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1030 {
1031 	arc_buf_hdr_t *hdr = vbuf;
1032 
1033 	bzero(hdr, HDR_L2ONLY_SIZE);
1034 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1035 
1036 	return (0);
1037 }
1038 
1039 /* ARGSUSED */
1040 static int
buf_cons(void * vbuf,void * unused,int kmflag)1041 buf_cons(void *vbuf, void *unused, int kmflag)
1042 {
1043 	arc_buf_t *buf = vbuf;
1044 
1045 	bzero(buf, sizeof (arc_buf_t));
1046 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1047 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1048 
1049 	return (0);
1050 }
1051 
1052 /*
1053  * Destructor callback - called when a cached buf is
1054  * no longer required.
1055  */
1056 /* ARGSUSED */
1057 static void
hdr_full_dest(void * vbuf,void * unused)1058 hdr_full_dest(void *vbuf, void *unused)
1059 {
1060 	arc_buf_hdr_t *hdr = vbuf;
1061 
1062 	ASSERT(HDR_EMPTY(hdr));
1063 	cv_destroy(&hdr->b_l1hdr.b_cv);
1064 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1065 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1066 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1067 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1068 }
1069 
1070 /* ARGSUSED */
1071 static void
hdr_full_crypt_dest(void * vbuf,void * unused)1072 hdr_full_crypt_dest(void *vbuf, void *unused)
1073 {
1074 	arc_buf_hdr_t *hdr = vbuf;
1075 
1076 	hdr_full_dest(hdr, unused);
1077 	arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1078 }
1079 
1080 /* ARGSUSED */
1081 static void
hdr_l2only_dest(void * vbuf,void * unused)1082 hdr_l2only_dest(void *vbuf, void *unused)
1083 {
1084 	arc_buf_hdr_t *hdr = vbuf;
1085 
1086 	ASSERT(HDR_EMPTY(hdr));
1087 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1088 }
1089 
1090 /* ARGSUSED */
1091 static void
buf_dest(void * vbuf,void * unused)1092 buf_dest(void *vbuf, void *unused)
1093 {
1094 	arc_buf_t *buf = vbuf;
1095 
1096 	mutex_destroy(&buf->b_evict_lock);
1097 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1098 }
1099 
1100 /*
1101  * Reclaim callback -- invoked when memory is low.
1102  */
1103 /* ARGSUSED */
1104 static void
hdr_recl(void * unused)1105 hdr_recl(void *unused)
1106 {
1107 	dprintf("hdr_recl called\n");
1108 	/*
1109 	 * umem calls the reclaim func when we destroy the buf cache,
1110 	 * which is after we do arc_fini().
1111 	 */
1112 	if (arc_initialized)
1113 		zthr_wakeup(arc_reap_zthr);
1114 }
1115 
1116 static void
buf_init(void)1117 buf_init(void)
1118 {
1119 	uint64_t *ct;
1120 	uint64_t hsize = 1ULL << 12;
1121 	int i, j;
1122 
1123 	/*
1124 	 * The hash table is big enough to fill all of physical memory
1125 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1126 	 * By default, the table will take up
1127 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1128 	 */
1129 	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1130 		hsize <<= 1;
1131 retry:
1132 	buf_hash_table.ht_mask = hsize - 1;
1133 	buf_hash_table.ht_table =
1134 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1135 	if (buf_hash_table.ht_table == NULL) {
1136 		ASSERT(hsize > (1ULL << 8));
1137 		hsize >>= 1;
1138 		goto retry;
1139 	}
1140 
1141 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1142 	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1143 	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
1144 	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
1145 	    hdr_recl, NULL, NULL, 0);
1146 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1147 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1148 	    NULL, NULL, 0);
1149 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1150 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1151 
1152 	for (i = 0; i < 256; i++)
1153 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1154 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1155 
1156 	for (i = 0; i < BUF_LOCKS; i++) {
1157 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1158 		    NULL, MUTEX_DEFAULT, NULL);
1159 	}
1160 }
1161 
1162 /*
1163  * This is the size that the buf occupies in memory. If the buf is compressed,
1164  * it will correspond to the compressed size. You should use this method of
1165  * getting the buf size unless you explicitly need the logical size.
1166  */
1167 int32_t
arc_buf_size(arc_buf_t * buf)1168 arc_buf_size(arc_buf_t *buf)
1169 {
1170 	return (ARC_BUF_COMPRESSED(buf) ?
1171 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1172 }
1173 
1174 int32_t
arc_buf_lsize(arc_buf_t * buf)1175 arc_buf_lsize(arc_buf_t *buf)
1176 {
1177 	return (HDR_GET_LSIZE(buf->b_hdr));
1178 }
1179 
1180 /*
1181  * This function will return B_TRUE if the buffer is encrypted in memory.
1182  * This buffer can be decrypted by calling arc_untransform().
1183  */
1184 boolean_t
arc_is_encrypted(arc_buf_t * buf)1185 arc_is_encrypted(arc_buf_t *buf)
1186 {
1187 	return (ARC_BUF_ENCRYPTED(buf) != 0);
1188 }
1189 
1190 /*
1191  * Returns B_TRUE if the buffer represents data that has not had its MAC
1192  * verified yet.
1193  */
1194 boolean_t
arc_is_unauthenticated(arc_buf_t * buf)1195 arc_is_unauthenticated(arc_buf_t *buf)
1196 {
1197 	return (HDR_NOAUTH(buf->b_hdr) != 0);
1198 }
1199 
1200 void
arc_get_raw_params(arc_buf_t * buf,boolean_t * byteorder,uint8_t * salt,uint8_t * iv,uint8_t * mac)1201 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1202     uint8_t *iv, uint8_t *mac)
1203 {
1204 	arc_buf_hdr_t *hdr = buf->b_hdr;
1205 
1206 	ASSERT(HDR_PROTECTED(hdr));
1207 
1208 	bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
1209 	bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
1210 	bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
1211 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1212 	    /* CONSTCOND */
1213 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1214 }
1215 
1216 /*
1217  * Indicates how this buffer is compressed in memory. If it is not compressed
1218  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1219  * arc_untransform() as long as it is also unencrypted.
1220  */
1221 enum zio_compress
arc_get_compression(arc_buf_t * buf)1222 arc_get_compression(arc_buf_t *buf)
1223 {
1224 	return (ARC_BUF_COMPRESSED(buf) ?
1225 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1226 }
1227 
1228 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
1229 
1230 /*
1231  * Return the compression algorithm used to store this data in the ARC. If ARC
1232  * compression is enabled or this is an encrypted block, this will be the same
1233  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1234  */
1235 static inline enum zio_compress
arc_hdr_get_compress(arc_buf_hdr_t * hdr)1236 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1237 {
1238 	return (HDR_COMPRESSION_ENABLED(hdr) ?
1239 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1240 }
1241 
1242 static inline boolean_t
arc_buf_is_shared(arc_buf_t * buf)1243 arc_buf_is_shared(arc_buf_t *buf)
1244 {
1245 	boolean_t shared = (buf->b_data != NULL &&
1246 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1247 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1248 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1249 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1250 	IMPLY(shared, ARC_BUF_SHARED(buf));
1251 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1252 
1253 	/*
1254 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1255 	 * already being shared" requirement prevents us from doing that.
1256 	 */
1257 
1258 	return (shared);
1259 }
1260 
1261 /*
1262  * Free the checksum associated with this header. If there is no checksum, this
1263  * is a no-op.
1264  */
1265 static inline void
arc_cksum_free(arc_buf_hdr_t * hdr)1266 arc_cksum_free(arc_buf_hdr_t *hdr)
1267 {
1268 	ASSERT(HDR_HAS_L1HDR(hdr));
1269 
1270 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1271 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1272 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1273 		hdr->b_l1hdr.b_freeze_cksum = NULL;
1274 	}
1275 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1276 }
1277 
1278 /*
1279  * Return true iff at least one of the bufs on hdr is not compressed.
1280  * Encrypted buffers count as compressed.
1281  */
1282 static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t * hdr)1283 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1284 {
1285 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
1286 
1287 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1288 		if (!ARC_BUF_COMPRESSED(b)) {
1289 			return (B_TRUE);
1290 		}
1291 	}
1292 	return (B_FALSE);
1293 }
1294 
1295 /*
1296  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1297  * matches the checksum that is stored in the hdr. If there is no checksum,
1298  * or if the buf is compressed, this is a no-op.
1299  */
1300 static void
arc_cksum_verify(arc_buf_t * buf)1301 arc_cksum_verify(arc_buf_t *buf)
1302 {
1303 	arc_buf_hdr_t *hdr = buf->b_hdr;
1304 	zio_cksum_t zc;
1305 
1306 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1307 		return;
1308 
1309 	if (ARC_BUF_COMPRESSED(buf))
1310 		return;
1311 
1312 	ASSERT(HDR_HAS_L1HDR(hdr));
1313 
1314 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1315 
1316 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1317 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1318 		return;
1319 	}
1320 
1321 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1322 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1323 		panic("buffer modified while frozen!");
1324 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1325 }
1326 
1327 /*
1328  * This function makes the assumption that data stored in the L2ARC
1329  * will be transformed exactly as it is in the main pool. Because of
1330  * this we can verify the checksum against the reading process's bp.
1331  */
1332 static boolean_t
arc_cksum_is_equal(arc_buf_hdr_t * hdr,zio_t * zio)1333 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1334 {
1335 	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1336 	boolean_t valid_cksum;
1337 
1338 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1339 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1340 
1341 	/*
1342 	 * We rely on the blkptr's checksum to determine if the block
1343 	 * is valid or not. When compressed arc is enabled, the l2arc
1344 	 * writes the block to the l2arc just as it appears in the pool.
1345 	 * This allows us to use the blkptr's checksum to validate the
1346 	 * data that we just read off of the l2arc without having to store
1347 	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
1348 	 * arc is disabled, then the data written to the l2arc is always
1349 	 * uncompressed and won't match the block as it exists in the main
1350 	 * pool. When this is the case, we must first compress it if it is
1351 	 * compressed on the main pool before we can validate the checksum.
1352 	 */
1353 	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1354 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1355 		uint64_t lsize = HDR_GET_LSIZE(hdr);
1356 		uint64_t csize;
1357 
1358 		abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
1359 		csize = zio_compress_data(compress, zio->io_abd,
1360 		    abd_to_buf(cdata), lsize);
1361 
1362 		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1363 		if (csize < HDR_GET_PSIZE(hdr)) {
1364 			/*
1365 			 * Compressed blocks are always a multiple of the
1366 			 * smallest ashift in the pool. Ideally, we would
1367 			 * like to round up the csize to the next
1368 			 * spa_min_ashift but that value may have changed
1369 			 * since the block was last written. Instead,
1370 			 * we rely on the fact that the hdr's psize
1371 			 * was set to the psize of the block when it was
1372 			 * last written. We set the csize to that value
1373 			 * and zero out any part that should not contain
1374 			 * data.
1375 			 */
1376 			abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
1377 			csize = HDR_GET_PSIZE(hdr);
1378 		}
1379 		zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
1380 	}
1381 
1382 	/*
1383 	 * Block pointers always store the checksum for the logical data.
1384 	 * If the block pointer has the gang bit set, then the checksum
1385 	 * it represents is for the reconstituted data and not for an
1386 	 * individual gang member. The zio pipeline, however, must be able to
1387 	 * determine the checksum of each of the gang constituents so it
1388 	 * treats the checksum comparison differently than what we need
1389 	 * for l2arc blocks. This prevents us from using the
1390 	 * zio_checksum_error() interface directly. Instead we must call the
1391 	 * zio_checksum_error_impl() so that we can ensure the checksum is
1392 	 * generated using the correct checksum algorithm and accounts for the
1393 	 * logical I/O size and not just a gang fragment.
1394 	 */
1395 	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1396 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1397 	    zio->io_offset, NULL) == 0);
1398 	zio_pop_transforms(zio);
1399 	return (valid_cksum);
1400 }
1401 
1402 /*
1403  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1404  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1405  * isn't modified later on. If buf is compressed or there is already a checksum
1406  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1407  */
1408 static void
arc_cksum_compute(arc_buf_t * buf)1409 arc_cksum_compute(arc_buf_t *buf)
1410 {
1411 	arc_buf_hdr_t *hdr = buf->b_hdr;
1412 
1413 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1414 		return;
1415 
1416 	ASSERT(HDR_HAS_L1HDR(hdr));
1417 
1418 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1419 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1420 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1421 		return;
1422 	}
1423 
1424 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
1425 	ASSERT(!ARC_BUF_COMPRESSED(buf));
1426 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1427 	    KM_SLEEP);
1428 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1429 	    hdr->b_l1hdr.b_freeze_cksum);
1430 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1431 	arc_buf_watch(buf);
1432 }
1433 
1434 #ifndef _KERNEL
1435 typedef struct procctl {
1436 	long cmd;
1437 	prwatch_t prwatch;
1438 } procctl_t;
1439 #endif
1440 
1441 /* ARGSUSED */
1442 static void
arc_buf_unwatch(arc_buf_t * buf)1443 arc_buf_unwatch(arc_buf_t *buf)
1444 {
1445 #ifndef _KERNEL
1446 	if (arc_watch) {
1447 		int result;
1448 		procctl_t ctl;
1449 		ctl.cmd = PCWATCH;
1450 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1451 		ctl.prwatch.pr_size = 0;
1452 		ctl.prwatch.pr_wflags = 0;
1453 		result = write(arc_procfd, &ctl, sizeof (ctl));
1454 		ASSERT3U(result, ==, sizeof (ctl));
1455 	}
1456 #endif
1457 }
1458 
1459 /* ARGSUSED */
1460 static void
arc_buf_watch(arc_buf_t * buf)1461 arc_buf_watch(arc_buf_t *buf)
1462 {
1463 #ifndef _KERNEL
1464 	if (arc_watch) {
1465 		int result;
1466 		procctl_t ctl;
1467 		ctl.cmd = PCWATCH;
1468 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1469 		ctl.prwatch.pr_size = arc_buf_size(buf);
1470 		ctl.prwatch.pr_wflags = WA_WRITE;
1471 		result = write(arc_procfd, &ctl, sizeof (ctl));
1472 		ASSERT3U(result, ==, sizeof (ctl));
1473 	}
1474 #endif
1475 }
1476 
1477 static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t * hdr)1478 arc_buf_type(arc_buf_hdr_t *hdr)
1479 {
1480 	arc_buf_contents_t type;
1481 	if (HDR_ISTYPE_METADATA(hdr)) {
1482 		type = ARC_BUFC_METADATA;
1483 	} else {
1484 		type = ARC_BUFC_DATA;
1485 	}
1486 	VERIFY3U(hdr->b_type, ==, type);
1487 	return (type);
1488 }
1489 
1490 boolean_t
arc_is_metadata(arc_buf_t * buf)1491 arc_is_metadata(arc_buf_t *buf)
1492 {
1493 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1494 }
1495 
1496 static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)1497 arc_bufc_to_flags(arc_buf_contents_t type)
1498 {
1499 	switch (type) {
1500 	case ARC_BUFC_DATA:
1501 		/* metadata field is 0 if buffer contains normal data */
1502 		return (0);
1503 	case ARC_BUFC_METADATA:
1504 		return (ARC_FLAG_BUFC_METADATA);
1505 	default:
1506 		break;
1507 	}
1508 	panic("undefined ARC buffer type!");
1509 	return ((uint32_t)-1);
1510 }
1511 
1512 void
arc_buf_thaw(arc_buf_t * buf)1513 arc_buf_thaw(arc_buf_t *buf)
1514 {
1515 	arc_buf_hdr_t *hdr = buf->b_hdr;
1516 
1517 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1518 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1519 
1520 	arc_cksum_verify(buf);
1521 
1522 	/*
1523 	 * Compressed buffers do not manipulate the b_freeze_cksum.
1524 	 */
1525 	if (ARC_BUF_COMPRESSED(buf))
1526 		return;
1527 
1528 	ASSERT(HDR_HAS_L1HDR(hdr));
1529 	arc_cksum_free(hdr);
1530 
1531 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1532 #ifdef ZFS_DEBUG
1533 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1534 		if (hdr->b_l1hdr.b_thawed != NULL)
1535 			kmem_free(hdr->b_l1hdr.b_thawed, 1);
1536 		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1537 	}
1538 #endif
1539 
1540 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1541 
1542 	arc_buf_unwatch(buf);
1543 }
1544 
1545 void
arc_buf_freeze(arc_buf_t * buf)1546 arc_buf_freeze(arc_buf_t *buf)
1547 {
1548 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1549 		return;
1550 
1551 	if (ARC_BUF_COMPRESSED(buf))
1552 		return;
1553 
1554 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1555 	arc_cksum_compute(buf);
1556 }
1557 
1558 /*
1559  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1560  * the following functions should be used to ensure that the flags are
1561  * updated in a thread-safe way. When manipulating the flags either
1562  * the hash_lock must be held or the hdr must be undiscoverable. This
1563  * ensures that we're not racing with any other threads when updating
1564  * the flags.
1565  */
1566 static inline void
arc_hdr_set_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1567 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1568 {
1569 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1570 	hdr->b_flags |= flags;
1571 }
1572 
1573 static inline void
arc_hdr_clear_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1574 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1575 {
1576 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1577 	hdr->b_flags &= ~flags;
1578 }
1579 
1580 /*
1581  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1582  * done in a special way since we have to clear and set bits
1583  * at the same time. Consumers that wish to set the compression bits
1584  * must use this function to ensure that the flags are updated in
1585  * thread-safe manner.
1586  */
1587 static void
arc_hdr_set_compress(arc_buf_hdr_t * hdr,enum zio_compress cmp)1588 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1589 {
1590 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1591 
1592 	/*
1593 	 * Holes and embedded blocks will always have a psize = 0 so
1594 	 * we ignore the compression of the blkptr and set the
1595 	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
1596 	 * Holes and embedded blocks remain anonymous so we don't
1597 	 * want to uncompress them. Mark them as uncompressed.
1598 	 */
1599 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1600 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1601 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
1602 	} else {
1603 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1604 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1605 	}
1606 
1607 	HDR_SET_COMPRESS(hdr, cmp);
1608 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
1609 }
1610 
1611 /*
1612  * Looks for another buf on the same hdr which has the data decompressed, copies
1613  * from it, and returns true. If no such buf exists, returns false.
1614  */
1615 static boolean_t
arc_buf_try_copy_decompressed_data(arc_buf_t * buf)1616 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1617 {
1618 	arc_buf_hdr_t *hdr = buf->b_hdr;
1619 	boolean_t copied = B_FALSE;
1620 
1621 	ASSERT(HDR_HAS_L1HDR(hdr));
1622 	ASSERT3P(buf->b_data, !=, NULL);
1623 	ASSERT(!ARC_BUF_COMPRESSED(buf));
1624 
1625 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
1626 	    from = from->b_next) {
1627 		/* can't use our own data buffer */
1628 		if (from == buf) {
1629 			continue;
1630 		}
1631 
1632 		if (!ARC_BUF_COMPRESSED(from)) {
1633 			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
1634 			copied = B_TRUE;
1635 			break;
1636 		}
1637 	}
1638 
1639 	/*
1640 	 * Note: With encryption support, the following assertion is no longer
1641 	 * necessarily valid. If we receive two back to back raw snapshots
1642 	 * (send -w), the second receive can use a hdr with a cksum already
1643 	 * calculated. This happens via:
1644 	 *    dmu_recv_stream() -> receive_read_record() -> arc_loan_raw_buf()
1645 	 * The rsend/send_mixed_raw test case exercises this code path.
1646 	 *
1647 	 * There were no decompressed bufs, so there should not be a
1648 	 * checksum on the hdr either.
1649 	 * EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1650 	 */
1651 
1652 	return (copied);
1653 }
1654 
1655 /*
1656  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1657  */
1658 static uint64_t
arc_hdr_size(arc_buf_hdr_t * hdr)1659 arc_hdr_size(arc_buf_hdr_t *hdr)
1660 {
1661 	uint64_t size;
1662 
1663 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1664 	    HDR_GET_PSIZE(hdr) > 0) {
1665 		size = HDR_GET_PSIZE(hdr);
1666 	} else {
1667 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1668 		size = HDR_GET_LSIZE(hdr);
1669 	}
1670 	return (size);
1671 }
1672 
1673 static int
arc_hdr_authenticate(arc_buf_hdr_t * hdr,spa_t * spa,uint64_t dsobj)1674 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1675 {
1676 	int ret;
1677 	uint64_t csize;
1678 	uint64_t lsize = HDR_GET_LSIZE(hdr);
1679 	uint64_t psize = HDR_GET_PSIZE(hdr);
1680 	void *tmpbuf = NULL;
1681 	abd_t *abd = hdr->b_l1hdr.b_pabd;
1682 
1683 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1684 	ASSERT(HDR_AUTHENTICATED(hdr));
1685 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1686 
1687 	/*
1688 	 * The MAC is calculated on the compressed data that is stored on disk.
1689 	 * However, if compressed arc is disabled we will only have the
1690 	 * decompressed data available to us now. Compress it into a temporary
1691 	 * abd so we can verify the MAC. The performance overhead of this will
1692 	 * be relatively low, since most objects in an encrypted objset will
1693 	 * be encrypted (instead of authenticated) anyway.
1694 	 */
1695 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1696 	    !HDR_COMPRESSION_ENABLED(hdr)) {
1697 		tmpbuf = zio_buf_alloc(lsize);
1698 		abd = abd_get_from_buf(tmpbuf, lsize);
1699 		abd_take_ownership_of_buf(abd, B_TRUE);
1700 
1701 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1702 		    hdr->b_l1hdr.b_pabd, tmpbuf, lsize);
1703 		ASSERT3U(csize, <=, psize);
1704 		abd_zero_off(abd, csize, psize - csize);
1705 	}
1706 
1707 	/*
1708 	 * Authentication is best effort. We authenticate whenever the key is
1709 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
1710 	 */
1711 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1712 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1713 		ASSERT3U(lsize, ==, psize);
1714 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1715 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1716 	} else {
1717 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1718 		    hdr->b_crypt_hdr.b_mac);
1719 	}
1720 
1721 	if (ret == 0)
1722 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1723 	else if (ret != ENOENT)
1724 		goto error;
1725 
1726 	if (tmpbuf != NULL)
1727 		abd_free(abd);
1728 
1729 	return (0);
1730 
1731 error:
1732 	if (tmpbuf != NULL)
1733 		abd_free(abd);
1734 
1735 	return (ret);
1736 }
1737 
1738 /*
1739  * This function will take a header that only has raw encrypted data in
1740  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1741  * b_l1hdr.b_pabd. If designated in the header flags, this function will
1742  * also decompress the data.
1743  */
1744 static int
arc_hdr_decrypt(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb)1745 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
1746 {
1747 	int ret;
1748 	abd_t *cabd = NULL;
1749 	void *tmp = NULL;
1750 	boolean_t no_crypt = B_FALSE;
1751 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1752 
1753 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1754 	ASSERT(HDR_ENCRYPTED(hdr));
1755 
1756 	arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
1757 
1758 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1759 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1760 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
1761 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
1762 	if (ret != 0)
1763 		goto error;
1764 
1765 	if (no_crypt) {
1766 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1767 		    HDR_GET_PSIZE(hdr));
1768 	}
1769 
1770 	/*
1771 	 * If this header has disabled arc compression but the b_pabd is
1772 	 * compressed after decrypting it, we need to decompress the newly
1773 	 * decrypted data.
1774 	 */
1775 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1776 	    !HDR_COMPRESSION_ENABLED(hdr)) {
1777 		/*
1778 		 * We want to make sure that we are correctly honoring the
1779 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
1780 		 * and then loan a buffer from it, rather than allocating a
1781 		 * linear buffer and wrapping it in an abd later.
1782 		 */
1783 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
1784 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
1785 
1786 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1787 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
1788 		    HDR_GET_LSIZE(hdr));
1789 		if (ret != 0) {
1790 			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
1791 			goto error;
1792 		}
1793 
1794 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
1795 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1796 		    arc_hdr_size(hdr), hdr);
1797 		hdr->b_l1hdr.b_pabd = cabd;
1798 	}
1799 
1800 	return (0);
1801 
1802 error:
1803 	arc_hdr_free_pabd(hdr, B_FALSE);
1804 	if (cabd != NULL)
1805 		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
1806 
1807 	return (ret);
1808 }
1809 
1810 /*
1811  * This function is called during arc_buf_fill() to prepare the header's
1812  * abd plaintext pointer for use. This involves authenticated protected
1813  * data and decrypting encrypted data into the plaintext abd.
1814  */
1815 static int
arc_fill_hdr_crypt(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,spa_t * spa,const zbookmark_phys_t * zb,boolean_t noauth)1816 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
1817     const zbookmark_phys_t *zb, boolean_t noauth)
1818 {
1819 	int ret;
1820 
1821 	ASSERT(HDR_PROTECTED(hdr));
1822 
1823 	if (hash_lock != NULL)
1824 		mutex_enter(hash_lock);
1825 
1826 	if (HDR_NOAUTH(hdr) && !noauth) {
1827 		/*
1828 		 * The caller requested authenticated data but our data has
1829 		 * not been authenticated yet. Verify the MAC now if we can.
1830 		 */
1831 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
1832 		if (ret != 0)
1833 			goto error;
1834 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
1835 		/*
1836 		 * If we only have the encrypted version of the data, but the
1837 		 * unencrypted version was requested we take this opportunity
1838 		 * to store the decrypted version in the header for future use.
1839 		 */
1840 		ret = arc_hdr_decrypt(hdr, spa, zb);
1841 		if (ret != 0)
1842 			goto error;
1843 	}
1844 
1845 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1846 
1847 	if (hash_lock != NULL)
1848 		mutex_exit(hash_lock);
1849 
1850 	return (0);
1851 
1852 error:
1853 	if (hash_lock != NULL)
1854 		mutex_exit(hash_lock);
1855 
1856 	return (ret);
1857 }
1858 
1859 /*
1860  * This function is used by the dbuf code to decrypt bonus buffers in place.
1861  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
1862  * block, so we use the hash lock here to protect against concurrent calls to
1863  * arc_buf_fill().
1864  */
1865 /* ARGSUSED */
1866 static void
arc_buf_untransform_in_place(arc_buf_t * buf,kmutex_t * hash_lock)1867 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
1868 {
1869 	arc_buf_hdr_t *hdr = buf->b_hdr;
1870 
1871 	ASSERT(HDR_ENCRYPTED(hdr));
1872 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1873 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1874 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1875 
1876 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
1877 	    arc_buf_size(buf));
1878 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
1879 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
1880 	hdr->b_crypt_hdr.b_ebufcnt -= 1;
1881 }
1882 
1883 /*
1884  * Given a buf that has a data buffer attached to it, this function will
1885  * efficiently fill the buf with data of the specified compression setting from
1886  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
1887  * are already sharing a data buf, no copy is performed.
1888  *
1889  * If the buf is marked as compressed but uncompressed data was requested, this
1890  * will allocate a new data buffer for the buf, remove that flag, and fill the
1891  * buf with uncompressed data. You can't request a compressed buf on a hdr with
1892  * uncompressed data, and (since we haven't added support for it yet) if you
1893  * want compressed data your buf must already be marked as compressed and have
1894  * the correct-sized data buffer.
1895  */
1896 static int
arc_buf_fill(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,arc_fill_flags_t flags)1897 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
1898     arc_fill_flags_t flags)
1899 {
1900 	int error = 0;
1901 	arc_buf_hdr_t *hdr = buf->b_hdr;
1902 	boolean_t hdr_compressed =
1903 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
1904 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
1905 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
1906 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
1907 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
1908 
1909 	ASSERT3P(buf->b_data, !=, NULL);
1910 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
1911 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
1912 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
1913 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
1914 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
1915 	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
1916 
1917 	/*
1918 	 * If the caller wanted encrypted data we just need to copy it from
1919 	 * b_rabd and potentially byteswap it. We won't be able to do any
1920 	 * further transforms on it.
1921 	 */
1922 	if (encrypted) {
1923 		ASSERT(HDR_HAS_RABD(hdr));
1924 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
1925 		    HDR_GET_PSIZE(hdr));
1926 		goto byteswap;
1927 	}
1928 
1929 	/*
1930 	 * Adjust encrypted and authenticated headers to accomodate
1931 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
1932 	 * allowed to fail decryption due to keys not being loaded
1933 	 * without being marked as an IO error.
1934 	 */
1935 	if (HDR_PROTECTED(hdr)) {
1936 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
1937 		    zb, !!(flags & ARC_FILL_NOAUTH));
1938 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
1939 			return (error);
1940 		} else if (error != 0) {
1941 			if (hash_lock != NULL)
1942 				mutex_enter(hash_lock);
1943 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
1944 			if (hash_lock != NULL)
1945 				mutex_exit(hash_lock);
1946 			return (error);
1947 		}
1948 	}
1949 
1950 	/*
1951 	 * There is a special case here for dnode blocks which are
1952 	 * decrypting their bonus buffers. These blocks may request to
1953 	 * be decrypted in-place. This is necessary because there may
1954 	 * be many dnodes pointing into this buffer and there is
1955 	 * currently no method to synchronize replacing the backing
1956 	 * b_data buffer and updating all of the pointers. Here we use
1957 	 * the hash lock to ensure there are no races. If the need
1958 	 * arises for other types to be decrypted in-place, they must
1959 	 * add handling here as well.
1960 	 */
1961 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
1962 		ASSERT(!hdr_compressed);
1963 		ASSERT(!compressed);
1964 		ASSERT(!encrypted);
1965 
1966 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
1967 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1968 
1969 			if (hash_lock != NULL)
1970 				mutex_enter(hash_lock);
1971 			arc_buf_untransform_in_place(buf, hash_lock);
1972 			if (hash_lock != NULL)
1973 				mutex_exit(hash_lock);
1974 
1975 			/* Compute the hdr's checksum if necessary */
1976 			arc_cksum_compute(buf);
1977 		}
1978 
1979 		return (0);
1980 	}
1981 
1982 	if (hdr_compressed == compressed) {
1983 		if (!arc_buf_is_shared(buf)) {
1984 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
1985 			    arc_buf_size(buf));
1986 		}
1987 	} else {
1988 		ASSERT(hdr_compressed);
1989 		ASSERT(!compressed);
1990 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
1991 
1992 		/*
1993 		 * If the buf is sharing its data with the hdr, unlink it and
1994 		 * allocate a new data buffer for the buf.
1995 		 */
1996 		if (arc_buf_is_shared(buf)) {
1997 			ASSERT(ARC_BUF_COMPRESSED(buf));
1998 
1999 			/* We need to give the buf its own b_data */
2000 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2001 			buf->b_data =
2002 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2003 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2004 
2005 			/* Previously overhead was 0; just add new overhead */
2006 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2007 		} else if (ARC_BUF_COMPRESSED(buf)) {
2008 			/* We need to reallocate the buf's b_data */
2009 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2010 			    buf);
2011 			buf->b_data =
2012 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2013 
2014 			/* We increased the size of b_data; update overhead */
2015 			ARCSTAT_INCR(arcstat_overhead_size,
2016 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2017 		}
2018 
2019 		/*
2020 		 * Regardless of the buf's previous compression settings, it
2021 		 * should not be compressed at the end of this function.
2022 		 */
2023 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2024 
2025 		/*
2026 		 * Try copying the data from another buf which already has a
2027 		 * decompressed version. If that's not possible, it's time to
2028 		 * bite the bullet and decompress the data from the hdr.
2029 		 */
2030 		if (arc_buf_try_copy_decompressed_data(buf)) {
2031 			/* Skip byteswapping and checksumming (already done) */
2032 			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2033 			return (0);
2034 		} else {
2035 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2036 			    hdr->b_l1hdr.b_pabd, buf->b_data,
2037 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2038 
2039 			/*
2040 			 * Absent hardware errors or software bugs, this should
2041 			 * be impossible, but log it anyway so we can debug it.
2042 			 */
2043 			if (error != 0) {
2044 				zfs_dbgmsg(
2045 				    "hdr %p, compress %d, psize %d, lsize %d",
2046 				    hdr, arc_hdr_get_compress(hdr),
2047 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2048 				if (hash_lock != NULL)
2049 					mutex_enter(hash_lock);
2050 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2051 				if (hash_lock != NULL)
2052 					mutex_exit(hash_lock);
2053 				return (SET_ERROR(EIO));
2054 			}
2055 		}
2056 	}
2057 
2058 byteswap:
2059 	/* Byteswap the buf's data if necessary */
2060 	if (bswap != DMU_BSWAP_NUMFUNCS) {
2061 		ASSERT(!HDR_SHARED_DATA(hdr));
2062 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2063 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2064 	}
2065 
2066 	/* Compute the hdr's checksum if necessary */
2067 	arc_cksum_compute(buf);
2068 
2069 	return (0);
2070 }
2071 
2072 /*
2073  * If this function is being called to decrypt an encrypted buffer or verify an
2074  * authenticated one, the key must be loaded and a mapping must be made
2075  * available in the keystore via spa_keystore_create_mapping() or one of its
2076  * callers.
2077  */
2078 int
arc_untransform(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,boolean_t in_place)2079 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2080     boolean_t in_place)
2081 {
2082 	int ret;
2083 	arc_fill_flags_t flags = 0;
2084 
2085 	if (in_place)
2086 		flags |= ARC_FILL_IN_PLACE;
2087 
2088 	ret = arc_buf_fill(buf, spa, zb, flags);
2089 	if (ret == ECKSUM) {
2090 		/*
2091 		 * Convert authentication and decryption errors to EIO
2092 		 * (and generate an ereport) before leaving the ARC.
2093 		 */
2094 		ret = SET_ERROR(EIO);
2095 		spa_log_error(spa, zb);
2096 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2097 		    spa, NULL, zb, NULL, 0, 0);
2098 	}
2099 
2100 	return (ret);
2101 }
2102 
2103 /*
2104  * Increment the amount of evictable space in the arc_state_t's refcount.
2105  * We account for the space used by the hdr and the arc buf individually
2106  * so that we can add and remove them from the refcount individually.
2107  */
2108 static void
arc_evictable_space_increment(arc_buf_hdr_t * hdr,arc_state_t * state)2109 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2110 {
2111 	arc_buf_contents_t type = arc_buf_type(hdr);
2112 
2113 	ASSERT(HDR_HAS_L1HDR(hdr));
2114 
2115 	if (GHOST_STATE(state)) {
2116 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
2117 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2118 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2119 		ASSERT(!HDR_HAS_RABD(hdr));
2120 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2121 		    HDR_GET_LSIZE(hdr), hdr);
2122 		return;
2123 	}
2124 
2125 	ASSERT(!GHOST_STATE(state));
2126 	if (hdr->b_l1hdr.b_pabd != NULL) {
2127 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2128 		    arc_hdr_size(hdr), hdr);
2129 	}
2130 	if (HDR_HAS_RABD(hdr)) {
2131 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2132 		    HDR_GET_PSIZE(hdr), hdr);
2133 	}
2134 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2135 	    buf = buf->b_next) {
2136 		if (arc_buf_is_shared(buf))
2137 			continue;
2138 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2139 		    arc_buf_size(buf), buf);
2140 	}
2141 }
2142 
2143 /*
2144  * Decrement the amount of evictable space in the arc_state_t's refcount.
2145  * We account for the space used by the hdr and the arc buf individually
2146  * so that we can add and remove them from the refcount individually.
2147  */
2148 static void
arc_evictable_space_decrement(arc_buf_hdr_t * hdr,arc_state_t * state)2149 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2150 {
2151 	arc_buf_contents_t type = arc_buf_type(hdr);
2152 
2153 	ASSERT(HDR_HAS_L1HDR(hdr));
2154 
2155 	if (GHOST_STATE(state)) {
2156 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
2157 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2158 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2159 		ASSERT(!HDR_HAS_RABD(hdr));
2160 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2161 		    HDR_GET_LSIZE(hdr), hdr);
2162 		return;
2163 	}
2164 
2165 	ASSERT(!GHOST_STATE(state));
2166 	if (hdr->b_l1hdr.b_pabd != NULL) {
2167 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2168 		    arc_hdr_size(hdr), hdr);
2169 	}
2170 	if (HDR_HAS_RABD(hdr)) {
2171 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2172 		    HDR_GET_PSIZE(hdr), hdr);
2173 	}
2174 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2175 	    buf = buf->b_next) {
2176 		if (arc_buf_is_shared(buf))
2177 			continue;
2178 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2179 		    arc_buf_size(buf), buf);
2180 	}
2181 }
2182 
2183 /*
2184  * Add a reference to this hdr indicating that someone is actively
2185  * referencing that memory. When the refcount transitions from 0 to 1,
2186  * we remove it from the respective arc_state_t list to indicate that
2187  * it is not evictable.
2188  */
2189 static void
add_reference(arc_buf_hdr_t * hdr,void * tag)2190 add_reference(arc_buf_hdr_t *hdr, void *tag)
2191 {
2192 	ASSERT(HDR_HAS_L1HDR(hdr));
2193 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
2194 		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2195 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2196 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2197 	}
2198 
2199 	arc_state_t *state = hdr->b_l1hdr.b_state;
2200 
2201 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2202 	    (state != arc_anon)) {
2203 		/* We don't use the L2-only state list. */
2204 		if (state != arc_l2c_only) {
2205 			multilist_remove(state->arcs_list[arc_buf_type(hdr)],
2206 			    hdr);
2207 			arc_evictable_space_decrement(hdr, state);
2208 		}
2209 		/* remove the prefetch flag if we get a reference */
2210 		if (HDR_HAS_L2HDR(hdr))
2211 			l2arc_hdr_arcstats_decrement_state(hdr);
2212 		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2213 		if (HDR_HAS_L2HDR(hdr))
2214 			l2arc_hdr_arcstats_increment_state(hdr);
2215 	}
2216 }
2217 
2218 /*
2219  * Remove a reference from this hdr. When the reference transitions from
2220  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2221  * list making it eligible for eviction.
2222  */
2223 static int
remove_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)2224 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2225 {
2226 	int cnt;
2227 	arc_state_t *state = hdr->b_l1hdr.b_state;
2228 
2229 	ASSERT(HDR_HAS_L1HDR(hdr));
2230 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2231 	ASSERT(!GHOST_STATE(state));
2232 
2233 	/*
2234 	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
2235 	 * check to prevent usage of the arc_l2c_only list.
2236 	 */
2237 	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2238 	    (state != arc_anon)) {
2239 		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
2240 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2241 		arc_evictable_space_increment(hdr, state);
2242 	}
2243 	return (cnt);
2244 }
2245 
2246 /*
2247  * Move the supplied buffer to the indicated state. The hash lock
2248  * for the buffer must be held by the caller.
2249  */
2250 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr,kmutex_t * hash_lock)2251 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2252     kmutex_t *hash_lock)
2253 {
2254 	arc_state_t *old_state;
2255 	int64_t refcnt;
2256 	uint32_t bufcnt;
2257 	boolean_t update_old, update_new;
2258 	arc_buf_contents_t buftype = arc_buf_type(hdr);
2259 
2260 	/*
2261 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2262 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
2263 	 * L1 hdr doesn't always exist when we change state to arc_anon before
2264 	 * destroying a header, in which case reallocating to add the L1 hdr is
2265 	 * pointless.
2266 	 */
2267 	if (HDR_HAS_L1HDR(hdr)) {
2268 		old_state = hdr->b_l1hdr.b_state;
2269 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2270 		bufcnt = hdr->b_l1hdr.b_bufcnt;
2271 
2272 		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
2273 		    HDR_HAS_RABD(hdr));
2274 	} else {
2275 		old_state = arc_l2c_only;
2276 		refcnt = 0;
2277 		bufcnt = 0;
2278 		update_old = B_FALSE;
2279 	}
2280 	update_new = update_old;
2281 
2282 	ASSERT(MUTEX_HELD(hash_lock));
2283 	ASSERT3P(new_state, !=, old_state);
2284 	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2285 	ASSERT(old_state != arc_anon || bufcnt <= 1);
2286 
2287 	/*
2288 	 * If this buffer is evictable, transfer it from the
2289 	 * old state list to the new state list.
2290 	 */
2291 	if (refcnt == 0) {
2292 		if (old_state != arc_anon && old_state != arc_l2c_only) {
2293 			ASSERT(HDR_HAS_L1HDR(hdr));
2294 			multilist_remove(old_state->arcs_list[buftype], hdr);
2295 
2296 			if (GHOST_STATE(old_state)) {
2297 				ASSERT0(bufcnt);
2298 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2299 				update_old = B_TRUE;
2300 			}
2301 			arc_evictable_space_decrement(hdr, old_state);
2302 		}
2303 		if (new_state != arc_anon && new_state != arc_l2c_only) {
2304 
2305 			/*
2306 			 * An L1 header always exists here, since if we're
2307 			 * moving to some L1-cached state (i.e. not l2c_only or
2308 			 * anonymous), we realloc the header to add an L1hdr
2309 			 * beforehand.
2310 			 */
2311 			ASSERT(HDR_HAS_L1HDR(hdr));
2312 			multilist_insert(new_state->arcs_list[buftype], hdr);
2313 
2314 			if (GHOST_STATE(new_state)) {
2315 				ASSERT0(bufcnt);
2316 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2317 				update_new = B_TRUE;
2318 			}
2319 			arc_evictable_space_increment(hdr, new_state);
2320 		}
2321 	}
2322 
2323 	ASSERT(!HDR_EMPTY(hdr));
2324 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2325 		buf_hash_remove(hdr);
2326 
2327 	/* adjust state sizes (ignore arc_l2c_only) */
2328 
2329 	if (update_new && new_state != arc_l2c_only) {
2330 		ASSERT(HDR_HAS_L1HDR(hdr));
2331 		if (GHOST_STATE(new_state)) {
2332 			ASSERT0(bufcnt);
2333 
2334 			/*
2335 			 * When moving a header to a ghost state, we first
2336 			 * remove all arc buffers. Thus, we'll have a
2337 			 * bufcnt of zero, and no arc buffer to use for
2338 			 * the reference. As a result, we use the arc
2339 			 * header pointer for the reference.
2340 			 */
2341 			(void) zfs_refcount_add_many(&new_state->arcs_size,
2342 			    HDR_GET_LSIZE(hdr), hdr);
2343 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2344 			ASSERT(!HDR_HAS_RABD(hdr));
2345 		} else {
2346 			uint32_t buffers = 0;
2347 
2348 			/*
2349 			 * Each individual buffer holds a unique reference,
2350 			 * thus we must remove each of these references one
2351 			 * at a time.
2352 			 */
2353 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2354 			    buf = buf->b_next) {
2355 				ASSERT3U(bufcnt, !=, 0);
2356 				buffers++;
2357 
2358 				/*
2359 				 * When the arc_buf_t is sharing the data
2360 				 * block with the hdr, the owner of the
2361 				 * reference belongs to the hdr. Only
2362 				 * add to the refcount if the arc_buf_t is
2363 				 * not shared.
2364 				 */
2365 				if (arc_buf_is_shared(buf))
2366 					continue;
2367 
2368 				(void) zfs_refcount_add_many(
2369 				    &new_state->arcs_size,
2370 				    arc_buf_size(buf), buf);
2371 			}
2372 			ASSERT3U(bufcnt, ==, buffers);
2373 
2374 			if (hdr->b_l1hdr.b_pabd != NULL) {
2375 				(void) zfs_refcount_add_many(
2376 				    &new_state->arcs_size,
2377 				    arc_hdr_size(hdr), hdr);
2378 			}
2379 
2380 			if (HDR_HAS_RABD(hdr)) {
2381 				(void) zfs_refcount_add_many(
2382 				    &new_state->arcs_size,
2383 				    HDR_GET_PSIZE(hdr), hdr);
2384 			}
2385 		}
2386 	}
2387 
2388 	if (update_old && old_state != arc_l2c_only) {
2389 		ASSERT(HDR_HAS_L1HDR(hdr));
2390 		if (GHOST_STATE(old_state)) {
2391 			ASSERT0(bufcnt);
2392 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2393 			ASSERT(!HDR_HAS_RABD(hdr));
2394 
2395 			/*
2396 			 * When moving a header off of a ghost state,
2397 			 * the header will not contain any arc buffers.
2398 			 * We use the arc header pointer for the reference
2399 			 * which is exactly what we did when we put the
2400 			 * header on the ghost state.
2401 			 */
2402 
2403 			(void) zfs_refcount_remove_many(&old_state->arcs_size,
2404 			    HDR_GET_LSIZE(hdr), hdr);
2405 		} else {
2406 			uint32_t buffers = 0;
2407 
2408 			/*
2409 			 * Each individual buffer holds a unique reference,
2410 			 * thus we must remove each of these references one
2411 			 * at a time.
2412 			 */
2413 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2414 			    buf = buf->b_next) {
2415 				ASSERT3U(bufcnt, !=, 0);
2416 				buffers++;
2417 
2418 				/*
2419 				 * When the arc_buf_t is sharing the data
2420 				 * block with the hdr, the owner of the
2421 				 * reference belongs to the hdr. Only
2422 				 * add to the refcount if the arc_buf_t is
2423 				 * not shared.
2424 				 */
2425 				if (arc_buf_is_shared(buf))
2426 					continue;
2427 
2428 				(void) zfs_refcount_remove_many(
2429 				    &old_state->arcs_size, arc_buf_size(buf),
2430 				    buf);
2431 			}
2432 			ASSERT3U(bufcnt, ==, buffers);
2433 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2434 			    HDR_HAS_RABD(hdr));
2435 
2436 			if (hdr->b_l1hdr.b_pabd != NULL) {
2437 				(void) zfs_refcount_remove_many(
2438 				    &old_state->arcs_size, arc_hdr_size(hdr),
2439 				    hdr);
2440 			}
2441 
2442 			if (HDR_HAS_RABD(hdr)) {
2443 				(void) zfs_refcount_remove_many(
2444 				    &old_state->arcs_size, HDR_GET_PSIZE(hdr),
2445 				    hdr);
2446 			}
2447 		}
2448 	}
2449 
2450 	if (HDR_HAS_L1HDR(hdr)) {
2451 		hdr->b_l1hdr.b_state = new_state;
2452 
2453 		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
2454 			l2arc_hdr_arcstats_decrement_state(hdr);
2455 			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
2456 			l2arc_hdr_arcstats_increment_state(hdr);
2457 		}
2458 	}
2459 
2460 	/*
2461 	 * L2 headers should never be on the L2 state list since they don't
2462 	 * have L1 headers allocated.
2463 	 */
2464 	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2465 	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2466 }
2467 
2468 void
arc_space_consume(uint64_t space,arc_space_type_t type)2469 arc_space_consume(uint64_t space, arc_space_type_t type)
2470 {
2471 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2472 
2473 	switch (type) {
2474 	case ARC_SPACE_DATA:
2475 		aggsum_add(&astat_data_size, space);
2476 		break;
2477 	case ARC_SPACE_META:
2478 		aggsum_add(&astat_metadata_size, space);
2479 		break;
2480 	case ARC_SPACE_OTHER:
2481 		aggsum_add(&astat_other_size, space);
2482 		break;
2483 	case ARC_SPACE_HDRS:
2484 		aggsum_add(&astat_hdr_size, space);
2485 		break;
2486 	case ARC_SPACE_L2HDRS:
2487 		aggsum_add(&astat_l2_hdr_size, space);
2488 		break;
2489 	}
2490 
2491 	if (type != ARC_SPACE_DATA)
2492 		aggsum_add(&arc_meta_used, space);
2493 
2494 	aggsum_add(&arc_size, space);
2495 }
2496 
2497 void
arc_space_return(uint64_t space,arc_space_type_t type)2498 arc_space_return(uint64_t space, arc_space_type_t type)
2499 {
2500 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2501 
2502 	switch (type) {
2503 	case ARC_SPACE_DATA:
2504 		aggsum_add(&astat_data_size, -space);
2505 		break;
2506 	case ARC_SPACE_META:
2507 		aggsum_add(&astat_metadata_size, -space);
2508 		break;
2509 	case ARC_SPACE_OTHER:
2510 		aggsum_add(&astat_other_size, -space);
2511 		break;
2512 	case ARC_SPACE_HDRS:
2513 		aggsum_add(&astat_hdr_size, -space);
2514 		break;
2515 	case ARC_SPACE_L2HDRS:
2516 		aggsum_add(&astat_l2_hdr_size, -space);
2517 		break;
2518 	}
2519 
2520 	if (type != ARC_SPACE_DATA) {
2521 		ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2522 		/*
2523 		 * We use the upper bound here rather than the precise value
2524 		 * because the arc_meta_max value doesn't need to be
2525 		 * precise. It's only consumed by humans via arcstats.
2526 		 */
2527 		if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2528 			arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2529 		aggsum_add(&arc_meta_used, -space);
2530 	}
2531 
2532 	ASSERT(aggsum_compare(&arc_size, space) >= 0);
2533 	aggsum_add(&arc_size, -space);
2534 }
2535 
2536 /*
2537  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2538  * with the hdr's b_pabd.
2539  */
2540 static boolean_t
arc_can_share(arc_buf_hdr_t * hdr,arc_buf_t * buf)2541 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2542 {
2543 	/*
2544 	 * The criteria for sharing a hdr's data are:
2545 	 * 1. the buffer is not encrypted
2546 	 * 2. the hdr's compression matches the buf's compression
2547 	 * 3. the hdr doesn't need to be byteswapped
2548 	 * 4. the hdr isn't already being shared
2549 	 * 5. the buf is either compressed or it is the last buf in the hdr list
2550 	 *
2551 	 * Criterion #5 maintains the invariant that shared uncompressed
2552 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2553 	 * might ask, "if a compressed buf is allocated first, won't that be the
2554 	 * last thing in the list?", but in that case it's impossible to create
2555 	 * a shared uncompressed buf anyway (because the hdr must be compressed
2556 	 * to have the compressed buf). You might also think that #3 is
2557 	 * sufficient to make this guarantee, however it's possible
2558 	 * (specifically in the rare L2ARC write race mentioned in
2559 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2560 	 * is sharable, but wasn't at the time of its allocation. Rather than
2561 	 * allow a new shared uncompressed buf to be created and then shuffle
2562 	 * the list around to make it the last element, this simply disallows
2563 	 * sharing if the new buf isn't the first to be added.
2564 	 */
2565 	ASSERT3P(buf->b_hdr, ==, hdr);
2566 	boolean_t hdr_compressed = arc_hdr_get_compress(hdr) !=
2567 	    ZIO_COMPRESS_OFF;
2568 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2569 	return (!ARC_BUF_ENCRYPTED(buf) &&
2570 	    buf_compressed == hdr_compressed &&
2571 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2572 	    !HDR_SHARED_DATA(hdr) &&
2573 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2574 }
2575 
2576 /*
2577  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2578  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2579  * copy was made successfully, or an error code otherwise.
2580  */
2581 static int
arc_buf_alloc_impl(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb,void * tag,boolean_t encrypted,boolean_t compressed,boolean_t noauth,boolean_t fill,arc_buf_t ** ret)2582 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2583     void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
2584     boolean_t fill, arc_buf_t **ret)
2585 {
2586 	arc_buf_t *buf;
2587 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
2588 
2589 	ASSERT(HDR_HAS_L1HDR(hdr));
2590 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2591 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2592 	    hdr->b_type == ARC_BUFC_METADATA);
2593 	ASSERT3P(ret, !=, NULL);
2594 	ASSERT3P(*ret, ==, NULL);
2595 	IMPLY(encrypted, compressed);
2596 
2597 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2598 	buf->b_hdr = hdr;
2599 	buf->b_data = NULL;
2600 	buf->b_next = hdr->b_l1hdr.b_buf;
2601 	buf->b_flags = 0;
2602 
2603 	add_reference(hdr, tag);
2604 
2605 	/*
2606 	 * We're about to change the hdr's b_flags. We must either
2607 	 * hold the hash_lock or be undiscoverable.
2608 	 */
2609 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2610 
2611 	/*
2612 	 * Only honor requests for compressed bufs if the hdr is actually
2613 	 * compressed. This must be overriden if the buffer is encrypted since
2614 	 * encrypted buffers cannot be decompressed.
2615 	 */
2616 	if (encrypted) {
2617 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2618 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2619 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2620 	} else if (compressed &&
2621 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
2622 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2623 		flags |= ARC_FILL_COMPRESSED;
2624 	}
2625 
2626 	if (noauth) {
2627 		ASSERT0(encrypted);
2628 		flags |= ARC_FILL_NOAUTH;
2629 	}
2630 
2631 	/*
2632 	 * If the hdr's data can be shared then we share the data buffer and
2633 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2634 	 * allocate a new buffer to store the buf's data.
2635 	 *
2636 	 * There are two additional restrictions here because we're sharing
2637 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2638 	 * actively involved in an L2ARC write, because if this buf is used by
2639 	 * an arc_write() then the hdr's data buffer will be released when the
2640 	 * write completes, even though the L2ARC write might still be using it.
2641 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
2642 	 * need to be ABD-aware.
2643 	 */
2644 	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
2645 	    hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
2646 
2647 	/* Set up b_data and sharing */
2648 	if (can_share) {
2649 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2650 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
2651 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2652 	} else {
2653 		buf->b_data =
2654 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2655 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2656 	}
2657 	VERIFY3P(buf->b_data, !=, NULL);
2658 
2659 	hdr->b_l1hdr.b_buf = buf;
2660 	hdr->b_l1hdr.b_bufcnt += 1;
2661 	if (encrypted)
2662 		hdr->b_crypt_hdr.b_ebufcnt += 1;
2663 
2664 	/*
2665 	 * If the user wants the data from the hdr, we need to either copy or
2666 	 * decompress the data.
2667 	 */
2668 	if (fill) {
2669 		ASSERT3P(zb, !=, NULL);
2670 		return (arc_buf_fill(buf, spa, zb, flags));
2671 	}
2672 
2673 	return (0);
2674 }
2675 
2676 static char *arc_onloan_tag = "onloan";
2677 
2678 static inline void
arc_loaned_bytes_update(int64_t delta)2679 arc_loaned_bytes_update(int64_t delta)
2680 {
2681 	atomic_add_64(&arc_loaned_bytes, delta);
2682 
2683 	/* assert that it did not wrap around */
2684 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2685 }
2686 
2687 /*
2688  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2689  * flight data by arc_tempreserve_space() until they are "returned". Loaned
2690  * buffers must be returned to the arc before they can be used by the DMU or
2691  * freed.
2692  */
2693 arc_buf_t *
arc_loan_buf(spa_t * spa,boolean_t is_metadata,int size)2694 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2695 {
2696 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2697 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2698 
2699 	arc_loaned_bytes_update(arc_buf_size(buf));
2700 
2701 	return (buf);
2702 }
2703 
2704 arc_buf_t *
arc_loan_compressed_buf(spa_t * spa,uint64_t psize,uint64_t lsize,enum zio_compress compression_type)2705 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2706     enum zio_compress compression_type)
2707 {
2708 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2709 	    psize, lsize, compression_type);
2710 
2711 	arc_loaned_bytes_update(arc_buf_size(buf));
2712 
2713 	return (buf);
2714 }
2715 
2716 arc_buf_t *
arc_loan_raw_buf(spa_t * spa,uint64_t dsobj,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_object_type_t ot,uint64_t psize,uint64_t lsize,enum zio_compress compression_type)2717 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2718     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2719     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2720     enum zio_compress compression_type)
2721 {
2722 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2723 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type);
2724 
2725 	atomic_add_64(&arc_loaned_bytes, psize);
2726 	return (buf);
2727 }
2728 
2729 /*
2730  * Performance tuning of L2ARC persistence:
2731  *
2732  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
2733  *		an L2ARC device (either at pool import or later) will attempt
2734  *		to rebuild L2ARC buffer contents.
2735  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
2736  *		whether log blocks are written to the L2ARC device. If the L2ARC
2737  *		device is less than 1GB, the amount of data l2arc_evict()
2738  *		evicts is significant compared to the amount of restored L2ARC
2739  *		data. In this case do not write log blocks in L2ARC in order
2740  *		not to waste space.
2741  */
2742 int l2arc_rebuild_enabled = B_TRUE;
2743 unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
2744 
2745 /* L2ARC persistence rebuild control routines. */
2746 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
2747 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
2748 static int l2arc_rebuild(l2arc_dev_t *dev);
2749 
2750 /* L2ARC persistence read I/O routines. */
2751 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
2752 static int l2arc_log_blk_read(l2arc_dev_t *dev,
2753     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
2754     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
2755     zio_t *this_io, zio_t **next_io);
2756 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
2757     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
2758 static void l2arc_log_blk_fetch_abort(zio_t *zio);
2759 
2760 /* L2ARC persistence block restoration routines. */
2761 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
2762     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
2763 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
2764     l2arc_dev_t *dev);
2765 
2766 /* L2ARC persistence write I/O routines. */
2767 static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
2768 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
2769     l2arc_write_callback_t *cb);
2770 
2771 /* L2ARC persistence auxilliary routines. */
2772 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
2773     const l2arc_log_blkptr_t *lbp);
2774 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
2775     const arc_buf_hdr_t *ab);
2776 boolean_t l2arc_range_check_overlap(uint64_t bottom,
2777     uint64_t top, uint64_t check);
2778 static void l2arc_blk_fetch_done(zio_t *zio);
2779 static inline uint64_t
2780     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
2781 
2782 /*
2783  * Return a loaned arc buffer to the arc.
2784  */
2785 void
arc_return_buf(arc_buf_t * buf,void * tag)2786 arc_return_buf(arc_buf_t *buf, void *tag)
2787 {
2788 	arc_buf_hdr_t *hdr = buf->b_hdr;
2789 
2790 	ASSERT3P(buf->b_data, !=, NULL);
2791 	ASSERT(HDR_HAS_L1HDR(hdr));
2792 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2793 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2794 
2795 	arc_loaned_bytes_update(-arc_buf_size(buf));
2796 }
2797 
2798 /* Detach an arc_buf from a dbuf (tag) */
2799 void
arc_loan_inuse_buf(arc_buf_t * buf,void * tag)2800 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2801 {
2802 	arc_buf_hdr_t *hdr = buf->b_hdr;
2803 
2804 	ASSERT3P(buf->b_data, !=, NULL);
2805 	ASSERT(HDR_HAS_L1HDR(hdr));
2806 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2807 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2808 
2809 	arc_loaned_bytes_update(arc_buf_size(buf));
2810 }
2811 
2812 static void
l2arc_free_abd_on_write(abd_t * abd,size_t size,arc_buf_contents_t type)2813 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
2814 {
2815 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2816 
2817 	df->l2df_abd = abd;
2818 	df->l2df_size = size;
2819 	df->l2df_type = type;
2820 	mutex_enter(&l2arc_free_on_write_mtx);
2821 	list_insert_head(l2arc_free_on_write, df);
2822 	mutex_exit(&l2arc_free_on_write_mtx);
2823 }
2824 
2825 static void
arc_hdr_free_on_write(arc_buf_hdr_t * hdr,boolean_t free_rdata)2826 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
2827 {
2828 	arc_state_t *state = hdr->b_l1hdr.b_state;
2829 	arc_buf_contents_t type = arc_buf_type(hdr);
2830 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
2831 
2832 	/* protected by hash lock, if in the hash table */
2833 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2834 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2835 		ASSERT(state != arc_anon && state != arc_l2c_only);
2836 
2837 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2838 		    size, hdr);
2839 	}
2840 	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
2841 	if (type == ARC_BUFC_METADATA) {
2842 		arc_space_return(size, ARC_SPACE_META);
2843 	} else {
2844 		ASSERT(type == ARC_BUFC_DATA);
2845 		arc_space_return(size, ARC_SPACE_DATA);
2846 	}
2847 
2848 	if (free_rdata) {
2849 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
2850 	} else {
2851 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
2852 	}
2853 }
2854 
2855 /*
2856  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
2857  * data buffer, we transfer the refcount ownership to the hdr and update
2858  * the appropriate kstats.
2859  */
2860 static void
arc_share_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)2861 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2862 {
2863 	/* LINTED */
2864 	arc_state_t *state = hdr->b_l1hdr.b_state;
2865 
2866 	ASSERT(arc_can_share(hdr, buf));
2867 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2868 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
2869 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2870 
2871 	/*
2872 	 * Start sharing the data buffer. We transfer the
2873 	 * refcount ownership to the hdr since it always owns
2874 	 * the refcount whenever an arc_buf_t is shared.
2875 	 */
2876 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2877 	    arc_hdr_size(hdr), buf, hdr);
2878 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
2879 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
2880 	    HDR_ISTYPE_METADATA(hdr));
2881 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2882 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
2883 
2884 	/*
2885 	 * Since we've transferred ownership to the hdr we need
2886 	 * to increment its compressed and uncompressed kstats and
2887 	 * decrement the overhead size.
2888 	 */
2889 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
2890 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2891 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
2892 }
2893 
2894 static void
arc_unshare_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)2895 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2896 {
2897 	/* LINTED */
2898 	arc_state_t *state = hdr->b_l1hdr.b_state;
2899 
2900 	ASSERT(arc_buf_is_shared(buf));
2901 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2902 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2903 
2904 	/*
2905 	 * We are no longer sharing this buffer so we need
2906 	 * to transfer its ownership to the rightful owner.
2907 	 */
2908 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2909 	    arc_hdr_size(hdr), hdr, buf);
2910 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2911 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
2912 	abd_put(hdr->b_l1hdr.b_pabd);
2913 	hdr->b_l1hdr.b_pabd = NULL;
2914 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2915 
2916 	/*
2917 	 * Since the buffer is no longer shared between
2918 	 * the arc buf and the hdr, count it as overhead.
2919 	 */
2920 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
2921 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
2922 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2923 }
2924 
2925 /*
2926  * Remove an arc_buf_t from the hdr's buf list and return the last
2927  * arc_buf_t on the list. If no buffers remain on the list then return
2928  * NULL.
2929  */
2930 static arc_buf_t *
arc_buf_remove(arc_buf_hdr_t * hdr,arc_buf_t * buf)2931 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2932 {
2933 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
2934 	arc_buf_t *lastbuf = NULL;
2935 
2936 	ASSERT(HDR_HAS_L1HDR(hdr));
2937 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2938 
2939 	/*
2940 	 * Remove the buf from the hdr list and locate the last
2941 	 * remaining buffer on the list.
2942 	 */
2943 	while (*bufp != NULL) {
2944 		if (*bufp == buf)
2945 			*bufp = buf->b_next;
2946 
2947 		/*
2948 		 * If we've removed a buffer in the middle of
2949 		 * the list then update the lastbuf and update
2950 		 * bufp.
2951 		 */
2952 		if (*bufp != NULL) {
2953 			lastbuf = *bufp;
2954 			bufp = &(*bufp)->b_next;
2955 		}
2956 	}
2957 	buf->b_next = NULL;
2958 	ASSERT3P(lastbuf, !=, buf);
2959 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
2960 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
2961 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
2962 
2963 	return (lastbuf);
2964 }
2965 
2966 /*
2967  * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
2968  * list and free it.
2969  */
2970 static void
arc_buf_destroy_impl(arc_buf_t * buf)2971 arc_buf_destroy_impl(arc_buf_t *buf)
2972 {
2973 	arc_buf_hdr_t *hdr = buf->b_hdr;
2974 
2975 	/*
2976 	 * Free up the data associated with the buf but only if we're not
2977 	 * sharing this with the hdr. If we are sharing it with the hdr, the
2978 	 * hdr is responsible for doing the free.
2979 	 */
2980 	if (buf->b_data != NULL) {
2981 		/*
2982 		 * We're about to change the hdr's b_flags. We must either
2983 		 * hold the hash_lock or be undiscoverable.
2984 		 */
2985 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2986 
2987 		arc_cksum_verify(buf);
2988 		arc_buf_unwatch(buf);
2989 
2990 		if (arc_buf_is_shared(buf)) {
2991 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2992 		} else {
2993 			uint64_t size = arc_buf_size(buf);
2994 			arc_free_data_buf(hdr, buf->b_data, size, buf);
2995 			ARCSTAT_INCR(arcstat_overhead_size, -size);
2996 		}
2997 		buf->b_data = NULL;
2998 
2999 		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3000 		hdr->b_l1hdr.b_bufcnt -= 1;
3001 
3002 		if (ARC_BUF_ENCRYPTED(buf)) {
3003 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
3004 
3005 			/*
3006 			 * If we have no more encrypted buffers and we've
3007 			 * already gotten a copy of the decrypted data we can
3008 			 * free b_rabd to save some space.
3009 			 */
3010 			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
3011 			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
3012 			    !HDR_IO_IN_PROGRESS(hdr)) {
3013 				arc_hdr_free_pabd(hdr, B_TRUE);
3014 			}
3015 		}
3016 	}
3017 
3018 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3019 
3020 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3021 		/*
3022 		 * If the current arc_buf_t is sharing its data buffer with the
3023 		 * hdr, then reassign the hdr's b_pabd to share it with the new
3024 		 * buffer at the end of the list. The shared buffer is always
3025 		 * the last one on the hdr's buffer list.
3026 		 *
3027 		 * There is an equivalent case for compressed bufs, but since
3028 		 * they aren't guaranteed to be the last buf in the list and
3029 		 * that is an exceedingly rare case, we just allow that space be
3030 		 * wasted temporarily. We must also be careful not to share
3031 		 * encrypted buffers, since they cannot be shared.
3032 		 */
3033 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3034 			/* Only one buf can be shared at once */
3035 			VERIFY(!arc_buf_is_shared(lastbuf));
3036 			/* hdr is uncompressed so can't have compressed buf */
3037 			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3038 
3039 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3040 			arc_hdr_free_pabd(hdr, B_FALSE);
3041 
3042 			/*
3043 			 * We must setup a new shared block between the
3044 			 * last buffer and the hdr. The data would have
3045 			 * been allocated by the arc buf so we need to transfer
3046 			 * ownership to the hdr since it's now being shared.
3047 			 */
3048 			arc_share_buf(hdr, lastbuf);
3049 		}
3050 	} else if (HDR_SHARED_DATA(hdr)) {
3051 		/*
3052 		 * Uncompressed shared buffers are always at the end
3053 		 * of the list. Compressed buffers don't have the
3054 		 * same requirements. This makes it hard to
3055 		 * simply assert that the lastbuf is shared so
3056 		 * we rely on the hdr's compression flags to determine
3057 		 * if we have a compressed, shared buffer.
3058 		 */
3059 		ASSERT3P(lastbuf, !=, NULL);
3060 		ASSERT(arc_buf_is_shared(lastbuf) ||
3061 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3062 	}
3063 
3064 	/*
3065 	 * Free the checksum if we're removing the last uncompressed buf from
3066 	 * this hdr.
3067 	 */
3068 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
3069 		arc_cksum_free(hdr);
3070 	}
3071 
3072 	/* clean up the buf */
3073 	buf->b_hdr = NULL;
3074 	kmem_cache_free(buf_cache, buf);
3075 }
3076 
3077 static void
arc_hdr_alloc_pabd(arc_buf_hdr_t * hdr,int alloc_flags)3078 arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, int alloc_flags)
3079 {
3080 	uint64_t size;
3081 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
3082 	boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
3083 
3084 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3085 	ASSERT(HDR_HAS_L1HDR(hdr));
3086 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3087 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3088 
3089 	if (alloc_rdata) {
3090 		size = HDR_GET_PSIZE(hdr);
3091 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
3092 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
3093 		    do_adapt);
3094 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3095 	} else {
3096 		size = arc_hdr_size(hdr);
3097 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3098 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
3099 		    do_adapt);
3100 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3101 	}
3102 
3103 	ARCSTAT_INCR(arcstat_compressed_size, size);
3104 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3105 }
3106 
3107 static void
arc_hdr_free_pabd(arc_buf_hdr_t * hdr,boolean_t free_rdata)3108 arc_hdr_free_pabd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3109 {
3110 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3111 
3112 	ASSERT(HDR_HAS_L1HDR(hdr));
3113 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3114 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3115 
3116 
3117 	/*
3118 	 * If the hdr is currently being written to the l2arc then
3119 	 * we defer freeing the data by adding it to the l2arc_free_on_write
3120 	 * list. The l2arc will free the data once it's finished
3121 	 * writing it to the l2arc device.
3122 	 */
3123 	if (HDR_L2_WRITING(hdr)) {
3124 		arc_hdr_free_on_write(hdr, free_rdata);
3125 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
3126 	} else if (free_rdata) {
3127 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3128 	} else {
3129 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
3130 		    size, hdr);
3131 	}
3132 
3133 	if (free_rdata) {
3134 		hdr->b_crypt_hdr.b_rabd = NULL;
3135 	} else {
3136 		hdr->b_l1hdr.b_pabd = NULL;
3137 	}
3138 
3139 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3140 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3141 
3142 	ARCSTAT_INCR(arcstat_compressed_size, -size);
3143 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3144 }
3145 
3146 static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa,int32_t psize,int32_t lsize,boolean_t protected,enum zio_compress compression_type,arc_buf_contents_t type,boolean_t alloc_rdata)3147 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3148     boolean_t protected, enum zio_compress compression_type,
3149     arc_buf_contents_t type, boolean_t alloc_rdata)
3150 {
3151 	arc_buf_hdr_t *hdr;
3152 	int flags = ARC_HDR_DO_ADAPT;
3153 
3154 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3155 	if (protected) {
3156 		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
3157 	} else {
3158 		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3159 	}
3160 	flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
3161 	ASSERT(HDR_EMPTY(hdr));
3162 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3163 	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
3164 	HDR_SET_PSIZE(hdr, psize);
3165 	HDR_SET_LSIZE(hdr, lsize);
3166 	hdr->b_spa = spa;
3167 	hdr->b_type = type;
3168 	hdr->b_flags = 0;
3169 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3170 	arc_hdr_set_compress(hdr, compression_type);
3171 	if (protected)
3172 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3173 
3174 	hdr->b_l1hdr.b_state = arc_anon;
3175 	hdr->b_l1hdr.b_arc_access = 0;
3176 	hdr->b_l1hdr.b_bufcnt = 0;
3177 	hdr->b_l1hdr.b_buf = NULL;
3178 
3179 	/*
3180 	 * Allocate the hdr's buffer. This will contain either
3181 	 * the compressed or uncompressed data depending on the block
3182 	 * it references and compressed arc enablement.
3183 	 */
3184 	arc_hdr_alloc_pabd(hdr, flags);
3185 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3186 
3187 	return (hdr);
3188 }
3189 
3190 /*
3191  * Transition between the two allocation states for the arc_buf_hdr struct.
3192  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3193  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3194  * version is used when a cache buffer is only in the L2ARC in order to reduce
3195  * memory usage.
3196  */
3197 static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t * hdr,kmem_cache_t * old,kmem_cache_t * new)3198 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3199 {
3200 	ASSERT(HDR_HAS_L2HDR(hdr));
3201 
3202 	arc_buf_hdr_t *nhdr;
3203 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3204 
3205 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3206 	    (old == hdr_l2only_cache && new == hdr_full_cache));
3207 
3208 	/*
3209 	 * if the caller wanted a new full header and the header is to be
3210 	 * encrypted we will actually allocate the header from the full crypt
3211 	 * cache instead. The same applies to freeing from the old cache.
3212 	 */
3213 	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
3214 		new = hdr_full_crypt_cache;
3215 	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
3216 		old = hdr_full_crypt_cache;
3217 
3218 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3219 
3220 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3221 	buf_hash_remove(hdr);
3222 
3223 	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3224 
3225 	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
3226 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3227 		/*
3228 		 * arc_access and arc_change_state need to be aware that a
3229 		 * header has just come out of L2ARC, so we set its state to
3230 		 * l2c_only even though it's about to change.
3231 		 */
3232 		nhdr->b_l1hdr.b_state = arc_l2c_only;
3233 
3234 		/* Verify previous threads set to NULL before freeing */
3235 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3236 		ASSERT(!HDR_HAS_RABD(hdr));
3237 	} else {
3238 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3239 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
3240 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3241 
3242 		/*
3243 		 * If we've reached here, We must have been called from
3244 		 * arc_evict_hdr(), as such we should have already been
3245 		 * removed from any ghost list we were previously on
3246 		 * (which protects us from racing with arc_evict_state),
3247 		 * thus no locking is needed during this check.
3248 		 */
3249 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3250 
3251 		/*
3252 		 * A buffer must not be moved into the arc_l2c_only
3253 		 * state if it's not finished being written out to the
3254 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3255 		 * might try to be accessed, even though it was removed.
3256 		 */
3257 		VERIFY(!HDR_L2_WRITING(hdr));
3258 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3259 		ASSERT(!HDR_HAS_RABD(hdr));
3260 
3261 #ifdef ZFS_DEBUG
3262 		if (hdr->b_l1hdr.b_thawed != NULL) {
3263 			kmem_free(hdr->b_l1hdr.b_thawed, 1);
3264 			hdr->b_l1hdr.b_thawed = NULL;
3265 		}
3266 #endif
3267 
3268 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3269 	}
3270 	/*
3271 	 * The header has been reallocated so we need to re-insert it into any
3272 	 * lists it was on.
3273 	 */
3274 	(void) buf_hash_insert(nhdr, NULL);
3275 
3276 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3277 
3278 	mutex_enter(&dev->l2ad_mtx);
3279 
3280 	/*
3281 	 * We must place the realloc'ed header back into the list at
3282 	 * the same spot. Otherwise, if it's placed earlier in the list,
3283 	 * l2arc_write_buffers() could find it during the function's
3284 	 * write phase, and try to write it out to the l2arc.
3285 	 */
3286 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3287 	list_remove(&dev->l2ad_buflist, hdr);
3288 
3289 	mutex_exit(&dev->l2ad_mtx);
3290 
3291 	/*
3292 	 * Since we're using the pointer address as the tag when
3293 	 * incrementing and decrementing the l2ad_alloc refcount, we
3294 	 * must remove the old pointer (that we're about to destroy) and
3295 	 * add the new pointer to the refcount. Otherwise we'd remove
3296 	 * the wrong pointer address when calling arc_hdr_destroy() later.
3297 	 */
3298 
3299 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3300 	    hdr);
3301 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
3302 	    nhdr);
3303 
3304 	buf_discard_identity(hdr);
3305 	kmem_cache_free(old, hdr);
3306 
3307 	return (nhdr);
3308 }
3309 
3310 /*
3311  * This function allows an L1 header to be reallocated as a crypt
3312  * header and vice versa. If we are going to a crypt header, the
3313  * new fields will be zeroed out.
3314  */
3315 static arc_buf_hdr_t *
arc_hdr_realloc_crypt(arc_buf_hdr_t * hdr,boolean_t need_crypt)3316 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
3317 {
3318 	arc_buf_hdr_t *nhdr;
3319 	arc_buf_t *buf;
3320 	kmem_cache_t *ncache, *ocache;
3321 
3322 	ASSERT(HDR_HAS_L1HDR(hdr));
3323 	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
3324 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3325 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3326 	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
3327 	ASSERT3P(hdr->b_hash_next, ==, NULL);
3328 
3329 	if (need_crypt) {
3330 		ncache = hdr_full_crypt_cache;
3331 		ocache = hdr_full_cache;
3332 	} else {
3333 		ncache = hdr_full_cache;
3334 		ocache = hdr_full_crypt_cache;
3335 	}
3336 
3337 	nhdr = kmem_cache_alloc(ncache,