1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2019, Joyent, Inc.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 2011, 2019, Delphix. All rights reserved.
28 * Copyright (c) 2020, George Amanakis. All rights reserved.
29 * Copyright (c) 2020, The FreeBSD Foundation [1]
30 *
31 * [1] Portions of this software were developed by Allan Jude
32 * under sponsorship from the FreeBSD Foundation.
33 */
34
35 /*
36 * DVA-based Adjustable Replacement Cache
37 *
38 * While much of the theory of operation used here is
39 * based on the self-tuning, low overhead replacement cache
40 * presented by Megiddo and Modha at FAST 2003, there are some
41 * significant differences:
42 *
43 * 1. The Megiddo and Modha model assumes any page is evictable.
44 * Pages in its cache cannot be "locked" into memory. This makes
45 * the eviction algorithm simple: evict the last page in the list.
46 * This also make the performance characteristics easy to reason
47 * about. Our cache is not so simple. At any given moment, some
48 * subset of the blocks in the cache are un-evictable because we
49 * have handed out a reference to them. Blocks are only evictable
50 * when there are no external references active. This makes
51 * eviction far more problematic: we choose to evict the evictable
52 * blocks that are the "lowest" in the list.
53 *
54 * There are times when it is not possible to evict the requested
55 * space. In these circumstances we are unable to adjust the cache
56 * size. To prevent the cache growing unbounded at these times we
57 * implement a "cache throttle" that slows the flow of new data
58 * into the cache until we can make space available.
59 *
60 * 2. The Megiddo and Modha model assumes a fixed cache size.
61 * Pages are evicted when the cache is full and there is a cache
62 * miss. Our model has a variable sized cache. It grows with
63 * high use, but also tries to react to memory pressure from the
64 * operating system: decreasing its size when system memory is
65 * tight.
66 *
67 * 3. The Megiddo and Modha model assumes a fixed page size. All
68 * elements of the cache are therefore exactly the same size. So
69 * when adjusting the cache size following a cache miss, its simply
70 * a matter of choosing a single page to evict. In our model, we
71 * have variable sized cache blocks (rangeing from 512 bytes to
72 * 128K bytes). We therefore choose a set of blocks to evict to make
73 * space for a cache miss that approximates as closely as possible
74 * the space used by the new block.
75 *
76 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
77 * by N. Megiddo & D. Modha, FAST 2003
78 */
79
80 /*
81 * The locking model:
82 *
83 * A new reference to a cache buffer can be obtained in two
84 * ways: 1) via a hash table lookup using the DVA as a key,
85 * or 2) via one of the ARC lists. The arc_read() interface
86 * uses method 1, while the internal ARC algorithms for
87 * adjusting the cache use method 2. We therefore provide two
88 * types of locks: 1) the hash table lock array, and 2) the
89 * ARC list locks.
90 *
91 * Buffers do not have their own mutexes, rather they rely on the
92 * hash table mutexes for the bulk of their protection (i.e. most
93 * fields in the arc_buf_hdr_t are protected by these mutexes).
94 *
95 * buf_hash_find() returns the appropriate mutex (held) when it
96 * locates the requested buffer in the hash table. It returns
97 * NULL for the mutex if the buffer was not in the table.
98 *
99 * buf_hash_remove() expects the appropriate hash mutex to be
100 * already held before it is invoked.
101 *
102 * Each ARC state also has a mutex which is used to protect the
103 * buffer list associated with the state. When attempting to
104 * obtain a hash table lock while holding an ARC list lock you
105 * must use: mutex_tryenter() to avoid deadlock. Also note that
106 * the active state mutex must be held before the ghost state mutex.
107 *
108 * Note that the majority of the performance stats are manipulated
109 * with atomic operations.
110 *
111 * The L2ARC uses the l2ad_mtx on each vdev for the following:
112 *
113 * - L2ARC buflist creation
114 * - L2ARC buflist eviction
115 * - L2ARC write completion, which walks L2ARC buflists
116 * - ARC header destruction, as it removes from L2ARC buflists
117 * - ARC header release, as it removes from L2ARC buflists
118 */
119
120 /*
121 * ARC operation:
122 *
123 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
124 * This structure can point either to a block that is still in the cache or to
125 * one that is only accessible in an L2 ARC device, or it can provide
126 * information about a block that was recently evicted. If a block is
127 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
128 * information to retrieve it from the L2ARC device. This information is
129 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
130 * that is in this state cannot access the data directly.
131 *
132 * Blocks that are actively being referenced or have not been evicted
133 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
134 * the arc_buf_hdr_t that will point to the data block in memory. A block can
135 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
136 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
137 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
138 *
139 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
140 * ability to store the physical data (b_pabd) associated with the DVA of the
141 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
142 * it will match its on-disk compression characteristics. This behavior can be
143 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
144 * compressed ARC functionality is disabled, the b_pabd will point to an
145 * uncompressed version of the on-disk data.
146 *
147 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
148 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
149 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
150 * consumer. The ARC will provide references to this data and will keep it
151 * cached until it is no longer in use. The ARC caches only the L1ARC's physical
152 * data block and will evict any arc_buf_t that is no longer referenced. The
153 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
154 * "overhead_size" kstat.
155 *
156 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
157 * compressed form. The typical case is that consumers will want uncompressed
158 * data, and when that happens a new data buffer is allocated where the data is
159 * decompressed for them to use. Currently the only consumer who wants
160 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
161 * exists on disk. When this happens, the arc_buf_t's data buffer is shared
162 * with the arc_buf_hdr_t.
163 *
164 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
165 * first one is owned by a compressed send consumer (and therefore references
166 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
167 * used by any other consumer (and has its own uncompressed copy of the data
168 * buffer).
169 *
170 * arc_buf_hdr_t
171 * +-----------+
172 * | fields |
173 * | common to |
174 * | L1- and |
175 * | L2ARC |
176 * +-----------+
177 * | l2arc_buf_hdr_t
178 * | |
179 * +-----------+
180 * | l1arc_buf_hdr_t
181 * | | arc_buf_t
182 * | b_buf +------------>+-----------+ arc_buf_t
183 * | b_pabd +-+ |b_next +---->+-----------+
184 * +-----------+ | |-----------| |b_next +-->NULL
185 * | |b_comp = T | +-----------+
186 * | |b_data +-+ |b_comp = F |
187 * | +-----------+ | |b_data +-+
188 * +->+------+ | +-----------+ |
189 * compressed | | | |
190 * data | |<--------------+ | uncompressed
191 * +------+ compressed, | data
192 * shared +-->+------+
193 * data | |
194 * | |
195 * +------+
196 *
197 * When a consumer reads a block, the ARC must first look to see if the
198 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
199 * arc_buf_t and either copies uncompressed data into a new data buffer from an
200 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
201 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
202 * hdr is compressed and the desired compression characteristics of the
203 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
204 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
205 * the last buffer in the hdr's b_buf list, however a shared compressed buf can
206 * be anywhere in the hdr's list.
207 *
208 * The diagram below shows an example of an uncompressed ARC hdr that is
209 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
210 * the last element in the buf list):
211 *
212 * arc_buf_hdr_t
213 * +-----------+
214 * | |
215 * | |
216 * | |
217 * +-----------+
218 * l2arc_buf_hdr_t| |
219 * | |
220 * +-----------+
221 * l1arc_buf_hdr_t| |
222 * | | arc_buf_t (shared)
223 * | b_buf +------------>+---------+ arc_buf_t
224 * | | |b_next +---->+---------+
225 * | b_pabd +-+ |---------| |b_next +-->NULL
226 * +-----------+ | | | +---------+
227 * | |b_data +-+ | |
228 * | +---------+ | |b_data +-+
229 * +->+------+ | +---------+ |
230 * | | | |
231 * uncompressed | | | |
232 * data +------+ | |
233 * ^ +->+------+ |
234 * | uncompressed | | |
235 * | data | | |
236 * | +------+ |
237 * +---------------------------------+
238 *
239 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
240 * since the physical block is about to be rewritten. The new data contents
241 * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
242 * it may compress the data before writing it to disk. The ARC will be called
243 * with the transformed data and will bcopy the transformed on-disk block into
244 * a newly allocated b_pabd. Writes are always done into buffers which have
245 * either been loaned (and hence are new and don't have other readers) or
246 * buffers which have been released (and hence have their own hdr, if there
247 * were originally other readers of the buf's original hdr). This ensures that
248 * the ARC only needs to update a single buf and its hdr after a write occurs.
249 *
250 * When the L2ARC is in use, it will also take advantage of the b_pabd. The
251 * L2ARC will always write the contents of b_pabd to the L2ARC. This means
252 * that when compressed ARC is enabled that the L2ARC blocks are identical
253 * to the on-disk block in the main data pool. This provides a significant
254 * advantage since the ARC can leverage the bp's checksum when reading from the
255 * L2ARC to determine if the contents are valid. However, if the compressed
256 * ARC is disabled, then the L2ARC's block must be transformed to look
257 * like the physical block in the main data pool before comparing the
258 * checksum and determining its validity.
259 *
260 * The L1ARC has a slightly different system for storing encrypted data.
261 * Raw (encrypted + possibly compressed) data has a few subtle differences from
262 * data that is just compressed. The biggest difference is that it is not
263 * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded.
264 * The other difference is that encryption cannot be treated as a suggestion.
265 * If a caller would prefer compressed data, but they actually wind up with
266 * uncompressed data the worst thing that could happen is there might be a
267 * performance hit. If the caller requests encrypted data, however, we must be
268 * sure they actually get it or else secret information could be leaked. Raw
269 * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
270 * may have both an encrypted version and a decrypted version of its data at
271 * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
272 * copied out of this header. To avoid complications with b_pabd, raw buffers
273 * cannot be shared.
274 */
275
276 #include <sys/spa.h>
277 #include <sys/zio.h>
278 #include <sys/spa_impl.h>
279 #include <sys/zio_compress.h>
280 #include <sys/zio_checksum.h>
281 #include <sys/zfs_context.h>
282 #include <sys/arc.h>
283 #include <sys/refcount.h>
284 #include <sys/vdev.h>
285 #include <sys/vdev_impl.h>
286 #include <sys/dsl_pool.h>
287 #include <sys/zio_checksum.h>
288 #include <sys/multilist.h>
289 #include <sys/abd.h>
290 #include <sys/zil.h>
291 #include <sys/fm/fs/zfs.h>
292 #ifdef _KERNEL
293 #include <sys/vmsystm.h>
294 #include <vm/anon.h>
295 #include <sys/fs/swapnode.h>
296 #include <sys/dnlc.h>
297 #endif
298 #include <sys/callb.h>
299 #include <sys/kstat.h>
300 #include <sys/zthr.h>
301 #include <zfs_fletcher.h>
302 #include <sys/arc_impl.h>
303 #include <sys/aggsum.h>
304 #include <sys/cityhash.h>
305 #include <sys/param.h>
306
307 #ifndef _KERNEL
308 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
309 boolean_t arc_watch = B_FALSE;
310 int arc_procfd;
311 #endif
312
313 /*
314 * This thread's job is to keep enough free memory in the system, by
315 * calling arc_kmem_reap_now() plus arc_shrink(), which improves
316 * arc_available_memory().
317 */
318 static zthr_t *arc_reap_zthr;
319
320 /*
321 * This thread's job is to keep arc_size under arc_c, by calling
322 * arc_adjust(), which improves arc_is_overflowing().
323 */
324 static zthr_t *arc_adjust_zthr;
325
326 static kmutex_t arc_adjust_lock;
327 static kcondvar_t arc_adjust_waiters_cv;
328 static boolean_t arc_adjust_needed = B_FALSE;
329
330 uint_t arc_reduce_dnlc_percent = 3;
331
332 /*
333 * The number of headers to evict in arc_evict_state_impl() before
334 * dropping the sublist lock and evicting from another sublist. A lower
335 * value means we're more likely to evict the "correct" header (i.e. the
336 * oldest header in the arc state), but comes with higher overhead
337 * (i.e. more invocations of arc_evict_state_impl()).
338 */
339 int zfs_arc_evict_batch_limit = 10;
340
341 /* number of seconds before growing cache again */
342 int arc_grow_retry = 60;
343
344 /*
345 * Minimum time between calls to arc_kmem_reap_soon(). Note that this will
346 * be converted to ticks, so with the default hz=100, a setting of 15 ms
347 * will actually wait 2 ticks, or 20ms.
348 */
349 int arc_kmem_cache_reap_retry_ms = 1000;
350
351 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
352 int zfs_arc_overflow_shift = 8;
353
354 /* shift of arc_c for calculating both min and max arc_p */
355 int arc_p_min_shift = 4;
356
357 /* log2(fraction of arc to reclaim) */
358 int arc_shrink_shift = 7;
359
360 /*
361 * log2(fraction of ARC which must be free to allow growing).
362 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
363 * when reading a new block into the ARC, we will evict an equal-sized block
364 * from the ARC.
365 *
366 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
367 * we will still not allow it to grow.
368 */
369 int arc_no_grow_shift = 5;
370
371
372 /*
373 * minimum lifespan of a prefetch block in clock ticks
374 * (initialized in arc_init())
375 */
376 static int zfs_arc_min_prefetch_ms = 1;
377 static int zfs_arc_min_prescient_prefetch_ms = 6;
378
379 /*
380 * If this percent of memory is free, don't throttle.
381 */
382 int arc_lotsfree_percent = 10;
383
384 static boolean_t arc_initialized;
385
386 /*
387 * The arc has filled available memory and has now warmed up.
388 */
389 static boolean_t arc_warm;
390
391 /*
392 * log2 fraction of the zio arena to keep free.
393 */
394 int arc_zio_arena_free_shift = 2;
395
396 /*
397 * These tunables are for performance analysis.
398 */
399 uint64_t zfs_arc_max;
400 uint64_t zfs_arc_min;
401 uint64_t zfs_arc_meta_limit = 0;
402 uint64_t zfs_arc_meta_min = 0;
403 int zfs_arc_grow_retry = 0;
404 int zfs_arc_shrink_shift = 0;
405 int zfs_arc_p_min_shift = 0;
406 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
407
408 /*
409 * ARC dirty data constraints for arc_tempreserve_space() throttle
410 */
411 uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
412 uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
413 uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
414
415 boolean_t zfs_compressed_arc_enabled = B_TRUE;
416
417 /* The 6 states: */
418 static arc_state_t ARC_anon;
419 static arc_state_t ARC_mru;
420 static arc_state_t ARC_mru_ghost;
421 static arc_state_t ARC_mfu;
422 static arc_state_t ARC_mfu_ghost;
423 static arc_state_t ARC_l2c_only;
424
425 arc_stats_t arc_stats = {
426 { "hits", KSTAT_DATA_UINT64 },
427 { "misses", KSTAT_DATA_UINT64 },
428 { "demand_data_hits", KSTAT_DATA_UINT64 },
429 { "demand_data_misses", KSTAT_DATA_UINT64 },
430 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
431 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
432 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
433 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
434 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
435 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
436 { "mru_hits", KSTAT_DATA_UINT64 },
437 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
438 { "mfu_hits", KSTAT_DATA_UINT64 },
439 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
440 { "deleted", KSTAT_DATA_UINT64 },
441 { "mutex_miss", KSTAT_DATA_UINT64 },
442 { "access_skip", KSTAT_DATA_UINT64 },
443 { "evict_skip", KSTAT_DATA_UINT64 },
444 { "evict_not_enough", KSTAT_DATA_UINT64 },
445 { "evict_l2_cached", KSTAT_DATA_UINT64 },
446 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
447 { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 },
448 { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 },
449 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
450 { "evict_l2_skip", KSTAT_DATA_UINT64 },
451 { "hash_elements", KSTAT_DATA_UINT64 },
452 { "hash_elements_max", KSTAT_DATA_UINT64 },
453 { "hash_collisions", KSTAT_DATA_UINT64 },
454 { "hash_chains", KSTAT_DATA_UINT64 },
455 { "hash_chain_max", KSTAT_DATA_UINT64 },
456 { "p", KSTAT_DATA_UINT64 },
457 { "c", KSTAT_DATA_UINT64 },
458 { "c_min", KSTAT_DATA_UINT64 },
459 { "c_max", KSTAT_DATA_UINT64 },
460 { "size", KSTAT_DATA_UINT64 },
461 { "compressed_size", KSTAT_DATA_UINT64 },
462 { "uncompressed_size", KSTAT_DATA_UINT64 },
463 { "overhead_size", KSTAT_DATA_UINT64 },
464 { "hdr_size", KSTAT_DATA_UINT64 },
465 { "data_size", KSTAT_DATA_UINT64 },
466 { "metadata_size", KSTAT_DATA_UINT64 },
467 { "other_size", KSTAT_DATA_UINT64 },
468 { "anon_size", KSTAT_DATA_UINT64 },
469 { "anon_evictable_data", KSTAT_DATA_UINT64 },
470 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
471 { "mru_size", KSTAT_DATA_UINT64 },
472 { "mru_evictable_data", KSTAT_DATA_UINT64 },
473 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
474 { "mru_ghost_size", KSTAT_DATA_UINT64 },
475 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
476 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
477 { "mfu_size", KSTAT_DATA_UINT64 },
478 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
479 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
480 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
481 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
482 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
483 { "l2_hits", KSTAT_DATA_UINT64 },
484 { "l2_misses", KSTAT_DATA_UINT64 },
485 { "l2_prefetch_asize", KSTAT_DATA_UINT64 },
486 { "l2_mru_asize", KSTAT_DATA_UINT64 },
487 { "l2_mfu_asize", KSTAT_DATA_UINT64 },
488 { "l2_bufc_data_asize", KSTAT_DATA_UINT64 },
489 { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 },
490 { "l2_feeds", KSTAT_DATA_UINT64 },
491 { "l2_rw_clash", KSTAT_DATA_UINT64 },
492 { "l2_read_bytes", KSTAT_DATA_UINT64 },
493 { "l2_write_bytes", KSTAT_DATA_UINT64 },
494 { "l2_writes_sent", KSTAT_DATA_UINT64 },
495 { "l2_writes_done", KSTAT_DATA_UINT64 },
496 { "l2_writes_error", KSTAT_DATA_UINT64 },
497 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
498 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
499 { "l2_evict_reading", KSTAT_DATA_UINT64 },
500 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
501 { "l2_free_on_write", KSTAT_DATA_UINT64 },
502 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
503 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
504 { "l2_io_error", KSTAT_DATA_UINT64 },
505 { "l2_size", KSTAT_DATA_UINT64 },
506 { "l2_asize", KSTAT_DATA_UINT64 },
507 { "l2_hdr_size", KSTAT_DATA_UINT64 },
508 { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
509 { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
510 { "l2_log_blk_asize", KSTAT_DATA_UINT64 },
511 { "l2_log_blk_count", KSTAT_DATA_UINT64 },
512 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
513 { "l2_rebuild_success", KSTAT_DATA_UINT64 },
514 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
515 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
516 { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
517 { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
518 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
519 { "l2_rebuild_size", KSTAT_DATA_UINT64 },
520 { "l2_rebuild_asize", KSTAT_DATA_UINT64 },
521 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
522 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
523 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
524 { "memory_throttle_count", KSTAT_DATA_UINT64 },
525 { "arc_meta_used", KSTAT_DATA_UINT64 },
526 { "arc_meta_limit", KSTAT_DATA_UINT64 },
527 { "arc_meta_max", KSTAT_DATA_UINT64 },
528 { "arc_meta_min", KSTAT_DATA_UINT64 },
529 { "async_upgrade_sync", KSTAT_DATA_UINT64 },
530 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
531 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
532 };
533
534 #define ARCSTAT_MAX(stat, val) { \
535 uint64_t m; \
536 while ((val) > (m = arc_stats.stat.value.ui64) && \
537 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
538 continue; \
539 }
540
541 #define ARCSTAT_MAXSTAT(stat) \
542 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
543
544 /*
545 * We define a macro to allow ARC hits/misses to be easily broken down by
546 * two separate conditions, giving a total of four different subtypes for
547 * each of hits and misses (so eight statistics total).
548 */
549 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
550 if (cond1) { \
551 if (cond2) { \
552 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
553 } else { \
554 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
555 } \
556 } else { \
557 if (cond2) { \
558 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
559 } else { \
560 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
561 } \
562 }
563
564 /*
565 * This macro allows us to use kstats as floating averages. Each time we
566 * update this kstat, we first factor it and the update value by
567 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
568 * average. This macro assumes that integer loads and stores are atomic, but
569 * is not safe for multiple writers updating the kstat in parallel (only the
570 * last writer's update will remain).
571 */
572 #define ARCSTAT_F_AVG_FACTOR 3
573 #define ARCSTAT_F_AVG(stat, value) \
574 do { \
575 uint64_t x = ARCSTAT(stat); \
576 x = x - x / ARCSTAT_F_AVG_FACTOR + \
577 (value) / ARCSTAT_F_AVG_FACTOR; \
578 ARCSTAT(stat) = x; \
579 _NOTE(CONSTCOND) \
580 } while (0)
581
582 kstat_t *arc_ksp;
583 static arc_state_t *arc_anon;
584 static arc_state_t *arc_mru;
585 static arc_state_t *arc_mru_ghost;
586 static arc_state_t *arc_mfu;
587 static arc_state_t *arc_mfu_ghost;
588 static arc_state_t *arc_l2c_only;
589
590 /*
591 * There are also some ARC variables that we want to export, but that are
592 * updated so often that having the canonical representation be the statistic
593 * variable causes a performance bottleneck. We want to use aggsum_t's for these
594 * instead, but still be able to export the kstat in the same way as before.
595 * The solution is to always use the aggsum version, except in the kstat update
596 * callback.
597 */
598 aggsum_t arc_size;
599 aggsum_t arc_meta_used;
600 aggsum_t astat_data_size;
601 aggsum_t astat_metadata_size;
602 aggsum_t astat_hdr_size;
603 aggsum_t astat_other_size;
604 aggsum_t astat_l2_hdr_size;
605
606 static int arc_no_grow; /* Don't try to grow cache size */
607 static hrtime_t arc_growtime;
608 static uint64_t arc_tempreserve;
609 static uint64_t arc_loaned_bytes;
610
611 #define GHOST_STATE(state) \
612 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
613 (state) == arc_l2c_only)
614
615 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
616 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
617 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
618 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
619 #define HDR_PRESCIENT_PREFETCH(hdr) \
620 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
621 #define HDR_COMPRESSION_ENABLED(hdr) \
622 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
623
624 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
625 #define HDR_L2_READING(hdr) \
626 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
627 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
628 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
629 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
630 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
631 #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED)
632 #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH)
633 #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
634
635 #define HDR_ISTYPE_METADATA(hdr) \
636 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
637 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
638
639 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
640 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
641 #define HDR_HAS_RABD(hdr) \
642 (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \
643 (hdr)->b_crypt_hdr.b_rabd != NULL)
644 #define HDR_ENCRYPTED(hdr) \
645 (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
646 #define HDR_AUTHENTICATED(hdr) \
647 (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
648
649 /* For storing compression mode in b_flags */
650 #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
651
652 #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
653 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
654 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
655 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
656
657 #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
658 #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
659 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
660 #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
661
662 /*
663 * Other sizes
664 */
665
666 #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
667 #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
668 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
669
670 /*
671 * Hash table routines
672 */
673
674 #define HT_LOCK_PAD 64
675
676 struct ht_lock {
677 kmutex_t ht_lock;
678 #ifdef _KERNEL
679 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
680 #endif
681 };
682
683 #define BUF_LOCKS 256
684 typedef struct buf_hash_table {
685 uint64_t ht_mask;
686 arc_buf_hdr_t **ht_table;
687 struct ht_lock ht_locks[BUF_LOCKS];
688 } buf_hash_table_t;
689
690 static buf_hash_table_t buf_hash_table;
691
692 #define BUF_HASH_INDEX(spa, dva, birth) \
693 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
694 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
695 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
696 #define HDR_LOCK(hdr) \
697 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
698
699 uint64_t zfs_crc64_table[256];
700
701 /*
702 * Level 2 ARC
703 */
704
705 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
706 #define L2ARC_HEADROOM 2 /* num of writes */
707 /*
708 * If we discover during ARC scan any buffers to be compressed, we boost
709 * our headroom for the next scanning cycle by this percentage multiple.
710 */
711 #define L2ARC_HEADROOM_BOOST 200
712 #define L2ARC_FEED_SECS 1 /* caching interval secs */
713 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
714
715 /*
716 * We can feed L2ARC from two states of ARC buffers, mru and mfu,
717 * and each of the state has two types: data and metadata.
718 */
719 #define L2ARC_FEED_TYPES 4
720
721
722 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
723 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
724
725 /* L2ARC Performance Tunables */
726 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
727 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
728 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
729 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
730 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
731 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
732 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
733 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
734 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
735 int l2arc_meta_percent = 33; /* limit on headers size */
736
737 /*
738 * L2ARC Internals
739 */
740 static list_t L2ARC_dev_list; /* device list */
741 static list_t *l2arc_dev_list; /* device list pointer */
742 static kmutex_t l2arc_dev_mtx; /* device list mutex */
743 static l2arc_dev_t *l2arc_dev_last; /* last device used */
744 static list_t L2ARC_free_on_write; /* free after write buf list */
745 static list_t *l2arc_free_on_write; /* free after write list ptr */
746 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
747 static uint64_t l2arc_ndev; /* number of devices */
748
749 typedef struct l2arc_read_callback {
750 arc_buf_hdr_t *l2rcb_hdr; /* read header */
751 blkptr_t l2rcb_bp; /* original blkptr */
752 zbookmark_phys_t l2rcb_zb; /* original bookmark */
753 int l2rcb_flags; /* original flags */
754 abd_t *l2rcb_abd; /* temporary buffer */
755 } l2arc_read_callback_t;
756
757 typedef struct l2arc_data_free {
758 /* protected by l2arc_free_on_write_mtx */
759 abd_t *l2df_abd;
760 size_t l2df_size;
761 arc_buf_contents_t l2df_type;
762 list_node_t l2df_list_node;
763 } l2arc_data_free_t;
764
765 static kmutex_t l2arc_feed_thr_lock;
766 static kcondvar_t l2arc_feed_thr_cv;
767 static uint8_t l2arc_thread_exit;
768
769 static kmutex_t l2arc_rebuild_thr_lock;
770 static kcondvar_t l2arc_rebuild_thr_cv;
771
772 enum arc_hdr_alloc_flags {
773 ARC_HDR_ALLOC_RDATA = 0x1,
774 ARC_HDR_DO_ADAPT = 0x2,
775 };
776
777
778 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
779 typedef enum arc_fill_flags {
780 ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
781 ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
782 ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */
783 ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */
784 ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
785 } arc_fill_flags_t;
786
787 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
788 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
789 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
790 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
791 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
792 static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t);
793 static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, int);
794 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
795 static boolean_t arc_is_overflowing();
796 static void arc_buf_watch(arc_buf_t *);
797 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
798
799 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
800 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
801 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
802 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
803
804 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
805 static void l2arc_read_done(zio_t *);
806 static void l2arc_do_free_on_write(void);
807 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
808 boolean_t state_only);
809
810 #define l2arc_hdr_arcstats_increment(hdr) \
811 l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
812 #define l2arc_hdr_arcstats_decrement(hdr) \
813 l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
814 #define l2arc_hdr_arcstats_increment_state(hdr) \
815 l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
816 #define l2arc_hdr_arcstats_decrement_state(hdr) \
817 l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
818
819 /*
820 * The arc_all_memory function is a ZoL enhancement that lives in their OSL
821 * code. In user-space code, which is used primarily for testing, we return
822 * half of all memory.
823 */
824 uint64_t
arc_all_memory(void)825 arc_all_memory(void)
826 {
827 #ifdef _KERNEL
828 return (ptob(physmem));
829 #else
830 return ((sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES)) / 2);
831 #endif
832 }
833
834 /*
835 * We use Cityhash for this. It's fast, and has good hash properties without
836 * requiring any large static buffers.
837 */
838 static uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)839 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
840 {
841 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
842 }
843
844 #define HDR_EMPTY(hdr) \
845 ((hdr)->b_dva.dva_word[0] == 0 && \
846 (hdr)->b_dva.dva_word[1] == 0)
847
848 #define HDR_EMPTY_OR_LOCKED(hdr) \
849 (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
850
851 #define HDR_EQUAL(spa, dva, birth, hdr) \
852 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
853 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
854 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
855
856 static void
buf_discard_identity(arc_buf_hdr_t * hdr)857 buf_discard_identity(arc_buf_hdr_t *hdr)
858 {
859 hdr->b_dva.dva_word[0] = 0;
860 hdr->b_dva.dva_word[1] = 0;
861 hdr->b_birth = 0;
862 }
863
864 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)865 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
866 {
867 const dva_t *dva = BP_IDENTITY(bp);
868 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
869 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
870 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
871 arc_buf_hdr_t *hdr;
872
873 mutex_enter(hash_lock);
874 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
875 hdr = hdr->b_hash_next) {
876 if (HDR_EQUAL(spa, dva, birth, hdr)) {
877 *lockp = hash_lock;
878 return (hdr);
879 }
880 }
881 mutex_exit(hash_lock);
882 *lockp = NULL;
883 return (NULL);
884 }
885
886 /*
887 * Insert an entry into the hash table. If there is already an element
888 * equal to elem in the hash table, then the already existing element
889 * will be returned and the new element will not be inserted.
890 * Otherwise returns NULL.
891 * If lockp == NULL, the caller is assumed to already hold the hash lock.
892 */
893 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)894 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
895 {
896 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
897 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
898 arc_buf_hdr_t *fhdr;
899 uint32_t i;
900
901 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
902 ASSERT(hdr->b_birth != 0);
903 ASSERT(!HDR_IN_HASH_TABLE(hdr));
904
905 if (lockp != NULL) {
906 *lockp = hash_lock;
907 mutex_enter(hash_lock);
908 } else {
909 ASSERT(MUTEX_HELD(hash_lock));
910 }
911
912 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
913 fhdr = fhdr->b_hash_next, i++) {
914 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
915 return (fhdr);
916 }
917
918 hdr->b_hash_next = buf_hash_table.ht_table[idx];
919 buf_hash_table.ht_table[idx] = hdr;
920 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
921
922 /* collect some hash table performance data */
923 if (i > 0) {
924 ARCSTAT_BUMP(arcstat_hash_collisions);
925 if (i == 1)
926 ARCSTAT_BUMP(arcstat_hash_chains);
927
928 ARCSTAT_MAX(arcstat_hash_chain_max, i);
929 }
930
931 ARCSTAT_BUMP(arcstat_hash_elements);
932 ARCSTAT_MAXSTAT(arcstat_hash_elements);
933
934 return (NULL);
935 }
936
937 static void
buf_hash_remove(arc_buf_hdr_t * hdr)938 buf_hash_remove(arc_buf_hdr_t *hdr)
939 {
940 arc_buf_hdr_t *fhdr, **hdrp;
941 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
942
943 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
944 ASSERT(HDR_IN_HASH_TABLE(hdr));
945
946 hdrp = &buf_hash_table.ht_table[idx];
947 while ((fhdr = *hdrp) != hdr) {
948 ASSERT3P(fhdr, !=, NULL);
949 hdrp = &fhdr->b_hash_next;
950 }
951 *hdrp = hdr->b_hash_next;
952 hdr->b_hash_next = NULL;
953 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
954
955 /* collect some hash table performance data */
956 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
957
958 if (buf_hash_table.ht_table[idx] &&
959 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
960 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
961 }
962
963 /*
964 * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
965 * metadata and data are cached from ARC into L2ARC.
966 */
967 int l2arc_mfuonly = 0;
968
969 /*
970 * Global data structures and functions for the buf kmem cache.
971 */
972
973 static kmem_cache_t *hdr_full_cache;
974 static kmem_cache_t *hdr_full_crypt_cache;
975 static kmem_cache_t *hdr_l2only_cache;
976 static kmem_cache_t *buf_cache;
977
978 static void
buf_fini(void)979 buf_fini(void)
980 {
981 int i;
982
983 kmem_free(buf_hash_table.ht_table,
984 (buf_hash_table.ht_mask + 1) * sizeof (void *));
985 for (i = 0; i < BUF_LOCKS; i++)
986 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
987 kmem_cache_destroy(hdr_full_cache);
988 kmem_cache_destroy(hdr_full_crypt_cache);
989 kmem_cache_destroy(hdr_l2only_cache);
990 kmem_cache_destroy(buf_cache);
991 }
992
993 /*
994 * Constructor callback - called when the cache is empty
995 * and a new buf is requested.
996 */
997 /* ARGSUSED */
998 static int
hdr_full_cons(void * vbuf,void * unused,int kmflag)999 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1000 {
1001 arc_buf_hdr_t *hdr = vbuf;
1002
1003 bzero(hdr, HDR_FULL_SIZE);
1004 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1005 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1006 zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1007 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1008 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1009 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1010
1011 return (0);
1012 }
1013
1014 /* ARGSUSED */
1015 static int
hdr_full_crypt_cons(void * vbuf,void * unused,int kmflag)1016 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
1017 {
1018 arc_buf_hdr_t *hdr = vbuf;
1019
1020 (void) hdr_full_cons(vbuf, unused, kmflag);
1021 bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
1022 arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1023
1024 return (0);
1025 }
1026
1027 /* ARGSUSED */
1028 static int
hdr_l2only_cons(void * vbuf,void * unused,int kmflag)1029 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1030 {
1031 arc_buf_hdr_t *hdr = vbuf;
1032
1033 bzero(hdr, HDR_L2ONLY_SIZE);
1034 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1035
1036 return (0);
1037 }
1038
1039 /* ARGSUSED */
1040 static int
buf_cons(void * vbuf,void * unused,int kmflag)1041 buf_cons(void *vbuf, void *unused, int kmflag)
1042 {
1043 arc_buf_t *buf = vbuf;
1044
1045 bzero(buf, sizeof (arc_buf_t));
1046 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1047 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1048
1049 return (0);
1050 }
1051
1052 /*
1053 * Destructor callback - called when a cached buf is
1054 * no longer required.
1055 */
1056 /* ARGSUSED */
1057 static void
hdr_full_dest(void * vbuf,void * unused)1058 hdr_full_dest(void *vbuf, void *unused)
1059 {
1060 arc_buf_hdr_t *hdr = vbuf;
1061
1062 ASSERT(HDR_EMPTY(hdr));
1063 cv_destroy(&hdr->b_l1hdr.b_cv);
1064 zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1065 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1066 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1067 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1068 }
1069
1070 /* ARGSUSED */
1071 static void
hdr_full_crypt_dest(void * vbuf,void * unused)1072 hdr_full_crypt_dest(void *vbuf, void *unused)
1073 {
1074 arc_buf_hdr_t *hdr = vbuf;
1075
1076 hdr_full_dest(hdr, unused);
1077 arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1078 }
1079
1080 /* ARGSUSED */
1081 static void
hdr_l2only_dest(void * vbuf,void * unused)1082 hdr_l2only_dest(void *vbuf, void *unused)
1083 {
1084 arc_buf_hdr_t *hdr = vbuf;
1085
1086 ASSERT(HDR_EMPTY(hdr));
1087 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1088 }
1089
1090 /* ARGSUSED */
1091 static void
buf_dest(void * vbuf,void * unused)1092 buf_dest(void *vbuf, void *unused)
1093 {
1094 arc_buf_t *buf = vbuf;
1095
1096 mutex_destroy(&buf->b_evict_lock);
1097 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1098 }
1099
1100 /*
1101 * Reclaim callback -- invoked when memory is low.
1102 */
1103 /* ARGSUSED */
1104 static void
hdr_recl(void * unused)1105 hdr_recl(void *unused)
1106 {
1107 dprintf("hdr_recl called\n");
1108 /*
1109 * umem calls the reclaim func when we destroy the buf cache,
1110 * which is after we do arc_fini().
1111 */
1112 if (arc_initialized)
1113 zthr_wakeup(arc_reap_zthr);
1114 }
1115
1116 static void
buf_init(void)1117 buf_init(void)
1118 {
1119 uint64_t *ct;
1120 uint64_t hsize = 1ULL << 12;
1121 int i, j;
1122
1123 /*
1124 * The hash table is big enough to fill all of physical memory
1125 * with an average block size of zfs_arc_average_blocksize (default 8K).
1126 * By default, the table will take up
1127 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1128 */
1129 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1130 hsize <<= 1;
1131 retry:
1132 buf_hash_table.ht_mask = hsize - 1;
1133 buf_hash_table.ht_table =
1134 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1135 if (buf_hash_table.ht_table == NULL) {
1136 ASSERT(hsize > (1ULL << 8));
1137 hsize >>= 1;
1138 goto retry;
1139 }
1140
1141 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1142 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1143 hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
1144 HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
1145 hdr_recl, NULL, NULL, 0);
1146 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1147 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1148 NULL, NULL, 0);
1149 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1150 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1151
1152 for (i = 0; i < 256; i++)
1153 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1154 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1155
1156 for (i = 0; i < BUF_LOCKS; i++) {
1157 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1158 NULL, MUTEX_DEFAULT, NULL);
1159 }
1160 }
1161
1162 /*
1163 * This is the size that the buf occupies in memory. If the buf is compressed,
1164 * it will correspond to the compressed size. You should use this method of
1165 * getting the buf size unless you explicitly need the logical size.
1166 */
1167 int32_t
arc_buf_size(arc_buf_t * buf)1168 arc_buf_size(arc_buf_t *buf)
1169 {
1170 return (ARC_BUF_COMPRESSED(buf) ?
1171 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1172 }
1173
1174 int32_t
arc_buf_lsize(arc_buf_t * buf)1175 arc_buf_lsize(arc_buf_t *buf)
1176 {
1177 return (HDR_GET_LSIZE(buf->b_hdr));
1178 }
1179
1180 /*
1181 * This function will return B_TRUE if the buffer is encrypted in memory.
1182 * This buffer can be decrypted by calling arc_untransform().
1183 */
1184 boolean_t
arc_is_encrypted(arc_buf_t * buf)1185 arc_is_encrypted(arc_buf_t *buf)
1186 {
1187 return (ARC_BUF_ENCRYPTED(buf) != 0);
1188 }
1189
1190 /*
1191 * Returns B_TRUE if the buffer represents data that has not had its MAC
1192 * verified yet.
1193 */
1194 boolean_t
arc_is_unauthenticated(arc_buf_t * buf)1195 arc_is_unauthenticated(arc_buf_t *buf)
1196 {
1197 return (HDR_NOAUTH(buf->b_hdr) != 0);
1198 }
1199
1200 void
arc_get_raw_params(arc_buf_t * buf,boolean_t * byteorder,uint8_t * salt,uint8_t * iv,uint8_t * mac)1201 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1202 uint8_t *iv, uint8_t *mac)
1203 {
1204 arc_buf_hdr_t *hdr = buf->b_hdr;
1205
1206 ASSERT(HDR_PROTECTED(hdr));
1207
1208 bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
1209 bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
1210 bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
1211 *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1212 /* CONSTCOND */
1213 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1214 }
1215
1216 /*
1217 * Indicates how this buffer is compressed in memory. If it is not compressed
1218 * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1219 * arc_untransform() as long as it is also unencrypted.
1220 */
1221 enum zio_compress
arc_get_compression(arc_buf_t * buf)1222 arc_get_compression(arc_buf_t *buf)
1223 {
1224 return (ARC_BUF_COMPRESSED(buf) ?
1225 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1226 }
1227
1228 #define ARC_MINTIME (hz>>4) /* 62 ms */
1229
1230 /*
1231 * Return the compression algorithm used to store this data in the ARC. If ARC
1232 * compression is enabled or this is an encrypted block, this will be the same
1233 * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1234 */
1235 static inline enum zio_compress
arc_hdr_get_compress(arc_buf_hdr_t * hdr)1236 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1237 {
1238 return (HDR_COMPRESSION_ENABLED(hdr) ?
1239 HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1240 }
1241
1242 static inline boolean_t
arc_buf_is_shared(arc_buf_t * buf)1243 arc_buf_is_shared(arc_buf_t *buf)
1244 {
1245 boolean_t shared = (buf->b_data != NULL &&
1246 buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1247 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1248 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1249 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1250 IMPLY(shared, ARC_BUF_SHARED(buf));
1251 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1252
1253 /*
1254 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1255 * already being shared" requirement prevents us from doing that.
1256 */
1257
1258 return (shared);
1259 }
1260
1261 /*
1262 * Free the checksum associated with this header. If there is no checksum, this
1263 * is a no-op.
1264 */
1265 static inline void
arc_cksum_free(arc_buf_hdr_t * hdr)1266 arc_cksum_free(arc_buf_hdr_t *hdr)
1267 {
1268 ASSERT(HDR_HAS_L1HDR(hdr));
1269
1270 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1271 if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1272 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1273 hdr->b_l1hdr.b_freeze_cksum = NULL;
1274 }
1275 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1276 }
1277
1278 /*
1279 * Return true iff at least one of the bufs on hdr is not compressed.
1280 * Encrypted buffers count as compressed.
1281 */
1282 static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t * hdr)1283 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1284 {
1285 ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
1286
1287 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1288 if (!ARC_BUF_COMPRESSED(b)) {
1289 return (B_TRUE);
1290 }
1291 }
1292 return (B_FALSE);
1293 }
1294
1295 /*
1296 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1297 * matches the checksum that is stored in the hdr. If there is no checksum,
1298 * or if the buf is compressed, this is a no-op.
1299 */
1300 static void
arc_cksum_verify(arc_buf_t * buf)1301 arc_cksum_verify(arc_buf_t *buf)
1302 {
1303 arc_buf_hdr_t *hdr = buf->b_hdr;
1304 zio_cksum_t zc;
1305
1306 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1307 return;
1308
1309 if (ARC_BUF_COMPRESSED(buf))
1310 return;
1311
1312 ASSERT(HDR_HAS_L1HDR(hdr));
1313
1314 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1315
1316 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1317 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1318 return;
1319 }
1320
1321 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1322 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1323 panic("buffer modified while frozen!");
1324 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1325 }
1326
1327 /*
1328 * This function makes the assumption that data stored in the L2ARC
1329 * will be transformed exactly as it is in the main pool. Because of
1330 * this we can verify the checksum against the reading process's bp.
1331 */
1332 static boolean_t
arc_cksum_is_equal(arc_buf_hdr_t * hdr,zio_t * zio)1333 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1334 {
1335 enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1336 boolean_t valid_cksum;
1337
1338 ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1339 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1340
1341 /*
1342 * We rely on the blkptr's checksum to determine if the block
1343 * is valid or not. When compressed arc is enabled, the l2arc
1344 * writes the block to the l2arc just as it appears in the pool.
1345 * This allows us to use the blkptr's checksum to validate the
1346 * data that we just read off of the l2arc without having to store
1347 * a separate checksum in the arc_buf_hdr_t. However, if compressed
1348 * arc is disabled, then the data written to the l2arc is always
1349 * uncompressed and won't match the block as it exists in the main
1350 * pool. When this is the case, we must first compress it if it is
1351 * compressed on the main pool before we can validate the checksum.
1352 */
1353 if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1354 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1355 uint64_t lsize = HDR_GET_LSIZE(hdr);
1356 uint64_t csize;
1357
1358 abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
1359 csize = zio_compress_data(compress, zio->io_abd,
1360 abd_to_buf(cdata), lsize);
1361
1362 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1363 if (csize < HDR_GET_PSIZE(hdr)) {
1364 /*
1365 * Compressed blocks are always a multiple of the
1366 * smallest ashift in the pool. Ideally, we would
1367 * like to round up the csize to the next
1368 * spa_min_ashift but that value may have changed
1369 * since the block was last written. Instead,
1370 * we rely on the fact that the hdr's psize
1371 * was set to the psize of the block when it was
1372 * last written. We set the csize to that value
1373 * and zero out any part that should not contain
1374 * data.
1375 */
1376 abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
1377 csize = HDR_GET_PSIZE(hdr);
1378 }
1379 zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
1380 }
1381
1382 /*
1383 * Block pointers always store the checksum for the logical data.
1384 * If the block pointer has the gang bit set, then the checksum
1385 * it represents is for the reconstituted data and not for an
1386 * individual gang member. The zio pipeline, however, must be able to
1387 * determine the checksum of each of the gang constituents so it
1388 * treats the checksum comparison differently than what we need
1389 * for l2arc blocks. This prevents us from using the
1390 * zio_checksum_error() interface directly. Instead we must call the
1391 * zio_checksum_error_impl() so that we can ensure the checksum is
1392 * generated using the correct checksum algorithm and accounts for the
1393 * logical I/O size and not just a gang fragment.
1394 */
1395 valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1396 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1397 zio->io_offset, NULL) == 0);
1398 zio_pop_transforms(zio);
1399 return (valid_cksum);
1400 }
1401
1402 /*
1403 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1404 * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1405 * isn't modified later on. If buf is compressed or there is already a checksum
1406 * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1407 */
1408 static void
arc_cksum_compute(arc_buf_t * buf)1409 arc_cksum_compute(arc_buf_t *buf)
1410 {
1411 arc_buf_hdr_t *hdr = buf->b_hdr;
1412
1413 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1414 return;
1415
1416 ASSERT(HDR_HAS_L1HDR(hdr));
1417
1418 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1419 if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1420 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1421 return;
1422 }
1423
1424 ASSERT(!ARC_BUF_ENCRYPTED(buf));
1425 ASSERT(!ARC_BUF_COMPRESSED(buf));
1426 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1427 KM_SLEEP);
1428 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1429 hdr->b_l1hdr.b_freeze_cksum);
1430 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1431 arc_buf_watch(buf);
1432 }
1433
1434 #ifndef _KERNEL
1435 typedef struct procctl {
1436 long cmd;
1437 prwatch_t prwatch;
1438 } procctl_t;
1439 #endif
1440
1441 /* ARGSUSED */
1442 static void
arc_buf_unwatch(arc_buf_t * buf)1443 arc_buf_unwatch(arc_buf_t *buf)
1444 {
1445 #ifndef _KERNEL
1446 if (arc_watch) {
1447 int result;
1448 procctl_t ctl;
1449 ctl.cmd = PCWATCH;
1450 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1451 ctl.prwatch.pr_size = 0;
1452 ctl.prwatch.pr_wflags = 0;
1453 result = write(arc_procfd, &ctl, sizeof (ctl));
1454 ASSERT3U(result, ==, sizeof (ctl));
1455 }
1456 #endif
1457 }
1458
1459 /* ARGSUSED */
1460 static void
arc_buf_watch(arc_buf_t * buf)1461 arc_buf_watch(arc_buf_t *buf)
1462 {
1463 #ifndef _KERNEL
1464 if (arc_watch) {
1465 int result;
1466 procctl_t ctl;
1467 ctl.cmd = PCWATCH;
1468 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1469 ctl.prwatch.pr_size = arc_buf_size(buf);
1470 ctl.prwatch.pr_wflags = WA_WRITE;
1471 result = write(arc_procfd, &ctl, sizeof (ctl));
1472 ASSERT3U(result, ==, sizeof (ctl));
1473 }
1474 #endif
1475 }
1476
1477 static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t * hdr)1478 arc_buf_type(arc_buf_hdr_t *hdr)
1479 {
1480 arc_buf_contents_t type;
1481 if (HDR_ISTYPE_METADATA(hdr)) {
1482 type = ARC_BUFC_METADATA;
1483 } else {
1484 type = ARC_BUFC_DATA;
1485 }
1486 VERIFY3U(hdr->b_type, ==, type);
1487 return (type);
1488 }
1489
1490 boolean_t
arc_is_metadata(arc_buf_t * buf)1491 arc_is_metadata(arc_buf_t *buf)
1492 {
1493 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1494 }
1495
1496 static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)1497 arc_bufc_to_flags(arc_buf_contents_t type)
1498 {
1499 switch (type) {
1500 case ARC_BUFC_DATA:
1501 /* metadata field is 0 if buffer contains normal data */
1502 return (0);
1503 case ARC_BUFC_METADATA:
1504 return (ARC_FLAG_BUFC_METADATA);
1505 default:
1506 break;
1507 }
1508 panic("undefined ARC buffer type!");
1509 return ((uint32_t)-1);
1510 }
1511
1512 void
arc_buf_thaw(arc_buf_t * buf)1513 arc_buf_thaw(arc_buf_t *buf)
1514 {
1515 arc_buf_hdr_t *hdr = buf->b_hdr;
1516
1517 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1518 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1519
1520 arc_cksum_verify(buf);
1521
1522 /*
1523 * Compressed buffers do not manipulate the b_freeze_cksum.
1524 */
1525 if (ARC_BUF_COMPRESSED(buf))
1526 return;
1527
1528 ASSERT(HDR_HAS_L1HDR(hdr));
1529 arc_cksum_free(hdr);
1530
1531 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1532 #ifdef ZFS_DEBUG
1533 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1534 if (hdr->b_l1hdr.b_thawed != NULL)
1535 kmem_free(hdr->b_l1hdr.b_thawed, 1);
1536 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1537 }
1538 #endif
1539
1540 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1541
1542 arc_buf_unwatch(buf);
1543 }
1544
1545 void
arc_buf_freeze(arc_buf_t * buf)1546 arc_buf_freeze(arc_buf_t *buf)
1547 {
1548 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1549 return;
1550
1551 if (ARC_BUF_COMPRESSED(buf))
1552 return;
1553
1554 ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1555 arc_cksum_compute(buf);
1556 }
1557
1558 /*
1559 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1560 * the following functions should be used to ensure that the flags are
1561 * updated in a thread-safe way. When manipulating the flags either
1562 * the hash_lock must be held or the hdr must be undiscoverable. This
1563 * ensures that we're not racing with any other threads when updating
1564 * the flags.
1565 */
1566 static inline void
arc_hdr_set_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1567 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1568 {
1569 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1570 hdr->b_flags |= flags;
1571 }
1572
1573 static inline void
arc_hdr_clear_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1574 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1575 {
1576 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1577 hdr->b_flags &= ~flags;
1578 }
1579
1580 /*
1581 * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1582 * done in a special way since we have to clear and set bits
1583 * at the same time. Consumers that wish to set the compression bits
1584 * must use this function to ensure that the flags are updated in
1585 * thread-safe manner.
1586 */
1587 static void
arc_hdr_set_compress(arc_buf_hdr_t * hdr,enum zio_compress cmp)1588 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1589 {
1590 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1591
1592 /*
1593 * Holes and embedded blocks will always have a psize = 0 so
1594 * we ignore the compression of the blkptr and set the
1595 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
1596 * Holes and embedded blocks remain anonymous so we don't
1597 * want to uncompress them. Mark them as uncompressed.
1598 */
1599 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1600 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1601 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
1602 } else {
1603 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1604 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1605 }
1606
1607 HDR_SET_COMPRESS(hdr, cmp);
1608 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
1609 }
1610
1611 /*
1612 * Looks for another buf on the same hdr which has the data decompressed, copies
1613 * from it, and returns true. If no such buf exists, returns false.
1614 */
1615 static boolean_t
arc_buf_try_copy_decompressed_data(arc_buf_t * buf)1616 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1617 {
1618 arc_buf_hdr_t *hdr = buf->b_hdr;
1619 boolean_t copied = B_FALSE;
1620
1621 ASSERT(HDR_HAS_L1HDR(hdr));
1622 ASSERT3P(buf->b_data, !=, NULL);
1623 ASSERT(!ARC_BUF_COMPRESSED(buf));
1624
1625 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
1626 from = from->b_next) {
1627 /* can't use our own data buffer */
1628 if (from == buf) {
1629 continue;
1630 }
1631
1632 if (!ARC_BUF_COMPRESSED(from)) {
1633 bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
1634 copied = B_TRUE;
1635 break;
1636 }
1637 }
1638
1639 /*
1640 * Note: With encryption support, the following assertion is no longer
1641 * necessarily valid. If we receive two back to back raw snapshots
1642 * (send -w), the second receive can use a hdr with a cksum already
1643 * calculated. This happens via:
1644 * dmu_recv_stream() -> receive_read_record() -> arc_loan_raw_buf()
1645 * The rsend/send_mixed_raw test case exercises this code path.
1646 *
1647 * There were no decompressed bufs, so there should not be a
1648 * checksum on the hdr either.
1649 * EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1650 */
1651
1652 return (copied);
1653 }
1654
1655 /*
1656 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1657 */
1658 static uint64_t
arc_hdr_size(arc_buf_hdr_t * hdr)1659 arc_hdr_size(arc_buf_hdr_t *hdr)
1660 {
1661 uint64_t size;
1662
1663 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1664 HDR_GET_PSIZE(hdr) > 0) {
1665 size = HDR_GET_PSIZE(hdr);
1666 } else {
1667 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1668 size = HDR_GET_LSIZE(hdr);
1669 }
1670 return (size);
1671 }
1672
1673 static int
arc_hdr_authenticate(arc_buf_hdr_t * hdr,spa_t * spa,uint64_t dsobj)1674 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1675 {
1676 int ret;
1677 uint64_t csize;
1678 uint64_t lsize = HDR_GET_LSIZE(hdr);
1679 uint64_t psize = HDR_GET_PSIZE(hdr);
1680 void *tmpbuf = NULL;
1681 abd_t *abd = hdr->b_l1hdr.b_pabd;
1682
1683 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1684 ASSERT(HDR_AUTHENTICATED(hdr));
1685 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1686
1687 /*
1688 * The MAC is calculated on the compressed data that is stored on disk.
1689 * However, if compressed arc is disabled we will only have the
1690 * decompressed data available to us now. Compress it into a temporary
1691 * abd so we can verify the MAC. The performance overhead of this will
1692 * be relatively low, since most objects in an encrypted objset will
1693 * be encrypted (instead of authenticated) anyway.
1694 */
1695 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1696 !HDR_COMPRESSION_ENABLED(hdr)) {
1697 tmpbuf = zio_buf_alloc(lsize);
1698 abd = abd_get_from_buf(tmpbuf, lsize);
1699 abd_take_ownership_of_buf(abd, B_TRUE);
1700
1701 csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1702 hdr->b_l1hdr.b_pabd, tmpbuf, lsize);
1703 ASSERT3U(csize, <=, psize);
1704 abd_zero_off(abd, csize, psize - csize);
1705 }
1706
1707 /*
1708 * Authentication is best effort. We authenticate whenever the key is
1709 * available. If we succeed we clear ARC_FLAG_NOAUTH.
1710 */
1711 if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1712 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1713 ASSERT3U(lsize, ==, psize);
1714 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1715 psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1716 } else {
1717 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1718 hdr->b_crypt_hdr.b_mac);
1719 }
1720
1721 if (ret == 0)
1722 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1723 else if (ret != ENOENT)
1724 goto error;
1725
1726 if (tmpbuf != NULL)
1727 abd_free(abd);
1728
1729 return (0);
1730
1731 error:
1732 if (tmpbuf != NULL)
1733 abd_free(abd);
1734
1735 return (ret);
1736 }
1737
1738 /*
1739 * This function will take a header that only has raw encrypted data in
1740 * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1741 * b_l1hdr.b_pabd. If designated in the header flags, this function will
1742 * also decompress the data.
1743 */
1744 static int
arc_hdr_decrypt(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb)1745 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
1746 {
1747 int ret;
1748 abd_t *cabd = NULL;
1749 void *tmp = NULL;
1750 boolean_t no_crypt = B_FALSE;
1751 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1752
1753 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1754 ASSERT(HDR_ENCRYPTED(hdr));
1755
1756 arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
1757
1758 ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1759 B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1760 hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
1761 hdr->b_crypt_hdr.b_rabd, &no_crypt);
1762 if (ret != 0)
1763 goto error;
1764
1765 if (no_crypt) {
1766 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1767 HDR_GET_PSIZE(hdr));
1768 }
1769
1770 /*
1771 * If this header has disabled arc compression but the b_pabd is
1772 * compressed after decrypting it, we need to decompress the newly
1773 * decrypted data.
1774 */
1775 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1776 !HDR_COMPRESSION_ENABLED(hdr)) {
1777 /*
1778 * We want to make sure that we are correctly honoring the
1779 * zfs_abd_scatter_enabled setting, so we allocate an abd here
1780 * and then loan a buffer from it, rather than allocating a
1781 * linear buffer and wrapping it in an abd later.
1782 */
1783 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
1784 tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
1785
1786 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1787 hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
1788 HDR_GET_LSIZE(hdr));
1789 if (ret != 0) {
1790 abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
1791 goto error;
1792 }
1793
1794 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
1795 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1796 arc_hdr_size(hdr), hdr);
1797 hdr->b_l1hdr.b_pabd = cabd;
1798 }
1799
1800 return (0);
1801
1802 error:
1803 arc_hdr_free_pabd(hdr, B_FALSE);
1804 if (cabd != NULL)
1805 arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
1806
1807 return (ret);
1808 }
1809
1810 /*
1811 * This function is called during arc_buf_fill() to prepare the header's
1812 * abd plaintext pointer for use. This involves authenticated protected
1813 * data and decrypting encrypted data into the plaintext abd.
1814 */
1815 static int
arc_fill_hdr_crypt(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,spa_t * spa,const zbookmark_phys_t * zb,boolean_t noauth)1816 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
1817 const zbookmark_phys_t *zb, boolean_t noauth)
1818 {
1819 int ret;
1820
1821 ASSERT(HDR_PROTECTED(hdr));
1822
1823 if (hash_lock != NULL)
1824 mutex_enter(hash_lock);
1825
1826 if (HDR_NOAUTH(hdr) && !noauth) {
1827 /*
1828 * The caller requested authenticated data but our data has
1829 * not been authenticated yet. Verify the MAC now if we can.
1830 */
1831 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
1832 if (ret != 0)
1833 goto error;
1834 } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
1835 /*
1836 * If we only have the encrypted version of the data, but the
1837 * unencrypted version was requested we take this opportunity
1838 * to store the decrypted version in the header for future use.
1839 */
1840 ret = arc_hdr_decrypt(hdr, spa, zb);
1841 if (ret != 0)
1842 goto error;
1843 }
1844
1845 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1846
1847 if (hash_lock != NULL)
1848 mutex_exit(hash_lock);
1849
1850 return (0);
1851
1852 error:
1853 if (hash_lock != NULL)
1854 mutex_exit(hash_lock);
1855
1856 return (ret);
1857 }
1858
1859 /*
1860 * This function is used by the dbuf code to decrypt bonus buffers in place.
1861 * The dbuf code itself doesn't have any locking for decrypting a shared dnode
1862 * block, so we use the hash lock here to protect against concurrent calls to
1863 * arc_buf_fill().
1864 */
1865 /* ARGSUSED */
1866 static void
arc_buf_untransform_in_place(arc_buf_t * buf,kmutex_t * hash_lock)1867 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
1868 {
1869 arc_buf_hdr_t *hdr = buf->b_hdr;
1870
1871 ASSERT(HDR_ENCRYPTED(hdr));
1872 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1873 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1874 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1875
1876 zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
1877 arc_buf_size(buf));
1878 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
1879 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
1880 hdr->b_crypt_hdr.b_ebufcnt -= 1;
1881 }
1882
1883 /*
1884 * Given a buf that has a data buffer attached to it, this function will
1885 * efficiently fill the buf with data of the specified compression setting from
1886 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
1887 * are already sharing a data buf, no copy is performed.
1888 *
1889 * If the buf is marked as compressed but uncompressed data was requested, this
1890 * will allocate a new data buffer for the buf, remove that flag, and fill the
1891 * buf with uncompressed data. You can't request a compressed buf on a hdr with
1892 * uncompressed data, and (since we haven't added support for it yet) if you
1893 * want compressed data your buf must already be marked as compressed and have
1894 * the correct-sized data buffer.
1895 */
1896 static int
arc_buf_fill(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,arc_fill_flags_t flags)1897 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
1898 arc_fill_flags_t flags)
1899 {
1900 int error = 0;
1901 arc_buf_hdr_t *hdr = buf->b_hdr;
1902 boolean_t hdr_compressed =
1903 (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
1904 boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
1905 boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
1906 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
1907 kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
1908
1909 ASSERT3P(buf->b_data, !=, NULL);
1910 IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
1911 IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
1912 IMPLY(encrypted, HDR_ENCRYPTED(hdr));
1913 IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
1914 IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
1915 IMPLY(encrypted, !ARC_BUF_SHARED(buf));
1916
1917 /*
1918 * If the caller wanted encrypted data we just need to copy it from
1919 * b_rabd and potentially byteswap it. We won't be able to do any
1920 * further transforms on it.
1921 */
1922 if (encrypted) {
1923 ASSERT(HDR_HAS_RABD(hdr));
1924 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
1925 HDR_GET_PSIZE(hdr));
1926 goto byteswap;
1927 }
1928
1929 /*
1930 * Adjust encrypted and authenticated headers to accomodate
1931 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
1932 * allowed to fail decryption due to keys not being loaded
1933 * without being marked as an IO error.
1934 */
1935 if (HDR_PROTECTED(hdr)) {
1936 error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
1937 zb, !!(flags & ARC_FILL_NOAUTH));
1938 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
1939 return (error);
1940 } else if (error != 0) {
1941 if (hash_lock != NULL)
1942 mutex_enter(hash_lock);
1943 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
1944 if (hash_lock != NULL)
1945 mutex_exit(hash_lock);
1946 return (error);
1947 }
1948 }
1949
1950 /*
1951 * There is a special case here for dnode blocks which are
1952 * decrypting their bonus buffers. These blocks may request to
1953 * be decrypted in-place. This is necessary because there may
1954 * be many dnodes pointing into this buffer and there is
1955 * currently no method to synchronize replacing the backing
1956 * b_data buffer and updating all of the pointers. Here we use
1957 * the hash lock to ensure there are no races. If the need
1958 * arises for other types to be decrypted in-place, they must
1959 * add handling here as well.
1960 */
1961 if ((flags & ARC_FILL_IN_PLACE) != 0) {
1962 ASSERT(!hdr_compressed);
1963 ASSERT(!compressed);
1964 ASSERT(!encrypted);
1965
1966 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
1967 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1968
1969 if (hash_lock != NULL)
1970 mutex_enter(hash_lock);
1971 arc_buf_untransform_in_place(buf, hash_lock);
1972 if (hash_lock != NULL)
1973 mutex_exit(hash_lock);
1974
1975 /* Compute the hdr's checksum if necessary */
1976 arc_cksum_compute(buf);
1977 }
1978
1979 return (0);
1980 }
1981
1982 if (hdr_compressed == compressed) {
1983 if (!arc_buf_is_shared(buf)) {
1984 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
1985 arc_buf_size(buf));
1986 }
1987 } else {
1988 ASSERT(hdr_compressed);
1989 ASSERT(!compressed);
1990 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
1991
1992 /*
1993 * If the buf is sharing its data with the hdr, unlink it and
1994 * allocate a new data buffer for the buf.
1995 */
1996 if (arc_buf_is_shared(buf)) {
1997 ASSERT(ARC_BUF_COMPRESSED(buf));
1998
1999 /* We need to give the buf its own b_data */
2000 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2001 buf->b_data =
2002 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2003 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2004
2005 /* Previously overhead was 0; just add new overhead */
2006 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2007 } else if (ARC_BUF_COMPRESSED(buf)) {
2008 /* We need to reallocate the buf's b_data */
2009 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2010 buf);
2011 buf->b_data =
2012 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2013
2014 /* We increased the size of b_data; update overhead */
2015 ARCSTAT_INCR(arcstat_overhead_size,
2016 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2017 }
2018
2019 /*
2020 * Regardless of the buf's previous compression settings, it
2021 * should not be compressed at the end of this function.
2022 */
2023 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2024
2025 /*
2026 * Try copying the data from another buf which already has a
2027 * decompressed version. If that's not possible, it's time to
2028 * bite the bullet and decompress the data from the hdr.
2029 */
2030 if (arc_buf_try_copy_decompressed_data(buf)) {
2031 /* Skip byteswapping and checksumming (already done) */
2032 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2033 return (0);
2034 } else {
2035 error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2036 hdr->b_l1hdr.b_pabd, buf->b_data,
2037 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2038
2039 /*
2040 * Absent hardware errors or software bugs, this should
2041 * be impossible, but log it anyway so we can debug it.
2042 */
2043 if (error != 0) {
2044 zfs_dbgmsg(
2045 "hdr %p, compress %d, psize %d, lsize %d",
2046 hdr, arc_hdr_get_compress(hdr),
2047 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2048 if (hash_lock != NULL)
2049 mutex_enter(hash_lock);
2050 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2051 if (hash_lock != NULL)
2052 mutex_exit(hash_lock);
2053 return (SET_ERROR(EIO));
2054 }
2055 }
2056 }
2057
2058 byteswap:
2059 /* Byteswap the buf's data if necessary */
2060 if (bswap != DMU_BSWAP_NUMFUNCS) {
2061 ASSERT(!HDR_SHARED_DATA(hdr));
2062 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2063 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2064 }
2065
2066 /* Compute the hdr's checksum if necessary */
2067 arc_cksum_compute(buf);
2068
2069 return (0);
2070 }
2071
2072 /*
2073 * If this function is being called to decrypt an encrypted buffer or verify an
2074 * authenticated one, the key must be loaded and a mapping must be made
2075 * available in the keystore via spa_keystore_create_mapping() or one of its
2076 * callers.
2077 */
2078 int
arc_untransform(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,boolean_t in_place)2079 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2080 boolean_t in_place)
2081 {
2082 int ret;
2083 arc_fill_flags_t flags = 0;
2084
2085 if (in_place)
2086 flags |= ARC_FILL_IN_PLACE;
2087
2088 ret = arc_buf_fill(buf, spa, zb, flags);
2089 if (ret == ECKSUM) {
2090 /*
2091 * Convert authentication and decryption errors to EIO
2092 * (and generate an ereport) before leaving the ARC.
2093 */
2094 ret = SET_ERROR(EIO);
2095 spa_log_error(spa, zb);
2096 (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2097 spa, NULL, zb, NULL, 0, 0);
2098 }
2099
2100 return (ret);
2101 }
2102
2103 /*
2104 * Increment the amount of evictable space in the arc_state_t's refcount.
2105 * We account for the space used by the hdr and the arc buf individually
2106 * so that we can add and remove them from the refcount individually.
2107 */
2108 static void
arc_evictable_space_increment(arc_buf_hdr_t * hdr,arc_state_t * state)2109 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2110 {
2111 arc_buf_contents_t type = arc_buf_type(hdr);
2112
2113 ASSERT(HDR_HAS_L1HDR(hdr));
2114
2115 if (GHOST_STATE(state)) {
2116 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2117 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2118 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2119 ASSERT(!HDR_HAS_RABD(hdr));
2120 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2121 HDR_GET_LSIZE(hdr), hdr);
2122 return;
2123 }
2124
2125 ASSERT(!GHOST_STATE(state));
2126 if (hdr->b_l1hdr.b_pabd != NULL) {
2127 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2128 arc_hdr_size(hdr), hdr);
2129 }
2130 if (HDR_HAS_RABD(hdr)) {
2131 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2132 HDR_GET_PSIZE(hdr), hdr);
2133 }
2134 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2135 buf = buf->b_next) {
2136 if (arc_buf_is_shared(buf))
2137 continue;
2138 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2139 arc_buf_size(buf), buf);
2140 }
2141 }
2142
2143 /*
2144 * Decrement the amount of evictable space in the arc_state_t's refcount.
2145 * We account for the space used by the hdr and the arc buf individually
2146 * so that we can add and remove them from the refcount individually.
2147 */
2148 static void
arc_evictable_space_decrement(arc_buf_hdr_t * hdr,arc_state_t * state)2149 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2150 {
2151 arc_buf_contents_t type = arc_buf_type(hdr);
2152
2153 ASSERT(HDR_HAS_L1HDR(hdr));
2154
2155 if (GHOST_STATE(state)) {
2156 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2157 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2158 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2159 ASSERT(!HDR_HAS_RABD(hdr));
2160 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2161 HDR_GET_LSIZE(hdr), hdr);
2162 return;
2163 }
2164
2165 ASSERT(!GHOST_STATE(state));
2166 if (hdr->b_l1hdr.b_pabd != NULL) {
2167 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2168 arc_hdr_size(hdr), hdr);
2169 }
2170 if (HDR_HAS_RABD(hdr)) {
2171 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2172 HDR_GET_PSIZE(hdr), hdr);
2173 }
2174 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2175 buf = buf->b_next) {
2176 if (arc_buf_is_shared(buf))
2177 continue;
2178 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2179 arc_buf_size(buf), buf);
2180 }
2181 }
2182
2183 /*
2184 * Add a reference to this hdr indicating that someone is actively
2185 * referencing that memory. When the refcount transitions from 0 to 1,
2186 * we remove it from the respective arc_state_t list to indicate that
2187 * it is not evictable.
2188 */
2189 static void
add_reference(arc_buf_hdr_t * hdr,void * tag)2190 add_reference(arc_buf_hdr_t *hdr, void *tag)
2191 {
2192 ASSERT(HDR_HAS_L1HDR(hdr));
2193 if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
2194 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2195 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2196 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2197 }
2198
2199 arc_state_t *state = hdr->b_l1hdr.b_state;
2200
2201 if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2202 (state != arc_anon)) {
2203 /* We don't use the L2-only state list. */
2204 if (state != arc_l2c_only) {
2205 multilist_remove(state->arcs_list[arc_buf_type(hdr)],
2206 hdr);
2207 arc_evictable_space_decrement(hdr, state);
2208 }
2209 /* remove the prefetch flag if we get a reference */
2210 if (HDR_HAS_L2HDR(hdr))
2211 l2arc_hdr_arcstats_decrement_state(hdr);
2212 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2213 if (HDR_HAS_L2HDR(hdr))
2214 l2arc_hdr_arcstats_increment_state(hdr);
2215 }
2216 }
2217
2218 /*
2219 * Remove a reference from this hdr. When the reference transitions from
2220 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2221 * list making it eligible for eviction.
2222 */
2223 static int
remove_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)2224 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2225 {
2226 int cnt;
2227 arc_state_t *state = hdr->b_l1hdr.b_state;
2228
2229 ASSERT(HDR_HAS_L1HDR(hdr));
2230 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2231 ASSERT(!GHOST_STATE(state));
2232
2233 /*
2234 * arc_l2c_only counts as a ghost state so we don't need to explicitly
2235 * check to prevent usage of the arc_l2c_only list.
2236 */
2237 if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2238 (state != arc_anon)) {
2239 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
2240 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2241 arc_evictable_space_increment(hdr, state);
2242 }
2243 return (cnt);
2244 }
2245
2246 /*
2247 * Move the supplied buffer to the indicated state. The hash lock
2248 * for the buffer must be held by the caller.
2249 */
2250 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr,kmutex_t * hash_lock)2251 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2252 kmutex_t *hash_lock)
2253 {
2254 arc_state_t *old_state;
2255 int64_t refcnt;
2256 uint32_t bufcnt;
2257 boolean_t update_old, update_new;
2258 arc_buf_contents_t buftype = arc_buf_type(hdr);
2259
2260 /*
2261 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2262 * in arc_read() when bringing a buffer out of the L2ARC. However, the
2263 * L1 hdr doesn't always exist when we change state to arc_anon before
2264 * destroying a header, in which case reallocating to add the L1 hdr is
2265 * pointless.
2266 */
2267 if (HDR_HAS_L1HDR(hdr)) {
2268 old_state = hdr->b_l1hdr.b_state;
2269 refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2270 bufcnt = hdr->b_l1hdr.b_bufcnt;
2271
2272 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
2273 HDR_HAS_RABD(hdr));
2274 } else {
2275 old_state = arc_l2c_only;
2276 refcnt = 0;
2277 bufcnt = 0;
2278 update_old = B_FALSE;
2279 }
2280 update_new = update_old;
2281
2282 ASSERT(MUTEX_HELD(hash_lock));
2283 ASSERT3P(new_state, !=, old_state);
2284 ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2285 ASSERT(old_state != arc_anon || bufcnt <= 1);
2286
2287 /*
2288 * If this buffer is evictable, transfer it from the
2289 * old state list to the new state list.
2290 */
2291 if (refcnt == 0) {
2292 if (old_state != arc_anon && old_state != arc_l2c_only) {
2293 ASSERT(HDR_HAS_L1HDR(hdr));
2294 multilist_remove(old_state->arcs_list[buftype], hdr);
2295
2296 if (GHOST_STATE(old_state)) {
2297 ASSERT0(bufcnt);
2298 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2299 update_old = B_TRUE;
2300 }
2301 arc_evictable_space_decrement(hdr, old_state);
2302 }
2303 if (new_state != arc_anon && new_state != arc_l2c_only) {
2304
2305 /*
2306 * An L1 header always exists here, since if we're
2307 * moving to some L1-cached state (i.e. not l2c_only or
2308 * anonymous), we realloc the header to add an L1hdr
2309 * beforehand.
2310 */
2311 ASSERT(HDR_HAS_L1HDR(hdr));
2312 multilist_insert(new_state->arcs_list[buftype], hdr);
2313
2314 if (GHOST_STATE(new_state)) {
2315 ASSERT0(bufcnt);
2316 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2317 update_new = B_TRUE;
2318 }
2319 arc_evictable_space_increment(hdr, new_state);
2320 }
2321 }
2322
2323 ASSERT(!HDR_EMPTY(hdr));
2324 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2325 buf_hash_remove(hdr);
2326
2327 /* adjust state sizes (ignore arc_l2c_only) */
2328
2329 if (update_new && new_state != arc_l2c_only) {
2330 ASSERT(HDR_HAS_L1HDR(hdr));
2331 if (GHOST_STATE(new_state)) {
2332 ASSERT0(bufcnt);
2333
2334 /*
2335 * When moving a header to a ghost state, we first
2336 * remove all arc buffers. Thus, we'll have a
2337 * bufcnt of zero, and no arc buffer to use for
2338 * the reference. As a result, we use the arc
2339 * header pointer for the reference.
2340 */
2341 (void) zfs_refcount_add_many(&new_state->arcs_size,
2342 HDR_GET_LSIZE(hdr), hdr);
2343 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2344 ASSERT(!HDR_HAS_RABD(hdr));
2345 } else {
2346 uint32_t buffers = 0;
2347
2348 /*
2349 * Each individual buffer holds a unique reference,
2350 * thus we must remove each of these references one
2351 * at a time.
2352 */
2353 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2354 buf = buf->b_next) {
2355 ASSERT3U(bufcnt, !=, 0);
2356 buffers++;
2357
2358 /*
2359 * When the arc_buf_t is sharing the data
2360 * block with the hdr, the owner of the
2361 * reference belongs to the hdr. Only
2362 * add to the refcount if the arc_buf_t is
2363 * not shared.
2364 */
2365 if (arc_buf_is_shared(buf))
2366 continue;
2367
2368 (void) zfs_refcount_add_many(
2369 &new_state->arcs_size,
2370 arc_buf_size(buf), buf);
2371 }
2372 ASSERT3U(bufcnt, ==, buffers);
2373
2374 if (hdr->b_l1hdr.b_pabd != NULL) {
2375 (void) zfs_refcount_add_many(
2376 &new_state->arcs_size,
2377 arc_hdr_size(hdr), hdr);
2378 }
2379
2380 if (HDR_HAS_RABD(hdr)) {
2381 (void) zfs_refcount_add_many(
2382 &new_state->arcs_size,
2383 HDR_GET_PSIZE(hdr), hdr);
2384 }
2385 }
2386 }
2387
2388 if (update_old && old_state != arc_l2c_only) {
2389 ASSERT(HDR_HAS_L1HDR(hdr));
2390 if (GHOST_STATE(old_state)) {
2391 ASSERT0(bufcnt);
2392 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2393 ASSERT(!HDR_HAS_RABD(hdr));
2394
2395 /*
2396 * When moving a header off of a ghost state,
2397 * the header will not contain any arc buffers.
2398 * We use the arc header pointer for the reference
2399 * which is exactly what we did when we put the
2400 * header on the ghost state.
2401 */
2402
2403 (void) zfs_refcount_remove_many(&old_state->arcs_size,
2404 HDR_GET_LSIZE(hdr), hdr);
2405 } else {
2406 uint32_t buffers = 0;
2407
2408 /*
2409 * Each individual buffer holds a unique reference,
2410 * thus we must remove each of these references one
2411 * at a time.
2412 */
2413 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2414 buf = buf->b_next) {
2415 ASSERT3U(bufcnt, !=, 0);
2416 buffers++;
2417
2418 /*
2419 * When the arc_buf_t is sharing the data
2420 * block with the hdr, the owner of the
2421 * reference belongs to the hdr. Only
2422 * add to the refcount if the arc_buf_t is
2423 * not shared.
2424 */
2425 if (arc_buf_is_shared(buf))
2426 continue;
2427
2428 (void) zfs_refcount_remove_many(
2429 &old_state->arcs_size, arc_buf_size(buf),
2430 buf);
2431 }
2432 ASSERT3U(bufcnt, ==, buffers);
2433 ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2434 HDR_HAS_RABD(hdr));
2435
2436 if (hdr->b_l1hdr.b_pabd != NULL) {
2437 (void) zfs_refcount_remove_many(
2438 &old_state->arcs_size, arc_hdr_size(hdr),
2439 hdr);
2440 }
2441
2442 if (HDR_HAS_RABD(hdr)) {
2443 (void) zfs_refcount_remove_many(
2444 &old_state->arcs_size, HDR_GET_PSIZE(hdr),
2445 hdr);
2446 }
2447 }
2448 }
2449
2450 if (HDR_HAS_L1HDR(hdr)) {
2451 hdr->b_l1hdr.b_state = new_state;
2452
2453 if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
2454 l2arc_hdr_arcstats_decrement_state(hdr);
2455 hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
2456 l2arc_hdr_arcstats_increment_state(hdr);
2457 }
2458 }
2459
2460 /*
2461 * L2 headers should never be on the L2 state list since they don't
2462 * have L1 headers allocated.
2463 */
2464 ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2465 multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2466 }
2467
2468 void
arc_space_consume(uint64_t space,arc_space_type_t type)2469 arc_space_consume(uint64_t space, arc_space_type_t type)
2470 {
2471 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2472
2473 switch (type) {
2474 case ARC_SPACE_DATA:
2475 aggsum_add(&astat_data_size, space);
2476 break;
2477 case ARC_SPACE_META:
2478 aggsum_add(&astat_metadata_size, space);
2479 break;
2480 case ARC_SPACE_OTHER:
2481 aggsum_add(&astat_other_size, space);
2482 break;
2483 case ARC_SPACE_HDRS:
2484 aggsum_add(&astat_hdr_size, space);
2485 break;
2486 case ARC_SPACE_L2HDRS:
2487 aggsum_add(&astat_l2_hdr_size, space);
2488 break;
2489 }
2490
2491 if (type != ARC_SPACE_DATA)
2492 aggsum_add(&arc_meta_used, space);
2493
2494 aggsum_add(&arc_size, space);
2495 }
2496
2497 void
arc_space_return(uint64_t space,arc_space_type_t type)2498 arc_space_return(uint64_t space, arc_space_type_t type)
2499 {
2500 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2501
2502 switch (type) {
2503 case ARC_SPACE_DATA:
2504 aggsum_add(&astat_data_size, -space);
2505 break;
2506 case ARC_SPACE_META:
2507 aggsum_add(&astat_metadata_size, -space);
2508 break;
2509 case ARC_SPACE_OTHER:
2510 aggsum_add(&astat_other_size, -space);
2511 break;
2512 case ARC_SPACE_HDRS:
2513 aggsum_add(&astat_hdr_size, -space);
2514 break;
2515 case ARC_SPACE_L2HDRS:
2516 aggsum_add(&astat_l2_hdr_size, -space);
2517 break;
2518 }
2519
2520 if (type != ARC_SPACE_DATA) {
2521 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2522 /*
2523 * We use the upper bound here rather than the precise value
2524 * because the arc_meta_max value doesn't need to be
2525 * precise. It's only consumed by humans via arcstats.
2526 */
2527 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2528 arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2529 aggsum_add(&arc_meta_used, -space);
2530 }
2531
2532 ASSERT(aggsum_compare(&arc_size, space) >= 0);
2533 aggsum_add(&arc_size, -space);
2534 }
2535
2536 /*
2537 * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2538 * with the hdr's b_pabd.
2539 */
2540 static boolean_t
arc_can_share(arc_buf_hdr_t * hdr,arc_buf_t * buf)2541 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2542 {
2543 /*
2544 * The criteria for sharing a hdr's data are:
2545 * 1. the buffer is not encrypted
2546 * 2. the hdr's compression matches the buf's compression
2547 * 3. the hdr doesn't need to be byteswapped
2548 * 4. the hdr isn't already being shared
2549 * 5. the buf is either compressed or it is the last buf in the hdr list
2550 *
2551 * Criterion #5 maintains the invariant that shared uncompressed
2552 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2553 * might ask, "if a compressed buf is allocated first, won't that be the
2554 * last thing in the list?", but in that case it's impossible to create
2555 * a shared uncompressed buf anyway (because the hdr must be compressed
2556 * to have the compressed buf). You might also think that #3 is
2557 * sufficient to make this guarantee, however it's possible
2558 * (specifically in the rare L2ARC write race mentioned in
2559 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2560 * is sharable, but wasn't at the time of its allocation. Rather than
2561 * allow a new shared uncompressed buf to be created and then shuffle
2562 * the list around to make it the last element, this simply disallows
2563 * sharing if the new buf isn't the first to be added.
2564 */
2565 ASSERT3P(buf->b_hdr, ==, hdr);
2566 boolean_t hdr_compressed = arc_hdr_get_compress(hdr) !=
2567 ZIO_COMPRESS_OFF;
2568 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2569 return (!ARC_BUF_ENCRYPTED(buf) &&
2570 buf_compressed == hdr_compressed &&
2571 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2572 !HDR_SHARED_DATA(hdr) &&
2573 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2574 }
2575
2576 /*
2577 * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2578 * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2579 * copy was made successfully, or an error code otherwise.
2580 */
2581 static int
arc_buf_alloc_impl(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb,void * tag,boolean_t encrypted,boolean_t compressed,boolean_t noauth,boolean_t fill,arc_buf_t ** ret)2582 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2583 void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
2584 boolean_t fill, arc_buf_t **ret)
2585 {
2586 arc_buf_t *buf;
2587 arc_fill_flags_t flags = ARC_FILL_LOCKED;
2588
2589 ASSERT(HDR_HAS_L1HDR(hdr));
2590 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2591 VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2592 hdr->b_type == ARC_BUFC_METADATA);
2593 ASSERT3P(ret, !=, NULL);
2594 ASSERT3P(*ret, ==, NULL);
2595 IMPLY(encrypted, compressed);
2596
2597 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2598 buf->b_hdr = hdr;
2599 buf->b_data = NULL;
2600 buf->b_next = hdr->b_l1hdr.b_buf;
2601 buf->b_flags = 0;
2602
2603 add_reference(hdr, tag);
2604
2605 /*
2606 * We're about to change the hdr's b_flags. We must either
2607 * hold the hash_lock or be undiscoverable.
2608 */
2609 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2610
2611 /*
2612 * Only honor requests for compressed bufs if the hdr is actually
2613 * compressed. This must be overriden if the buffer is encrypted since
2614 * encrypted buffers cannot be decompressed.
2615 */
2616 if (encrypted) {
2617 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2618 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2619 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2620 } else if (compressed &&
2621 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
2622 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2623 flags |= ARC_FILL_COMPRESSED;
2624 }
2625
2626 if (noauth) {
2627 ASSERT0(encrypted);
2628 flags |= ARC_FILL_NOAUTH;
2629 }
2630
2631 /*
2632 * If the hdr's data can be shared then we share the data buffer and
2633 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2634 * allocate a new buffer to store the buf's data.
2635 *
2636 * There are two additional restrictions here because we're sharing
2637 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2638 * actively involved in an L2ARC write, because if this buf is used by
2639 * an arc_write() then the hdr's data buffer will be released when the
2640 * write completes, even though the L2ARC write might still be using it.
2641 * Second, the hdr's ABD must be linear so that the buf's user doesn't
2642 * need to be ABD-aware.
2643 */
2644 boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
2645 hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
2646
2647 /* Set up b_data and sharing */
2648 if (can_share) {
2649 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2650 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2651 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2652 } else {
2653 buf->b_data =
2654 arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2655 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2656 }
2657 VERIFY3P(buf->b_data, !=, NULL);
2658
2659 hdr->b_l1hdr.b_buf = buf;
2660 hdr->b_l1hdr.b_bufcnt += 1;
2661 if (encrypted)
2662 hdr->b_crypt_hdr.b_ebufcnt += 1;
2663
2664 /*
2665 * If the user wants the data from the hdr, we need to either copy or
2666 * decompress the data.
2667 */
2668 if (fill) {
2669 ASSERT3P(zb, !=, NULL);
2670 return (arc_buf_fill(buf, spa, zb, flags));
2671 }
2672
2673 return (0);
2674 }
2675
2676 static char *arc_onloan_tag = "onloan";
2677
2678 static inline void
arc_loaned_bytes_update(int64_t delta)2679 arc_loaned_bytes_update(int64_t delta)
2680 {
2681 atomic_add_64(&arc_loaned_bytes, delta);
2682
2683 /* assert that it did not wrap around */
2684 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2685 }
2686
2687 /*
2688 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2689 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2690 * buffers must be returned to the arc before they can be used by the DMU or
2691 * freed.
2692 */
2693 arc_buf_t *
arc_loan_buf(spa_t * spa,boolean_t is_metadata,int size)2694 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2695 {
2696 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2697 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2698
2699 arc_loaned_bytes_update(arc_buf_size(buf));
2700
2701 return (buf);
2702 }
2703
2704 arc_buf_t *
arc_loan_compressed_buf(spa_t * spa,uint64_t psize,uint64_t lsize,enum zio_compress compression_type)2705 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2706 enum zio_compress compression_type)
2707 {
2708 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2709 psize, lsize, compression_type);
2710
2711 arc_loaned_bytes_update(arc_buf_size(buf));
2712
2713 return (buf);
2714 }
2715
2716 arc_buf_t *
arc_loan_raw_buf(spa_t * spa,uint64_t dsobj,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_object_type_t ot,uint64_t psize,uint64_t lsize,enum zio_compress compression_type)2717 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2718 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2719 dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2720 enum zio_compress compression_type)
2721 {
2722 arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2723 byteorder, salt, iv, mac, ot, psize, lsize, compression_type);
2724
2725 atomic_add_64(&arc_loaned_bytes, psize);
2726 return (buf);
2727 }
2728
2729 /*
2730 * Performance tuning of L2ARC persistence:
2731 *
2732 * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
2733 * an L2ARC device (either at pool import or later) will attempt
2734 * to rebuild L2ARC buffer contents.
2735 * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
2736 * whether log blocks are written to the L2ARC device. If the L2ARC
2737 * device is less than 1GB, the amount of data l2arc_evict()
2738 * evicts is significant compared to the amount of restored L2ARC
2739 * data. In this case do not write log blocks in L2ARC in order
2740 * not to waste space.
2741 */
2742 int l2arc_rebuild_enabled = B_TRUE;
2743 unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
2744
2745 /* L2ARC persistence rebuild control routines. */
2746 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
2747 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
2748 static int l2arc_rebuild(l2arc_dev_t *dev);
2749
2750 /* L2ARC persistence read I/O routines. */
2751 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
2752 static int l2arc_log_blk_read(l2arc_dev_t *dev,
2753 const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
2754 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
2755 zio_t *this_io, zio_t **next_io);
2756 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
2757 const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
2758 static void l2arc_log_blk_fetch_abort(zio_t *zio);
2759
2760 /* L2ARC persistence block restoration routines. */
2761 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
2762 const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
2763 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
2764 l2arc_dev_t *dev);
2765
2766 /* L2ARC persistence write I/O routines. */
2767 static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
2768 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
2769 l2arc_write_callback_t *cb);
2770
2771 /* L2ARC persistence auxilliary routines. */
2772 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
2773 const l2arc_log_blkptr_t *lbp);
2774 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
2775 const arc_buf_hdr_t *ab);
2776 boolean_t l2arc_range_check_overlap(uint64_t bottom,
2777 uint64_t top, uint64_t check);
2778 static void l2arc_blk_fetch_done(zio_t *zio);
2779 static inline uint64_t
2780 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
2781
2782 /*
2783 * Return a loaned arc buffer to the arc.
2784 */
2785 void
arc_return_buf(arc_buf_t * buf,void * tag)2786 arc_return_buf(arc_buf_t *buf, void *tag)
2787 {
2788 arc_buf_hdr_t *hdr = buf->b_hdr;
2789
2790 ASSERT3P(buf->b_data, !=, NULL);
2791 ASSERT(HDR_HAS_L1HDR(hdr));
2792 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2793 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2794
2795 arc_loaned_bytes_update(-arc_buf_size(buf));
2796 }
2797
2798 /* Detach an arc_buf from a dbuf (tag) */
2799 void
arc_loan_inuse_buf(arc_buf_t * buf,void * tag)2800 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2801 {
2802 arc_buf_hdr_t *hdr = buf->b_hdr;
2803
2804 ASSERT3P(buf->b_data, !=, NULL);
2805 ASSERT(HDR_HAS_L1HDR(hdr));
2806 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2807 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2808
2809 arc_loaned_bytes_update(arc_buf_size(buf));
2810 }
2811
2812 static void
l2arc_free_abd_on_write(abd_t * abd,size_t size,arc_buf_contents_t type)2813 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
2814 {
2815 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2816
2817 df->l2df_abd = abd;
2818 df->l2df_size = size;
2819 df->l2df_type = type;
2820 mutex_enter(&l2arc_free_on_write_mtx);
2821 list_insert_head(l2arc_free_on_write, df);
2822 mutex_exit(&l2arc_free_on_write_mtx);
2823 }
2824
2825 static void
arc_hdr_free_on_write(arc_buf_hdr_t * hdr,boolean_t free_rdata)2826 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
2827 {
2828 arc_state_t *state = hdr->b_l1hdr.b_state;
2829 arc_buf_contents_t type = arc_buf_type(hdr);
2830 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
2831
2832 /* protected by hash lock, if in the hash table */
2833 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2834 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2835 ASSERT(state != arc_anon && state != arc_l2c_only);
2836
2837 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2838 size, hdr);
2839 }
2840 (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
2841 if (type == ARC_BUFC_METADATA) {
2842 arc_space_return(size, ARC_SPACE_META);
2843 } else {
2844 ASSERT(type == ARC_BUFC_DATA);
2845 arc_space_return(size, ARC_SPACE_DATA);
2846 }
2847
2848 if (free_rdata) {
2849 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
2850 } else {
2851 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
2852 }
2853 }
2854
2855 /*
2856 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
2857 * data buffer, we transfer the refcount ownership to the hdr and update
2858 * the appropriate kstats.
2859 */
2860 static void
arc_share_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)2861 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2862 {
2863 /* LINTED */
2864 arc_state_t *state = hdr->b_l1hdr.b_state;
2865
2866 ASSERT(arc_can_share(hdr, buf));
2867 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2868 ASSERT(!ARC_BUF_ENCRYPTED(buf));
2869 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2870
2871 /*
2872 * Start sharing the data buffer. We transfer the
2873 * refcount ownership to the hdr since it always owns
2874 * the refcount whenever an arc_buf_t is shared.
2875 */
2876 zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2877 arc_hdr_size(hdr), buf, hdr);
2878 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
2879 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
2880 HDR_ISTYPE_METADATA(hdr));
2881 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2882 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2883
2884 /*
2885 * Since we've transferred ownership to the hdr we need
2886 * to increment its compressed and uncompressed kstats and
2887 * decrement the overhead size.
2888 */
2889 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
2890 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2891 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
2892 }
2893
2894 static void
arc_unshare_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)2895 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2896 {
2897 /* LINTED */
2898 arc_state_t *state = hdr->b_l1hdr.b_state;
2899
2900 ASSERT(arc_buf_is_shared(buf));
2901 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2902 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2903
2904 /*
2905 * We are no longer sharing this buffer so we need
2906 * to transfer its ownership to the rightful owner.
2907 */
2908 zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2909 arc_hdr_size(hdr), hdr, buf);
2910 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2911 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
2912 abd_put(hdr->b_l1hdr.b_pabd);
2913 hdr->b_l1hdr.b_pabd = NULL;
2914 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2915
2916 /*
2917 * Since the buffer is no longer shared between
2918 * the arc buf and the hdr, count it as overhead.
2919 */
2920 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
2921 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
2922 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2923 }
2924
2925 /*
2926 * Remove an arc_buf_t from the hdr's buf list and return the last
2927 * arc_buf_t on the list. If no buffers remain on the list then return
2928 * NULL.
2929 */
2930 static arc_buf_t *
arc_buf_remove(arc_buf_hdr_t * hdr,arc_buf_t * buf)2931 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2932 {
2933 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
2934 arc_buf_t *lastbuf = NULL;
2935
2936 ASSERT(HDR_HAS_L1HDR(hdr));
2937 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2938
2939 /*
2940 * Remove the buf from the hdr list and locate the last
2941 * remaining buffer on the list.
2942 */
2943 while (*bufp != NULL) {
2944 if (*bufp == buf)
2945 *bufp = buf->b_next;
2946
2947 /*
2948 * If we've removed a buffer in the middle of
2949 * the list then update the lastbuf and update
2950 * bufp.
2951 */
2952 if (*bufp != NULL) {
2953 lastbuf = *bufp;
2954 bufp = &(*bufp)->b_next;
2955 }
2956 }
2957 buf->b_next = NULL;
2958 ASSERT3P(lastbuf, !=, buf);
2959 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
2960 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
2961 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
2962
2963 return (lastbuf);
2964 }
2965
2966 /*
2967 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
2968 * list and free it.
2969 */
2970 static void
arc_buf_destroy_impl(arc_buf_t * buf)2971 arc_buf_destroy_impl(arc_buf_t *buf)
2972 {
2973 arc_buf_hdr_t *hdr = buf->b_hdr;
2974
2975 /*
2976 * Free up the data associated with the buf but only if we're not
2977 * sharing this with the hdr. If we are sharing it with the hdr, the
2978 * hdr is responsible for doing the free.
2979 */
2980 if (buf->b_data != NULL) {
2981 /*
2982 * We're about to change the hdr's b_flags. We must either
2983 * hold the hash_lock or be undiscoverable.
2984 */
2985 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2986
2987 arc_cksum_verify(buf);
2988 arc_buf_unwatch(buf);
2989
2990 if (arc_buf_is_shared(buf)) {
2991 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2992 } else {
2993 uint64_t size = arc_buf_size(buf);
2994 arc_free_data_buf(hdr, buf->b_data, size, buf);
2995 ARCSTAT_INCR(arcstat_overhead_size, -size);
2996 }
2997 buf->b_data = NULL;
2998
2999 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3000 hdr->b_l1hdr.b_bufcnt -= 1;
3001
3002 if (ARC_BUF_ENCRYPTED(buf)) {
3003 hdr->b_crypt_hdr.b_ebufcnt -= 1;
3004
3005 /*
3006 * If we have no more encrypted buffers and we've
3007 * already gotten a copy of the decrypted data we can
3008 * free b_rabd to save some space.
3009 */
3010 if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
3011 HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
3012 !HDR_IO_IN_PROGRESS(hdr)) {
3013 arc_hdr_free_pabd(hdr, B_TRUE);
3014 }
3015 }
3016 }
3017
3018 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3019
3020 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3021 /*
3022 * If the current arc_buf_t is sharing its data buffer with the
3023 * hdr, then reassign the hdr's b_pabd to share it with the new
3024 * buffer at the end of the list. The shared buffer is always
3025 * the last one on the hdr's buffer list.
3026 *
3027 * There is an equivalent case for compressed bufs, but since
3028 * they aren't guaranteed to be the last buf in the list and
3029 * that is an exceedingly rare case, we just allow that space be
3030 * wasted temporarily. We must also be careful not to share
3031 * encrypted buffers, since they cannot be shared.
3032 */
3033 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3034 /* Only one buf can be shared at once */
3035 VERIFY(!arc_buf_is_shared(lastbuf));
3036 /* hdr is uncompressed so can't have compressed buf */
3037 VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3038
3039 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3040 arc_hdr_free_pabd(hdr, B_FALSE);
3041
3042 /*
3043 * We must setup a new shared block between the
3044 * last buffer and the hdr. The data would have
3045 * been allocated by the arc buf so we need to transfer
3046 * ownership to the hdr since it's now being shared.
3047 */
3048 arc_share_buf(hdr, lastbuf);
3049 }
3050 } else if (HDR_SHARED_DATA(hdr)) {
3051 /*
3052 * Uncompressed shared buffers are always at the end
3053 * of the list. Compressed buffers don't have the
3054 * same requirements. This makes it hard to
3055 * simply assert that the lastbuf is shared so
3056 * we rely on the hdr's compression flags to determine
3057 * if we have a compressed, shared buffer.
3058 */
3059 ASSERT3P(lastbuf, !=, NULL);
3060 ASSERT(arc_buf_is_shared(lastbuf) ||
3061 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3062 }
3063
3064 /*
3065 * Free the checksum if we're removing the last uncompressed buf from
3066 * this hdr.
3067 */
3068 if (!arc_hdr_has_uncompressed_buf(hdr)) {
3069 arc_cksum_free(hdr);
3070 }
3071
3072 /* clean up the buf */
3073 buf->b_hdr = NULL;
3074 kmem_cache_free(buf_cache, buf);
3075 }
3076
3077 static void
arc_hdr_alloc_pabd(arc_buf_hdr_t * hdr,int alloc_flags)3078 arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, int alloc_flags)
3079 {
3080 uint64_t size;
3081 boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
3082 boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
3083
3084 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3085 ASSERT(HDR_HAS_L1HDR(hdr));
3086 ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3087 IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3088
3089 if (alloc_rdata) {
3090 size = HDR_GET_PSIZE(hdr);
3091 ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
3092 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
3093 do_adapt);
3094 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3095 } else {
3096 size = arc_hdr_size(hdr);
3097 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3098 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
3099 do_adapt);
3100 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3101 }
3102
3103 ARCSTAT_INCR(arcstat_compressed_size, size);
3104 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3105 }
3106
3107 static void
arc_hdr_free_pabd(arc_buf_hdr_t * hdr,boolean_t free_rdata)3108 arc_hdr_free_pabd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3109 {
3110 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3111
3112 ASSERT(HDR_HAS_L1HDR(hdr));
3113 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3114 IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3115
3116
3117 /*
3118 * If the hdr is currently being written to the l2arc then
3119 * we defer freeing the data by adding it to the l2arc_free_on_write
3120 * list. The l2arc will free the data once it's finished
3121 * writing it to the l2arc device.
3122 */
3123 if (HDR_L2_WRITING(hdr)) {
3124 arc_hdr_free_on_write(hdr, free_rdata);
3125 ARCSTAT_BUMP(arcstat_l2_free_on_write);
3126 } else if (free_rdata) {
3127 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3128 } else {
3129 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
3130 size, hdr);
3131 }
3132
3133 if (free_rdata) {
3134 hdr->b_crypt_hdr.b_rabd = NULL;
3135 } else {
3136 hdr->b_l1hdr.b_pabd = NULL;
3137 }
3138
3139 if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3140 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3141
3142 ARCSTAT_INCR(arcstat_compressed_size, -size);
3143 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3144 }
3145
3146 static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa,int32_t psize,int32_t lsize,boolean_t protected,enum zio_compress compression_type,arc_buf_contents_t type,boolean_t alloc_rdata)3147 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3148 boolean_t protected, enum zio_compress compression_type,
3149 arc_buf_contents_t type, boolean_t alloc_rdata)
3150 {
3151 arc_buf_hdr_t *hdr;
3152 int flags = ARC_HDR_DO_ADAPT;
3153
3154 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3155 if (protected) {
3156 hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
3157 } else {
3158 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3159 }
3160 flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
3161 ASSERT(HDR_EMPTY(hdr));
3162 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3163 ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
3164 HDR_SET_PSIZE(hdr, psize);
3165 HDR_SET_LSIZE(hdr, lsize);
3166 hdr->b_spa = spa;
3167 hdr->b_type = type;
3168 hdr->b_flags = 0;
3169 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3170 arc_hdr_set_compress(hdr, compression_type);
3171 if (protected)
3172 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3173
3174 hdr->b_l1hdr.b_state = arc_anon;
3175 hdr->b_l1hdr.b_arc_access = 0;
3176 hdr->b_l1hdr.b_bufcnt = 0;
3177 hdr->b_l1hdr.b_buf = NULL;
3178
3179 /*
3180 * Allocate the hdr's buffer. This will contain either
3181 * the compressed or uncompressed data depending on the block
3182 * it references and compressed arc enablement.
3183 */
3184 arc_hdr_alloc_pabd(hdr, flags);
3185 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3186
3187 return (hdr);
3188 }
3189
3190 /*
3191 * Transition between the two allocation states for the arc_buf_hdr struct.
3192 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3193 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3194 * version is used when a cache buffer is only in the L2ARC in order to reduce
3195 * memory usage.
3196 */
3197 static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t * hdr,kmem_cache_t * old,kmem_cache_t * new)3198 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3199 {
3200 ASSERT(HDR_HAS_L2HDR(hdr));
3201
3202 arc_buf_hdr_t *nhdr;
3203 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3204
3205 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3206 (old == hdr_l2only_cache && new == hdr_full_cache));
3207
3208 /*
3209 * if the caller wanted a new full header and the header is to be
3210 * encrypted we will actually allocate the header from the full crypt
3211 * cache instead. The same applies to freeing from the old cache.
3212 */
3213 if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
3214 new = hdr_full_crypt_cache;
3215 if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
3216 old = hdr_full_crypt_cache;
3217
3218 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3219
3220 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3221 buf_hash_remove(hdr);
3222
3223 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3224
3225 if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
3226 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3227 /*
3228 * arc_access and arc_change_state need to be aware that a
3229 * header has just come out of L2ARC, so we set its state to
3230 * l2c_only even though it's about to change.
3231 */
3232 nhdr->b_l1hdr.b_state = arc_l2c_only;
3233
3234 /* Verify previous threads set to NULL before freeing */
3235 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3236 ASSERT(!HDR_HAS_RABD(hdr));
3237 } else {
3238 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3239 ASSERT0(hdr->b_l1hdr.b_bufcnt);
3240 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3241
3242 /*
3243 * If we've reached here, We must have been called from
3244 * arc_evict_hdr(), as such we should have already been
3245 * removed from any ghost list we were previously on
3246 * (which protects us from racing with arc_evict_state),
3247 * thus no locking is needed during this check.
3248 */
3249 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3250
3251 /*
3252 * A buffer must not be moved into the arc_l2c_only
3253 * state if it's not finished being written out to the
3254 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3255 * might try to be accessed, even though it was removed.
3256 */
3257 VERIFY(!HDR_L2_WRITING(hdr));
3258 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3259 ASSERT(!HDR_HAS_RABD(hdr));
3260
3261 #ifdef ZFS_DEBUG
3262 if (hdr->b_l1hdr.b_thawed != NULL) {
3263 kmem_free(hdr->b_l1hdr.b_thawed, 1);
3264 hdr->b_l1hdr.b_thawed = NULL;
3265 }
3266 #endif
3267
3268 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3269 }
3270 /*
3271 * The header has been reallocated so we need to re-insert it into any
3272 * lists it was on.
3273 */
3274 (void) buf_hash_insert(nhdr, NULL);
3275
3276 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3277
3278 mutex_enter(&dev->l2ad_mtx);
3279
3280 /*
3281 * We must place the realloc'ed header back into the list at
3282 * the same spot. Otherwise, if it's placed earlier in the list,
3283 * l2arc_write_buffers() could find it during the function's
3284 * write phase, and try to write it out to the l2arc.
3285 */
3286 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3287 list_remove(&dev->l2ad_buflist, hdr);
3288
3289 mutex_exit(&dev->l2ad_mtx);
3290
3291 /*
3292 * Since we're using the pointer address as the tag when
3293 * incrementing and decrementing the l2ad_alloc refcount, we
3294 * must remove the old pointer (that we're about to destroy) and
3295 * add the new pointer to the refcount. Otherwise we'd remove
3296 * the wrong pointer address when calling arc_hdr_destroy() later.
3297 */
3298
3299 (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3300 hdr);
3301 (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
3302 nhdr);
3303
3304 buf_discard_identity(hdr);
3305 kmem_cache_free(old, hdr);
3306
3307 return (nhdr);
3308 }
3309
3310 /*
3311 * This function allows an L1 header to be reallocated as a crypt
3312 * header and vice versa. If we are going to a crypt header, the
3313 * new fields will be zeroed out.
3314 */
3315 static arc_buf_hdr_t *
arc_hdr_realloc_crypt(arc_buf_hdr_t * hdr,boolean_t need_crypt)3316 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
3317 {
3318 arc_buf_hdr_t *nhdr;
3319 arc_buf_t *buf;
3320 kmem_cache_t *ncache, *ocache;
3321
3322 ASSERT(HDR_HAS_L1HDR(hdr));
3323 ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
3324 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3325 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3326 ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
3327 ASSERT3P(hdr->b_hash_next, ==, NULL);
3328
3329 if (need_crypt) {
3330 ncache = hdr_full_crypt_cache;
3331 ocache = hdr_full_cache;
3332 } else {
3333 ncache = hdr_full_cache;
3334 ocache = hdr_full_crypt_cache;
3335 }
3336
3337 nhdr = kmem_cache_alloc(ncache,