1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5033f9833Sek * Common Development and Distribution License (the "License"). 6033f9833Sek * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22e6c728e1Sbrendan * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens /* 2944cb6abcSbmc * DVA-based Adjustable Replacement Cache 30fa9e4066Sahrens * 31ea8dc4b6Seschrock * While much of the theory of operation used here is 32ea8dc4b6Seschrock * based on the self-tuning, low overhead replacement cache 33fa9e4066Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 34fa9e4066Sahrens * significant differences: 35fa9e4066Sahrens * 36fa9e4066Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 37fa9e4066Sahrens * Pages in its cache cannot be "locked" into memory. This makes 38fa9e4066Sahrens * the eviction algorithm simple: evict the last page in the list. 39fa9e4066Sahrens * This also make the performance characteristics easy to reason 40fa9e4066Sahrens * about. Our cache is not so simple. At any given moment, some 41fa9e4066Sahrens * subset of the blocks in the cache are un-evictable because we 42fa9e4066Sahrens * have handed out a reference to them. Blocks are only evictable 43fa9e4066Sahrens * when there are no external references active. This makes 44fa9e4066Sahrens * eviction far more problematic: we choose to evict the evictable 45fa9e4066Sahrens * blocks that are the "lowest" in the list. 46fa9e4066Sahrens * 47fa9e4066Sahrens * There are times when it is not possible to evict the requested 48fa9e4066Sahrens * space. In these circumstances we are unable to adjust the cache 49fa9e4066Sahrens * size. To prevent the cache growing unbounded at these times we 50fa94a07fSbrendan * implement a "cache throttle" that slows the flow of new data 51fa94a07fSbrendan * into the cache until we can make space available. 52fa9e4066Sahrens * 53fa9e4066Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 54fa9e4066Sahrens * Pages are evicted when the cache is full and there is a cache 55fa9e4066Sahrens * miss. Our model has a variable sized cache. It grows with 56fa94a07fSbrendan * high use, but also tries to react to memory pressure from the 57fa9e4066Sahrens * operating system: decreasing its size when system memory is 58fa9e4066Sahrens * tight. 59fa9e4066Sahrens * 60fa9e4066Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 61fa9e4066Sahrens * elements of the cache are therefor exactly the same size. So 62fa9e4066Sahrens * when adjusting the cache size following a cache miss, its simply 63fa9e4066Sahrens * a matter of choosing a single page to evict. In our model, we 64fa9e4066Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 65fa9e4066Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 66fa9e4066Sahrens * space for a cache miss that approximates as closely as possible 67fa9e4066Sahrens * the space used by the new block. 68fa9e4066Sahrens * 69fa9e4066Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70fa9e4066Sahrens * by N. Megiddo & D. Modha, FAST 2003 71fa9e4066Sahrens */ 72fa9e4066Sahrens 73fa9e4066Sahrens /* 74fa9e4066Sahrens * The locking model: 75fa9e4066Sahrens * 76fa9e4066Sahrens * A new reference to a cache buffer can be obtained in two 77fa9e4066Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 78fa94a07fSbrendan * or 2) via one of the ARC lists. The arc_read() interface 79fa9e4066Sahrens * uses method 1, while the internal arc algorithms for 80fa9e4066Sahrens * adjusting the cache use method 2. We therefor provide two 81fa9e4066Sahrens * types of locks: 1) the hash table lock array, and 2) the 82fa9e4066Sahrens * arc list locks. 83fa9e4066Sahrens * 84fa9e4066Sahrens * Buffers do not have their own mutexs, rather they rely on the 85fa9e4066Sahrens * hash table mutexs for the bulk of their protection (i.e. most 86fa9e4066Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 87fa9e4066Sahrens * 88fa9e4066Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 89fa9e4066Sahrens * locates the requested buffer in the hash table. It returns 90fa9e4066Sahrens * NULL for the mutex if the buffer was not in the table. 91fa9e4066Sahrens * 92fa9e4066Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 93fa9e4066Sahrens * already held before it is invoked. 94fa9e4066Sahrens * 95fa9e4066Sahrens * Each arc state also has a mutex which is used to protect the 96fa9e4066Sahrens * buffer list associated with the state. When attempting to 97fa9e4066Sahrens * obtain a hash table lock while holding an arc list lock you 98fa9e4066Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 9944eda4d7Smaybee * the active state mutex must be held before the ghost state mutex. 100fa9e4066Sahrens * 101ea8dc4b6Seschrock * Arc buffers may have an associated eviction callback function. 102ea8dc4b6Seschrock * This function will be invoked prior to removing the buffer (e.g. 103ea8dc4b6Seschrock * in arc_do_user_evicts()). Note however that the data associated 104ea8dc4b6Seschrock * with the buffer may be evicted prior to the callback. The callback 105ea8dc4b6Seschrock * must be made with *no locks held* (to prevent deadlock). Additionally, 106ea8dc4b6Seschrock * the users of callbacks must ensure that their private data is 107ea8dc4b6Seschrock * protected from simultaneous callbacks from arc_buf_evict() 108ea8dc4b6Seschrock * and arc_do_user_evicts(). 109ea8dc4b6Seschrock * 110fa9e4066Sahrens * Note that the majority of the performance stats are manipulated 111fa9e4066Sahrens * with atomic operations. 112fa94a07fSbrendan * 113fa94a07fSbrendan * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 114fa94a07fSbrendan * 115fa94a07fSbrendan * - L2ARC buflist creation 116fa94a07fSbrendan * - L2ARC buflist eviction 117fa94a07fSbrendan * - L2ARC write completion, which walks L2ARC buflists 118fa94a07fSbrendan * - ARC header destruction, as it removes from L2ARC buflists 119fa94a07fSbrendan * - ARC header release, as it removes from L2ARC buflists 120fa9e4066Sahrens */ 121fa9e4066Sahrens 122fa9e4066Sahrens #include <sys/spa.h> 123fa9e4066Sahrens #include <sys/zio.h> 1246b4acc8bSahrens #include <sys/zio_checksum.h> 125fa9e4066Sahrens #include <sys/zfs_context.h> 126fa9e4066Sahrens #include <sys/arc.h> 127fa9e4066Sahrens #include <sys/refcount.h> 128c5904d13Seschrock #include <sys/vdev.h> 129fa9e4066Sahrens #ifdef _KERNEL 130fa9e4066Sahrens #include <sys/vmsystm.h> 131fa9e4066Sahrens #include <vm/anon.h> 132fa9e4066Sahrens #include <sys/fs/swapnode.h> 133033f9833Sek #include <sys/dnlc.h> 134fa9e4066Sahrens #endif 135fa9e4066Sahrens #include <sys/callb.h> 13644cb6abcSbmc #include <sys/kstat.h> 137fa9e4066Sahrens 138fa9e4066Sahrens static kmutex_t arc_reclaim_thr_lock; 139fa9e4066Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 140fa9e4066Sahrens static uint8_t arc_thread_exit; 141fa9e4066Sahrens 1421ab7f2deSmaybee extern int zfs_write_limit_shift; 1431ab7f2deSmaybee extern uint64_t zfs_write_limit_max; 1441ab7f2deSmaybee extern uint64_t zfs_write_limit_inflated; 1451ab7f2deSmaybee 146033f9833Sek #define ARC_REDUCE_DNLC_PERCENT 3 147033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 148033f9833Sek 149fa9e4066Sahrens typedef enum arc_reclaim_strategy { 150fa9e4066Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 151fa9e4066Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 152fa9e4066Sahrens } arc_reclaim_strategy_t; 153fa9e4066Sahrens 154fa9e4066Sahrens /* number of seconds before growing cache again */ 155fa9e4066Sahrens static int arc_grow_retry = 60; 156fa9e4066Sahrens 15713506d1eSmaybee /* 158b19a79ecSperrin * minimum lifespan of a prefetch block in clock ticks 159b19a79ecSperrin * (initialized in arc_init()) 16013506d1eSmaybee */ 161b19a79ecSperrin static int arc_min_prefetch_lifespan; 16213506d1eSmaybee 163fa9e4066Sahrens static int arc_dead; 164fa9e4066Sahrens 165*3a737e0dSbrendan /* 166*3a737e0dSbrendan * The arc has filled available memory and has now warmed up. 167*3a737e0dSbrendan */ 168*3a737e0dSbrendan static boolean_t arc_warm; 169*3a737e0dSbrendan 170a2eea2e1Sahrens /* 171a2eea2e1Sahrens * These tunables are for performance analysis. 172a2eea2e1Sahrens */ 173a2eea2e1Sahrens uint64_t zfs_arc_max; 174a2eea2e1Sahrens uint64_t zfs_arc_min; 1751116048bSek uint64_t zfs_arc_meta_limit = 0; 176a2eea2e1Sahrens 177fa9e4066Sahrens /* 178fa94a07fSbrendan * Note that buffers can be in one of 6 states: 179fa9e4066Sahrens * ARC_anon - anonymous (discussed below) 180ea8dc4b6Seschrock * ARC_mru - recently used, currently cached 181ea8dc4b6Seschrock * ARC_mru_ghost - recentely used, no longer in cache 182ea8dc4b6Seschrock * ARC_mfu - frequently used, currently cached 183ea8dc4b6Seschrock * ARC_mfu_ghost - frequently used, no longer in cache 184fa94a07fSbrendan * ARC_l2c_only - exists in L2ARC but not other states 1850e8c6158Smaybee * When there are no active references to the buffer, they are 1860e8c6158Smaybee * are linked onto a list in one of these arc states. These are 1870e8c6158Smaybee * the only buffers that can be evicted or deleted. Within each 1880e8c6158Smaybee * state there are multiple lists, one for meta-data and one for 1890e8c6158Smaybee * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 1900e8c6158Smaybee * etc.) is tracked separately so that it can be managed more 191fa94a07fSbrendan * explicitly: favored over data, limited explicitly. 192fa9e4066Sahrens * 193fa9e4066Sahrens * Anonymous buffers are buffers that are not associated with 194fa9e4066Sahrens * a DVA. These are buffers that hold dirty block copies 195fa9e4066Sahrens * before they are written to stable storage. By definition, 196ea8dc4b6Seschrock * they are "ref'd" and are considered part of arc_mru 197fa9e4066Sahrens * that cannot be freed. Generally, they will aquire a DVA 198ea8dc4b6Seschrock * as they are written and migrate onto the arc_mru list. 199fa94a07fSbrendan * 200fa94a07fSbrendan * The ARC_l2c_only state is for buffers that are in the second 201fa94a07fSbrendan * level ARC but no longer in any of the ARC_m* lists. The second 202fa94a07fSbrendan * level ARC itself may also contain buffers that are in any of 203fa94a07fSbrendan * the ARC_m* states - meaning that a buffer can exist in two 204fa94a07fSbrendan * places. The reason for the ARC_l2c_only state is to keep the 205fa94a07fSbrendan * buffer header in the hash table, so that reads that hit the 206fa94a07fSbrendan * second level ARC benefit from these fast lookups. 207fa9e4066Sahrens */ 208fa9e4066Sahrens 209fa9e4066Sahrens typedef struct arc_state { 2100e8c6158Smaybee list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 2110e8c6158Smaybee uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 2120e8c6158Smaybee uint64_t arcs_size; /* total amount of data in this state */ 21344cb6abcSbmc kmutex_t arcs_mtx; 214fa9e4066Sahrens } arc_state_t; 215fa9e4066Sahrens 216fa94a07fSbrendan /* The 6 states: */ 217fa9e4066Sahrens static arc_state_t ARC_anon; 218ea8dc4b6Seschrock static arc_state_t ARC_mru; 219ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost; 220ea8dc4b6Seschrock static arc_state_t ARC_mfu; 221ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost; 222fa94a07fSbrendan static arc_state_t ARC_l2c_only; 223fa9e4066Sahrens 22444cb6abcSbmc typedef struct arc_stats { 22544cb6abcSbmc kstat_named_t arcstat_hits; 22644cb6abcSbmc kstat_named_t arcstat_misses; 22744cb6abcSbmc kstat_named_t arcstat_demand_data_hits; 22844cb6abcSbmc kstat_named_t arcstat_demand_data_misses; 22944cb6abcSbmc kstat_named_t arcstat_demand_metadata_hits; 23044cb6abcSbmc kstat_named_t arcstat_demand_metadata_misses; 23144cb6abcSbmc kstat_named_t arcstat_prefetch_data_hits; 23244cb6abcSbmc kstat_named_t arcstat_prefetch_data_misses; 23344cb6abcSbmc kstat_named_t arcstat_prefetch_metadata_hits; 23444cb6abcSbmc kstat_named_t arcstat_prefetch_metadata_misses; 23544cb6abcSbmc kstat_named_t arcstat_mru_hits; 23644cb6abcSbmc kstat_named_t arcstat_mru_ghost_hits; 23744cb6abcSbmc kstat_named_t arcstat_mfu_hits; 23844cb6abcSbmc kstat_named_t arcstat_mfu_ghost_hits; 23944cb6abcSbmc kstat_named_t arcstat_deleted; 24044cb6abcSbmc kstat_named_t arcstat_recycle_miss; 24144cb6abcSbmc kstat_named_t arcstat_mutex_miss; 24244cb6abcSbmc kstat_named_t arcstat_evict_skip; 24344cb6abcSbmc kstat_named_t arcstat_hash_elements; 24444cb6abcSbmc kstat_named_t arcstat_hash_elements_max; 24544cb6abcSbmc kstat_named_t arcstat_hash_collisions; 24644cb6abcSbmc kstat_named_t arcstat_hash_chains; 24744cb6abcSbmc kstat_named_t arcstat_hash_chain_max; 24844cb6abcSbmc kstat_named_t arcstat_p; 24944cb6abcSbmc kstat_named_t arcstat_c; 25044cb6abcSbmc kstat_named_t arcstat_c_min; 25144cb6abcSbmc kstat_named_t arcstat_c_max; 25244cb6abcSbmc kstat_named_t arcstat_size; 253fa94a07fSbrendan kstat_named_t arcstat_hdr_size; 254fa94a07fSbrendan kstat_named_t arcstat_l2_hits; 255fa94a07fSbrendan kstat_named_t arcstat_l2_misses; 256fa94a07fSbrendan kstat_named_t arcstat_l2_feeds; 257fa94a07fSbrendan kstat_named_t arcstat_l2_rw_clash; 258fa94a07fSbrendan kstat_named_t arcstat_l2_writes_sent; 259fa94a07fSbrendan kstat_named_t arcstat_l2_writes_done; 260fa94a07fSbrendan kstat_named_t arcstat_l2_writes_error; 261fa94a07fSbrendan kstat_named_t arcstat_l2_writes_hdr_miss; 262fa94a07fSbrendan kstat_named_t arcstat_l2_evict_lock_retry; 263fa94a07fSbrendan kstat_named_t arcstat_l2_evict_reading; 264fa94a07fSbrendan kstat_named_t arcstat_l2_free_on_write; 265fa94a07fSbrendan kstat_named_t arcstat_l2_abort_lowmem; 266fa94a07fSbrendan kstat_named_t arcstat_l2_cksum_bad; 267fa94a07fSbrendan kstat_named_t arcstat_l2_io_error; 268fa94a07fSbrendan kstat_named_t arcstat_l2_size; 269fa94a07fSbrendan kstat_named_t arcstat_l2_hdr_size; 2701ab7f2deSmaybee kstat_named_t arcstat_memory_throttle_count; 27144cb6abcSbmc } arc_stats_t; 27244cb6abcSbmc 27344cb6abcSbmc static arc_stats_t arc_stats = { 27444cb6abcSbmc { "hits", KSTAT_DATA_UINT64 }, 27544cb6abcSbmc { "misses", KSTAT_DATA_UINT64 }, 27644cb6abcSbmc { "demand_data_hits", KSTAT_DATA_UINT64 }, 27744cb6abcSbmc { "demand_data_misses", KSTAT_DATA_UINT64 }, 27844cb6abcSbmc { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 27944cb6abcSbmc { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 28044cb6abcSbmc { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 28144cb6abcSbmc { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 28244cb6abcSbmc { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 28344cb6abcSbmc { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 28444cb6abcSbmc { "mru_hits", KSTAT_DATA_UINT64 }, 28544cb6abcSbmc { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 28644cb6abcSbmc { "mfu_hits", KSTAT_DATA_UINT64 }, 28744cb6abcSbmc { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 28844cb6abcSbmc { "deleted", KSTAT_DATA_UINT64 }, 28944cb6abcSbmc { "recycle_miss", KSTAT_DATA_UINT64 }, 29044cb6abcSbmc { "mutex_miss", KSTAT_DATA_UINT64 }, 29144cb6abcSbmc { "evict_skip", KSTAT_DATA_UINT64 }, 29244cb6abcSbmc { "hash_elements", KSTAT_DATA_UINT64 }, 29344cb6abcSbmc { "hash_elements_max", KSTAT_DATA_UINT64 }, 29444cb6abcSbmc { "hash_collisions", KSTAT_DATA_UINT64 }, 29544cb6abcSbmc { "hash_chains", KSTAT_DATA_UINT64 }, 29644cb6abcSbmc { "hash_chain_max", KSTAT_DATA_UINT64 }, 29744cb6abcSbmc { "p", KSTAT_DATA_UINT64 }, 29844cb6abcSbmc { "c", KSTAT_DATA_UINT64 }, 29944cb6abcSbmc { "c_min", KSTAT_DATA_UINT64 }, 30044cb6abcSbmc { "c_max", KSTAT_DATA_UINT64 }, 301fa94a07fSbrendan { "size", KSTAT_DATA_UINT64 }, 302fa94a07fSbrendan { "hdr_size", KSTAT_DATA_UINT64 }, 303fa94a07fSbrendan { "l2_hits", KSTAT_DATA_UINT64 }, 304fa94a07fSbrendan { "l2_misses", KSTAT_DATA_UINT64 }, 305fa94a07fSbrendan { "l2_feeds", KSTAT_DATA_UINT64 }, 306fa94a07fSbrendan { "l2_rw_clash", KSTAT_DATA_UINT64 }, 307fa94a07fSbrendan { "l2_writes_sent", KSTAT_DATA_UINT64 }, 308fa94a07fSbrendan { "l2_writes_done", KSTAT_DATA_UINT64 }, 309fa94a07fSbrendan { "l2_writes_error", KSTAT_DATA_UINT64 }, 310fa94a07fSbrendan { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 311fa94a07fSbrendan { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 312fa94a07fSbrendan { "l2_evict_reading", KSTAT_DATA_UINT64 }, 313fa94a07fSbrendan { "l2_free_on_write", KSTAT_DATA_UINT64 }, 314fa94a07fSbrendan { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 315fa94a07fSbrendan { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 316fa94a07fSbrendan { "l2_io_error", KSTAT_DATA_UINT64 }, 317fa94a07fSbrendan { "l2_size", KSTAT_DATA_UINT64 }, 3181ab7f2deSmaybee { "l2_hdr_size", KSTAT_DATA_UINT64 }, 3191ab7f2deSmaybee { "memory_throttle_count", KSTAT_DATA_UINT64 } 32044cb6abcSbmc }; 32144cb6abcSbmc 32244cb6abcSbmc #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 32344cb6abcSbmc 32444cb6abcSbmc #define ARCSTAT_INCR(stat, val) \ 32544cb6abcSbmc atomic_add_64(&arc_stats.stat.value.ui64, (val)); 32644cb6abcSbmc 32744cb6abcSbmc #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 32844cb6abcSbmc #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 32944cb6abcSbmc 33044cb6abcSbmc #define ARCSTAT_MAX(stat, val) { \ 33144cb6abcSbmc uint64_t m; \ 33244cb6abcSbmc while ((val) > (m = arc_stats.stat.value.ui64) && \ 33344cb6abcSbmc (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 33444cb6abcSbmc continue; \ 33544cb6abcSbmc } 33644cb6abcSbmc 33744cb6abcSbmc #define ARCSTAT_MAXSTAT(stat) \ 33844cb6abcSbmc ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 33944cb6abcSbmc 34044cb6abcSbmc /* 34144cb6abcSbmc * We define a macro to allow ARC hits/misses to be easily broken down by 34244cb6abcSbmc * two separate conditions, giving a total of four different subtypes for 34344cb6abcSbmc * each of hits and misses (so eight statistics total). 34444cb6abcSbmc */ 34544cb6abcSbmc #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 34644cb6abcSbmc if (cond1) { \ 34744cb6abcSbmc if (cond2) { \ 34844cb6abcSbmc ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 34944cb6abcSbmc } else { \ 35044cb6abcSbmc ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 35144cb6abcSbmc } \ 35244cb6abcSbmc } else { \ 35344cb6abcSbmc if (cond2) { \ 35444cb6abcSbmc ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 35544cb6abcSbmc } else { \ 35644cb6abcSbmc ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 35744cb6abcSbmc } \ 35844cb6abcSbmc } 35944cb6abcSbmc 36044cb6abcSbmc kstat_t *arc_ksp; 36144cb6abcSbmc static arc_state_t *arc_anon; 36244cb6abcSbmc static arc_state_t *arc_mru; 36344cb6abcSbmc static arc_state_t *arc_mru_ghost; 36444cb6abcSbmc static arc_state_t *arc_mfu; 36544cb6abcSbmc static arc_state_t *arc_mfu_ghost; 366fa94a07fSbrendan static arc_state_t *arc_l2c_only; 36744cb6abcSbmc 36844cb6abcSbmc /* 36944cb6abcSbmc * There are several ARC variables that are critical to export as kstats -- 37044cb6abcSbmc * but we don't want to have to grovel around in the kstat whenever we wish to 37144cb6abcSbmc * manipulate them. For these variables, we therefore define them to be in 37244cb6abcSbmc * terms of the statistic variable. This assures that we are not introducing 37344cb6abcSbmc * the possibility of inconsistency by having shadow copies of the variables, 37444cb6abcSbmc * while still allowing the code to be readable. 37544cb6abcSbmc */ 37644cb6abcSbmc #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 37744cb6abcSbmc #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 37844cb6abcSbmc #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 37944cb6abcSbmc #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 38044cb6abcSbmc #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 38144cb6abcSbmc 38244cb6abcSbmc static int arc_no_grow; /* Don't try to grow cache size */ 38344cb6abcSbmc static uint64_t arc_tempreserve; 3840e8c6158Smaybee static uint64_t arc_meta_used; 3850e8c6158Smaybee static uint64_t arc_meta_limit; 3860e8c6158Smaybee static uint64_t arc_meta_max = 0; 387fa9e4066Sahrens 388fa94a07fSbrendan typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 389fa94a07fSbrendan 390fa9e4066Sahrens typedef struct arc_callback arc_callback_t; 391fa9e4066Sahrens 392fa9e4066Sahrens struct arc_callback { 393fa9e4066Sahrens void *acb_private; 394c717a561Smaybee arc_done_func_t *acb_done; 395fa9e4066Sahrens arc_byteswap_func_t *acb_byteswap; 396fa9e4066Sahrens arc_buf_t *acb_buf; 397fa9e4066Sahrens zio_t *acb_zio_dummy; 398fa9e4066Sahrens arc_callback_t *acb_next; 399fa9e4066Sahrens }; 400fa9e4066Sahrens 401c717a561Smaybee typedef struct arc_write_callback arc_write_callback_t; 402c717a561Smaybee 403c717a561Smaybee struct arc_write_callback { 404c717a561Smaybee void *awcb_private; 405c717a561Smaybee arc_done_func_t *awcb_ready; 406c717a561Smaybee arc_done_func_t *awcb_done; 407c717a561Smaybee arc_buf_t *awcb_buf; 408c717a561Smaybee }; 409c717a561Smaybee 410fa9e4066Sahrens struct arc_buf_hdr { 411fa9e4066Sahrens /* protected by hash lock */ 412fa9e4066Sahrens dva_t b_dva; 413fa9e4066Sahrens uint64_t b_birth; 414fa9e4066Sahrens uint64_t b_cksum0; 415fa9e4066Sahrens 4166b4acc8bSahrens kmutex_t b_freeze_lock; 4176b4acc8bSahrens zio_cksum_t *b_freeze_cksum; 4186b4acc8bSahrens 419fa9e4066Sahrens arc_buf_hdr_t *b_hash_next; 420fa9e4066Sahrens arc_buf_t *b_buf; 421fa9e4066Sahrens uint32_t b_flags; 422ea8dc4b6Seschrock uint32_t b_datacnt; 423fa9e4066Sahrens 424fa9e4066Sahrens arc_callback_t *b_acb; 425ad23a2dbSjohansen kcondvar_t b_cv; 426ad23a2dbSjohansen 427ad23a2dbSjohansen /* immutable */ 428ad23a2dbSjohansen arc_buf_contents_t b_type; 429ad23a2dbSjohansen uint64_t b_size; 430ad23a2dbSjohansen spa_t *b_spa; 431fa9e4066Sahrens 432fa9e4066Sahrens /* protected by arc state mutex */ 433fa9e4066Sahrens arc_state_t *b_state; 434fa9e4066Sahrens list_node_t b_arc_node; 435fa9e4066Sahrens 436fa9e4066Sahrens /* updated atomically */ 437fa9e4066Sahrens clock_t b_arc_access; 438fa9e4066Sahrens 439fa9e4066Sahrens /* self protecting */ 440fa9e4066Sahrens refcount_t b_refcnt; 441fa94a07fSbrendan 442fa94a07fSbrendan l2arc_buf_hdr_t *b_l2hdr; 443fa94a07fSbrendan list_node_t b_l2node; 444fa9e4066Sahrens }; 445fa9e4066Sahrens 446ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list; 447ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx; 44840d7d650Smaybee static arc_buf_hdr_t arc_eviction_hdr; 44944eda4d7Smaybee static void arc_get_data_buf(arc_buf_t *buf); 45044eda4d7Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 4510e8c6158Smaybee static int arc_evict_needed(arc_buf_contents_t type); 452874395d5Smaybee static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); 453ea8dc4b6Seschrock 454ea8dc4b6Seschrock #define GHOST_STATE(state) \ 455fa94a07fSbrendan ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 456fa94a07fSbrendan (state) == arc_l2c_only) 457ea8dc4b6Seschrock 458fa9e4066Sahrens /* 459fa9e4066Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 460fa9e4066Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 461fa9e4066Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 462fa9e4066Sahrens * should never be passed and should only be set by ARC code. When adding new 463fa9e4066Sahrens * public flags, make sure not to smash the private ones. 464fa9e4066Sahrens */ 465fa9e4066Sahrens 466ea8dc4b6Seschrock #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 467fa9e4066Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 468fa9e4066Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 469fa9e4066Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 470ea8dc4b6Seschrock #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 47113506d1eSmaybee #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 472fa94a07fSbrendan #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 473fa94a07fSbrendan #define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */ 474*3a737e0dSbrendan #define ARC_L2_WRITING (1 << 17) /* L2ARC write in progress */ 475*3a737e0dSbrendan #define ARC_L2_EVICTED (1 << 18) /* evicted during I/O */ 476*3a737e0dSbrendan #define ARC_L2_WRITE_HEAD (1 << 19) /* head of write list */ 477fa9e4066Sahrens 478ea8dc4b6Seschrock #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 479fa9e4066Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 480fa9e4066Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 481fa9e4066Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 482ea8dc4b6Seschrock #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 483fa94a07fSbrendan #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 484fa94a07fSbrendan #define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE) 485*3a737e0dSbrendan #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 486*3a737e0dSbrendan (hdr)->b_l2hdr != NULL) 487fa94a07fSbrendan #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 488fa94a07fSbrendan #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 489fa94a07fSbrendan #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 490fa9e4066Sahrens 491e6c728e1Sbrendan /* 492e6c728e1Sbrendan * Other sizes 493e6c728e1Sbrendan */ 494e6c728e1Sbrendan 495e6c728e1Sbrendan #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 496e6c728e1Sbrendan #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 497e6c728e1Sbrendan 498fa9e4066Sahrens /* 499fa9e4066Sahrens * Hash table routines 500fa9e4066Sahrens */ 501fa9e4066Sahrens 502fa9e4066Sahrens #define HT_LOCK_PAD 64 503fa9e4066Sahrens 504fa9e4066Sahrens struct ht_lock { 505fa9e4066Sahrens kmutex_t ht_lock; 506fa9e4066Sahrens #ifdef _KERNEL 507fa9e4066Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 508fa9e4066Sahrens #endif 509fa9e4066Sahrens }; 510fa9e4066Sahrens 511fa9e4066Sahrens #define BUF_LOCKS 256 512fa9e4066Sahrens typedef struct buf_hash_table { 513fa9e4066Sahrens uint64_t ht_mask; 514fa9e4066Sahrens arc_buf_hdr_t **ht_table; 515fa9e4066Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 516fa9e4066Sahrens } buf_hash_table_t; 517fa9e4066Sahrens 518fa9e4066Sahrens static buf_hash_table_t buf_hash_table; 519fa9e4066Sahrens 520fa9e4066Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 521fa9e4066Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 522fa9e4066Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 523fa9e4066Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 524fa9e4066Sahrens #define HDR_LOCK(buf) \ 525fa9e4066Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 526fa9e4066Sahrens 527fa9e4066Sahrens uint64_t zfs_crc64_table[256]; 528fa9e4066Sahrens 529fa94a07fSbrendan /* 530fa94a07fSbrendan * Level 2 ARC 531fa94a07fSbrendan */ 532fa94a07fSbrendan 533fa94a07fSbrendan #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 534fa94a07fSbrendan #define L2ARC_HEADROOM 4 /* num of writes */ 535fa94a07fSbrendan #define L2ARC_FEED_SECS 1 /* caching interval */ 536fa94a07fSbrendan 537fa94a07fSbrendan #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 538fa94a07fSbrendan #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 539fa94a07fSbrendan 540fa94a07fSbrendan /* 541fa94a07fSbrendan * L2ARC Performance Tunables 542fa94a07fSbrendan */ 543fa94a07fSbrendan uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 544*3a737e0dSbrendan uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 545fa94a07fSbrendan uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 546fa94a07fSbrendan uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 547fa94a07fSbrendan boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 548fa94a07fSbrendan 549fa94a07fSbrendan /* 550fa94a07fSbrendan * L2ARC Internals 551fa94a07fSbrendan */ 552fa94a07fSbrendan typedef struct l2arc_dev { 553fa94a07fSbrendan vdev_t *l2ad_vdev; /* vdev */ 554fa94a07fSbrendan spa_t *l2ad_spa; /* spa */ 555fa94a07fSbrendan uint64_t l2ad_hand; /* next write location */ 556fa94a07fSbrendan uint64_t l2ad_write; /* desired write size, bytes */ 557*3a737e0dSbrendan uint64_t l2ad_boost; /* warmup write boost, bytes */ 558fa94a07fSbrendan uint64_t l2ad_start; /* first addr on device */ 559fa94a07fSbrendan uint64_t l2ad_end; /* last addr on device */ 560fa94a07fSbrendan uint64_t l2ad_evict; /* last addr eviction reached */ 561fa94a07fSbrendan boolean_t l2ad_first; /* first sweep through */ 562fa94a07fSbrendan list_t *l2ad_buflist; /* buffer list */ 563fa94a07fSbrendan list_node_t l2ad_node; /* device list node */ 564fa94a07fSbrendan } l2arc_dev_t; 565fa94a07fSbrendan 566fa94a07fSbrendan static list_t L2ARC_dev_list; /* device list */ 567fa94a07fSbrendan static list_t *l2arc_dev_list; /* device list pointer */ 568fa94a07fSbrendan static kmutex_t l2arc_dev_mtx; /* device list mutex */ 569fa94a07fSbrendan static l2arc_dev_t *l2arc_dev_last; /* last device used */ 570fa94a07fSbrendan static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 571fa94a07fSbrendan static list_t L2ARC_free_on_write; /* free after write buf list */ 572fa94a07fSbrendan static list_t *l2arc_free_on_write; /* free after write list ptr */ 573fa94a07fSbrendan static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 574fa94a07fSbrendan static uint64_t l2arc_ndev; /* number of devices */ 575fa94a07fSbrendan 576fa94a07fSbrendan typedef struct l2arc_read_callback { 577fa94a07fSbrendan arc_buf_t *l2rcb_buf; /* read buffer */ 578fa94a07fSbrendan spa_t *l2rcb_spa; /* spa */ 579fa94a07fSbrendan blkptr_t l2rcb_bp; /* original blkptr */ 580fa94a07fSbrendan zbookmark_t l2rcb_zb; /* original bookmark */ 581fa94a07fSbrendan int l2rcb_flags; /* original flags */ 582fa94a07fSbrendan } l2arc_read_callback_t; 583fa94a07fSbrendan 584fa94a07fSbrendan typedef struct l2arc_write_callback { 585fa94a07fSbrendan l2arc_dev_t *l2wcb_dev; /* device info */ 586fa94a07fSbrendan arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 587fa94a07fSbrendan } l2arc_write_callback_t; 588fa94a07fSbrendan 589fa94a07fSbrendan struct l2arc_buf_hdr { 590fa94a07fSbrendan /* protected by arc_buf_hdr mutex */ 591fa94a07fSbrendan l2arc_dev_t *b_dev; /* L2ARC device */ 592fa94a07fSbrendan daddr_t b_daddr; /* disk address, offset byte */ 593fa94a07fSbrendan }; 594fa94a07fSbrendan 595fa94a07fSbrendan typedef struct l2arc_data_free { 596fa94a07fSbrendan /* protected by l2arc_free_on_write_mtx */ 597fa94a07fSbrendan void *l2df_data; 598fa94a07fSbrendan size_t l2df_size; 599fa94a07fSbrendan void (*l2df_func)(void *, size_t); 600fa94a07fSbrendan list_node_t l2df_list_node; 601fa94a07fSbrendan } l2arc_data_free_t; 602fa94a07fSbrendan 603fa94a07fSbrendan static kmutex_t l2arc_feed_thr_lock; 604fa94a07fSbrendan static kcondvar_t l2arc_feed_thr_cv; 605fa94a07fSbrendan static uint8_t l2arc_thread_exit; 606fa94a07fSbrendan 607fa94a07fSbrendan static void l2arc_read_done(zio_t *zio); 608fa94a07fSbrendan static void l2arc_hdr_stat_add(void); 609fa94a07fSbrendan static void l2arc_hdr_stat_remove(void); 610fa94a07fSbrendan 611fa9e4066Sahrens static uint64_t 612fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 613fa9e4066Sahrens { 614fa9e4066Sahrens uintptr_t spav = (uintptr_t)spa; 615fa9e4066Sahrens uint8_t *vdva = (uint8_t *)dva; 616fa9e4066Sahrens uint64_t crc = -1ULL; 617fa9e4066Sahrens int i; 618fa9e4066Sahrens 619fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 620fa9e4066Sahrens 621fa9e4066Sahrens for (i = 0; i < sizeof (dva_t); i++) 622fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 623fa9e4066Sahrens 624fa9e4066Sahrens crc ^= (spav>>8) ^ birth; 625fa9e4066Sahrens 626fa9e4066Sahrens return (crc); 627fa9e4066Sahrens } 628fa9e4066Sahrens 629fa9e4066Sahrens #define BUF_EMPTY(buf) \ 630fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 631fa9e4066Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 632fa9e4066Sahrens (buf)->b_birth == 0) 633fa9e4066Sahrens 634fa9e4066Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 635fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 636fa9e4066Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 637fa9e4066Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 638fa9e4066Sahrens 639fa9e4066Sahrens static arc_buf_hdr_t * 640fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 641fa9e4066Sahrens { 642fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 643fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 644fa9e4066Sahrens arc_buf_hdr_t *buf; 645fa9e4066Sahrens 646fa9e4066Sahrens mutex_enter(hash_lock); 647fa9e4066Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 648fa9e4066Sahrens buf = buf->b_hash_next) { 649fa9e4066Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 650fa9e4066Sahrens *lockp = hash_lock; 651fa9e4066Sahrens return (buf); 652fa9e4066Sahrens } 653fa9e4066Sahrens } 654fa9e4066Sahrens mutex_exit(hash_lock); 655fa9e4066Sahrens *lockp = NULL; 656fa9e4066Sahrens return (NULL); 657fa9e4066Sahrens } 658fa9e4066Sahrens 659fa9e4066Sahrens /* 660fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 661fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 662fa9e4066Sahrens * will be returned and the new element will not be inserted. 663fa9e4066Sahrens * Otherwise returns NULL. 664fa9e4066Sahrens */ 665fa9e4066Sahrens static arc_buf_hdr_t * 666fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 667fa9e4066Sahrens { 668fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 669fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 670fa9e4066Sahrens arc_buf_hdr_t *fbuf; 67144cb6abcSbmc uint32_t i; 672fa9e4066Sahrens 673ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(buf)); 674fa9e4066Sahrens *lockp = hash_lock; 675fa9e4066Sahrens mutex_enter(hash_lock); 676fa9e4066Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 677fa9e4066Sahrens fbuf = fbuf->b_hash_next, i++) { 678fa9e4066Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 679fa9e4066Sahrens return (fbuf); 680fa9e4066Sahrens } 681fa9e4066Sahrens 682fa9e4066Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 683fa9e4066Sahrens buf_hash_table.ht_table[idx] = buf; 684ea8dc4b6Seschrock buf->b_flags |= ARC_IN_HASH_TABLE; 685fa9e4066Sahrens 686fa9e4066Sahrens /* collect some hash table performance data */ 687fa9e4066Sahrens if (i > 0) { 68844cb6abcSbmc ARCSTAT_BUMP(arcstat_hash_collisions); 689fa9e4066Sahrens if (i == 1) 69044cb6abcSbmc ARCSTAT_BUMP(arcstat_hash_chains); 69144cb6abcSbmc 69244cb6abcSbmc ARCSTAT_MAX(arcstat_hash_chain_max, i); 693fa9e4066Sahrens } 69444cb6abcSbmc 69544cb6abcSbmc ARCSTAT_BUMP(arcstat_hash_elements); 69644cb6abcSbmc ARCSTAT_MAXSTAT(arcstat_hash_elements); 697fa9e4066Sahrens 698fa9e4066Sahrens return (NULL); 699fa9e4066Sahrens } 700fa9e4066Sahrens 701fa9e4066Sahrens static void 702fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 703fa9e4066Sahrens { 704fa9e4066Sahrens arc_buf_hdr_t *fbuf, **bufp; 705fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 706fa9e4066Sahrens 707fa9e4066Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 708ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(buf)); 709fa9e4066Sahrens 710fa9e4066Sahrens bufp = &buf_hash_table.ht_table[idx]; 711fa9e4066Sahrens while ((fbuf = *bufp) != buf) { 712fa9e4066Sahrens ASSERT(fbuf != NULL); 713fa9e4066Sahrens bufp = &fbuf->b_hash_next; 714fa9e4066Sahrens } 715fa9e4066Sahrens *bufp = buf->b_hash_next; 716fa9e4066Sahrens buf->b_hash_next = NULL; 717ea8dc4b6Seschrock buf->b_flags &= ~ARC_IN_HASH_TABLE; 718fa9e4066Sahrens 719fa9e4066Sahrens /* collect some hash table performance data */ 72044cb6abcSbmc ARCSTAT_BUMPDOWN(arcstat_hash_elements); 72144cb6abcSbmc 722fa9e4066Sahrens if (buf_hash_table.ht_table[idx] && 723fa9e4066Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 72444cb6abcSbmc ARCSTAT_BUMPDOWN(arcstat_hash_chains); 725fa9e4066Sahrens } 726fa9e4066Sahrens 727fa9e4066Sahrens /* 728fa9e4066Sahrens * Global data structures and functions for the buf kmem cache. 729fa9e4066Sahrens */ 730fa9e4066Sahrens static kmem_cache_t *hdr_cache; 731fa9e4066Sahrens static kmem_cache_t *buf_cache; 732fa9e4066Sahrens 733fa9e4066Sahrens static void 734fa9e4066Sahrens buf_fini(void) 735fa9e4066Sahrens { 736fa9e4066Sahrens int i; 737fa9e4066Sahrens 738fa9e4066Sahrens kmem_free(buf_hash_table.ht_table, 739fa9e4066Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 740fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) 741fa9e4066Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 742fa9e4066Sahrens kmem_cache_destroy(hdr_cache); 743fa9e4066Sahrens kmem_cache_destroy(buf_cache); 744fa9e4066Sahrens } 745fa9e4066Sahrens 746fa9e4066Sahrens /* 747fa9e4066Sahrens * Constructor callback - called when the cache is empty 748fa9e4066Sahrens * and a new buf is requested. 749fa9e4066Sahrens */ 750fa9e4066Sahrens /* ARGSUSED */ 751fa9e4066Sahrens static int 752fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 753fa9e4066Sahrens { 754fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 755fa9e4066Sahrens 756fa9e4066Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 757fa9e4066Sahrens refcount_create(&buf->b_refcnt); 758fa9e4066Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 759c25056deSgw mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 760fa94a07fSbrendan 761e6c728e1Sbrendan ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 762fa9e4066Sahrens return (0); 763fa9e4066Sahrens } 764fa9e4066Sahrens 765fa9e4066Sahrens /* 766fa9e4066Sahrens * Destructor callback - called when a cached buf is 767fa9e4066Sahrens * no longer required. 768fa9e4066Sahrens */ 769fa9e4066Sahrens /* ARGSUSED */ 770fa9e4066Sahrens static void 771fa9e4066Sahrens hdr_dest(void *vbuf, void *unused) 772fa9e4066Sahrens { 773fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 774fa9e4066Sahrens 775fa9e4066Sahrens refcount_destroy(&buf->b_refcnt); 776fa9e4066Sahrens cv_destroy(&buf->b_cv); 777c25056deSgw mutex_destroy(&buf->b_freeze_lock); 778fa94a07fSbrendan 779e6c728e1Sbrendan ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 780fa9e4066Sahrens } 781fa9e4066Sahrens 782fa9e4066Sahrens /* 783fa9e4066Sahrens * Reclaim callback -- invoked when memory is low. 784fa9e4066Sahrens */ 785fa9e4066Sahrens /* ARGSUSED */ 786fa9e4066Sahrens static void 787fa9e4066Sahrens hdr_recl(void *unused) 788fa9e4066Sahrens { 789fa9e4066Sahrens dprintf("hdr_recl called\n"); 79049e3519aSmaybee /* 79149e3519aSmaybee * umem calls the reclaim func when we destroy the buf cache, 79249e3519aSmaybee * which is after we do arc_fini(). 79349e3519aSmaybee */ 79449e3519aSmaybee if (!arc_dead) 79549e3519aSmaybee cv_signal(&arc_reclaim_thr_cv); 796fa9e4066Sahrens } 797fa9e4066Sahrens 798fa9e4066Sahrens static void 799fa9e4066Sahrens buf_init(void) 800fa9e4066Sahrens { 801fa9e4066Sahrens uint64_t *ct; 802ea8dc4b6Seschrock uint64_t hsize = 1ULL << 12; 803fa9e4066Sahrens int i, j; 804fa9e4066Sahrens 805fa9e4066Sahrens /* 806fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 807ea8dc4b6Seschrock * with an average 64K block size. The table will take up 808ea8dc4b6Seschrock * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 809fa9e4066Sahrens */ 810ea8dc4b6Seschrock while (hsize * 65536 < physmem * PAGESIZE) 811fa9e4066Sahrens hsize <<= 1; 812ea8dc4b6Seschrock retry: 813fa9e4066Sahrens buf_hash_table.ht_mask = hsize - 1; 814ea8dc4b6Seschrock buf_hash_table.ht_table = 815ea8dc4b6Seschrock kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 816ea8dc4b6Seschrock if (buf_hash_table.ht_table == NULL) { 817ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 8)); 818ea8dc4b6Seschrock hsize >>= 1; 819ea8dc4b6Seschrock goto retry; 820ea8dc4b6Seschrock } 821fa9e4066Sahrens 822fa9e4066Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 823fa9e4066Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 824fa9e4066Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 825fa9e4066Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 826fa9e4066Sahrens 827fa9e4066Sahrens for (i = 0; i < 256; i++) 828fa9e4066Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 829fa9e4066Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 830fa9e4066Sahrens 831fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) { 832fa9e4066Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 833fa9e4066Sahrens NULL, MUTEX_DEFAULT, NULL); 834fa9e4066Sahrens } 835fa9e4066Sahrens } 836fa9e4066Sahrens 837fa9e4066Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 838fa9e4066Sahrens 8396b4acc8bSahrens static void 8406b4acc8bSahrens arc_cksum_verify(arc_buf_t *buf) 8416b4acc8bSahrens { 8426b4acc8bSahrens zio_cksum_t zc; 8436b4acc8bSahrens 844cc60fd72Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 8456b4acc8bSahrens return; 8466b4acc8bSahrens 8476b4acc8bSahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 8483ccfa83cSahrens if (buf->b_hdr->b_freeze_cksum == NULL || 8493ccfa83cSahrens (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 8506b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 8516b4acc8bSahrens return; 8526b4acc8bSahrens } 8536b4acc8bSahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 8546b4acc8bSahrens if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 8556b4acc8bSahrens panic("buffer modified while frozen!"); 8566b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 8576b4acc8bSahrens } 8586b4acc8bSahrens 859fa94a07fSbrendan static int 860fa94a07fSbrendan arc_cksum_equal(arc_buf_t *buf) 861fa94a07fSbrendan { 862fa94a07fSbrendan zio_cksum_t zc; 863fa94a07fSbrendan int equal; 864fa94a07fSbrendan 865fa94a07fSbrendan mutex_enter(&buf->b_hdr->b_freeze_lock); 866fa94a07fSbrendan fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 867fa94a07fSbrendan equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 868fa94a07fSbrendan mutex_exit(&buf->b_hdr->b_freeze_lock); 869fa94a07fSbrendan 870fa94a07fSbrendan return (equal); 871fa94a07fSbrendan } 872fa94a07fSbrendan 8736b4acc8bSahrens static void 874fa94a07fSbrendan arc_cksum_compute(arc_buf_t *buf, boolean_t force) 8756b4acc8bSahrens { 876fa94a07fSbrendan if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 8776b4acc8bSahrens return; 8786b4acc8bSahrens 8796b4acc8bSahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 8806b4acc8bSahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 8816b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 8826b4acc8bSahrens return; 8836b4acc8bSahrens } 8846b4acc8bSahrens buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 8856b4acc8bSahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 8866b4acc8bSahrens buf->b_hdr->b_freeze_cksum); 8876b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 8886b4acc8bSahrens } 8896b4acc8bSahrens 8906b4acc8bSahrens void 8916b4acc8bSahrens arc_buf_thaw(arc_buf_t *buf) 8926b4acc8bSahrens { 893fa94a07fSbrendan if (zfs_flags & ZFS_DEBUG_MODIFY) { 894fa94a07fSbrendan if (buf->b_hdr->b_state != arc_anon) 895fa94a07fSbrendan panic("modifying non-anon buffer!"); 896fa94a07fSbrendan if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 897fa94a07fSbrendan panic("modifying buffer while i/o in progress!"); 898fa94a07fSbrendan arc_cksum_verify(buf); 899fa94a07fSbrendan } 9006b4acc8bSahrens 9016b4acc8bSahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 9026b4acc8bSahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 9036b4acc8bSahrens kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 9046b4acc8bSahrens buf->b_hdr->b_freeze_cksum = NULL; 9056b4acc8bSahrens } 9066b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 9076b4acc8bSahrens } 9086b4acc8bSahrens 9096b4acc8bSahrens void 9106b4acc8bSahrens arc_buf_freeze(arc_buf_t *buf) 9116b4acc8bSahrens { 912cc60fd72Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 913cc60fd72Sahrens return; 914cc60fd72Sahrens 9156b4acc8bSahrens ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 91644cb6abcSbmc buf->b_hdr->b_state == arc_anon); 917fa94a07fSbrendan arc_cksum_compute(buf, B_FALSE); 9186b4acc8bSahrens } 9196b4acc8bSahrens 920fa9e4066Sahrens static void 921fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 922fa9e4066Sahrens { 923fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 924fa9e4066Sahrens 925fa9e4066Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 92644cb6abcSbmc (ab->b_state != arc_anon)) { 927c0a81264Sek uint64_t delta = ab->b_size * ab->b_datacnt; 9280e8c6158Smaybee list_t *list = &ab->b_state->arcs_list[ab->b_type]; 9290e8c6158Smaybee uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 930fa9e4066Sahrens 93144cb6abcSbmc ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 93244cb6abcSbmc mutex_enter(&ab->b_state->arcs_mtx); 933fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 9340e8c6158Smaybee list_remove(list, ab); 935ea8dc4b6Seschrock if (GHOST_STATE(ab->b_state)) { 936ea8dc4b6Seschrock ASSERT3U(ab->b_datacnt, ==, 0); 937ea8dc4b6Seschrock ASSERT3P(ab->b_buf, ==, NULL); 938ea8dc4b6Seschrock delta = ab->b_size; 939ea8dc4b6Seschrock } 940ea8dc4b6Seschrock ASSERT(delta > 0); 9410e8c6158Smaybee ASSERT3U(*size, >=, delta); 9420e8c6158Smaybee atomic_add_64(size, -delta); 94344cb6abcSbmc mutex_exit(&ab->b_state->arcs_mtx); 94413506d1eSmaybee /* remove the prefetch flag is we get a reference */ 94513506d1eSmaybee if (ab->b_flags & ARC_PREFETCH) 94613506d1eSmaybee ab->b_flags &= ~ARC_PREFETCH; 947fa9e4066Sahrens } 948fa9e4066Sahrens } 949fa9e4066Sahrens 950fa9e4066Sahrens static int 951fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 952fa9e4066Sahrens { 953fa9e4066Sahrens int cnt; 95444cb6abcSbmc arc_state_t *state = ab->b_state; 955fa9e4066Sahrens 95644cb6abcSbmc ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 95744cb6abcSbmc ASSERT(!GHOST_STATE(state)); 958fa9e4066Sahrens 959fa9e4066Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 96044cb6abcSbmc (state != arc_anon)) { 9610e8c6158Smaybee uint64_t *size = &state->arcs_lsize[ab->b_type]; 9620e8c6158Smaybee 96344cb6abcSbmc ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 96444cb6abcSbmc mutex_enter(&state->arcs_mtx); 965fa9e4066Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 9660e8c6158Smaybee list_insert_head(&state->arcs_list[ab->b_type], ab); 967ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 9680e8c6158Smaybee atomic_add_64(size, ab->b_size * ab->b_datacnt); 96944cb6abcSbmc mutex_exit(&state->arcs_mtx); 970fa9e4066Sahrens } 971fa9e4066Sahrens return (cnt); 972fa9e4066Sahrens } 973fa9e4066Sahrens 974fa9e4066Sahrens /* 975fa9e4066Sahrens * Move the supplied buffer to the indicated state. The mutex 976fa9e4066Sahrens * for the buffer must be held by the caller. 977fa9e4066Sahrens */ 978fa9e4066Sahrens static void 979ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 980fa9e4066Sahrens { 981ea8dc4b6Seschrock arc_state_t *old_state = ab->b_state; 982c0a81264Sek int64_t refcnt = refcount_count(&ab->b_refcnt); 983c0a81264Sek uint64_t from_delta, to_delta; 984fa9e4066Sahrens 985fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 986ea8dc4b6Seschrock ASSERT(new_state != old_state); 987ea8dc4b6Seschrock ASSERT(refcnt == 0 || ab->b_datacnt > 0); 988ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 989ea8dc4b6Seschrock 990ea8dc4b6Seschrock from_delta = to_delta = ab->b_datacnt * ab->b_size; 991fa9e4066Sahrens 992fa9e4066Sahrens /* 993fa9e4066Sahrens * If this buffer is evictable, transfer it from the 994fa9e4066Sahrens * old state list to the new state list. 995fa9e4066Sahrens */ 996ea8dc4b6Seschrock if (refcnt == 0) { 99744cb6abcSbmc if (old_state != arc_anon) { 99844cb6abcSbmc int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 9990e8c6158Smaybee uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1000ea8dc4b6Seschrock 1001ea8dc4b6Seschrock if (use_mutex) 100244cb6abcSbmc mutex_enter(&old_state->arcs_mtx); 1003fa9e4066Sahrens 1004fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 10050e8c6158Smaybee list_remove(&old_state->arcs_list[ab->b_type], ab); 1006ea8dc4b6Seschrock 100713506d1eSmaybee /* 100813506d1eSmaybee * If prefetching out of the ghost cache, 100913506d1eSmaybee * we will have a non-null datacnt. 101013506d1eSmaybee */ 101113506d1eSmaybee if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 101213506d1eSmaybee /* ghost elements have a ghost size */ 1013ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 1014ea8dc4b6Seschrock from_delta = ab->b_size; 1015ea8dc4b6Seschrock } 10160e8c6158Smaybee ASSERT3U(*size, >=, from_delta); 10170e8c6158Smaybee atomic_add_64(size, -from_delta); 1018ea8dc4b6Seschrock 1019ea8dc4b6Seschrock if (use_mutex) 102044cb6abcSbmc mutex_exit(&old_state->arcs_mtx); 1021fa9e4066Sahrens } 102244cb6abcSbmc if (new_state != arc_anon) { 102344cb6abcSbmc int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 10240e8c6158Smaybee uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1025fa9e4066Sahrens 1026ea8dc4b6Seschrock if (use_mutex) 102744cb6abcSbmc mutex_enter(&new_state->arcs_mtx); 1028ea8dc4b6Seschrock 10290e8c6158Smaybee list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1030ea8dc4b6Seschrock 1031ea8dc4b6Seschrock /* ghost elements have a ghost size */ 1032ea8dc4b6Seschrock if (GHOST_STATE(new_state)) { 1033ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 1034ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 1035ea8dc4b6Seschrock to_delta = ab->b_size; 1036ea8dc4b6Seschrock } 10370e8c6158Smaybee atomic_add_64(size, to_delta); 1038ea8dc4b6Seschrock 1039ea8dc4b6Seschrock if (use_mutex) 104044cb6abcSbmc mutex_exit(&new_state->arcs_mtx); 1041fa9e4066Sahrens } 1042fa9e4066Sahrens } 1043fa9e4066Sahrens 1044fa9e4066Sahrens ASSERT(!BUF_EMPTY(ab)); 1045fa94a07fSbrendan if (new_state == arc_anon) { 1046fa9e4066Sahrens buf_hash_remove(ab); 1047fa9e4066Sahrens } 1048fa9e4066Sahrens 1049ea8dc4b6Seschrock /* adjust state sizes */ 1050ea8dc4b6Seschrock if (to_delta) 105144cb6abcSbmc atomic_add_64(&new_state->arcs_size, to_delta); 1052ea8dc4b6Seschrock if (from_delta) { 105344cb6abcSbmc ASSERT3U(old_state->arcs_size, >=, from_delta); 105444cb6abcSbmc atomic_add_64(&old_state->arcs_size, -from_delta); 1055fa9e4066Sahrens } 1056fa9e4066Sahrens ab->b_state = new_state; 1057fa94a07fSbrendan 1058fa94a07fSbrendan /* adjust l2arc hdr stats */ 1059fa94a07fSbrendan if (new_state == arc_l2c_only) 1060fa94a07fSbrendan l2arc_hdr_stat_add(); 1061fa94a07fSbrendan else if (old_state == arc_l2c_only) 1062fa94a07fSbrendan l2arc_hdr_stat_remove(); 1063fa9e4066Sahrens } 1064fa9e4066Sahrens 10650e8c6158Smaybee void 10660e8c6158Smaybee arc_space_consume(uint64_t space) 10670e8c6158Smaybee { 10680e8c6158Smaybee atomic_add_64(&arc_meta_used, space); 10690e8c6158Smaybee atomic_add_64(&arc_size, space); 10700e8c6158Smaybee } 10710e8c6158Smaybee 10720e8c6158Smaybee void 10730e8c6158Smaybee arc_space_return(uint64_t space) 10740e8c6158Smaybee { 10750e8c6158Smaybee ASSERT(arc_meta_used >= space); 10760e8c6158Smaybee if (arc_meta_max < arc_meta_used) 10770e8c6158Smaybee arc_meta_max = arc_meta_used; 10780e8c6158Smaybee atomic_add_64(&arc_meta_used, -space); 10790e8c6158Smaybee ASSERT(arc_size >= space); 10800e8c6158Smaybee atomic_add_64(&arc_size, -space); 10810e8c6158Smaybee } 10820e8c6158Smaybee 10830e8c6158Smaybee void * 10840e8c6158Smaybee arc_data_buf_alloc(uint64_t size) 10850e8c6158Smaybee { 10860e8c6158Smaybee if (arc_evict_needed(ARC_BUFC_DATA)) 10870e8c6158Smaybee cv_signal(&arc_reclaim_thr_cv); 10880e8c6158Smaybee atomic_add_64(&arc_size, size); 10890e8c6158Smaybee return (zio_data_buf_alloc(size)); 10900e8c6158Smaybee } 10910e8c6158Smaybee 10920e8c6158Smaybee void 10930e8c6158Smaybee arc_data_buf_free(void *buf, uint64_t size) 10940e8c6158Smaybee { 10950e8c6158Smaybee zio_data_buf_free(buf, size); 10960e8c6158Smaybee ASSERT(arc_size >= size); 10970e8c6158Smaybee atomic_add_64(&arc_size, -size); 10980e8c6158Smaybee } 10990e8c6158Smaybee 1100fa9e4066Sahrens arc_buf_t * 1101ad23a2dbSjohansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1102fa9e4066Sahrens { 1103fa9e4066Sahrens arc_buf_hdr_t *hdr; 1104fa9e4066Sahrens arc_buf_t *buf; 1105fa9e4066Sahrens 1106fa9e4066Sahrens ASSERT3U(size, >, 0); 11071ab7f2deSmaybee hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1108fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 1109fa9e4066Sahrens hdr->b_size = size; 1110ad23a2dbSjohansen hdr->b_type = type; 1111fa9e4066Sahrens hdr->b_spa = spa; 111244cb6abcSbmc hdr->b_state = arc_anon; 1113fa9e4066Sahrens hdr->b_arc_access = 0; 11141ab7f2deSmaybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1115fa9e4066Sahrens buf->b_hdr = hdr; 111644eda4d7Smaybee buf->b_data = NULL; 1117ea8dc4b6Seschrock buf->b_efunc = NULL; 1118ea8dc4b6Seschrock buf->b_private = NULL; 1119fa9e4066Sahrens buf->b_next = NULL; 1120fa9e4066Sahrens hdr->b_buf = buf; 112144eda4d7Smaybee arc_get_data_buf(buf); 1122ea8dc4b6Seschrock hdr->b_datacnt = 1; 1123fa9e4066Sahrens hdr->b_flags = 0; 1124fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1125fa9e4066Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 1126fa9e4066Sahrens 1127fa9e4066Sahrens return (buf); 1128fa9e4066Sahrens } 1129fa9e4066Sahrens 113044eda4d7Smaybee static arc_buf_t * 113144eda4d7Smaybee arc_buf_clone(arc_buf_t *from) 1132ea8dc4b6Seschrock { 113344eda4d7Smaybee arc_buf_t *buf; 113444eda4d7Smaybee arc_buf_hdr_t *hdr = from->b_hdr; 113544eda4d7Smaybee uint64_t size = hdr->b_size; 1136ea8dc4b6Seschrock 11371ab7f2deSmaybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 113844eda4d7Smaybee buf->b_hdr = hdr; 113944eda4d7Smaybee buf->b_data = NULL; 114044eda4d7Smaybee buf->b_efunc = NULL; 114144eda4d7Smaybee buf->b_private = NULL; 114244eda4d7Smaybee buf->b_next = hdr->b_buf; 114344eda4d7Smaybee hdr->b_buf = buf; 114444eda4d7Smaybee arc_get_data_buf(buf); 114544eda4d7Smaybee bcopy(from->b_data, buf->b_data, size); 114644eda4d7Smaybee hdr->b_datacnt += 1; 114744eda4d7Smaybee return (buf); 1148ea8dc4b6Seschrock } 1149ea8dc4b6Seschrock 1150ea8dc4b6Seschrock void 1151ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag) 1152ea8dc4b6Seschrock { 115340d7d650Smaybee arc_buf_hdr_t *hdr; 1154ea8dc4b6Seschrock kmutex_t *hash_lock; 1155ea8dc4b6Seschrock 11569b23f181Smaybee /* 11579b23f181Smaybee * Check to see if this buffer is currently being evicted via 115840d7d650Smaybee * arc_do_user_evicts(). 11599b23f181Smaybee */ 116040d7d650Smaybee mutex_enter(&arc_eviction_mtx); 116140d7d650Smaybee hdr = buf->b_hdr; 116240d7d650Smaybee if (hdr == NULL) { 116340d7d650Smaybee mutex_exit(&arc_eviction_mtx); 11649b23f181Smaybee return; 116540d7d650Smaybee } 11669b23f181Smaybee hash_lock = HDR_LOCK(hdr); 116740d7d650Smaybee mutex_exit(&arc_eviction_mtx); 116840d7d650Smaybee 11699b23f181Smaybee mutex_enter(hash_lock); 1170ea8dc4b6Seschrock if (buf->b_data == NULL) { 1171ea8dc4b6Seschrock /* 1172ea8dc4b6Seschrock * This buffer is evicted. 1173ea8dc4b6Seschrock */ 11749b23f181Smaybee mutex_exit(hash_lock); 1175ea8dc4b6Seschrock return; 1176ea8dc4b6Seschrock } 1177ea8dc4b6Seschrock 11789b23f181Smaybee ASSERT(buf->b_hdr == hdr); 117944cb6abcSbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1180ea8dc4b6Seschrock add_reference(hdr, hash_lock, tag); 118144eda4d7Smaybee arc_access(hdr, hash_lock); 118244eda4d7Smaybee mutex_exit(hash_lock); 118344cb6abcSbmc ARCSTAT_BUMP(arcstat_hits); 118444cb6abcSbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 118544cb6abcSbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 118644cb6abcSbmc data, metadata, hits); 1187ea8dc4b6Seschrock } 1188ea8dc4b6Seschrock 1189fa94a07fSbrendan /* 1190fa94a07fSbrendan * Free the arc data buffer. If it is an l2arc write in progress, 1191fa94a07fSbrendan * the buffer is placed on l2arc_free_on_write to be freed later. 1192fa94a07fSbrendan */ 1193fa94a07fSbrendan static void 1194fa94a07fSbrendan arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), 1195fa94a07fSbrendan void *data, size_t size) 1196fa94a07fSbrendan { 1197fa94a07fSbrendan if (HDR_L2_WRITING(hdr)) { 1198fa94a07fSbrendan l2arc_data_free_t *df; 1199fa94a07fSbrendan df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1200fa94a07fSbrendan df->l2df_data = data; 1201fa94a07fSbrendan df->l2df_size = size; 1202fa94a07fSbrendan df->l2df_func = free_func; 1203fa94a07fSbrendan mutex_enter(&l2arc_free_on_write_mtx); 1204fa94a07fSbrendan list_insert_head(l2arc_free_on_write, df); 1205fa94a07fSbrendan mutex_exit(&l2arc_free_on_write_mtx); 1206fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_free_on_write); 1207fa94a07fSbrendan } else { 1208fa94a07fSbrendan free_func(data, size); 1209fa94a07fSbrendan } 1210fa94a07fSbrendan } 1211fa94a07fSbrendan 1212ea8dc4b6Seschrock static void 121344eda4d7Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1214ea8dc4b6Seschrock { 1215ea8dc4b6Seschrock arc_buf_t **bufp; 1216ea8dc4b6Seschrock 1217ea8dc4b6Seschrock /* free up data associated with the buf */ 1218ea8dc4b6Seschrock if (buf->b_data) { 1219ea8dc4b6Seschrock arc_state_t *state = buf->b_hdr->b_state; 1220ea8dc4b6Seschrock uint64_t size = buf->b_hdr->b_size; 1221ad23a2dbSjohansen arc_buf_contents_t type = buf->b_hdr->b_type; 1222ea8dc4b6Seschrock 12236b4acc8bSahrens arc_cksum_verify(buf); 122444eda4d7Smaybee if (!recycle) { 1225ad23a2dbSjohansen if (type == ARC_BUFC_METADATA) { 1226fa94a07fSbrendan arc_buf_data_free(buf->b_hdr, zio_buf_free, 1227fa94a07fSbrendan buf->b_data, size); 12280e8c6158Smaybee arc_space_return(size); 1229ad23a2dbSjohansen } else { 1230ad23a2dbSjohansen ASSERT(type == ARC_BUFC_DATA); 1231fa94a07fSbrendan arc_buf_data_free(buf->b_hdr, 1232fa94a07fSbrendan zio_data_buf_free, buf->b_data, size); 12330e8c6158Smaybee atomic_add_64(&arc_size, -size); 1234ad23a2dbSjohansen } 123544eda4d7Smaybee } 1236ea8dc4b6Seschrock if (list_link_active(&buf->b_hdr->b_arc_node)) { 12370e8c6158Smaybee uint64_t *cnt = &state->arcs_lsize[type]; 12380e8c6158Smaybee 1239ea8dc4b6Seschrock ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 124044cb6abcSbmc ASSERT(state != arc_anon); 12410e8c6158Smaybee 12420e8c6158Smaybee ASSERT3U(*cnt, >=, size); 12430e8c6158Smaybee atomic_add_64(cnt, -size); 1244ea8dc4b6Seschrock } 124544cb6abcSbmc ASSERT3U(state->arcs_size, >=, size); 124644cb6abcSbmc atomic_add_64(&state->arcs_size, -size); 1247ea8dc4b6Seschrock buf->b_data = NULL; 1248ea8dc4b6Seschrock ASSERT(buf->b_hdr->b_datacnt > 0); 1249ea8dc4b6Seschrock buf->b_hdr->b_datacnt -= 1; 1250ea8dc4b6Seschrock } 1251ea8dc4b6Seschrock 1252ea8dc4b6Seschrock /* only remove the buf if requested */ 1253ea8dc4b6Seschrock if (!all) 1254ea8dc4b6Seschrock return; 1255ea8dc4b6Seschrock 1256ea8dc4b6Seschrock /* remove the buf from the hdr list */ 1257ea8dc4b6Seschrock for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1258ea8dc4b6Seschrock continue; 1259ea8dc4b6Seschrock *bufp = buf->b_next; 1260ea8dc4b6Seschrock 1261ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 1262ea8dc4b6Seschrock 1263ea8dc4b6Seschrock /* clean up the buf */ 1264ea8dc4b6Seschrock buf->b_hdr = NULL; 1265ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 1266ea8dc4b6Seschrock } 1267ea8dc4b6Seschrock 1268fa9e4066Sahrens static void 1269ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr) 1270fa9e4066Sahrens { 1271fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 127244cb6abcSbmc ASSERT3P(hdr->b_state, ==, arc_anon); 1273ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1274fa9e4066Sahrens 1275fa94a07fSbrendan if (hdr->b_l2hdr != NULL) { 1276fa94a07fSbrendan if (!MUTEX_HELD(&l2arc_buflist_mtx)) { 1277fa94a07fSbrendan /* 1278fa94a07fSbrendan * To prevent arc_free() and l2arc_evict() from 1279fa94a07fSbrendan * attempting to free the same buffer at the same time, 1280fa94a07fSbrendan * a FREE_IN_PROGRESS flag is given to arc_free() to 1281fa94a07fSbrendan * give it priority. l2arc_evict() can't destroy this 1282fa94a07fSbrendan * header while we are waiting on l2arc_buflist_mtx. 1283fa94a07fSbrendan */ 1284fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 1285fa94a07fSbrendan ASSERT(hdr->b_l2hdr != NULL); 1286fa94a07fSbrendan 1287fa94a07fSbrendan list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); 1288fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 1289fa94a07fSbrendan } else { 1290fa94a07fSbrendan list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); 1291fa94a07fSbrendan } 1292fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1293fa94a07fSbrendan kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); 1294fa94a07fSbrendan if (hdr->b_state == arc_l2c_only) 1295fa94a07fSbrendan l2arc_hdr_stat_remove(); 1296fa94a07fSbrendan hdr->b_l2hdr = NULL; 1297fa94a07fSbrendan } 1298fa94a07fSbrendan 1299fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 1300ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1301fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1302fa9e4066Sahrens hdr->b_birth = 0; 1303fa9e4066Sahrens hdr->b_cksum0 = 0; 1304fa9e4066Sahrens } 1305ea8dc4b6Seschrock while (hdr->b_buf) { 1306fa9e4066Sahrens arc_buf_t *buf = hdr->b_buf; 1307fa9e4066Sahrens 1308ea8dc4b6Seschrock if (buf->b_efunc) { 1309ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1310ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 131144eda4d7Smaybee arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1312ea8dc4b6Seschrock hdr->b_buf = buf->b_next; 131340d7d650Smaybee buf->b_hdr = &arc_eviction_hdr; 1314ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 1315ea8dc4b6Seschrock arc_eviction_list = buf; 1316ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1317ea8dc4b6Seschrock } else { 131844eda4d7Smaybee arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1319ea8dc4b6Seschrock } 1320fa9e4066Sahrens } 13216b4acc8bSahrens if (hdr->b_freeze_cksum != NULL) { 13226b4acc8bSahrens kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 13236b4acc8bSahrens hdr->b_freeze_cksum = NULL; 13246b4acc8bSahrens } 1325ea8dc4b6Seschrock 1326fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1327fa9e4066Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 1328fa9e4066Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 1329fa9e4066Sahrens kmem_cache_free(hdr_cache, hdr); 1330fa9e4066Sahrens } 1331fa9e4066Sahrens 1332fa9e4066Sahrens void 1333fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 1334fa9e4066Sahrens { 1335fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 133644cb6abcSbmc int hashed = hdr->b_state != arc_anon; 1337fa9e4066Sahrens 1338ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 1339ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 1340ea8dc4b6Seschrock 1341ea8dc4b6Seschrock if (hashed) { 1342ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 1343ea8dc4b6Seschrock 1344ea8dc4b6Seschrock mutex_enter(hash_lock); 1345ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 1346ea8dc4b6Seschrock if (hdr->b_datacnt > 1) 134744eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 1348ea8dc4b6Seschrock else 1349ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1350fa9e4066Sahrens mutex_exit(hash_lock); 1351ea8dc4b6Seschrock } else if (HDR_IO_IN_PROGRESS(hdr)) { 1352ea8dc4b6Seschrock int destroy_hdr; 1353ea8dc4b6Seschrock /* 1354ea8dc4b6Seschrock * We are in the middle of an async write. Don't destroy 1355ea8dc4b6Seschrock * this buffer unless the write completes before we finish 1356ea8dc4b6Seschrock * decrementing the reference count. 1357ea8dc4b6Seschrock */ 1358ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1359ea8dc4b6Seschrock (void) remove_reference(hdr, NULL, tag); 1360ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1361ea8dc4b6Seschrock destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1362ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1363ea8dc4b6Seschrock if (destroy_hdr) 1364ea8dc4b6Seschrock arc_hdr_destroy(hdr); 1365ea8dc4b6Seschrock } else { 1366ea8dc4b6Seschrock if (remove_reference(hdr, NULL, tag) > 0) { 1367ea8dc4b6Seschrock ASSERT(HDR_IO_ERROR(hdr)); 136844eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 1369ea8dc4b6Seschrock } else { 1370ea8dc4b6Seschrock arc_hdr_destroy(hdr); 1371ea8dc4b6Seschrock } 1372fa9e4066Sahrens } 1373ea8dc4b6Seschrock } 1374fa9e4066Sahrens 1375ea8dc4b6Seschrock int 1376ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1377ea8dc4b6Seschrock { 1378ea8dc4b6Seschrock arc_buf_hdr_t *hdr = buf->b_hdr; 1379ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 1380ea8dc4b6Seschrock int no_callback = (buf->b_efunc == NULL); 1381fa9e4066Sahrens 138244cb6abcSbmc if (hdr->b_state == arc_anon) { 1383ea8dc4b6Seschrock arc_buf_free(buf, tag); 1384ea8dc4b6Seschrock return (no_callback); 1385ea8dc4b6Seschrock } 1386ea8dc4b6Seschrock 1387ea8dc4b6Seschrock mutex_enter(hash_lock); 138844cb6abcSbmc ASSERT(hdr->b_state != arc_anon); 1389ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 1390ea8dc4b6Seschrock 1391ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 1392ea8dc4b6Seschrock if (hdr->b_datacnt > 1) { 1393ea8dc4b6Seschrock if (no_callback) 139444eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 1395ea8dc4b6Seschrock } else if (no_callback) { 1396ea8dc4b6Seschrock ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1397ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1398ea8dc4b6Seschrock } 1399ea8dc4b6Seschrock ASSERT(no_callback || hdr->b_datacnt > 1 || 1400ea8dc4b6Seschrock refcount_is_zero(&hdr->b_refcnt)); 1401ea8dc4b6Seschrock mutex_exit(hash_lock); 1402ea8dc4b6Seschrock return (no_callback); 1403fa9e4066Sahrens } 1404fa9e4066Sahrens 1405fa9e4066Sahrens int 1406fa9e4066Sahrens arc_buf_size(arc_buf_t *buf) 1407fa9e4066Sahrens { 1408fa9e4066Sahrens return (buf->b_hdr->b_size); 1409fa9e4066Sahrens } 1410fa9e4066Sahrens 1411fa9e4066Sahrens /* 1412fa9e4066Sahrens * Evict buffers from list until we've removed the specified number of 1413fa9e4066Sahrens * bytes. Move the removed buffers to the appropriate evict state. 141444eda4d7Smaybee * If the recycle flag is set, then attempt to "recycle" a buffer: 141544eda4d7Smaybee * - look for a buffer to evict that is `bytes' long. 141644eda4d7Smaybee * - return the data block from this buffer rather than freeing it. 141744eda4d7Smaybee * This flag is used by callers that are trying to make space for a 141844eda4d7Smaybee * new buffer in a full arc cache. 1419874395d5Smaybee * 1420874395d5Smaybee * This function makes a "best effort". It skips over any buffers 1421874395d5Smaybee * it can't get a hash_lock on, and so may not catch all candidates. 1422874395d5Smaybee * It may also return without evicting as much space as requested. 1423fa9e4066Sahrens */ 142444eda4d7Smaybee static void * 1425874395d5Smaybee arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, 1426ad23a2dbSjohansen arc_buf_contents_t type) 1427fa9e4066Sahrens { 1428fa9e4066Sahrens arc_state_t *evicted_state; 142944eda4d7Smaybee uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 14303fa51506Smaybee arc_buf_hdr_t *ab, *ab_prev = NULL; 14310e8c6158Smaybee list_t *list = &state->arcs_list[type]; 1432fa9e4066Sahrens kmutex_t *hash_lock; 143344eda4d7Smaybee boolean_t have_lock; 14343fa51506Smaybee void *stolen = NULL; 1435fa9e4066Sahrens 143644cb6abcSbmc ASSERT(state == arc_mru || state == arc_mfu); 1437fa9e4066Sahrens 143844cb6abcSbmc evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1439fa9e4066Sahrens 144044cb6abcSbmc mutex_enter(&state->arcs_mtx); 144144cb6abcSbmc mutex_enter(&evicted_state->arcs_mtx); 1442fa9e4066Sahrens 14430e8c6158Smaybee for (ab = list_tail(list); ab; ab = ab_prev) { 14440e8c6158Smaybee ab_prev = list_prev(list, ab); 144513506d1eSmaybee /* prefetch buffers have a minimum lifespan */ 144644eda4d7Smaybee if (HDR_IO_IN_PROGRESS(ab) || 1447874395d5Smaybee (spa && ab->b_spa != spa) || 144844eda4d7Smaybee (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 144944eda4d7Smaybee lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 145013506d1eSmaybee skipped++; 145113506d1eSmaybee continue; 145213506d1eSmaybee } 14533fa51506Smaybee /* "lookahead" for better eviction candidate */ 14543fa51506Smaybee if (recycle && ab->b_size != bytes && 14553fa51506Smaybee ab_prev && ab_prev->b_size == bytes) 145644eda4d7Smaybee continue; 1457fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 145844eda4d7Smaybee have_lock = MUTEX_HELD(hash_lock); 145944eda4d7Smaybee if (have_lock || mutex_tryenter(hash_lock)) { 1460fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1461ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 1462ea8dc4b6Seschrock while (ab->b_buf) { 1463ea8dc4b6Seschrock arc_buf_t *buf = ab->b_buf; 146444eda4d7Smaybee if (buf->b_data) { 1465ea8dc4b6Seschrock bytes_evicted += ab->b_size; 1466ad23a2dbSjohansen if (recycle && ab->b_type == type && 1467fa94a07fSbrendan ab->b_size == bytes && 1468fa94a07fSbrendan !HDR_L2_WRITING(ab)) { 14693fa51506Smaybee stolen = buf->b_data; 14703fa51506Smaybee recycle = FALSE; 14713fa51506Smaybee } 147244eda4d7Smaybee } 1473ea8dc4b6Seschrock if (buf->b_efunc) { 1474ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 14753fa51506Smaybee arc_buf_destroy(buf, 14763fa51506Smaybee buf->b_data == stolen, FALSE); 1477ea8dc4b6Seschrock ab->b_buf = buf->b_next; 147840d7d650Smaybee buf->b_hdr = &arc_eviction_hdr; 1479ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 1480ea8dc4b6Seschrock arc_eviction_list = buf; 1481ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1482ea8dc4b6Seschrock } else { 14833fa51506Smaybee arc_buf_destroy(buf, 14843fa51506Smaybee buf->b_data == stolen, TRUE); 1485ea8dc4b6Seschrock } 1486ea8dc4b6Seschrock } 1487ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 1488fa9e4066Sahrens arc_change_state(evicted_state, ab, hash_lock); 1489ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(ab)); 1490fa94a07fSbrendan ab->b_flags |= ARC_IN_HASH_TABLE; 1491fa94a07fSbrendan ab->b_flags &= ~ARC_BUF_AVAILABLE; 1492fa9e4066Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 149344eda4d7Smaybee if (!have_lock) 149444eda4d7Smaybee mutex_exit(hash_lock); 1495ea8dc4b6Seschrock if (bytes >= 0 && bytes_evicted >= bytes) 1496fa9e4066Sahrens break; 1497fa9e4066Sahrens } else { 149844eda4d7Smaybee missed += 1; 1499fa9e4066Sahrens } 1500fa9e4066Sahrens } 150144cb6abcSbmc 150244cb6abcSbmc mutex_exit(&evicted_state->arcs_mtx); 150344cb6abcSbmc mutex_exit(&state->arcs_mtx); 1504fa9e4066Sahrens 1505fa9e4066Sahrens if (bytes_evicted < bytes) 1506fa9e4066Sahrens dprintf("only evicted %lld bytes from %x", 1507fa9e4066Sahrens (longlong_t)bytes_evicted, state); 1508fa9e4066Sahrens 150944eda4d7Smaybee if (skipped) 151044cb6abcSbmc ARCSTAT_INCR(arcstat_evict_skip, skipped); 151144cb6abcSbmc 151244eda4d7Smaybee if (missed) 151344cb6abcSbmc ARCSTAT_INCR(arcstat_mutex_miss, missed); 1514f4d2e9e6Smaybee 1515f4d2e9e6Smaybee /* 1516f4d2e9e6Smaybee * We have just evicted some date into the ghost state, make 1517f4d2e9e6Smaybee * sure we also adjust the ghost state size if necessary. 1518f4d2e9e6Smaybee */ 1519f4d2e9e6Smaybee if (arc_no_grow && 1520f4d2e9e6Smaybee arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1521f4d2e9e6Smaybee int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1522f4d2e9e6Smaybee arc_mru_ghost->arcs_size - arc_c; 1523f4d2e9e6Smaybee 1524f4d2e9e6Smaybee if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1525f4d2e9e6Smaybee int64_t todelete = 1526f4d2e9e6Smaybee MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1527874395d5Smaybee arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1528f4d2e9e6Smaybee } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1529f4d2e9e6Smaybee int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1530f4d2e9e6Smaybee arc_mru_ghost->arcs_size + 1531f4d2e9e6Smaybee arc_mfu_ghost->arcs_size - arc_c); 1532874395d5Smaybee arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1533f4d2e9e6Smaybee } 1534f4d2e9e6Smaybee } 153544cb6abcSbmc 15363fa51506Smaybee return (stolen); 1537fa9e4066Sahrens } 1538fa9e4066Sahrens 1539fa9e4066Sahrens /* 1540fa9e4066Sahrens * Remove buffers from list until we've removed the specified number of 1541fa9e4066Sahrens * bytes. Destroy the buffers that are removed. 1542fa9e4066Sahrens */ 1543fa9e4066Sahrens static void 1544874395d5Smaybee arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) 1545fa9e4066Sahrens { 1546fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 15470e8c6158Smaybee list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1548fa9e4066Sahrens kmutex_t *hash_lock; 1549ea8dc4b6Seschrock uint64_t bytes_deleted = 0; 1550c0a81264Sek uint64_t bufs_skipped = 0; 1551fa9e4066Sahrens 1552ea8dc4b6Seschrock ASSERT(GHOST_STATE(state)); 1553fa9e4066Sahrens top: 155444cb6abcSbmc mutex_enter(&state->arcs_mtx); 15550e8c6158Smaybee for (ab = list_tail(list); ab; ab = ab_prev) { 15560e8c6158Smaybee ab_prev = list_prev(list, ab); 1557874395d5Smaybee if (spa && ab->b_spa != spa) 1558874395d5Smaybee continue; 1559fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 1560fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 156113506d1eSmaybee ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1562ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 156344cb6abcSbmc ARCSTAT_BUMP(arcstat_deleted); 1564fa9e4066Sahrens bytes_deleted += ab->b_size; 1565fa94a07fSbrendan 1566fa94a07fSbrendan if (ab->b_l2hdr != NULL) { 1567fa94a07fSbrendan /* 1568fa94a07fSbrendan * This buffer is cached on the 2nd Level ARC; 1569fa94a07fSbrendan * don't destroy the header. 1570fa94a07fSbrendan */ 1571fa94a07fSbrendan arc_change_state(arc_l2c_only, ab, hash_lock); 1572fa94a07fSbrendan mutex_exit(hash_lock); 1573fa94a07fSbrendan } else { 1574fa94a07fSbrendan arc_change_state(arc_anon, ab, hash_lock); 1575fa94a07fSbrendan mutex_exit(hash_lock); 1576fa94a07fSbrendan arc_hdr_destroy(ab); 1577fa94a07fSbrendan } 1578fa94a07fSbrendan 1579ea8dc4b6Seschrock DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1580fa9e4066Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 1581fa9e4066Sahrens break; 1582fa9e4066Sahrens } else { 1583fa9e4066Sahrens if (bytes < 0) { 158444cb6abcSbmc mutex_exit(&state->arcs_mtx); 1585fa9e4066Sahrens mutex_enter(hash_lock); 1586fa9e4066Sahrens mutex_exit(hash_lock); 1587fa9e4066Sahrens goto top; 1588fa9e4066Sahrens } 1589fa9e4066Sahrens bufs_skipped += 1; 1590fa9e4066Sahrens } 1591fa9e4066Sahrens } 159244cb6abcSbmc mutex_exit(&state->arcs_mtx); 1593fa9e4066Sahrens 15940e8c6158Smaybee if (list == &state->arcs_list[ARC_BUFC_DATA] && 15950e8c6158Smaybee (bytes < 0 || bytes_deleted < bytes)) { 15960e8c6158Smaybee list = &state->arcs_list[ARC_BUFC_METADATA]; 15970e8c6158Smaybee goto top; 15980e8c6158Smaybee } 15990e8c6158Smaybee 1600fa9e4066Sahrens if (bufs_skipped) { 160144cb6abcSbmc ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1602fa9e4066Sahrens ASSERT(bytes >= 0); 1603fa9e4066Sahrens } 1604fa9e4066Sahrens 1605fa9e4066Sahrens if (bytes_deleted < bytes) 1606fa9e4066Sahrens dprintf("only deleted %lld bytes from %p", 1607fa9e4066Sahrens (longlong_t)bytes_deleted, state); 1608fa9e4066Sahrens } 1609fa9e4066Sahrens 1610fa9e4066Sahrens static void 1611fa9e4066Sahrens arc_adjust(void) 1612fa9e4066Sahrens { 161344cb6abcSbmc int64_t top_sz, mru_over, arc_over, todelete; 1614fa9e4066Sahrens 1615874395d5Smaybee top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; 1616fa9e4066Sahrens 16170e8c6158Smaybee if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 16180e8c6158Smaybee int64_t toevict = 16190e8c6158Smaybee MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 1620874395d5Smaybee (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); 16210e8c6158Smaybee top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 16220e8c6158Smaybee } 16230e8c6158Smaybee 16240e8c6158Smaybee if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 16250e8c6158Smaybee int64_t toevict = 16260e8c6158Smaybee MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 1627874395d5Smaybee (void) arc_evict(arc_mru, NULL, toevict, FALSE, 1628874395d5Smaybee ARC_BUFC_METADATA); 162944cb6abcSbmc top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1630fa9e4066Sahrens } 1631fa9e4066Sahrens 163244cb6abcSbmc mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1633fa9e4066Sahrens 1634fa9e4066Sahrens if (mru_over > 0) { 16350e8c6158Smaybee if (arc_mru_ghost->arcs_size > 0) { 16360e8c6158Smaybee todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 1637874395d5Smaybee arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1638fa9e4066Sahrens } 1639fa9e4066Sahrens } 1640fa9e4066Sahrens 164144cb6abcSbmc if ((arc_over = arc_size - arc_c) > 0) { 1642ea8dc4b6Seschrock int64_t tbl_over; 1643fa9e4066Sahrens 16440e8c6158Smaybee if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 16450e8c6158Smaybee int64_t toevict = 16460e8c6158Smaybee MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 1647874395d5Smaybee (void) arc_evict(arc_mfu, NULL, toevict, FALSE, 16480e8c6158Smaybee ARC_BUFC_DATA); 16490e8c6158Smaybee arc_over = arc_size - arc_c; 1650fa9e4066Sahrens } 1651fa9e4066Sahrens 16520e8c6158Smaybee if (arc_over > 0 && 16530e8c6158Smaybee arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 16540e8c6158Smaybee int64_t toevict = 16550e8c6158Smaybee MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 16560e8c6158Smaybee arc_over); 1657874395d5Smaybee (void) arc_evict(arc_mfu, NULL, toevict, FALSE, 16580e8c6158Smaybee ARC_BUFC_METADATA); 16590e8c6158Smaybee } 1660fa9e4066Sahrens 16610e8c6158Smaybee tbl_over = arc_size + arc_mru_ghost->arcs_size + 16620e8c6158Smaybee arc_mfu_ghost->arcs_size - arc_c * 2; 16630e8c6158Smaybee 16640e8c6158Smaybee if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 16650e8c6158Smaybee todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 1666874395d5Smaybee arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1667fa9e4066Sahrens } 1668fa9e4066Sahrens } 1669fa9e4066Sahrens } 1670fa9e4066Sahrens 1671ea8dc4b6Seschrock static void 1672ea8dc4b6Seschrock arc_do_user_evicts(void) 1673ea8dc4b6Seschrock { 1674ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1675ea8dc4b6Seschrock while (arc_eviction_list != NULL) { 1676ea8dc4b6Seschrock arc_buf_t *buf = arc_eviction_list; 1677ea8dc4b6Seschrock arc_eviction_list = buf->b_next; 1678ea8dc4b6Seschrock buf->b_hdr = NULL; 1679ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1680ea8dc4b6Seschrock 1681dd6ef538Smaybee if (buf->b_efunc != NULL) 1682dd6ef538Smaybee VERIFY(buf->b_efunc(buf) == 0); 1683ea8dc4b6Seschrock 1684ea8dc4b6Seschrock buf->b_efunc = NULL; 1685ea8dc4b6Seschrock buf->b_private = NULL; 1686ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 1687ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1688ea8dc4b6Seschrock } 1689ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1690ea8dc4b6Seschrock } 1691ea8dc4b6Seschrock 1692fa9e4066Sahrens /* 1693874395d5Smaybee * Flush all *evictable* data from the cache for the given spa. 1694fa9e4066Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 1695fa9e4066Sahrens */ 1696fa9e4066Sahrens void 1697874395d5Smaybee arc_flush(spa_t *spa) 1698fa9e4066Sahrens { 1699874395d5Smaybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 1700874395d5Smaybee (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); 1701874395d5Smaybee if (spa) 1702874395d5Smaybee break; 1703874395d5Smaybee } 1704874395d5Smaybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 1705874395d5Smaybee (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); 1706874395d5Smaybee if (spa) 1707874395d5Smaybee break; 1708874395d5Smaybee } 1709874395d5Smaybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 1710874395d5Smaybee (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); 1711874395d5Smaybee if (spa) 1712874395d5Smaybee break; 1713874395d5Smaybee } 1714874395d5Smaybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 1715874395d5Smaybee (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); 1716874395d5Smaybee if (spa) 1717874395d5Smaybee break; 1718874395d5Smaybee } 1719874395d5Smaybee 1720874395d5Smaybee arc_evict_ghost(arc_mru_ghost, spa, -1); 1721874395d5Smaybee arc_evict_ghost(arc_mfu_ghost, spa, -1); 1722ea8dc4b6Seschrock 1723ea8dc4b6Seschrock mutex_enter(&arc_reclaim_thr_lock); 1724ea8dc4b6Seschrock arc_do_user_evicts(); 1725ea8dc4b6Seschrock mutex_exit(&arc_reclaim_thr_lock); 1726874395d5Smaybee ASSERT(spa || arc_eviction_list == NULL); 1727fa9e4066Sahrens } 1728fa9e4066Sahrens 172949e3519aSmaybee int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 173013506d1eSmaybee 1731fa9e4066Sahrens void 173249e3519aSmaybee arc_shrink(void) 1733fa9e4066Sahrens { 173444cb6abcSbmc if (arc_c > arc_c_min) { 173549e3519aSmaybee uint64_t to_free; 1736fa9e4066Sahrens 17373cff2f43Sstans #ifdef _KERNEL 173844cb6abcSbmc to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 17393cff2f43Sstans #else 174044cb6abcSbmc to_free = arc_c >> arc_shrink_shift; 17413cff2f43Sstans #endif 174244cb6abcSbmc if (arc_c > arc_c_min + to_free) 174344cb6abcSbmc atomic_add_64(&arc_c, -to_free); 174449e3519aSmaybee else 174544cb6abcSbmc arc_c = arc_c_min; 174644cb6abcSbmc 174744cb6abcSbmc atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 174844cb6abcSbmc if (arc_c > arc_size) 174944cb6abcSbmc arc_c = MAX(arc_size, arc_c_min); 175044cb6abcSbmc if (arc_p > arc_c) 175144cb6abcSbmc arc_p = (arc_c >> 1); 175244cb6abcSbmc ASSERT(arc_c >= arc_c_min); 175344cb6abcSbmc ASSERT((int64_t)arc_p >= 0); 175449e3519aSmaybee } 1755fa9e4066Sahrens 175644cb6abcSbmc if (arc_size > arc_c) 175749e3519aSmaybee arc_adjust(); 1758fa9e4066Sahrens } 1759fa9e4066Sahrens 1760fa9e4066Sahrens static int 1761fa9e4066Sahrens arc_reclaim_needed(void) 1762fa9e4066Sahrens { 1763fa9e4066Sahrens uint64_t extra; 1764fa9e4066Sahrens 1765fa9e4066Sahrens #ifdef _KERNEL 17663cff2f43Sstans 17673cff2f43Sstans if (needfree) 17683cff2f43Sstans return (1); 17693cff2f43Sstans 1770fa9e4066Sahrens /* 1771fa9e4066Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 1772fa9e4066Sahrens */ 1773fa9e4066Sahrens extra = desfree; 1774fa9e4066Sahrens 1775fa9e4066Sahrens /* 1776fa9e4066Sahrens * check that we're out of range of the pageout scanner. It starts to 1777fa9e4066Sahrens * schedule paging if freemem is less than lotsfree and needfree. 1778fa9e4066Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 1779fa9e4066Sahrens * number of needed free pages. We add extra pages here to make sure 1780fa9e4066Sahrens * the scanner doesn't start up while we're freeing memory. 1781fa9e4066Sahrens */ 1782fa9e4066Sahrens if (freemem < lotsfree + needfree + extra) 1783fa9e4066Sahrens return (1); 1784fa9e4066Sahrens 1785fa9e4066Sahrens /* 1786fa9e4066Sahrens * check to make sure that swapfs has enough space so that anon 1787fa94a07fSbrendan * reservations can still succeed. anon_resvmem() checks that the 1788fa9e4066Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 1789fa9e4066Sahrens * swap pages. We also add a bit of extra here just to prevent 1790fa9e4066Sahrens * circumstances from getting really dire. 1791fa9e4066Sahrens */ 1792fa9e4066Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1793fa9e4066Sahrens return (1); 1794fa9e4066Sahrens 17955dc8af33Smaybee #if defined(__i386) 1796fa9e4066Sahrens /* 1797fa9e4066Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 1798fa9e4066Sahrens * kernel heap space before we ever run out of available physical 1799fa9e4066Sahrens * memory. Most checks of the size of the heap_area compare against 1800fa9e4066Sahrens * tune.t_minarmem, which is the minimum available real memory that we 1801fa9e4066Sahrens * can have in the system. However, this is generally fixed at 25 pages 1802fa9e4066Sahrens * which is so low that it's useless. In this comparison, we seek to 1803fa9e4066Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 1804fa94a07fSbrendan * heap is allocated. (Or, in the calculation, if less than 1/4th is 1805fa9e4066Sahrens * free) 1806fa9e4066Sahrens */ 1807fa9e4066Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1808fa9e4066Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1809fa9e4066Sahrens return (1); 1810fa9e4066Sahrens #endif 1811fa9e4066Sahrens 1812fa9e4066Sahrens #else 1813fa9e4066Sahrens if (spa_get_random(100) == 0) 1814fa9e4066Sahrens return (1); 1815fa9e4066Sahrens #endif 1816fa9e4066Sahrens return (0); 1817fa9e4066Sahrens } 1818fa9e4066Sahrens 1819fa9e4066Sahrens static void 1820fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1821fa9e4066Sahrens { 1822fa9e4066Sahrens size_t i; 1823fa9e4066Sahrens kmem_cache_t *prev_cache = NULL; 1824ad23a2dbSjohansen kmem_cache_t *prev_data_cache = NULL; 1825fa9e4066Sahrens extern kmem_cache_t *zio_buf_cache[]; 1826ad23a2dbSjohansen extern kmem_cache_t *zio_data_buf_cache[]; 1827fa9e4066Sahrens 1828033f9833Sek #ifdef _KERNEL 18290e8c6158Smaybee if (arc_meta_used >= arc_meta_limit) { 18300e8c6158Smaybee /* 18310e8c6158Smaybee * We are exceeding our meta-data cache limit. 18320e8c6158Smaybee * Purge some DNLC entries to release holds on meta-data. 18330e8c6158Smaybee */ 18340e8c6158Smaybee dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 18350e8c6158Smaybee } 18365dc8af33Smaybee #if defined(__i386) 18375dc8af33Smaybee /* 18385dc8af33Smaybee * Reclaim unused memory from all kmem caches. 18395dc8af33Smaybee */ 18405dc8af33Smaybee kmem_reap(); 18415dc8af33Smaybee #endif 1842033f9833Sek #endif 1843033f9833Sek 1844fa9e4066Sahrens /* 1845fa94a07fSbrendan * An aggressive reclamation will shrink the cache size as well as 1846ea8dc4b6Seschrock * reap free buffers from the arc kmem caches. 1847fa9e4066Sahrens */ 1848fa9e4066Sahrens if (strat == ARC_RECLAIM_AGGR) 184949e3519aSmaybee arc_shrink(); 1850fa9e4066Sahrens 1851fa9e4066Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1852fa9e4066Sahrens if (zio_buf_cache[i] != prev_cache) { 1853fa9e4066Sahrens prev_cache = zio_buf_cache[i]; 1854fa9e4066Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 1855fa9e4066Sahrens } 1856ad23a2dbSjohansen if (zio_data_buf_cache[i] != prev_data_cache) { 1857ad23a2dbSjohansen prev_data_cache = zio_data_buf_cache[i]; 1858ad23a2dbSjohansen kmem_cache_reap_now(zio_data_buf_cache[i]); 1859ad23a2dbSjohansen } 1860fa9e4066Sahrens } 1861ea8dc4b6Seschrock kmem_cache_reap_now(buf_cache); 1862ea8dc4b6Seschrock kmem_cache_reap_now(hdr_cache); 1863fa9e4066Sahrens } 1864fa9e4066Sahrens 1865fa9e4066Sahrens static void 1866fa9e4066Sahrens arc_reclaim_thread(void) 1867fa9e4066Sahrens { 1868fa9e4066Sahrens clock_t growtime = 0; 1869fa9e4066Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1870fa9e4066Sahrens callb_cpr_t cpr; 1871fa9e4066Sahrens 1872fa9e4066Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1873fa9e4066Sahrens 1874fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1875fa9e4066Sahrens while (arc_thread_exit == 0) { 1876fa9e4066Sahrens if (arc_reclaim_needed()) { 1877fa9e4066Sahrens 187844cb6abcSbmc if (arc_no_grow) { 1879fa9e4066Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1880fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1881fa9e4066Sahrens } else { 1882fa9e4066Sahrens last_reclaim = ARC_RECLAIM_CONS; 1883fa9e4066Sahrens } 1884fa9e4066Sahrens } else { 188544cb6abcSbmc arc_no_grow = TRUE; 1886fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1887fa9e4066Sahrens membar_producer(); 1888fa9e4066Sahrens } 1889fa9e4066Sahrens 1890fa9e4066Sahrens /* reset the growth delay for every reclaim */ 1891fa9e4066Sahrens growtime = lbolt + (arc_grow_retry * hz); 1892fa9e4066Sahrens 1893fa9e4066Sahrens arc_kmem_reap_now(last_reclaim); 1894*3a737e0dSbrendan arc_warm = B_TRUE; 1895fa9e4066Sahrens 18960e8c6158Smaybee } else if (arc_no_grow && lbolt >= growtime) { 189744cb6abcSbmc arc_no_grow = FALSE; 1898fa9e4066Sahrens } 1899fa9e4066Sahrens 190044cb6abcSbmc if (2 * arc_c < arc_size + 190144cb6abcSbmc arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1902641fbdaeSmaybee arc_adjust(); 1903641fbdaeSmaybee 1904ea8dc4b6Seschrock if (arc_eviction_list != NULL) 1905ea8dc4b6Seschrock arc_do_user_evicts(); 1906ea8dc4b6Seschrock 1907fa9e4066Sahrens /* block until needed, or one second, whichever is shorter */ 1908fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1909fa9e4066Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1910fa9e4066Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1911fa9e4066Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1912fa9e4066Sahrens } 1913fa9e4066Sahrens 1914fa9e4066Sahrens arc_thread_exit = 0; 1915fa9e4066Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1916fa9e4066Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1917fa9e4066Sahrens thread_exit(); 1918fa9e4066Sahrens } 1919fa9e4066Sahrens 1920ea8dc4b6Seschrock /* 1921ea8dc4b6Seschrock * Adapt arc info given the number of bytes we are trying to add and 1922ea8dc4b6Seschrock * the state that we are comming from. This function is only called 1923ea8dc4b6Seschrock * when we are adding new content to the cache. 1924ea8dc4b6Seschrock */ 1925fa9e4066Sahrens static void 1926ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state) 1927fa9e4066Sahrens { 1928ea8dc4b6Seschrock int mult; 1929ea8dc4b6Seschrock 1930fa94a07fSbrendan if (state == arc_l2c_only) 1931fa94a07fSbrendan return; 1932fa94a07fSbrendan 1933ea8dc4b6Seschrock ASSERT(bytes > 0); 1934fa9e4066Sahrens /* 1935ea8dc4b6Seschrock * Adapt the target size of the MRU list: 1936ea8dc4b6Seschrock * - if we just hit in the MRU ghost list, then increase 1937ea8dc4b6Seschrock * the target size of the MRU list. 1938ea8dc4b6Seschrock * - if we just hit in the MFU ghost list, then increase 1939ea8dc4b6Seschrock * the target size of the MFU list by decreasing the 1940ea8dc4b6Seschrock * target size of the MRU list. 1941fa9e4066Sahrens */ 194244cb6abcSbmc if (state == arc_mru_ghost) { 194344cb6abcSbmc mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 194444cb6abcSbmc 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1945ea8dc4b6Seschrock 194644cb6abcSbmc arc_p = MIN(arc_c, arc_p + bytes * mult); 194744cb6abcSbmc } else if (state == arc_mfu_ghost) { 194844cb6abcSbmc mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 194944cb6abcSbmc 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1950ea8dc4b6Seschrock 195144cb6abcSbmc arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1952ea8dc4b6Seschrock } 195344cb6abcSbmc ASSERT((int64_t)arc_p >= 0); 1954fa9e4066Sahrens 1955fa9e4066Sahrens if (arc_reclaim_needed()) { 1956fa9e4066Sahrens cv_signal(&arc_reclaim_thr_cv); 1957fa9e4066Sahrens return; 1958fa9e4066Sahrens } 1959fa9e4066Sahrens 196044cb6abcSbmc if (arc_no_grow) 1961fa9e4066Sahrens return; 1962fa9e4066Sahrens 196344cb6abcSbmc if (arc_c >= arc_c_max) 1964ea8dc4b6Seschrock return; 1965ea8dc4b6Seschrock 1966fa9e4066Sahrens /* 1967ea8dc4b6Seschrock * If we're within (2 * maxblocksize) bytes of the target 1968ea8dc4b6Seschrock * cache size, increment the target cache size 1969fa9e4066Sahrens */ 197044cb6abcSbmc if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 197144cb6abcSbmc atomic_add_64(&arc_c, (int64_t)bytes); 197244cb6abcSbmc if (arc_c > arc_c_max) 197344cb6abcSbmc arc_c = arc_c_max; 197444cb6abcSbmc else if (state == arc_anon) 197544cb6abcSbmc atomic_add_64(&arc_p, (int64_t)bytes); 197644cb6abcSbmc if (arc_p > arc_c) 197744cb6abcSbmc arc_p = arc_c; 1978fa9e4066Sahrens } 197944cb6abcSbmc ASSERT((int64_t)arc_p >= 0); 1980fa9e4066Sahrens } 1981fa9e4066Sahrens 1982fa9e4066Sahrens /* 1983ea8dc4b6Seschrock * Check if the cache has reached its limits and eviction is required 1984ea8dc4b6Seschrock * prior to insert. 1985fa9e4066Sahrens */ 1986fa9e4066Sahrens static int 19870e8c6158Smaybee arc_evict_needed(arc_buf_contents_t type) 1988fa9e4066Sahrens { 19890e8c6158Smaybee if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 19900e8c6158Smaybee return (1); 19910e8c6158Smaybee 19920e8c6158Smaybee #ifdef _KERNEL 19930e8c6158Smaybee /* 19940e8c6158Smaybee * If zio data pages are being allocated out of a separate heap segment, 19950e8c6158Smaybee * then enforce that the size of available vmem for this area remains 19960e8c6158Smaybee * above about 1/32nd free. 19970e8c6158Smaybee */ 19980e8c6158Smaybee if (type == ARC_BUFC_DATA && zio_arena != NULL && 19990e8c6158Smaybee vmem_size(zio_arena, VMEM_FREE) < 20000e8c6158Smaybee (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 20010e8c6158Smaybee return (1); 20020e8c6158Smaybee #endif 20030e8c6158Smaybee 2004fa9e4066Sahrens if (arc_reclaim_needed()) 2005fa9e4066Sahrens return (1); 2006fa9e4066Sahrens 200744cb6abcSbmc return (arc_size > arc_c); 2008fa9e4066Sahrens } 2009fa9e4066Sahrens 2010fa9e4066Sahrens /* 201144eda4d7Smaybee * The buffer, supplied as the first argument, needs a data block. 201244eda4d7Smaybee * So, if we are at cache max, determine which cache should be victimized. 201344eda4d7Smaybee * We have the following cases: 2014fa9e4066Sahrens * 201544cb6abcSbmc * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2016fa9e4066Sahrens * In this situation if we're out of space, but the resident size of the MFU is 2017fa9e4066Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 2018fa9e4066Sahrens * 201944cb6abcSbmc * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2020fa9e4066Sahrens * Here, we've used up all of the available space for the MRU, so we need to 2021fa9e4066Sahrens * evict from our own cache instead. Evict from the set of resident MRU 2022fa9e4066Sahrens * entries. 2023fa9e4066Sahrens * 202444cb6abcSbmc * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2025fa9e4066Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 2026fa9e4066Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 2027fa9e4066Sahrens * the MFU side, so the MRU side needs to be victimized. 2028fa9e4066Sahrens * 202944cb6abcSbmc * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2030fa9e4066Sahrens * MFU's resident set is consuming more space than it has been allotted. In 2031fa9e4066Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 2032fa9e4066Sahrens */ 2033fa9e4066Sahrens static void 203444eda4d7Smaybee arc_get_data_buf(arc_buf_t *buf) 2035fa9e4066Sahrens { 2036ad23a2dbSjohansen arc_state_t *state = buf->b_hdr->b_state; 2037ad23a2dbSjohansen uint64_t size = buf->b_hdr->b_size; 2038ad23a2dbSjohansen arc_buf_contents_t type = buf->b_hdr->b_type; 2039fa9e4066Sahrens 204044eda4d7Smaybee arc_adapt(size, state); 2041fa9e4066Sahrens 204244eda4d7Smaybee /* 204344eda4d7Smaybee * We have not yet reached cache maximum size, 204444eda4d7Smaybee * just allocate a new buffer. 204544eda4d7Smaybee */ 20460e8c6158Smaybee if (!arc_evict_needed(type)) { 2047ad23a2dbSjohansen if (type == ARC_BUFC_METADATA) { 2048ad23a2dbSjohansen buf->b_data = zio_buf_alloc(size); 20490e8c6158Smaybee arc_space_consume(size); 2050ad23a2dbSjohansen } else { 2051ad23a2dbSjohansen ASSERT(type == ARC_BUFC_DATA); 2052ad23a2dbSjohansen buf->b_data = zio_data_buf_alloc(size); 20530e8c6158Smaybee atomic_add_64(&arc_size, size); 2054ad23a2dbSjohansen } 205544eda4d7Smaybee goto out; 205644eda4d7Smaybee } 205744eda4d7Smaybee 205844eda4d7Smaybee /* 205944eda4d7Smaybee * If we are prefetching from the mfu ghost list, this buffer 206044eda4d7Smaybee * will end up on the mru list; so steal space from there. 206144eda4d7Smaybee */ 206244cb6abcSbmc if (state == arc_mfu_ghost) 206344cb6abcSbmc state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 206444cb6abcSbmc else if (state == arc_mru_ghost) 206544cb6abcSbmc state = arc_mru; 206644cb6abcSbmc 206744cb6abcSbmc if (state == arc_mru || state == arc_anon) { 206844cb6abcSbmc uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 20690e8c6158Smaybee state = (arc_mfu->arcs_lsize[type] > 0 && 20700e8c6158Smaybee arc_p > mru_used) ? arc_mfu : arc_mru; 2071fa9e4066Sahrens } else { 207244eda4d7Smaybee /* MFU cases */ 207344cb6abcSbmc uint64_t mfu_space = arc_c - arc_p; 20740e8c6158Smaybee state = (arc_mru->arcs_lsize[type] > 0 && 20750e8c6158Smaybee mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 207644eda4d7Smaybee } 2077874395d5Smaybee if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2078ad23a2dbSjohansen if (type == ARC_BUFC_METADATA) { 2079ad23a2dbSjohansen buf->b_data = zio_buf_alloc(size); 20800e8c6158Smaybee arc_space_consume(size); 2081ad23a2dbSjohansen } else { 2082ad23a2dbSjohansen ASSERT(type == ARC_BUFC_DATA); 2083ad23a2dbSjohansen buf->b_data = zio_data_buf_alloc(size); 20840e8c6158Smaybee atomic_add_64(&arc_size, size); 2085ad23a2dbSjohansen } 208644cb6abcSbmc ARCSTAT_BUMP(arcstat_recycle_miss); 208744eda4d7Smaybee } 208844eda4d7Smaybee ASSERT(buf->b_data != NULL); 208944eda4d7Smaybee out: 209044eda4d7Smaybee /* 209144eda4d7Smaybee * Update the state size. Note that ghost states have a 209244eda4d7Smaybee * "ghost size" and so don't need to be updated. 209344eda4d7Smaybee */ 209444eda4d7Smaybee if (!GHOST_STATE(buf->b_hdr->b_state)) { 209544eda4d7Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 209644eda4d7Smaybee 209744cb6abcSbmc atomic_add_64(&hdr->b_state->arcs_size, size); 209844eda4d7Smaybee if (list_link_active(&hdr->b_arc_node)) { 209944eda4d7Smaybee ASSERT(refcount_is_zero(&hdr->b_refcnt)); 21000e8c6158Smaybee atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2101fa9e4066Sahrens } 2102641fbdaeSmaybee /* 2103641fbdaeSmaybee * If we are growing the cache, and we are adding anonymous 210444cb6abcSbmc * data, and we have outgrown arc_p, update arc_p 2105641fbdaeSmaybee */ 210644cb6abcSbmc if (arc_size < arc_c && hdr->b_state == arc_anon && 210744cb6abcSbmc arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 210844cb6abcSbmc arc_p = MIN(arc_c, arc_p + size); 2109fa9e4066Sahrens } 2110fa9e4066Sahrens } 2111fa9e4066Sahrens 2112fa9e4066Sahrens /* 2113fa9e4066Sahrens * This routine is called whenever a buffer is accessed. 2114ea8dc4b6Seschrock * NOTE: the hash lock is dropped in this function. 2115fa9e4066Sahrens */ 2116fa9e4066Sahrens static void 211744eda4d7Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2118fa9e4066Sahrens { 2119fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 2120fa9e4066Sahrens 212144cb6abcSbmc if (buf->b_state == arc_anon) { 2122fa9e4066Sahrens /* 2123fa9e4066Sahrens * This buffer is not in the cache, and does not 2124fa9e4066Sahrens * appear in our "ghost" list. Add the new buffer 2125fa9e4066Sahrens * to the MRU state. 2126fa9e4066Sahrens */ 2127fa9e4066Sahrens 2128fa9e4066Sahrens ASSERT(buf->b_arc_access == 0); 2129fa9e4066Sahrens buf->b_arc_access = lbolt; 2130ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 213144cb6abcSbmc arc_change_state(arc_mru, buf, hash_lock); 2132fa9e4066Sahrens 213344cb6abcSbmc } else if (buf->b_state == arc_mru) { 2134fa9e4066Sahrens /* 213513506d1eSmaybee * If this buffer is here because of a prefetch, then either: 213613506d1eSmaybee * - clear the flag if this is a "referencing" read 213713506d1eSmaybee * (any subsequent access will bump this into the MFU state). 213813506d1eSmaybee * or 213913506d1eSmaybee * - move the buffer to the head of the list if this is 214013506d1eSmaybee * another prefetch (to make it less likely to be evicted). 2141fa9e4066Sahrens */ 2142fa9e4066Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 214313506d1eSmaybee if (refcount_count(&buf->b_refcnt) == 0) { 214413506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 214513506d1eSmaybee } else { 214613506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 214744cb6abcSbmc ARCSTAT_BUMP(arcstat_mru_hits); 214813506d1eSmaybee } 214913506d1eSmaybee buf->b_arc_access = lbolt; 2150fa9e4066Sahrens return; 2151fa9e4066Sahrens } 2152fa9e4066Sahrens 2153fa9e4066Sahrens /* 2154fa9e4066Sahrens * This buffer has been "accessed" only once so far, 2155fa9e4066Sahrens * but it is still in the cache. Move it to the MFU 2156fa9e4066Sahrens * state. 2157fa9e4066Sahrens */ 2158fa9e4066Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 2159fa9e4066Sahrens /* 2160fa9e4066Sahrens * More than 125ms have passed since we 2161fa9e4066Sahrens * instantiated this buffer. Move it to the 2162fa9e4066Sahrens * most frequently used state. 2163fa9e4066Sahrens */ 2164fa9e4066Sahrens buf->b_arc_access = lbolt; 2165ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 216644cb6abcSbmc arc_change_state(arc_mfu, buf, hash_lock); 2167fa9e4066Sahrens } 216844cb6abcSbmc ARCSTAT_BUMP(arcstat_mru_hits); 216944cb6abcSbmc } else if (buf->b_state == arc_mru_ghost) { 2170fa9e4066Sahrens arc_state_t *new_state; 2171fa9e4066Sahrens /* 2172fa9e4066Sahrens * This buffer has been "accessed" recently, but 2173fa9e4066Sahrens * was evicted from the cache. Move it to the 2174fa9e4066Sahrens * MFU state. 2175fa9e4066Sahrens */ 2176fa9e4066Sahrens 2177fa9e4066Sahrens if (buf->b_flags & ARC_PREFETCH) { 217844cb6abcSbmc new_state = arc_mru; 217913506d1eSmaybee if (refcount_count(&buf->b_refcnt) > 0) 218013506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 2181ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2182fa9e4066Sahrens } else { 218344cb6abcSbmc new_state = arc_mfu; 2184ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2185fa9e4066Sahrens } 2186fa9e4066Sahrens 2187fa9e4066Sahrens buf->b_arc_access = lbolt; 2188fa9e4066Sahrens arc_change_state(new_state, buf, hash_lock); 2189fa9e4066Sahrens 219044cb6abcSbmc ARCSTAT_BUMP(arcstat_mru_ghost_hits); 219144cb6abcSbmc } else if (buf->b_state == arc_mfu) { 2192fa9e4066Sahrens /* 2193fa9e4066Sahrens * This buffer has been accessed more than once and is 2194fa9e4066Sahrens * still in the cache. Keep it in the MFU state. 2195fa9e4066Sahrens * 219613506d1eSmaybee * NOTE: an add_reference() that occurred when we did 219713506d1eSmaybee * the arc_read() will have kicked this off the list. 219813506d1eSmaybee * If it was a prefetch, we will explicitly move it to 219913506d1eSmaybee * the head of the list now. 2200fa9e4066Sahrens */ 220113506d1eSmaybee if ((buf->b_flags & ARC_PREFETCH) != 0) { 220213506d1eSmaybee ASSERT(refcount_count(&buf->b_refcnt) == 0); 220313506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 220413506d1eSmaybee } 220544cb6abcSbmc ARCSTAT_BUMP(arcstat_mfu_hits); 220613506d1eSmaybee buf->b_arc_access = lbolt; 220744cb6abcSbmc } else if (buf->b_state == arc_mfu_ghost) { 220844cb6abcSbmc arc_state_t *new_state = arc_mfu; 2209fa9e4066Sahrens /* 2210fa9e4066Sahrens * This buffer has been accessed more than once but has 2211fa9e4066Sahrens * been evicted from the cache. Move it back to the 2212fa9e4066Sahrens * MFU state. 2213fa9e4066Sahrens */ 2214fa9e4066Sahrens 221513506d1eSmaybee if (buf->b_flags & ARC_PREFETCH) { 221613506d1eSmaybee /* 221713506d1eSmaybee * This is a prefetch access... 221813506d1eSmaybee * move this block back to the MRU state. 221913506d1eSmaybee */ 222013506d1eSmaybee ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 222144cb6abcSbmc new_state = arc_mru; 222213506d1eSmaybee } 222313506d1eSmaybee 2224fa9e4066Sahrens buf->b_arc_access = lbolt; 2225ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 222613506d1eSmaybee arc_change_state(new_state, buf, hash_lock); 2227fa9e4066Sahrens 222844cb6abcSbmc ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2229fa94a07fSbrendan } else if (buf->b_state == arc_l2c_only) { 2230fa94a07fSbrendan /* 2231fa94a07fSbrendan * This buffer is on the 2nd Level ARC. 2232fa94a07fSbrendan */ 2233fa94a07fSbrendan 2234fa94a07fSbrendan buf->b_arc_access = lbolt; 2235fa94a07fSbrendan DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2236fa94a07fSbrendan arc_change_state(arc_mfu, buf, hash_lock); 2237fa9e4066Sahrens } else { 2238fa9e4066Sahrens ASSERT(!"invalid arc state"); 2239fa9e4066Sahrens } 2240fa9e4066Sahrens } 2241fa9e4066Sahrens 2242fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 2243fa9e4066Sahrens /* ARGSUSED */ 2244fa9e4066Sahrens void 2245fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2246fa9e4066Sahrens { 2247fa9e4066Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2248ea8dc4b6Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2249fa9e4066Sahrens } 2250fa9e4066Sahrens 22510e8c6158Smaybee /* a generic arc_done_func_t */ 2252fa9e4066Sahrens void 2253fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2254fa9e4066Sahrens { 2255fa9e4066Sahrens arc_buf_t **bufp = arg; 2256fa9e4066Sahrens if (zio && zio->io_error) { 2257ea8dc4b6Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2258fa9e4066Sahrens *bufp = NULL; 2259fa9e4066Sahrens } else { 2260fa9e4066Sahrens *bufp = buf; 2261fa9e4066Sahrens } 2262fa9e4066Sahrens } 2263fa9e4066Sahrens 2264fa9e4066Sahrens static void 2265fa9e4066Sahrens arc_read_done(zio_t *zio) 2266fa9e4066Sahrens { 2267bbf4a8dfSmaybee arc_buf_hdr_t *hdr, *found; 2268fa9e4066Sahrens arc_buf_t *buf; 2269fa9e4066Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 2270fa9e4066Sahrens kmutex_t *hash_lock; 2271fa9e4066Sahrens arc_callback_t *callback_list, *acb; 2272fa9e4066Sahrens int freeable = FALSE; 2273fa9e4066Sahrens 2274fa9e4066Sahrens buf = zio->io_private; 2275fa9e4066Sahrens hdr = buf->b_hdr; 2276fa9e4066Sahrens 2277bbf4a8dfSmaybee /* 2278bbf4a8dfSmaybee * The hdr was inserted into hash-table and removed from lists 2279bbf4a8dfSmaybee * prior to starting I/O. We should find this header, since 2280bbf4a8dfSmaybee * it's in the hash table, and it should be legit since it's 2281bbf4a8dfSmaybee * not possible to evict it during the I/O. The only possible 2282bbf4a8dfSmaybee * reason for it not to be found is if we were freed during the 2283bbf4a8dfSmaybee * read. 2284bbf4a8dfSmaybee */ 2285bbf4a8dfSmaybee found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 22866b4acc8bSahrens &hash_lock); 2287fa9e4066Sahrens 2288bbf4a8dfSmaybee ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2289fa94a07fSbrendan (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2290fa94a07fSbrendan (found == hdr && HDR_L2_READING(hdr))); 2291fa94a07fSbrendan 2292*3a737e0dSbrendan hdr->b_flags &= ~ARC_L2_EVICTED; 2293fa94a07fSbrendan if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2294fa94a07fSbrendan hdr->b_flags |= ARC_DONT_L2CACHE; 2295fa9e4066Sahrens 2296fa9e4066Sahrens /* byteswap if necessary */ 2297fa9e4066Sahrens callback_list = hdr->b_acb; 2298fa9e4066Sahrens ASSERT(callback_list != NULL); 2299fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 2300fa9e4066Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 2301fa9e4066Sahrens 2302fa94a07fSbrendan arc_cksum_compute(buf, B_FALSE); 23036b4acc8bSahrens 2304fa9e4066Sahrens /* create copies of the data buffer for the callers */ 2305fa9e4066Sahrens abuf = buf; 2306fa9e4066Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 2307fa9e4066Sahrens if (acb->acb_done) { 230844eda4d7Smaybee if (abuf == NULL) 230944eda4d7Smaybee abuf = arc_buf_clone(buf); 2310fa9e4066Sahrens acb->acb_buf = abuf; 2311fa9e4066Sahrens abuf = NULL; 2312fa9e4066Sahrens } 2313fa9e4066Sahrens } 2314fa9e4066Sahrens hdr->b_acb = NULL; 2315fa9e4066Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2316ea8dc4b6Seschrock ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2317ea8dc4b6Seschrock if (abuf == buf) 2318ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 2319fa9e4066Sahrens 2320fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2321fa9e4066Sahrens 2322fa9e4066Sahrens if (zio->io_error != 0) { 2323fa9e4066Sahrens hdr->b_flags |= ARC_IO_ERROR; 232444cb6abcSbmc if (hdr->b_state != arc_anon) 232544cb6abcSbmc arc_change_state(arc_anon, hdr, hash_lock); 2326ea8dc4b6Seschrock if (HDR_IN_HASH_TABLE(hdr)) 2327ea8dc4b6Seschrock buf_hash_remove(hdr); 2328fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 232913506d1eSmaybee /* convert checksum errors into IO errors */ 2330ea8dc4b6Seschrock if (zio->io_error == ECKSUM) 2331ea8dc4b6Seschrock zio->io_error = EIO; 2332fa9e4066Sahrens } 2333fa9e4066Sahrens 2334ea8dc4b6Seschrock /* 233513506d1eSmaybee * Broadcast before we drop the hash_lock to avoid the possibility 233613506d1eSmaybee * that the hdr (and hence the cv) might be freed before we get to 233713506d1eSmaybee * the cv_broadcast(). 2338ea8dc4b6Seschrock */ 2339ea8dc4b6Seschrock cv_broadcast(&hdr->b_cv); 2340ea8dc4b6Seschrock 2341bbf4a8dfSmaybee if (hash_lock) { 2342fa9e4066Sahrens /* 2343fa9e4066Sahrens * Only call arc_access on anonymous buffers. This is because 2344fa9e4066Sahrens * if we've issued an I/O for an evicted buffer, we've already 2345fa9e4066Sahrens * called arc_access (to prevent any simultaneous readers from 2346fa9e4066Sahrens * getting confused). 2347fa9e4066Sahrens */ 234844cb6abcSbmc if (zio->io_error == 0 && hdr->b_state == arc_anon) 234944eda4d7Smaybee arc_access(hdr, hash_lock); 235044eda4d7Smaybee mutex_exit(hash_lock); 2351fa9e4066Sahrens } else { 2352fa9e4066Sahrens /* 2353fa9e4066Sahrens * This block was freed while we waited for the read to 2354fa9e4066Sahrens * complete. It has been removed from the hash table and 2355fa9e4066Sahrens * moved to the anonymous state (so that it won't show up 2356fa9e4066Sahrens * in the cache). 2357fa9e4066Sahrens */ 235844cb6abcSbmc ASSERT3P(hdr->b_state, ==, arc_anon); 2359fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 2360fa9e4066Sahrens } 2361fa9e4066Sahrens 2362fa9e4066Sahrens /* execute each callback and free its structure */ 2363fa9e4066Sahrens while ((acb = callback_list) != NULL) { 2364fa9e4066Sahrens if (acb->acb_done) 2365fa9e4066Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2366fa9e4066Sahrens 2367fa9e4066Sahrens if (acb->acb_zio_dummy != NULL) { 2368fa9e4066Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 2369fa9e4066Sahrens zio_nowait(acb->acb_zio_dummy); 2370fa9e4066Sahrens } 2371fa9e4066Sahrens 2372fa9e4066Sahrens callback_list = acb->acb_next; 2373fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 2374fa9e4066Sahrens } 2375fa9e4066Sahrens 2376fa9e4066Sahrens if (freeable) 2377ea8dc4b6Seschrock arc_hdr_destroy(hdr); 2378fa9e4066Sahrens } 2379fa9e4066Sahrens 2380fa9e4066Sahrens /* 2381fa9e4066Sahrens * "Read" the block block at the specified DVA (in bp) via the 2382fa9e4066Sahrens * cache. If the block is found in the cache, invoke the provided 2383fa9e4066Sahrens * callback immediately and return. Note that the `zio' parameter 2384fa9e4066Sahrens * in the callback will be NULL in this case, since no IO was 2385fa9e4066Sahrens * required. If the block is not in the cache pass the read request 2386fa9e4066Sahrens * on to the spa with a substitute callback function, so that the 2387fa9e4066Sahrens * requested block will be added to the cache. 2388fa9e4066Sahrens * 2389fa9e4066Sahrens * If a read request arrives for a block that has a read in-progress, 2390fa9e4066Sahrens * either wait for the in-progress read to complete (and return the 2391fa9e4066Sahrens * results); or, if this is a read with a "done" func, add a record 2392fa9e4066Sahrens * to the read to invoke the "done" func when the read completes, 2393fa9e4066Sahrens * and return; or just return. 2394fa9e4066Sahrens * 2395fa9e4066Sahrens * arc_read_done() will invoke all the requested "done" functions 2396fa9e4066Sahrens * for readers of this block. 2397fa9e4066Sahrens */ 2398fa9e4066Sahrens int 2399fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2400fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 240113506d1eSmaybee uint32_t *arc_flags, zbookmark_t *zb) 2402fa9e4066Sahrens { 2403fa9e4066Sahrens arc_buf_hdr_t *hdr; 2404fa9e4066Sahrens arc_buf_t *buf; 2405fa9e4066Sahrens kmutex_t *hash_lock; 2406fa94a07fSbrendan zio_t *rzio; 2407fa9e4066Sahrens 2408fa9e4066Sahrens top: 2409fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2410ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0) { 2411fa9e4066Sahrens 241213506d1eSmaybee *arc_flags |= ARC_CACHED; 241313506d1eSmaybee 2414fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 241513506d1eSmaybee 241613506d1eSmaybee if (*arc_flags & ARC_WAIT) { 241713506d1eSmaybee cv_wait(&hdr->b_cv, hash_lock); 241813506d1eSmaybee mutex_exit(hash_lock); 241913506d1eSmaybee goto top; 242013506d1eSmaybee } 242113506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 242213506d1eSmaybee 242313506d1eSmaybee if (done) { 2424fa9e4066Sahrens arc_callback_t *acb = NULL; 2425fa9e4066Sahrens 2426fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 2427fa9e4066Sahrens KM_SLEEP); 2428fa9e4066Sahrens acb->acb_done = done; 2429fa9e4066Sahrens acb->acb_private = private; 2430fa9e4066Sahrens acb->acb_byteswap = swap; 2431fa9e4066Sahrens if (pio != NULL) 2432fa9e4066Sahrens acb->acb_zio_dummy = zio_null(pio, 2433fa9e4066Sahrens spa, NULL, NULL, flags); 2434fa9e4066Sahrens 2435fa9e4066Sahrens ASSERT(acb->acb_done != NULL); 2436fa9e4066Sahrens acb->acb_next = hdr->b_acb; 2437fa9e4066Sahrens hdr->b_acb = acb; 2438fa9e4066Sahrens add_reference(hdr, hash_lock, private); 2439fa9e4066Sahrens mutex_exit(hash_lock); 2440fa9e4066Sahrens return (0); 2441fa9e4066Sahrens } 2442fa9e4066Sahrens mutex_exit(hash_lock); 2443fa9e4066Sahrens return (0); 2444fa9e4066Sahrens } 2445fa9e4066Sahrens 244644cb6abcSbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2447fa9e4066Sahrens 2448ea8dc4b6Seschrock if (done) { 244944eda4d7Smaybee add_reference(hdr, hash_lock, private); 2450ea8dc4b6Seschrock /* 2451ea8dc4b6Seschrock * If this block is already in use, create a new 2452ea8dc4b6Seschrock * copy of the data so that we will be guaranteed 2453ea8dc4b6Seschrock * that arc_release() will always succeed. 2454ea8dc4b6Seschrock */ 2455fa9e4066Sahrens buf = hdr->b_buf; 2456ea8dc4b6Seschrock ASSERT(buf); 2457ea8dc4b6Seschrock ASSERT(buf->b_data); 245844eda4d7Smaybee if (HDR_BUF_AVAILABLE(hdr)) { 2459ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 2460ea8dc4b6Seschrock hdr->b_flags &= ~ARC_BUF_AVAILABLE; 246144eda4d7Smaybee } else { 246244eda4d7Smaybee buf = arc_buf_clone(buf); 2463ea8dc4b6Seschrock } 246413506d1eSmaybee } else if (*arc_flags & ARC_PREFETCH && 246513506d1eSmaybee refcount_count(&hdr->b_refcnt) == 0) { 246613506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 2467fa9e4066Sahrens } 2468fa9e4066Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 246944eda4d7Smaybee arc_access(hdr, hash_lock); 247044eda4d7Smaybee mutex_exit(hash_lock); 247144cb6abcSbmc ARCSTAT_BUMP(arcstat_hits); 247244cb6abcSbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 247344cb6abcSbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 247444cb6abcSbmc data, metadata, hits); 247544cb6abcSbmc 2476fa9e4066Sahrens if (done) 2477fa9e4066Sahrens done(NULL, buf, private); 2478fa9e4066Sahrens } else { 2479fa9e4066Sahrens uint64_t size = BP_GET_LSIZE(bp); 2480fa9e4066Sahrens arc_callback_t *acb; 2481*3a737e0dSbrendan vdev_t *vd = NULL; 2482*3a737e0dSbrendan daddr_t addr; 2483fa9e4066Sahrens 2484fa9e4066Sahrens if (hdr == NULL) { 2485fa9e4066Sahrens /* this block is not in the cache */ 2486fa9e4066Sahrens arc_buf_hdr_t *exists; 2487ad23a2dbSjohansen arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2488ad23a2dbSjohansen buf = arc_buf_alloc(spa, size, private, type); 2489fa9e4066Sahrens hdr = buf->b_hdr; 2490fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(bp); 2491fa9e4066Sahrens hdr->b_birth = bp->blk_birth; 2492fa9e4066Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2493fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2494fa9e4066Sahrens if (exists) { 2495fa9e4066Sahrens /* somebody beat us to the hash insert */ 2496fa9e4066Sahrens mutex_exit(hash_lock); 2497fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2498fa9e4066Sahrens hdr->b_birth = 0; 2499fa9e4066Sahrens hdr->b_cksum0 = 0; 2500ea8dc4b6Seschrock (void) arc_buf_remove_ref(buf, private); 2501fa9e4066Sahrens goto top; /* restart the IO request */ 2502fa9e4066Sahrens } 250313506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 250413506d1eSmaybee if (*arc_flags & ARC_PREFETCH) { 250513506d1eSmaybee (void) remove_reference(hdr, hash_lock, 250613506d1eSmaybee private); 250713506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 250813506d1eSmaybee } 250913506d1eSmaybee if (BP_GET_LEVEL(bp) > 0) 251013506d1eSmaybee hdr->b_flags |= ARC_INDIRECT; 2511fa9e4066Sahrens } else { 2512fa9e4066Sahrens /* this block is in the ghost cache */ 2513ea8dc4b6Seschrock ASSERT(GHOST_STATE(hdr->b_state)); 2514ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 251513506d1eSmaybee ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2516ea8dc4b6Seschrock ASSERT(hdr->b_buf == NULL); 251713506d1eSmaybee 251813506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 251913506d1eSmaybee if (*arc_flags & ARC_PREFETCH) 252013506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 252113506d1eSmaybee else 252213506d1eSmaybee add_reference(hdr, hash_lock, private); 25231ab7f2deSmaybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2524fa9e4066Sahrens buf->b_hdr = hdr; 252544eda4d7Smaybee buf->b_data = NULL; 2526ea8dc4b6Seschrock buf->b_efunc = NULL; 2527ea8dc4b6Seschrock buf->b_private = NULL; 2528fa9e4066Sahrens buf->b_next = NULL; 2529fa9e4066Sahrens hdr->b_buf = buf; 253044eda4d7Smaybee arc_get_data_buf(buf); 2531ea8dc4b6Seschrock ASSERT(hdr->b_datacnt == 0); 2532ea8dc4b6Seschrock hdr->b_datacnt = 1; 253313506d1eSmaybee 2534fa9e4066Sahrens } 2535fa9e4066Sahrens 2536fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2537fa9e4066Sahrens acb->acb_done = done; 2538fa9e4066Sahrens acb->acb_private = private; 2539fa9e4066Sahrens acb->acb_byteswap = swap; 2540fa9e4066Sahrens 2541fa9e4066Sahrens ASSERT(hdr->b_acb == NULL); 2542fa9e4066Sahrens hdr->b_acb = acb; 2543fa9e4066Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 2544fa9e4066Sahrens 2545fa9e4066Sahrens /* 2546fa9e4066Sahrens * If the buffer has been evicted, migrate it to a present state 2547fa9e4066Sahrens * before issuing the I/O. Once we drop the hash-table lock, 2548fa9e4066Sahrens * the header will be marked as I/O in progress and have an 2549fa9e4066Sahrens * attached buffer. At this point, anybody who finds this 2550fa9e4066Sahrens * buffer ought to notice that it's legit but has a pending I/O. 2551fa9e4066Sahrens */ 2552fa9e4066Sahrens 2553ea8dc4b6Seschrock if (GHOST_STATE(hdr->b_state)) 255444eda4d7Smaybee arc_access(hdr, hash_lock); 2555fa9e4066Sahrens 2556*3a737e0dSbrendan if (hdr->b_l2hdr != NULL) { 2557*3a737e0dSbrendan vd = hdr->b_l2hdr->b_dev->l2ad_vdev; 2558*3a737e0dSbrendan addr = hdr->b_l2hdr->b_daddr; 2559*3a737e0dSbrendan } 2560*3a737e0dSbrendan 2561*3a737e0dSbrendan mutex_exit(hash_lock); 2562*3a737e0dSbrendan 2563fa9e4066Sahrens ASSERT3U(hdr->b_size, ==, size); 2564c543ec06Sahrens DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2565c543ec06Sahrens zbookmark_t *, zb); 256644cb6abcSbmc ARCSTAT_BUMP(arcstat_misses); 256744cb6abcSbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 256844cb6abcSbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 256944cb6abcSbmc data, metadata, misses); 2570ea8dc4b6Seschrock 2571fa94a07fSbrendan if (l2arc_ndev != 0) { 2572*3a737e0dSbrendan /* 2573*3a737e0dSbrendan * Lock out device removal. 2574*3a737e0dSbrendan */ 2575*3a737e0dSbrendan spa_config_enter(spa, RW_READER, FTAG); 2576*3a737e0dSbrendan 2577fa94a07fSbrendan /* 2578fa94a07fSbrendan * Read from the L2ARC if the following are true: 2579*3a737e0dSbrendan * 1. The L2ARC vdev was previously cached. 2580*3a737e0dSbrendan * 2. This buffer still has L2ARC metadata. 2581*3a737e0dSbrendan * 3. This buffer isn't currently writing to the L2ARC. 2582*3a737e0dSbrendan * 4. The L2ARC entry wasn't evicted, which may 2583*3a737e0dSbrendan * also have invalidated the vdev. 2584fa94a07fSbrendan */ 2585*3a737e0dSbrendan if (vd != NULL && hdr->b_l2hdr != NULL && 2586*3a737e0dSbrendan !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { 2587fa94a07fSbrendan l2arc_read_callback_t *cb; 2588fa94a07fSbrendan 2589c5904d13Seschrock if (vdev_is_dead(vd)) 2590*3a737e0dSbrendan goto l2skip; 2591fa94a07fSbrendan 2592c5904d13Seschrock DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 2593c5904d13Seschrock ARCSTAT_BUMP(arcstat_l2_hits); 2594c5904d13Seschrock 2595fa94a07fSbrendan cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 2596fa94a07fSbrendan KM_SLEEP); 2597fa94a07fSbrendan cb->l2rcb_buf = buf; 2598fa94a07fSbrendan cb->l2rcb_spa = spa; 2599fa94a07fSbrendan cb->l2rcb_bp = *bp; 2600fa94a07fSbrendan cb->l2rcb_zb = *zb; 2601fa94a07fSbrendan cb->l2rcb_flags = flags; 2602fa94a07fSbrendan 2603fa94a07fSbrendan /* 2604fa94a07fSbrendan * l2arc read. 2605fa94a07fSbrendan */ 2606fa94a07fSbrendan rzio = zio_read_phys(pio, vd, addr, size, 2607fa94a07fSbrendan buf->b_data, ZIO_CHECKSUM_OFF, 2608*3a737e0dSbrendan l2arc_read_done, cb, priority, flags | 2609*3a737e0dSbrendan ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL, 2610*3a737e0dSbrendan B_FALSE); 2611fa94a07fSbrendan DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 2612fa94a07fSbrendan zio_t *, rzio); 2613*3a737e0dSbrendan spa_config_exit(spa, FTAG); 2614fa94a07fSbrendan 2615*3a737e0dSbrendan if (*arc_flags & ARC_NOWAIT) { 2616*3a737e0dSbrendan zio_nowait(rzio); 2617*3a737e0dSbrendan return (0); 2618*3a737e0dSbrendan } 2619fa94a07fSbrendan 2620*3a737e0dSbrendan ASSERT(*arc_flags & ARC_WAIT); 2621*3a737e0dSbrendan if (zio_wait(rzio) == 0) 2622*3a737e0dSbrendan return (0); 2623*3a737e0dSbrendan 2624*3a737e0dSbrendan /* l2arc read error; goto zio_read() */ 2625fa94a07fSbrendan } else { 2626fa94a07fSbrendan DTRACE_PROBE1(l2arc__miss, 2627fa94a07fSbrendan arc_buf_hdr_t *, hdr); 2628fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_misses); 2629fa94a07fSbrendan if (HDR_L2_WRITING(hdr)) 2630fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_rw_clash); 2631*3a737e0dSbrendan l2skip: 2632*3a737e0dSbrendan spa_config_exit(spa, FTAG); 2633fa94a07fSbrendan } 2634fa94a07fSbrendan } 2635c5904d13Seschrock 2636fa9e4066Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 2637ea8dc4b6Seschrock arc_read_done, buf, priority, flags, zb); 2638fa9e4066Sahrens 263913506d1eSmaybee if (*arc_flags & ARC_WAIT) 2640fa9e4066Sahrens return (zio_wait(rzio)); 2641fa9e4066Sahrens 264213506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 2643fa9e4066Sahrens zio_nowait(rzio); 2644fa9e4066Sahrens } 2645fa9e4066Sahrens return (0); 2646fa9e4066Sahrens } 2647fa9e4066Sahrens 2648fa9e4066Sahrens /* 2649fa9e4066Sahrens * arc_read() variant to support pool traversal. If the block is already 2650fa9e4066Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2651fa9e4066Sahrens * The idea is that we don't want pool traversal filling up memory, but 2652fa9e4066Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2653fa9e4066Sahrens */ 2654fa9e4066Sahrens int 2655fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2656fa9e4066Sahrens { 2657fa9e4066Sahrens arc_buf_hdr_t *hdr; 2658fa9e4066Sahrens kmutex_t *hash_mtx; 2659fa9e4066Sahrens int rc = 0; 2660fa9e4066Sahrens 2661fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2662fa9e4066Sahrens 2663ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2664ea8dc4b6Seschrock arc_buf_t *buf = hdr->b_buf; 2665ea8dc4b6Seschrock 2666ea8dc4b6Seschrock ASSERT(buf); 2667ea8dc4b6Seschrock while (buf->b_data == NULL) { 2668ea8dc4b6Seschrock buf = buf->b_next; 2669ea8dc4b6Seschrock ASSERT(buf); 2670ea8dc4b6Seschrock } 2671ea8dc4b6Seschrock bcopy(buf->b_data, data, hdr->b_size); 2672ea8dc4b6Seschrock } else { 2673fa9e4066Sahrens rc = ENOENT; 2674ea8dc4b6Seschrock } 2675fa9e4066Sahrens 2676fa9e4066Sahrens if (hash_mtx) 2677fa9e4066Sahrens mutex_exit(hash_mtx); 2678fa9e4066Sahrens 2679fa9e4066Sahrens return (rc); 2680fa9e4066Sahrens } 2681fa9e4066Sahrens 2682ea8dc4b6Seschrock void 2683ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2684ea8dc4b6Seschrock { 2685ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 268644cb6abcSbmc ASSERT(buf->b_hdr->b_state != arc_anon); 2687ea8dc4b6Seschrock ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2688ea8dc4b6Seschrock buf->b_efunc = func; 2689ea8dc4b6Seschrock buf->b_private = private; 2690ea8dc4b6Seschrock } 2691ea8dc4b6Seschrock 2692ea8dc4b6Seschrock /* 2693ea8dc4b6Seschrock * This is used by the DMU to let the ARC know that a buffer is 2694ea8dc4b6Seschrock * being evicted, so the ARC should clean up. If this arc buf 2695ea8dc4b6Seschrock * is not yet in the evicted state, it will be put there. 2696ea8dc4b6Seschrock */ 2697ea8dc4b6Seschrock int 2698ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf) 2699ea8dc4b6Seschrock { 270040d7d650Smaybee arc_buf_hdr_t *hdr; 2701ea8dc4b6Seschrock kmutex_t *hash_lock; 2702ea8dc4b6Seschrock arc_buf_t **bufp; 2703ea8dc4b6Seschrock 270440d7d650Smaybee mutex_enter(&arc_eviction_mtx); 270540d7d650Smaybee hdr = buf->b_hdr; 2706ea8dc4b6Seschrock if (hdr == NULL) { 2707ea8dc4b6Seschrock /* 2708ea8dc4b6Seschrock * We are in arc_do_user_evicts(). 2709ea8dc4b6Seschrock */ 2710ea8dc4b6Seschrock ASSERT(buf->b_data == NULL); 271140d7d650Smaybee mutex_exit(&arc_eviction_mtx); 2712ea8dc4b6Seschrock return (0); 2713ea8dc4b6Seschrock } 2714ea8dc4b6Seschrock hash_lock = HDR_LOCK(hdr); 271540d7d650Smaybee mutex_exit(&arc_eviction_mtx); 271640d7d650Smaybee 2717ea8dc4b6Seschrock mutex_enter(hash_lock); 2718ea8dc4b6Seschrock 27199b23f181Smaybee if (buf->b_data == NULL) { 27209b23f181Smaybee /* 27219b23f181Smaybee * We are on the eviction list. 27229b23f181Smaybee */ 27239b23f181Smaybee mutex_exit(hash_lock); 27249b23f181Smaybee mutex_enter(&arc_eviction_mtx); 27259b23f181Smaybee if (buf->b_hdr == NULL) { 27269b23f181Smaybee /* 27279b23f181Smaybee * We are already in arc_do_user_evicts(). 27289b23f181Smaybee */ 27299b23f181Smaybee mutex_exit(&arc_eviction_mtx); 27309b23f181Smaybee return (0); 27319b23f181Smaybee } else { 27329b23f181Smaybee arc_buf_t copy = *buf; /* structure assignment */ 27339b23f181Smaybee /* 27349b23f181Smaybee * Process this buffer now 27359b23f181Smaybee * but let arc_do_user_evicts() do the reaping. 27369b23f181Smaybee */ 27379b23f181Smaybee buf->b_efunc = NULL; 27389b23f181Smaybee mutex_exit(&arc_eviction_mtx); 27399b23f181Smaybee VERIFY(copy.b_efunc(©) == 0); 27409b23f181Smaybee return (1); 27419b23f181Smaybee } 27429b23f181Smaybee } 27439b23f181Smaybee 27449b23f181Smaybee ASSERT(buf->b_hdr == hdr); 27459b23f181Smaybee ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 274644cb6abcSbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2747ea8dc4b6Seschrock 2748ea8dc4b6Seschrock /* 2749ea8dc4b6Seschrock * Pull this buffer off of the hdr 2750ea8dc4b6Seschrock */ 2751ea8dc4b6Seschrock bufp = &hdr->b_buf; 2752ea8dc4b6Seschrock while (*bufp != buf) 2753ea8dc4b6Seschrock bufp = &(*bufp)->b_next; 2754ea8dc4b6Seschrock *bufp = buf->b_next; 2755ea8dc4b6Seschrock 2756ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 275744eda4d7Smaybee arc_buf_destroy(buf, FALSE, FALSE); 2758ea8dc4b6Seschrock 2759ea8dc4b6Seschrock if (hdr->b_datacnt == 0) { 2760ea8dc4b6Seschrock arc_state_t *old_state = hdr->b_state; 2761ea8dc4b6Seschrock arc_state_t *evicted_state; 2762ea8dc4b6Seschrock 2763ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2764ea8dc4b6Seschrock 2765ea8dc4b6Seschrock evicted_state = 276644cb6abcSbmc (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2767ea8dc4b6Seschrock 276844cb6abcSbmc mutex_enter(&old_state->arcs_mtx); 276944cb6abcSbmc mutex_enter(&evicted_state->arcs_mtx); 2770ea8dc4b6Seschrock 2771ea8dc4b6Seschrock arc_change_state(evicted_state, hdr, hash_lock); 2772ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(hdr)); 2773fa94a07fSbrendan hdr->b_flags |= ARC_IN_HASH_TABLE; 2774fa94a07fSbrendan hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2775ea8dc4b6Seschrock 277644cb6abcSbmc mutex_exit(&evicted_state->arcs_mtx); 277744cb6abcSbmc mutex_exit(&old_state->arcs_mtx); 2778ea8dc4b6Seschrock } 2779ea8dc4b6Seschrock mutex_exit(hash_lock); 2780dd6ef538Smaybee 2781ea8dc4b6Seschrock VERIFY(buf->b_efunc(buf) == 0); 2782ea8dc4b6Seschrock buf->b_efunc = NULL; 2783ea8dc4b6Seschrock buf->b_private = NULL; 2784ea8dc4b6Seschrock buf->b_hdr = NULL; 2785ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 2786ea8dc4b6Seschrock return (1); 2787ea8dc4b6Seschrock } 2788ea8dc4b6Seschrock 2789fa9e4066Sahrens /* 2790fa9e4066Sahrens * Release this buffer from the cache. This must be done 2791fa9e4066Sahrens * after a read and prior to modifying the buffer contents. 2792fa9e4066Sahrens * If the buffer has more than one reference, we must make 2793fa9e4066Sahrens * make a new hdr for the buffer. 2794fa9e4066Sahrens */ 2795fa9e4066Sahrens void 2796fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag) 2797fa9e4066Sahrens { 2798fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 2799fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 2800fa94a07fSbrendan l2arc_buf_hdr_t *l2hdr = NULL; 2801fa94a07fSbrendan uint64_t buf_size; 2802fa9e4066Sahrens 2803fa9e4066Sahrens /* this buffer is not on any list */ 2804fa9e4066Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2805fa9e4066Sahrens 280644cb6abcSbmc if (hdr->b_state == arc_anon) { 2807fa9e4066Sahrens /* this buffer is already released */ 2808fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2809fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 2810ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 28116b4acc8bSahrens arc_buf_thaw(buf); 2812fa9e4066Sahrens return; 2813fa9e4066Sahrens } 2814fa9e4066Sahrens 2815fa9e4066Sahrens mutex_enter(hash_lock); 2816fa9e4066Sahrens 2817ea8dc4b6Seschrock /* 2818ea8dc4b6Seschrock * Do we have more than one buf? 2819ea8dc4b6Seschrock */ 2820ea8dc4b6Seschrock if (hdr->b_buf != buf || buf->b_next != NULL) { 2821fa9e4066Sahrens arc_buf_hdr_t *nhdr; 2822fa9e4066Sahrens arc_buf_t **bufp; 2823fa9e4066Sahrens uint64_t blksz = hdr->b_size; 2824fa9e4066Sahrens spa_t *spa = hdr->b_spa; 2825ad23a2dbSjohansen arc_buf_contents_t type = hdr->b_type; 2826fa94a07fSbrendan uint32_t flags = hdr->b_flags; 2827fa9e4066Sahrens 2828ea8dc4b6Seschrock ASSERT(hdr->b_datacnt > 1); 2829fa9e4066Sahrens /* 2830fa9e4066Sahrens * Pull the data off of this buf and attach it to 2831fa9e4066Sahrens * a new anonymous buf. 2832fa9e4066Sahrens */ 2833ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 2834fa9e4066Sahrens bufp = &hdr->b_buf; 2835ea8dc4b6Seschrock while (*bufp != buf) 2836fa9e4066Sahrens bufp = &(*bufp)->b_next; 2837fa9e4066Sahrens *bufp = (*bufp)->b_next; 2838af2c4821Smaybee buf->b_next = NULL; 2839ea8dc4b6Seschrock 284044cb6abcSbmc ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 284144cb6abcSbmc atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2842ea8dc4b6Seschrock if (refcount_is_zero(&hdr->b_refcnt)) { 28430e8c6158Smaybee uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 28440e8c6158Smaybee ASSERT3U(*size, >=, hdr->b_size); 28450e8c6158Smaybee atomic_add_64(size, -hdr->b_size); 2846ea8dc4b6Seschrock } 2847ea8dc4b6Seschrock hdr->b_datacnt -= 1; 2848fa94a07fSbrendan if (hdr->b_l2hdr != NULL) { 2849fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 2850fa94a07fSbrendan l2hdr = hdr->b_l2hdr; 2851fa94a07fSbrendan hdr->b_l2hdr = NULL; 2852fa94a07fSbrendan buf_size = hdr->b_size; 2853fa94a07fSbrendan } 2854c717a561Smaybee arc_cksum_verify(buf); 2855ea8dc4b6Seschrock 2856fa9e4066Sahrens mutex_exit(hash_lock); 2857fa9e4066Sahrens 28581ab7f2deSmaybee nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 2859fa9e4066Sahrens nhdr->b_size = blksz; 2860fa9e4066Sahrens nhdr->b_spa = spa; 2861ad23a2dbSjohansen nhdr->b_type = type; 2862fa9e4066Sahrens nhdr->b_buf = buf; 286344cb6abcSbmc nhdr->b_state = arc_anon; 2864fa9e4066Sahrens nhdr->b_arc_access = 0; 2865fa94a07fSbrendan nhdr->b_flags = flags & ARC_L2_WRITING; 2866fa94a07fSbrendan nhdr->b_l2hdr = NULL; 2867ea8dc4b6Seschrock nhdr->b_datacnt = 1; 2868c717a561Smaybee nhdr->b_freeze_cksum = NULL; 2869fa9e4066Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 2870af2c4821Smaybee buf->b_hdr = nhdr; 287144cb6abcSbmc atomic_add_64(&arc_anon->arcs_size, blksz); 2872fa9e4066Sahrens } else { 2873ea8dc4b6Seschrock ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2874fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 2875fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 287644cb6abcSbmc arc_change_state(arc_anon, hdr, hash_lock); 2877fa9e4066Sahrens hdr->b_arc_access = 0; 2878fa94a07fSbrendan if (hdr->b_l2hdr != NULL) { 2879fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 2880fa94a07fSbrendan l2hdr = hdr->b_l2hdr; 2881fa94a07fSbrendan hdr->b_l2hdr = NULL; 2882fa94a07fSbrendan buf_size = hdr->b_size; 2883fa94a07fSbrendan } 2884fa9e4066Sahrens mutex_exit(hash_lock); 2885fa94a07fSbrendan 2886fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2887fa9e4066Sahrens hdr->b_birth = 0; 2888fa9e4066Sahrens hdr->b_cksum0 = 0; 2889c717a561Smaybee arc_buf_thaw(buf); 2890fa9e4066Sahrens } 2891ea8dc4b6Seschrock buf->b_efunc = NULL; 2892ea8dc4b6Seschrock buf->b_private = NULL; 2893fa94a07fSbrendan 2894fa94a07fSbrendan if (l2hdr) { 2895fa94a07fSbrendan list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 2896fa94a07fSbrendan kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 2897fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, -buf_size); 2898fa94a07fSbrendan } 2899fa94a07fSbrendan if (MUTEX_HELD(&l2arc_buflist_mtx)) 2900fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 2901fa9e4066Sahrens } 2902fa9e4066Sahrens 2903fa9e4066Sahrens int 2904fa9e4066Sahrens arc_released(arc_buf_t *buf) 2905fa9e4066Sahrens { 290644cb6abcSbmc return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2907ea8dc4b6Seschrock } 2908ea8dc4b6Seschrock 2909ea8dc4b6Seschrock int 2910ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf) 2911ea8dc4b6Seschrock { 2912ea8dc4b6Seschrock return (buf->b_efunc != NULL); 2913fa9e4066Sahrens } 2914fa9e4066Sahrens 2915ea8dc4b6Seschrock #ifdef ZFS_DEBUG 2916ea8dc4b6Seschrock int 2917ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf) 2918ea8dc4b6Seschrock { 2919ea8dc4b6Seschrock return (refcount_count(&buf->b_hdr->b_refcnt)); 2920ea8dc4b6Seschrock } 2921ea8dc4b6Seschrock #endif 2922ea8dc4b6Seschrock 2923c717a561Smaybee static void 2924c717a561Smaybee arc_write_ready(zio_t *zio) 2925c717a561Smaybee { 2926c717a561Smaybee arc_write_callback_t *callback = zio->io_private; 2927c717a561Smaybee arc_buf_t *buf = callback->awcb_buf; 29280a4e9518Sgw arc_buf_hdr_t *hdr = buf->b_hdr; 2929c717a561Smaybee 29300a4e9518Sgw if (zio->io_error == 0 && callback->awcb_ready) { 2931c717a561Smaybee ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2932c717a561Smaybee callback->awcb_ready(zio, buf, callback->awcb_private); 2933c717a561Smaybee } 29340a4e9518Sgw /* 29350a4e9518Sgw * If the IO is already in progress, then this is a re-write 29360a4e9518Sgw * attempt, so we need to thaw and re-compute the cksum. It is 29370a4e9518Sgw * the responsibility of the callback to handle the freeing 29380a4e9518Sgw * and accounting for any re-write attempt. If we don't have a 29390a4e9518Sgw * callback registered then simply free the block here. 29400a4e9518Sgw */ 29410a4e9518Sgw if (HDR_IO_IN_PROGRESS(hdr)) { 29420a4e9518Sgw if (!BP_IS_HOLE(&zio->io_bp_orig) && 29430a4e9518Sgw callback->awcb_ready == NULL) { 29440a4e9518Sgw zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 29450a4e9518Sgw &zio->io_bp_orig, NULL, NULL)); 29460a4e9518Sgw } 29470a4e9518Sgw mutex_enter(&hdr->b_freeze_lock); 29480a4e9518Sgw if (hdr->b_freeze_cksum != NULL) { 29490a4e9518Sgw kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 29500a4e9518Sgw hdr->b_freeze_cksum = NULL; 29510a4e9518Sgw } 29520a4e9518Sgw mutex_exit(&hdr->b_freeze_lock); 29530a4e9518Sgw } 2954fa94a07fSbrendan arc_cksum_compute(buf, B_FALSE); 29550a4e9518Sgw hdr->b_flags |= ARC_IO_IN_PROGRESS; 2956c717a561Smaybee } 2957c717a561Smaybee 2958fa9e4066Sahrens static void 2959fa9e4066Sahrens arc_write_done(zio_t *zio) 2960fa9e4066Sahrens { 2961c717a561Smaybee arc_write_callback_t *callback = zio->io_private; 2962c717a561Smaybee arc_buf_t *buf = callback->awcb_buf; 2963c717a561Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 2964fa9e4066Sahrens 2965fa9e4066Sahrens hdr->b_acb = NULL; 2966fa9e4066Sahrens 2967fa9e4066Sahrens /* this buffer is on no lists and is not in the hash table */ 296844cb6abcSbmc ASSERT3P(hdr->b_state, ==, arc_anon); 2969fa9e4066Sahrens 2970fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2971fa9e4066Sahrens hdr->b_birth = zio->io_bp->blk_birth; 2972fa9e4066Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2973ea8dc4b6Seschrock /* 2974ea8dc4b6Seschrock * If the block to be written was all-zero, we may have 2975ea8dc4b6Seschrock * compressed it away. In this case no write was performed 2976ea8dc4b6Seschrock * so there will be no dva/birth-date/checksum. The buffer 2977ea8dc4b6Seschrock * must therefor remain anonymous (and uncached). 2978ea8dc4b6Seschrock */ 2979fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 2980fa9e4066Sahrens arc_buf_hdr_t *exists; 2981fa9e4066Sahrens kmutex_t *hash_lock; 2982fa9e4066Sahrens 29836b4acc8bSahrens arc_cksum_verify(buf); 29846b4acc8bSahrens 2985fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2986fa9e4066Sahrens if (exists) { 2987fa9e4066Sahrens /* 2988fa9e4066Sahrens * This can only happen if we overwrite for 2989fa9e4066Sahrens * sync-to-convergence, because we remove 2990fa9e4066Sahrens * buffers from the hash table when we arc_free(). 2991fa9e4066Sahrens */ 2992fa9e4066Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2993fa9e4066Sahrens BP_IDENTITY(zio->io_bp))); 2994fa9e4066Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2995fa9e4066Sahrens zio->io_bp->blk_birth); 2996fa9e4066Sahrens 2997fa9e4066Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 299844cb6abcSbmc arc_change_state(arc_anon, exists, hash_lock); 2999fa9e4066Sahrens mutex_exit(hash_lock); 3000ea8dc4b6Seschrock arc_hdr_destroy(exists); 3001fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 3002fa9e4066Sahrens ASSERT3P(exists, ==, NULL); 3003fa9e4066Sahrens } 3004ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 300544eda4d7Smaybee arc_access(hdr, hash_lock); 300644eda4d7Smaybee mutex_exit(hash_lock); 3007c717a561Smaybee } else if (callback->awcb_done == NULL) { 3008ea8dc4b6Seschrock int destroy_hdr; 3009ea8dc4b6Seschrock /* 3010ea8dc4b6Seschrock * This is an anonymous buffer with no user callback, 3011ea8dc4b6Seschrock * destroy it if there are no active references. 3012ea8dc4b6Seschrock */ 3013ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 3014ea8dc4b6Seschrock destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 3015ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3016ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 3017ea8dc4b6Seschrock if (destroy_hdr) 3018ea8dc4b6Seschrock arc_hdr_destroy(hdr); 3019ea8dc4b6Seschrock } else { 3020ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3021fa9e4066Sahrens } 3022ea8dc4b6Seschrock 3023c717a561Smaybee if (callback->awcb_done) { 3024fa9e4066Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3025c717a561Smaybee callback->awcb_done(zio, buf, callback->awcb_private); 3026fa9e4066Sahrens } 3027fa9e4066Sahrens 3028c717a561Smaybee kmem_free(callback, sizeof (arc_write_callback_t)); 3029fa9e4066Sahrens } 3030fa9e4066Sahrens 3031c717a561Smaybee zio_t * 303244cd46caSbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 3033fa9e4066Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 3034c717a561Smaybee arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 3035c717a561Smaybee int flags, zbookmark_t *zb) 3036fa9e4066Sahrens { 3037fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 3038c717a561Smaybee arc_write_callback_t *callback; 3039c717a561Smaybee zio_t *zio; 3040fa9e4066Sahrens 3041fa9e4066Sahrens /* this is a private buffer - no locking required */ 304244cb6abcSbmc ASSERT3P(hdr->b_state, ==, arc_anon); 3043fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 3044fa9e4066Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 3045c5c6ffa0Smaybee ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3046c5c6ffa0Smaybee ASSERT(hdr->b_acb == 0); 3047c717a561Smaybee callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3048c717a561Smaybee callback->awcb_ready = ready; 3049c717a561Smaybee callback->awcb_done = done; 3050c717a561Smaybee callback->awcb_private = private; 3051c717a561Smaybee callback->awcb_buf = buf; 3052c717a561Smaybee zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 3053c717a561Smaybee buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 3054c717a561Smaybee priority, flags, zb); 3055fa9e4066Sahrens 3056c717a561Smaybee return (zio); 3057fa9e4066Sahrens } 3058fa9e4066Sahrens 3059fa9e4066Sahrens int 3060fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 3061fa9e4066Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 3062fa9e4066Sahrens { 3063fa9e4066Sahrens arc_buf_hdr_t *ab; 3064fa9e4066Sahrens kmutex_t *hash_lock; 3065fa9e4066Sahrens zio_t *zio; 3066fa9e4066Sahrens 3067fa9e4066Sahrens /* 3068fa9e4066Sahrens * If this buffer is in the cache, release it, so it 3069fa9e4066Sahrens * can be re-used. 3070fa9e4066Sahrens */ 3071fa9e4066Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 3072fa9e4066Sahrens if (ab != NULL) { 3073fa9e4066Sahrens /* 3074fa9e4066Sahrens * The checksum of blocks to free is not always 3075fa9e4066Sahrens * preserved (eg. on the deadlist). However, if it is 3076fa9e4066Sahrens * nonzero, it should match what we have in the cache. 3077fa9e4066Sahrens */ 3078fa9e4066Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 3079fa9e4066Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 308044cb6abcSbmc if (ab->b_state != arc_anon) 308144cb6abcSbmc arc_change_state(arc_anon, ab, hash_lock); 308213506d1eSmaybee if (HDR_IO_IN_PROGRESS(ab)) { 308313506d1eSmaybee /* 308413506d1eSmaybee * This should only happen when we prefetch. 308513506d1eSmaybee */ 308613506d1eSmaybee ASSERT(ab->b_flags & ARC_PREFETCH); 308713506d1eSmaybee ASSERT3U(ab->b_datacnt, ==, 1); 308813506d1eSmaybee ab->b_flags |= ARC_FREED_IN_READ; 308913506d1eSmaybee if (HDR_IN_HASH_TABLE(ab)) 309013506d1eSmaybee buf_hash_remove(ab); 309113506d1eSmaybee ab->b_arc_access = 0; 309213506d1eSmaybee bzero(&ab->b_dva, sizeof (dva_t)); 309313506d1eSmaybee ab->b_birth = 0; 309413506d1eSmaybee ab->b_cksum0 = 0; 309513506d1eSmaybee ab->b_buf->b_efunc = NULL; 309613506d1eSmaybee ab->b_buf->b_private = NULL; 309713506d1eSmaybee mutex_exit(hash_lock); 309813506d1eSmaybee } else if (refcount_is_zero(&ab->b_refcnt)) { 3099fa94a07fSbrendan ab->b_flags |= ARC_FREE_IN_PROGRESS; 3100fa9e4066Sahrens mutex_exit(hash_lock); 3101ea8dc4b6Seschrock arc_hdr_destroy(ab); 310244cb6abcSbmc ARCSTAT_BUMP(arcstat_deleted); 3103fa9e4066Sahrens } else { 3104bbf4a8dfSmaybee /* 310513506d1eSmaybee * We still have an active reference on this 310613506d1eSmaybee * buffer. This can happen, e.g., from 310713506d1eSmaybee * dbuf_unoverride(). 3108bbf4a8dfSmaybee */ 310913506d1eSmaybee ASSERT(!HDR_IN_HASH_TABLE(ab)); 3110fa9e4066Sahrens ab->b_arc_access = 0; 3111fa9e4066Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 3112fa9e4066Sahrens ab->b_birth = 0; 3113fa9e4066Sahrens ab->b_cksum0 = 0; 3114ea8dc4b6Seschrock ab->b_buf->b_efunc = NULL; 3115ea8dc4b6Seschrock ab->b_buf->b_private = NULL; 3116fa9e4066Sahrens mutex_exit(hash_lock); 3117fa9e4066Sahrens } 3118fa9e4066Sahrens } 3119fa9e4066Sahrens 3120fa9e4066Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 3121fa9e4066Sahrens 3122fa9e4066Sahrens if (arc_flags & ARC_WAIT) 3123fa9e4066Sahrens return (zio_wait(zio)); 3124fa9e4066Sahrens 3125fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 3126fa9e4066Sahrens zio_nowait(zio); 3127fa9e4066Sahrens 3128fa9e4066Sahrens return (0); 3129fa9e4066Sahrens } 3130fa9e4066Sahrens 31311ab7f2deSmaybee static int 31321ab7f2deSmaybee arc_memory_throttle(uint64_t reserve, uint64_t txg) 31331ab7f2deSmaybee { 31341ab7f2deSmaybee #ifdef _KERNEL 31351ab7f2deSmaybee uint64_t inflight_data = arc_anon->arcs_size; 31361ab7f2deSmaybee uint64_t available_memory = ptob(freemem); 31371ab7f2deSmaybee static uint64_t page_load = 0; 31381ab7f2deSmaybee static uint64_t last_txg = 0; 31391ab7f2deSmaybee 31401ab7f2deSmaybee #if defined(__i386) 31411ab7f2deSmaybee available_memory = 31421ab7f2deSmaybee MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 31431ab7f2deSmaybee #endif 31441ab7f2deSmaybee if (available_memory >= zfs_write_limit_max) 31451ab7f2deSmaybee return (0); 31461ab7f2deSmaybee 31471ab7f2deSmaybee if (txg > last_txg) { 31481ab7f2deSmaybee last_txg = txg; 31491ab7f2deSmaybee page_load = 0; 31501ab7f2deSmaybee } 31511ab7f2deSmaybee /* 31521ab7f2deSmaybee * If we are in pageout, we know that memory is already tight, 31531ab7f2deSmaybee * the arc is already going to be evicting, so we just want to 31541ab7f2deSmaybee * continue to let page writes occur as quickly as possible. 31551ab7f2deSmaybee */ 31561ab7f2deSmaybee if (curproc == proc_pageout) { 31571ab7f2deSmaybee if (page_load > MAX(ptob(minfree), available_memory) / 4) 31581ab7f2deSmaybee return (ERESTART); 31591ab7f2deSmaybee /* Note: reserve is inflated, so we deflate */ 31601ab7f2deSmaybee page_load += reserve / 8; 31611ab7f2deSmaybee return (0); 31621ab7f2deSmaybee } else if (page_load > 0 && arc_reclaim_needed()) { 31631ab7f2deSmaybee /* memory is low, delay before restarting */ 31641ab7f2deSmaybee ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 31651ab7f2deSmaybee return (EAGAIN); 31661ab7f2deSmaybee } 31671ab7f2deSmaybee page_load = 0; 31681ab7f2deSmaybee 31691ab7f2deSmaybee if (arc_size > arc_c_min) { 31701ab7f2deSmaybee uint64_t evictable_memory = 31711ab7f2deSmaybee arc_mru->arcs_lsize[ARC_BUFC_DATA] + 31721ab7f2deSmaybee arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 31731ab7f2deSmaybee arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 31741ab7f2deSmaybee arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 31751ab7f2deSmaybee available_memory += MIN(evictable_memory, arc_size - arc_c_min); 31761ab7f2deSmaybee } 31771ab7f2deSmaybee 31781ab7f2deSmaybee if (inflight_data > available_memory / 4) { 31791ab7f2deSmaybee ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 31801ab7f2deSmaybee return (ERESTART); 31811ab7f2deSmaybee } 31821ab7f2deSmaybee #endif 31831ab7f2deSmaybee return (0); 31841ab7f2deSmaybee } 31851ab7f2deSmaybee 3186fa9e4066Sahrens void 31871ab7f2deSmaybee arc_tempreserve_clear(uint64_t reserve) 3188fa9e4066Sahrens { 31891ab7f2deSmaybee atomic_add_64(&arc_tempreserve, -reserve); 3190fa9e4066Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 3191fa9e4066Sahrens } 3192fa9e4066Sahrens 3193fa9e4066Sahrens int 31941ab7f2deSmaybee arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3195fa9e4066Sahrens { 31961ab7f2deSmaybee int error; 31971ab7f2deSmaybee 3198fa9e4066Sahrens #ifdef ZFS_DEBUG 3199fa9e4066Sahrens /* 3200fa9e4066Sahrens * Once in a while, fail for no reason. Everything should cope. 3201fa9e4066Sahrens */ 3202fa9e4066Sahrens if (spa_get_random(10000) == 0) { 3203fa9e4066Sahrens dprintf("forcing random failure\n"); 3204fa9e4066Sahrens return (ERESTART); 3205fa9e4066Sahrens } 3206fa9e4066Sahrens #endif 32071ab7f2deSmaybee if (reserve > arc_c/4 && !arc_no_grow) 32081ab7f2deSmaybee arc_c = MIN(arc_c_max, reserve * 4); 32091ab7f2deSmaybee if (reserve > arc_c) 3210112fe045Smaybee return (ENOMEM); 3211112fe045Smaybee 32121ab7f2deSmaybee /* 32131ab7f2deSmaybee * Writes will, almost always, require additional memory allocations 32141ab7f2deSmaybee * in order to compress/encrypt/etc the data. We therefor need to 32151ab7f2deSmaybee * make sure that there is sufficient available memory for this. 32161ab7f2deSmaybee */ 32171ab7f2deSmaybee if (error = arc_memory_throttle(reserve, txg)) 32181ab7f2deSmaybee return (error); 32191ab7f2deSmaybee 3220fa9e4066Sahrens /* 3221112fe045Smaybee * Throttle writes when the amount of dirty data in the cache 3222112fe045Smaybee * gets too large. We try to keep the cache less than half full 3223112fe045Smaybee * of dirty blocks so that our sync times don't grow too large. 3224112fe045Smaybee * Note: if two requests come in concurrently, we might let them 3225112fe045Smaybee * both succeed, when one of them should fail. Not a huge deal. 3226fa9e4066Sahrens */ 32271ab7f2deSmaybee if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 32281ab7f2deSmaybee arc_anon->arcs_size > arc_c / 4) { 32290e8c6158Smaybee dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 32300e8c6158Smaybee "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 32310e8c6158Smaybee arc_tempreserve>>10, 32320e8c6158Smaybee arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 32330e8c6158Smaybee arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 32341ab7f2deSmaybee reserve>>10, arc_c>>10); 3235fa9e4066Sahrens return (ERESTART); 3236fa9e4066Sahrens } 32371ab7f2deSmaybee atomic_add_64(&arc_tempreserve, reserve); 3238fa9e4066Sahrens return (0); 3239fa9e4066Sahrens } 3240fa9e4066Sahrens 3241fa9e4066Sahrens void 3242fa9e4066Sahrens arc_init(void) 3243fa9e4066Sahrens { 3244fa9e4066Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3245fa9e4066Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3246fa9e4066Sahrens 324713506d1eSmaybee /* Convert seconds to clock ticks */ 3248b19a79ecSperrin arc_min_prefetch_lifespan = 1 * hz; 324913506d1eSmaybee 3250fa9e4066Sahrens /* Start out with 1/8 of all memory */ 325144cb6abcSbmc arc_c = physmem * PAGESIZE / 8; 3252fa9e4066Sahrens 3253fa9e4066Sahrens #ifdef _KERNEL 3254fa9e4066Sahrens /* 3255fa9e4066Sahrens * On architectures where the physical memory can be larger 3256fa9e4066Sahrens * than the addressable space (intel in 32-bit mode), we may 3257fa9e4066Sahrens * need to limit the cache to 1/8 of VM size. 3258fa9e4066Sahrens */ 325944cb6abcSbmc arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3260fa9e4066Sahrens #endif 3261fa9e4066Sahrens 3262112fe045Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 326344cb6abcSbmc arc_c_min = MAX(arc_c / 4, 64<<20); 3264112fe045Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 326544cb6abcSbmc if (arc_c * 8 >= 1<<30) 326644cb6abcSbmc arc_c_max = (arc_c * 8) - (1<<30); 3267fa9e4066Sahrens else 326844cb6abcSbmc arc_c_max = arc_c_min; 326944cb6abcSbmc arc_c_max = MAX(arc_c * 6, arc_c_max); 3270a2eea2e1Sahrens 3271a2eea2e1Sahrens /* 3272a2eea2e1Sahrens * Allow the tunables to override our calculations if they are 3273a2eea2e1Sahrens * reasonable (ie. over 64MB) 3274a2eea2e1Sahrens */ 3275a2eea2e1Sahrens if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 327644cb6abcSbmc arc_c_max = zfs_arc_max; 327744cb6abcSbmc if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 327844cb6abcSbmc arc_c_min = zfs_arc_min; 3279a2eea2e1Sahrens 328044cb6abcSbmc arc_c = arc_c_max; 328144cb6abcSbmc arc_p = (arc_c >> 1); 3282fa9e4066Sahrens 32830e8c6158Smaybee /* limit meta-data to 1/4 of the arc capacity */ 32840e8c6158Smaybee arc_meta_limit = arc_c_max / 4; 32851116048bSek 32861116048bSek /* Allow the tunable to override if it is reasonable */ 32871116048bSek if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 32881116048bSek arc_meta_limit = zfs_arc_meta_limit; 32891116048bSek 32900e8c6158Smaybee if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 32910e8c6158Smaybee arc_c_min = arc_meta_limit / 2; 32920e8c6158Smaybee 3293fa9e4066Sahrens /* if kmem_flags are set, lets try to use less memory */ 3294fa9e4066Sahrens if (kmem_debugging()) 329544cb6abcSbmc arc_c = arc_c / 2; 329644cb6abcSbmc if (arc_c < arc_c_min) 329744cb6abcSbmc arc_c = arc_c_min; 329844cb6abcSbmc 329944cb6abcSbmc arc_anon = &ARC_anon; 330044cb6abcSbmc arc_mru = &ARC_mru; 330144cb6abcSbmc arc_mru_ghost = &ARC_mru_ghost; 330244cb6abcSbmc arc_mfu = &ARC_mfu; 330344cb6abcSbmc arc_mfu_ghost = &ARC_mfu_ghost; 3304fa94a07fSbrendan arc_l2c_only = &ARC_l2c_only; 330544cb6abcSbmc arc_size = 0; 330644cb6abcSbmc 330744cb6abcSbmc mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 330844cb6abcSbmc mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 330944cb6abcSbmc mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 331044cb6abcSbmc mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 331144cb6abcSbmc mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3312fa94a07fSbrendan mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 331344cb6abcSbmc 33140e8c6158Smaybee list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 33150e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33160e8c6158Smaybee list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 33170e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33180e8c6158Smaybee list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 33190e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33200e8c6158Smaybee list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 33210e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33220e8c6158Smaybee list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 33230e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33240e8c6158Smaybee list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 33250e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33260e8c6158Smaybee list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 33270e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 33280e8c6158Smaybee list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 33290e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3330fa94a07fSbrendan list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 3331fa94a07fSbrendan sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3332fa94a07fSbrendan list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 3333fa94a07fSbrendan sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3334fa9e4066Sahrens 3335fa9e4066Sahrens buf_init(); 3336fa9e4066Sahrens 3337fa9e4066Sahrens arc_thread_exit = 0; 3338ea8dc4b6Seschrock arc_eviction_list = NULL; 3339ea8dc4b6Seschrock mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 334040d7d650Smaybee bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3341fa9e4066Sahrens 334244cb6abcSbmc arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 334344cb6abcSbmc sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 334444cb6abcSbmc 334544cb6abcSbmc if (arc_ksp != NULL) { 334644cb6abcSbmc arc_ksp->ks_data = &arc_stats; 334744cb6abcSbmc kstat_install(arc_ksp); 334844cb6abcSbmc } 334944cb6abcSbmc 3350fa9e4066Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3351fa9e4066Sahrens TS_RUN, minclsyspri); 335249e3519aSmaybee 335349e3519aSmaybee arc_dead = FALSE; 3354*3a737e0dSbrendan arc_warm = B_FALSE; 33551ab7f2deSmaybee 33561ab7f2deSmaybee if (zfs_write_limit_max == 0) 33571ab7f2deSmaybee zfs_write_limit_max = physmem * PAGESIZE >> 33581ab7f2deSmaybee zfs_write_limit_shift; 33591ab7f2deSmaybee else 33601ab7f2deSmaybee zfs_write_limit_shift = 0; 3361fa9e4066Sahrens } 3362fa9e4066Sahrens 3363fa9e4066Sahrens void 3364fa9e4066Sahrens arc_fini(void) 3365fa9e4066Sahrens { 3366fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 3367fa9e4066Sahrens arc_thread_exit = 1; 3368fa9e4066Sahrens while (arc_thread_exit != 0) 3369fa9e4066Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3370fa9e4066Sahrens mutex_exit(&arc_reclaim_thr_lock); 3371fa9e4066Sahrens 3372874395d5Smaybee arc_flush(NULL); 3373fa9e4066Sahrens 3374fa9e4066Sahrens arc_dead = TRUE; 3375fa9e4066Sahrens 337644cb6abcSbmc if (arc_ksp != NULL) { 337744cb6abcSbmc kstat_delete(arc_ksp); 337844cb6abcSbmc arc_ksp = NULL; 337944cb6abcSbmc } 338044cb6abcSbmc 3381ea8dc4b6Seschrock mutex_destroy(&arc_eviction_mtx); 3382fa9e4066Sahrens mutex_destroy(&arc_reclaim_thr_lock); 3383fa9e4066Sahrens cv_destroy(&arc_reclaim_thr_cv); 3384fa9e4066Sahrens 33850e8c6158Smaybee list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 33860e8c6158Smaybee list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 33870e8c6158Smaybee list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 33880e8c6158Smaybee list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 33890e8c6158Smaybee list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 33900e8c6158Smaybee list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 33910e8c6158Smaybee list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 33920e8c6158Smaybee list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 3393fa9e4066Sahrens 339444cb6abcSbmc mutex_destroy(&arc_anon->arcs_mtx); 339544cb6abcSbmc mutex_destroy(&arc_mru->arcs_mtx); 339644cb6abcSbmc mutex_destroy(&arc_mru_ghost->arcs_mtx); 339744cb6abcSbmc mutex_destroy(&arc_mfu->arcs_mtx); 339844cb6abcSbmc mutex_destroy(&arc_mfu_ghost->arcs_mtx); 33995ad82045Snd 3400fa9e4066Sahrens buf_fini(); 3401fa9e4066Sahrens } 3402fa94a07fSbrendan 3403fa94a07fSbrendan /* 3404fa94a07fSbrendan * Level 2 ARC 3405fa94a07fSbrendan * 3406fa94a07fSbrendan * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3407fa94a07fSbrendan * It uses dedicated storage devices to hold cached data, which are populated 3408fa94a07fSbrendan * using large infrequent writes. The main role of this cache is to boost 3409fa94a07fSbrendan * the performance of random read workloads. The intended L2ARC devices 3410fa94a07fSbrendan * include short-stroked disks, solid state disks, and other media with 3411fa94a07fSbrendan * substantially faster read latency than disk. 3412fa94a07fSbrendan * 3413fa94a07fSbrendan * +-----------------------+ 3414fa94a07fSbrendan * | ARC | 3415fa94a07fSbrendan * +-----------------------+ 3416fa94a07fSbrendan * | ^ ^ 3417fa94a07fSbrendan * | | | 3418fa94a07fSbrendan * l2arc_feed_thread() arc_read() 3419fa94a07fSbrendan * | | | 3420fa94a07fSbrendan * | l2arc read | 3421fa94a07fSbrendan * V | | 3422fa94a07fSbrendan * +---------------+ | 3423fa94a07fSbrendan * | L2ARC | | 3424fa94a07fSbrendan * +---------------+ | 3425fa94a07fSbrendan * | ^ | 3426fa94a07fSbrendan * l2arc_write() | | 3427fa94a07fSbrendan * | | | 3428fa94a07fSbrendan * V | | 3429fa94a07fSbrendan * +-------+ +-------+ 3430fa94a07fSbrendan * | vdev | | vdev | 3431fa94a07fSbrendan * | cache | | cache | 3432fa94a07fSbrendan * +-------+ +-------+ 3433fa94a07fSbrendan * +=========+ .-----. 3434fa94a07fSbrendan * : L2ARC : |-_____-| 3435fa94a07fSbrendan * : devices : | Disks | 3436fa94a07fSbrendan * +=========+ `-_____-' 3437fa94a07fSbrendan * 3438fa94a07fSbrendan * Read requests are satisfied from the following sources, in order: 3439fa94a07fSbrendan * 3440fa94a07fSbrendan * 1) ARC 3441fa94a07fSbrendan * 2) vdev cache of L2ARC devices 3442fa94a07fSbrendan * 3) L2ARC devices 3443fa94a07fSbrendan * 4) vdev cache of disks 3444fa94a07fSbrendan * 5) disks 3445fa94a07fSbrendan * 3446fa94a07fSbrendan * Some L2ARC device types exhibit extremely slow write performance. 3447fa94a07fSbrendan * To accommodate for this there are some significant differences between 3448fa94a07fSbrendan * the L2ARC and traditional cache design: 3449fa94a07fSbrendan * 3450fa94a07fSbrendan * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 3451fa94a07fSbrendan * the ARC behave as usual, freeing buffers and placing headers on ghost 3452fa94a07fSbrendan * lists. The ARC does not send buffers to the L2ARC during eviction as 3453fa94a07fSbrendan * this would add inflated write latencies for all ARC memory pressure. 3454fa94a07fSbrendan * 3455fa94a07fSbrendan * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 3456fa94a07fSbrendan * It does this by periodically scanning buffers from the eviction-end of 3457fa94a07fSbrendan * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 3458fa94a07fSbrendan * not already there. It scans until a headroom of buffers is satisfied, 3459fa94a07fSbrendan * which itself is a buffer for ARC eviction. The thread that does this is 3460fa94a07fSbrendan * l2arc_feed_thread(), illustrated below; example sizes are included to 3461fa94a07fSbrendan * provide a better sense of ratio than this diagram: 3462fa94a07fSbrendan * 3463fa94a07fSbrendan * head --> tail 3464fa94a07fSbrendan * +---------------------+----------+ 3465fa94a07fSbrendan * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 3466fa94a07fSbrendan * +---------------------+----------+ | o L2ARC eligible 3467fa94a07fSbrendan * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 3468fa94a07fSbrendan * +---------------------+----------+ | 3469fa94a07fSbrendan * 15.9 Gbytes ^ 32 Mbytes | 3470fa94a07fSbrendan * headroom | 3471fa94a07fSbrendan * l2arc_feed_thread() 3472fa94a07fSbrendan * | 3473fa94a07fSbrendan * l2arc write hand <--[oooo]--' 3474fa94a07fSbrendan * | 8 Mbyte 3475fa94a07fSbrendan * | write max 3476fa94a07fSbrendan * V 3477fa94a07fSbrendan * +==============================+ 3478fa94a07fSbrendan * L2ARC dev |####|#|###|###| |####| ... | 3479fa94a07fSbrendan * +==============================+ 3480fa94a07fSbrendan * 32 Gbytes 3481fa94a07fSbrendan * 3482fa94a07fSbrendan * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 3483fa94a07fSbrendan * evicted, then the L2ARC has cached a buffer much sooner than it probably 3484fa94a07fSbrendan * needed to, potentially wasting L2ARC device bandwidth and storage. It is 3485fa94a07fSbrendan * safe to say that this is an uncommon case, since buffers at the end of 3486fa94a07fSbrendan * the ARC lists have moved there due to inactivity. 3487fa94a07fSbrendan * 3488fa94a07fSbrendan * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 3489fa94a07fSbrendan * then the L2ARC simply misses copying some buffers. This serves as a 3490fa94a07fSbrendan * pressure valve to prevent heavy read workloads from both stalling the ARC 3491fa94a07fSbrendan * with waits and clogging the L2ARC with writes. This also helps prevent 3492fa94a07fSbrendan * the potential for the L2ARC to churn if it attempts to cache content too 3493fa94a07fSbrendan * quickly, such as during backups of the entire pool. 3494fa94a07fSbrendan * 3495*3a737e0dSbrendan * 5. After system boot and before the ARC has filled main memory, there are 3496*3a737e0dSbrendan * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 3497*3a737e0dSbrendan * lists can remain mostly static. Instead of searching from tail of these 3498*3a737e0dSbrendan * lists as pictured, the l2arc_feed_thread() will search from the list heads 3499*3a737e0dSbrendan * for eligible buffers, greatly increasing its chance of finding them. 3500*3a737e0dSbrendan * 3501*3a737e0dSbrendan * The L2ARC device write speed is also boosted during this time so that 3502*3a737e0dSbrendan * the L2ARC warms up faster. Since there have been no ARC evictions yet, 3503*3a737e0dSbrendan * there are no L2ARC reads, and no fear of degrading read performance 3504*3a737e0dSbrendan * through increased writes. 3505*3a737e0dSbrendan * 3506*3a737e0dSbrendan * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 3507fa94a07fSbrendan * the vdev queue can aggregate them into larger and fewer writes. Each 3508fa94a07fSbrendan * device is written to in a rotor fashion, sweeping writes through 3509fa94a07fSbrendan * available space then repeating. 3510fa94a07fSbrendan * 3511*3a737e0dSbrendan * 7. The L2ARC does not store dirty content. It never needs to flush 3512fa94a07fSbrendan * write buffers back to disk based storage. 3513fa94a07fSbrendan * 3514*3a737e0dSbrendan * 8. If an ARC buffer is written (and dirtied) which also exists in the 3515fa94a07fSbrendan * L2ARC, the now stale L2ARC buffer is immediately dropped. 3516fa94a07fSbrendan * 3517fa94a07fSbrendan * The performance of the L2ARC can be tweaked by a number of tunables, which 3518fa94a07fSbrendan * may be necessary for different workloads: 3519fa94a07fSbrendan * 3520fa94a07fSbrendan * l2arc_write_max max write bytes per interval 3521*3a737e0dSbrendan * l2arc_write_boost extra write bytes during device warmup 3522fa94a07fSbrendan * l2arc_noprefetch skip caching prefetched buffers 3523fa94a07fSbrendan * l2arc_headroom number of max device writes to precache 3524fa94a07fSbrendan * l2arc_feed_secs seconds between L2ARC writing 3525fa94a07fSbrendan * 3526fa94a07fSbrendan * Tunables may be removed or added as future performance improvements are 3527fa94a07fSbrendan * integrated, and also may become zpool properties. 3528fa94a07fSbrendan */ 3529fa94a07fSbrendan 3530fa94a07fSbrendan static void 3531fa94a07fSbrendan l2arc_hdr_stat_add(void) 3532fa94a07fSbrendan { 3533e6c728e1Sbrendan ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 3534e6c728e1Sbrendan ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 3535fa94a07fSbrendan } 3536fa94a07fSbrendan 3537fa94a07fSbrendan static void 3538fa94a07fSbrendan l2arc_hdr_stat_remove(void) 3539fa94a07fSbrendan { 3540e6c728e1Sbrendan ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 3541e6c728e1Sbrendan ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 3542fa94a07fSbrendan } 3543fa94a07fSbrendan 3544fa94a07fSbrendan /* 3545fa94a07fSbrendan * Cycle through L2ARC devices. This is how L2ARC load balances. 3546*3a737e0dSbrendan * If a device is returned, this also returns holding the spa config lock. 3547fa94a07fSbrendan */ 3548fa94a07fSbrendan static l2arc_dev_t * 3549fa94a07fSbrendan l2arc_dev_get_next(void) 3550fa94a07fSbrendan { 3551*3a737e0dSbrendan l2arc_dev_t *first, *next = NULL; 3552*3a737e0dSbrendan 3553*3a737e0dSbrendan /* 3554*3a737e0dSbrendan * Lock out the removal of spas (spa_namespace_lock), then removal 3555*3a737e0dSbrendan * of cache devices (l2arc_dev_mtx). Once a device has been selected, 3556*3a737e0dSbrendan * both locks will be dropped and a spa config lock held instead. 3557*3a737e0dSbrendan */ 3558*3a737e0dSbrendan mutex_enter(&spa_namespace_lock); 3559*3a737e0dSbrendan mutex_enter(&l2arc_dev_mtx); 3560fa94a07fSbrendan 3561c5904d13Seschrock /* if there are no vdevs, there is nothing to do */ 3562c5904d13Seschrock if (l2arc_ndev == 0) 3563*3a737e0dSbrendan goto out; 3564c5904d13Seschrock 3565c5904d13Seschrock first = NULL; 3566c5904d13Seschrock next = l2arc_dev_last; 3567c5904d13Seschrock do { 3568c5904d13Seschrock /* loop around the list looking for a non-faulted vdev */ 3569c5904d13Seschrock if (next == NULL) { 3570fa94a07fSbrendan next = list_head(l2arc_dev_list); 3571c5904d13Seschrock } else { 3572c5904d13Seschrock next = list_next(l2arc_dev_list, next); 3573c5904d13Seschrock if (next == NULL) 3574c5904d13Seschrock next = list_head(l2arc_dev_list); 3575c5904d13Seschrock } 3576c5904d13Seschrock 3577c5904d13Seschrock /* if we have come back to the start, bail out */ 3578c5904d13Seschrock if (first == NULL) 3579c5904d13Seschrock first = next; 3580c5904d13Seschrock else if (next == first) 3581c5904d13Seschrock break; 3582c5904d13Seschrock 3583c5904d13Seschrock } while (vdev_is_dead(next->l2ad_vdev)); 3584c5904d13Seschrock 3585c5904d13Seschrock /* if we were unable to find any usable vdevs, return NULL */ 3586c5904d13Seschrock if (vdev_is_dead(next->l2ad_vdev)) 3587*3a737e0dSbrendan next = NULL; 3588fa94a07fSbrendan 3589fa94a07fSbrendan l2arc_dev_last = next; 3590fa94a07fSbrendan 3591*3a737e0dSbrendan out: 3592*3a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 3593*3a737e0dSbrendan 3594*3a737e0dSbrendan /* 3595*3a737e0dSbrendan * Grab the config lock to prevent the 'next' device from being 3596*3a737e0dSbrendan * removed while we are writing to it. 3597*3a737e0dSbrendan */ 3598*3a737e0dSbrendan if (next != NULL) 3599*3a737e0dSbrendan spa_config_enter(next->l2ad_spa, RW_READER, next); 3600*3a737e0dSbrendan mutex_exit(&spa_namespace_lock); 3601*3a737e0dSbrendan 3602fa94a07fSbrendan return (next); 3603fa94a07fSbrendan } 3604fa94a07fSbrendan 3605*3a737e0dSbrendan /* 3606*3a737e0dSbrendan * Free buffers that were tagged for destruction. 3607*3a737e0dSbrendan */ 3608*3a737e0dSbrendan static void 3609*3a737e0dSbrendan l2arc_do_free_on_write() 3610*3a737e0dSbrendan { 3611*3a737e0dSbrendan list_t *buflist; 3612*3a737e0dSbrendan l2arc_data_free_t *df, *df_prev; 3613*3a737e0dSbrendan 3614*3a737e0dSbrendan mutex_enter(&l2arc_free_on_write_mtx); 3615*3a737e0dSbrendan buflist = l2arc_free_on_write; 3616*3a737e0dSbrendan 3617*3a737e0dSbrendan for (df = list_tail(buflist); df; df = df_prev) { 3618*3a737e0dSbrendan df_prev = list_prev(buflist, df); 3619*3a737e0dSbrendan ASSERT(df->l2df_data != NULL); 3620*3a737e0dSbrendan ASSERT(df->l2df_func != NULL); 3621*3a737e0dSbrendan df->l2df_func(df->l2df_data, df->l2df_size); 3622*3a737e0dSbrendan list_remove(buflist, df); 3623*3a737e0dSbrendan kmem_free(df, sizeof (l2arc_data_free_t)); 3624*3a737e0dSbrendan } 3625*3a737e0dSbrendan 3626*3a737e0dSbrendan mutex_exit(&l2arc_free_on_write_mtx); 3627*3a737e0dSbrendan } 3628*3a737e0dSbrendan 3629fa94a07fSbrendan /* 3630fa94a07fSbrendan * A write to a cache device has completed. Update all headers to allow 3631fa94a07fSbrendan * reads from these buffers to begin. 3632fa94a07fSbrendan */ 3633fa94a07fSbrendan static void 3634fa94a07fSbrendan l2arc_write_done(zio_t *zio) 3635fa94a07fSbrendan { 3636fa94a07fSbrendan l2arc_write_callback_t *cb; 3637fa94a07fSbrendan l2arc_dev_t *dev; 3638fa94a07fSbrendan list_t *buflist; 3639fa94a07fSbrendan arc_buf_hdr_t *head, *ab, *ab_prev; 3640*3a737e0dSbrendan l2arc_buf_hdr_t *abl2; 3641fa94a07fSbrendan kmutex_t *hash_lock; 3642fa94a07fSbrendan 3643fa94a07fSbrendan cb = zio->io_private; 3644fa94a07fSbrendan ASSERT(cb != NULL); 3645fa94a07fSbrendan dev = cb->l2wcb_dev; 3646fa94a07fSbrendan ASSERT(dev != NULL); 3647fa94a07fSbrendan head = cb->l2wcb_head; 3648fa94a07fSbrendan ASSERT(head != NULL); 3649fa94a07fSbrendan buflist = dev->l2ad_buflist; 3650fa94a07fSbrendan ASSERT(buflist != NULL); 3651fa94a07fSbrendan DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 3652fa94a07fSbrendan l2arc_write_callback_t *, cb); 3653fa94a07fSbrendan 3654fa94a07fSbrendan if (zio->io_error != 0) 3655fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_writes_error); 3656fa94a07fSbrendan 3657fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 3658fa94a07fSbrendan 3659fa94a07fSbrendan /* 3660fa94a07fSbrendan * All writes completed, or an error was hit. 3661fa94a07fSbrendan */ 3662fa94a07fSbrendan for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 3663fa94a07fSbrendan ab_prev = list_prev(buflist, ab); 3664fa94a07fSbrendan 3665fa94a07fSbrendan hash_lock = HDR_LOCK(ab); 3666fa94a07fSbrendan if (!mutex_tryenter(hash_lock)) { 3667fa94a07fSbrendan /* 3668fa94a07fSbrendan * This buffer misses out. It may be in a stage 3669fa94a07fSbrendan * of eviction. Its ARC_L2_WRITING flag will be 3670fa94a07fSbrendan * left set, denying reads to this buffer. 3671fa94a07fSbrendan */ 3672fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 3673fa94a07fSbrendan continue; 3674fa94a07fSbrendan } 3675fa94a07fSbrendan 3676fa94a07fSbrendan if (zio->io_error != 0) { 3677fa94a07fSbrendan /* 3678*3a737e0dSbrendan * Error - drop L2ARC entry. 3679fa94a07fSbrendan */ 3680*3a737e0dSbrendan list_remove(buflist, ab); 3681*3a737e0dSbrendan abl2 = ab->b_l2hdr; 3682fa94a07fSbrendan ab->b_l2hdr = NULL; 3683*3a737e0dSbrendan kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 3684*3a737e0dSbrendan ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 3685fa94a07fSbrendan } 3686fa94a07fSbrendan 3687fa94a07fSbrendan /* 3688fa94a07fSbrendan * Allow ARC to begin reads to this L2ARC entry. 3689fa94a07fSbrendan */ 3690fa94a07fSbrendan ab->b_flags &= ~ARC_L2_WRITING; 3691fa94a07fSbrendan 3692fa94a07fSbrendan mutex_exit(hash_lock); 3693fa94a07fSbrendan } 3694fa94a07fSbrendan 3695fa94a07fSbrendan atomic_inc_64(&l2arc_writes_done); 3696fa94a07fSbrendan list_remove(buflist, head); 3697fa94a07fSbrendan kmem_cache_free(hdr_cache, head); 3698fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 3699fa94a07fSbrendan 3700*3a737e0dSbrendan l2arc_do_free_on_write(); 3701fa94a07fSbrendan 3702fa94a07fSbrendan kmem_free(cb, sizeof (l2arc_write_callback_t)); 3703fa94a07fSbrendan } 3704fa94a07fSbrendan 3705fa94a07fSbrendan /* 3706fa94a07fSbrendan * A read to a cache device completed. Validate buffer contents before 3707fa94a07fSbrendan * handing over to the regular ARC routines. 3708fa94a07fSbrendan */ 3709fa94a07fSbrendan static void 3710fa94a07fSbrendan l2arc_read_done(zio_t *zio) 3711fa94a07fSbrendan { 3712fa94a07fSbrendan l2arc_read_callback_t *cb; 3713fa94a07fSbrendan arc_buf_hdr_t *hdr; 3714fa94a07fSbrendan arc_buf_t *buf; 3715fa94a07fSbrendan zio_t *rzio; 3716fa94a07fSbrendan kmutex_t *hash_lock; 3717*3a737e0dSbrendan int equal; 3718fa94a07fSbrendan 3719fa94a07fSbrendan cb = zio->io_private; 3720fa94a07fSbrendan ASSERT(cb != NULL); 3721fa94a07fSbrendan buf = cb->l2rcb_buf; 3722fa94a07fSbrendan ASSERT(buf != NULL); 3723fa94a07fSbrendan hdr = buf->b_hdr; 3724fa94a07fSbrendan ASSERT(hdr != NULL); 3725fa94a07fSbrendan 3726fa94a07fSbrendan hash_lock = HDR_LOCK(hdr); 3727fa94a07fSbrendan mutex_enter(hash_lock); 3728fa94a07fSbrendan 3729fa94a07fSbrendan /* 3730fa94a07fSbrendan * Check this survived the L2ARC journey. 3731fa94a07fSbrendan */ 3732fa94a07fSbrendan equal = arc_cksum_equal(buf); 3733fa94a07fSbrendan if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 3734fa94a07fSbrendan mutex_exit(hash_lock); 3735fa94a07fSbrendan zio->io_private = buf; 3736fa94a07fSbrendan arc_read_done(zio); 3737fa94a07fSbrendan } else { 3738fa94a07fSbrendan mutex_exit(hash_lock); 3739fa94a07fSbrendan /* 3740fa94a07fSbrendan * Buffer didn't survive caching. Increment stats and 3741fa94a07fSbrendan * reissue to the original storage device. 3742fa94a07fSbrendan */ 3743*3a737e0dSbrendan if (zio->io_error != 0) { 3744fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_io_error); 3745*3a737e0dSbrendan } else { 3746*3a737e0dSbrendan zio->io_error = EIO; 3747*3a737e0dSbrendan } 3748fa94a07fSbrendan if (!equal) 3749fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_cksum_bad); 3750fa94a07fSbrendan 3751*3a737e0dSbrendan if (zio->io_waiter == NULL) { 3752*3a737e0dSbrendan /* 3753*3a737e0dSbrendan * Let the resent I/O call arc_read_done() instead. 3754*3a737e0dSbrendan */ 3755*3a737e0dSbrendan zio->io_done = NULL; 3756*3a737e0dSbrendan zio->io_flags &= ~ZIO_FLAG_DONT_CACHE; 3757fa94a07fSbrendan 3758*3a737e0dSbrendan rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp, 3759*3a737e0dSbrendan buf->b_data, zio->io_size, arc_read_done, buf, 3760*3a737e0dSbrendan zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); 3761fa94a07fSbrendan 3762*3a737e0dSbrendan (void) zio_nowait(rzio); 3763*3a737e0dSbrendan } 3764fa94a07fSbrendan } 3765fa94a07fSbrendan 3766fa94a07fSbrendan kmem_free(cb, sizeof (l2arc_read_callback_t)); 3767fa94a07fSbrendan } 3768fa94a07fSbrendan 3769fa94a07fSbrendan /* 3770fa94a07fSbrendan * This is the list priority from which the L2ARC will search for pages to 3771fa94a07fSbrendan * cache. This is used within loops (0..3) to cycle through lists in the 3772fa94a07fSbrendan * desired order. This order can have a significant effect on cache 3773fa94a07fSbrendan * performance. 3774fa94a07fSbrendan * 3775fa94a07fSbrendan * Currently the metadata lists are hit first, MFU then MRU, followed by 3776fa94a07fSbrendan * the data lists. This function returns a locked list, and also returns 3777fa94a07fSbrendan * the lock pointer. 3778fa94a07fSbrendan */ 3779fa94a07fSbrendan static list_t * 3780fa94a07fSbrendan l2arc_list_locked(int list_num, kmutex_t **lock) 3781fa94a07fSbrendan { 3782fa94a07fSbrendan list_t *list; 3783fa94a07fSbrendan 3784fa94a07fSbrendan ASSERT(list_num >= 0 && list_num <= 3); 3785fa94a07fSbrendan 3786fa94a07fSbrendan switch (list_num) { 3787fa94a07fSbrendan case 0: 3788fa94a07fSbrendan list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 3789fa94a07fSbrendan *lock = &arc_mfu->arcs_mtx; 3790fa94a07fSbrendan break; 3791fa94a07fSbrendan case 1: 3792fa94a07fSbrendan list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 3793fa94a07fSbrendan *lock = &arc_mru->arcs_mtx; 3794fa94a07fSbrendan break; 3795fa94a07fSbrendan case 2: 3796fa94a07fSbrendan list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 3797fa94a07fSbrendan *lock = &arc_mfu->arcs_mtx; 3798fa94a07fSbrendan break; 3799fa94a07fSbrendan case 3: 3800fa94a07fSbrendan list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 3801fa94a07fSbrendan *lock = &arc_mru->arcs_mtx; 3802fa94a07fSbrendan break; 3803fa94a07fSbrendan } 3804fa94a07fSbrendan 3805fa94a07fSbrendan ASSERT(!(MUTEX_HELD(*lock))); 3806fa94a07fSbrendan mutex_enter(*lock); 3807fa94a07fSbrendan return (list); 3808fa94a07fSbrendan } 3809fa94a07fSbrendan 3810fa94a07fSbrendan /* 3811fa94a07fSbrendan * Evict buffers from the device write hand to the distance specified in 3812fa94a07fSbrendan * bytes. This distance may span populated buffers, it may span nothing. 3813fa94a07fSbrendan * This is clearing a region on the L2ARC device ready for writing. 3814fa94a07fSbrendan * If the 'all' boolean is set, every buffer is evicted. 3815fa94a07fSbrendan */ 3816fa94a07fSbrendan static void 3817fa94a07fSbrendan l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 3818fa94a07fSbrendan { 3819fa94a07fSbrendan list_t *buflist; 3820fa94a07fSbrendan l2arc_buf_hdr_t *abl2; 3821fa94a07fSbrendan arc_buf_hdr_t *ab, *ab_prev; 3822fa94a07fSbrendan kmutex_t *hash_lock; 3823fa94a07fSbrendan uint64_t taddr; 3824fa94a07fSbrendan 3825fa94a07fSbrendan buflist = dev->l2ad_buflist; 3826fa94a07fSbrendan 3827fa94a07fSbrendan if (buflist == NULL) 3828fa94a07fSbrendan return; 3829fa94a07fSbrendan 3830fa94a07fSbrendan if (!all && dev->l2ad_first) { 3831fa94a07fSbrendan /* 3832fa94a07fSbrendan * This is the first sweep through the device. There is 3833fa94a07fSbrendan * nothing to evict. 3834fa94a07fSbrendan */ 3835fa94a07fSbrendan return; 3836fa94a07fSbrendan } 3837fa94a07fSbrendan 3838*3a737e0dSbrendan if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 3839fa94a07fSbrendan /* 3840fa94a07fSbrendan * When nearing the end of the device, evict to the end 3841fa94a07fSbrendan * before the device write hand jumps to the start. 3842fa94a07fSbrendan */ 3843fa94a07fSbrendan taddr = dev->l2ad_end; 3844fa94a07fSbrendan } else { 3845fa94a07fSbrendan taddr = dev->l2ad_hand + distance; 3846fa94a07fSbrendan } 3847fa94a07fSbrendan DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 3848fa94a07fSbrendan uint64_t, taddr, boolean_t, all); 3849fa94a07fSbrendan 3850fa94a07fSbrendan top: 3851fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 3852fa94a07fSbrendan for (ab = list_tail(buflist); ab; ab = ab_prev) { 3853fa94a07fSbrendan ab_prev = list_prev(buflist, ab); 3854fa94a07fSbrendan 3855fa94a07fSbrendan hash_lock = HDR_LOCK(ab); 3856fa94a07fSbrendan if (!mutex_tryenter(hash_lock)) { 3857fa94a07fSbrendan /* 3858fa94a07fSbrendan * Missed the hash lock. Retry. 3859fa94a07fSbrendan */ 3860fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 3861fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 3862fa94a07fSbrendan mutex_enter(hash_lock); 3863fa94a07fSbrendan mutex_exit(hash_lock); 3864fa94a07fSbrendan goto top; 3865fa94a07fSbrendan } 3866fa94a07fSbrendan 3867fa94a07fSbrendan if (HDR_L2_WRITE_HEAD(ab)) { 3868fa94a07fSbrendan /* 3869fa94a07fSbrendan * We hit a write head node. Leave it for 3870fa94a07fSbrendan * l2arc_write_done(). 3871fa94a07fSbrendan */ 3872fa94a07fSbrendan list_remove(buflist, ab); 3873fa94a07fSbrendan mutex_exit(hash_lock); 3874fa94a07fSbrendan continue; 3875fa94a07fSbrendan } 3876fa94a07fSbrendan 3877fa94a07fSbrendan if (!all && ab->b_l2hdr != NULL && 3878fa94a07fSbrendan (ab->b_l2hdr->b_daddr > taddr || 3879fa94a07fSbrendan ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 3880fa94a07fSbrendan /* 3881fa94a07fSbrendan * We've evicted to the target address, 3882fa94a07fSbrendan * or the end of the device. 3883fa94a07fSbrendan */ 3884fa94a07fSbrendan mutex_exit(hash_lock); 3885fa94a07fSbrendan break; 3886fa94a07fSbrendan } 3887fa94a07fSbrendan 3888fa94a07fSbrendan if (HDR_FREE_IN_PROGRESS(ab)) { 3889fa94a07fSbrendan /* 3890fa94a07fSbrendan * Already on the path to destruction. 3891fa94a07fSbrendan */ 3892fa94a07fSbrendan mutex_exit(hash_lock); 3893fa94a07fSbrendan continue; 3894fa94a07fSbrendan } 3895fa94a07fSbrendan 3896fa94a07fSbrendan if (ab->b_state == arc_l2c_only) { 3897fa94a07fSbrendan ASSERT(!HDR_L2_READING(ab)); 3898fa94a07fSbrendan /* 3899fa94a07fSbrendan * This doesn't exist in the ARC. Destroy. 3900fa94a07fSbrendan * arc_hdr_destroy() will call list_remove() 3901fa94a07fSbrendan * and decrement arcstat_l2_size. 3902fa94a07fSbrendan */ 3903fa94a07fSbrendan arc_change_state(arc_anon, ab, hash_lock); 3904fa94a07fSbrendan arc_hdr_destroy(ab); 3905fa94a07fSbrendan } else { 3906*3a737e0dSbrendan /* 3907*3a737e0dSbrendan * Invalidate issued or about to be issued 3908*3a737e0dSbrendan * reads, since we may be about to write 3909*3a737e0dSbrendan * over this location. 3910*3a737e0dSbrendan */ 3911*3a737e0dSbrendan if (HDR_L2_READING(ab)) { 3912*3a737e0dSbrendan ARCSTAT_BUMP(arcstat_l2_evict_reading); 3913*3a737e0dSbrendan ab->b_flags |= ARC_L2_EVICTED; 3914*3a737e0dSbrendan } 3915*3a737e0dSbrendan 3916fa94a07fSbrendan /* 3917fa94a07fSbrendan * Tell ARC this no longer exists in L2ARC. 3918fa94a07fSbrendan */ 3919fa94a07fSbrendan if (ab->b_l2hdr != NULL) { 3920fa94a07fSbrendan abl2 = ab->b_l2hdr; 3921fa94a07fSbrendan ab->b_l2hdr = NULL; 3922fa94a07fSbrendan kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 3923fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 3924fa94a07fSbrendan } 3925fa94a07fSbrendan list_remove(buflist, ab); 3926fa94a07fSbrendan 3927fa94a07fSbrendan /* 3928fa94a07fSbrendan * This may have been leftover after a 3929fa94a07fSbrendan * failed write. 3930fa94a07fSbrendan */ 3931fa94a07fSbrendan ab->b_flags &= ~ARC_L2_WRITING; 3932fa94a07fSbrendan } 3933fa94a07fSbrendan mutex_exit(hash_lock); 3934fa94a07fSbrendan } 3935fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 3936fa94a07fSbrendan 3937fa94a07fSbrendan spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); 3938fa94a07fSbrendan dev->l2ad_evict = taddr; 3939fa94a07fSbrendan } 3940fa94a07fSbrendan 3941fa94a07fSbrendan /* 3942fa94a07fSbrendan * Find and write ARC buffers to the L2ARC device. 3943fa94a07fSbrendan * 3944fa94a07fSbrendan * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 3945fa94a07fSbrendan * for reading until they have completed writing. 3946fa94a07fSbrendan */ 3947fa94a07fSbrendan static void 3948*3a737e0dSbrendan l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 3949fa94a07fSbrendan { 3950fa94a07fSbrendan arc_buf_hdr_t *ab, *ab_prev, *head; 3951fa94a07fSbrendan l2arc_buf_hdr_t *hdrl2; 3952fa94a07fSbrendan list_t *list; 3953*3a737e0dSbrendan uint64_t passed_sz, write_sz, buf_sz, headroom; 3954fa94a07fSbrendan void *buf_data; 3955fa94a07fSbrendan kmutex_t *hash_lock, *list_lock; 3956fa94a07fSbrendan boolean_t have_lock, full; 3957fa94a07fSbrendan l2arc_write_callback_t *cb; 3958fa94a07fSbrendan zio_t *pio, *wzio; 3959fa94a07fSbrendan 3960fa94a07fSbrendan ASSERT(dev->l2ad_vdev != NULL); 3961fa94a07fSbrendan 3962fa94a07fSbrendan pio = NULL; 3963fa94a07fSbrendan write_sz = 0; 3964fa94a07fSbrendan full = B_FALSE; 39651ab7f2deSmaybee head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3966fa94a07fSbrendan head->b_flags |= ARC_L2_WRITE_HEAD; 3967fa94a07fSbrendan 3968fa94a07fSbrendan /* 3969fa94a07fSbrendan * Copy buffers for L2ARC writing. 3970fa94a07fSbrendan */ 3971fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 3972fa94a07fSbrendan for (int try = 0; try <= 3; try++) { 3973fa94a07fSbrendan list = l2arc_list_locked(try, &list_lock); 3974fa94a07fSbrendan passed_sz = 0; 3975fa94a07fSbrendan 3976*3a737e0dSbrendan /* 3977*3a737e0dSbrendan * L2ARC fast warmup. 3978*3a737e0dSbrendan * 3979*3a737e0dSbrendan * Until the ARC is warm and starts to evict, read from the 3980*3a737e0dSbrendan * head of the ARC lists rather than the tail. 3981*3a737e0dSbrendan */ 3982*3a737e0dSbrendan headroom = target_sz * l2arc_headroom; 3983*3a737e0dSbrendan if (arc_warm == B_FALSE) 3984*3a737e0dSbrendan ab = list_head(list); 3985*3a737e0dSbrendan else 3986*3a737e0dSbrendan ab = list_tail(list); 3987*3a737e0dSbrendan 3988*3a737e0dSbrendan for (; ab; ab = ab_prev) { 3989*3a737e0dSbrendan if (arc_warm == B_FALSE) 3990*3a737e0dSbrendan ab_prev = list_next(list, ab); 3991*3a737e0dSbrendan else 3992*3a737e0dSbrendan ab_prev = list_prev(list, ab); 3993fa94a07fSbrendan 3994fa94a07fSbrendan hash_lock = HDR_LOCK(ab); 3995fa94a07fSbrendan have_lock = MUTEX_HELD(hash_lock); 3996fa94a07fSbrendan if (!have_lock && !mutex_tryenter(hash_lock)) { 3997fa94a07fSbrendan /* 3998fa94a07fSbrendan * Skip this buffer rather than waiting. 3999fa94a07fSbrendan */ 4000fa94a07fSbrendan continue; 4001fa94a07fSbrendan } 4002fa94a07fSbrendan 4003fa94a07fSbrendan passed_sz += ab->b_size; 4004fa94a07fSbrendan if (passed_sz > headroom) { 4005fa94a07fSbrendan /* 4006fa94a07fSbrendan * Searched too far. 4007fa94a07fSbrendan */ 4008fa94a07fSbrendan mutex_exit(hash_lock); 4009fa94a07fSbrendan break; 4010fa94a07fSbrendan } 4011fa94a07fSbrendan 4012fa94a07fSbrendan if (ab->b_spa != spa) { 4013fa94a07fSbrendan mutex_exit(hash_lock); 4014fa94a07fSbrendan continue; 4015fa94a07fSbrendan } 4016fa94a07fSbrendan 4017fa94a07fSbrendan if (ab->b_l2hdr != NULL) { 4018fa94a07fSbrendan /* 4019fa94a07fSbrendan * Already in L2ARC. 4020fa94a07fSbrendan */ 4021fa94a07fSbrendan mutex_exit(hash_lock); 4022fa94a07fSbrendan continue; 4023fa94a07fSbrendan } 4024fa94a07fSbrendan 4025fa94a07fSbrendan if (HDR_IO_IN_PROGRESS(ab) || HDR_DONT_L2CACHE(ab)) { 4026fa94a07fSbrendan mutex_exit(hash_lock); 4027fa94a07fSbrendan continue; 4028fa94a07fSbrendan } 4029fa94a07fSbrendan 4030fa94a07fSbrendan if ((write_sz + ab->b_size) > target_sz) { 4031fa94a07fSbrendan full = B_TRUE; 4032fa94a07fSbrendan mutex_exit(hash_lock); 4033fa94a07fSbrendan break; 4034fa94a07fSbrendan } 4035fa94a07fSbrendan 4036fa94a07fSbrendan if (ab->b_buf == NULL) { 4037fa94a07fSbrendan DTRACE_PROBE1(l2arc__buf__null, void *, ab); 4038fa94a07fSbrendan mutex_exit(hash_lock); 4039fa94a07fSbrendan continue; 4040fa94a07fSbrendan } 4041fa94a07fSbrendan 4042fa94a07fSbrendan if (pio == NULL) { 4043fa94a07fSbrendan /* 4044fa94a07fSbrendan * Insert a dummy header on the buflist so 4045fa94a07fSbrendan * l2arc_write_done() can find where the 4046fa94a07fSbrendan * write buffers begin without searching. 4047fa94a07fSbrendan */ 4048fa94a07fSbrendan list_insert_head(dev->l2ad_buflist, head); 4049fa94a07fSbrendan 4050fa94a07fSbrendan cb = kmem_alloc( 4051fa94a07fSbrendan sizeof (l2arc_write_callback_t), KM_SLEEP); 4052fa94a07fSbrendan cb->l2wcb_dev = dev; 4053fa94a07fSbrendan cb->l2wcb_head = head; 4054fa94a07fSbrendan pio = zio_root(spa, l2arc_write_done, cb, 4055fa94a07fSbrendan ZIO_FLAG_CANFAIL); 4056fa94a07fSbrendan } 4057fa94a07fSbrendan 4058fa94a07fSbrendan /* 4059fa94a07fSbrendan * Create and add a new L2ARC header. 4060fa94a07fSbrendan */ 4061fa94a07fSbrendan hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4062fa94a07fSbrendan hdrl2->b_dev = dev; 4063fa94a07fSbrendan hdrl2->b_daddr = dev->l2ad_hand; 4064fa94a07fSbrendan 4065fa94a07fSbrendan ab->b_flags |= ARC_L2_WRITING; 4066fa94a07fSbrendan ab->b_l2hdr = hdrl2; 4067fa94a07fSbrendan list_insert_head(dev->l2ad_buflist, ab); 4068fa94a07fSbrendan buf_data = ab->b_buf->b_data; 4069fa94a07fSbrendan buf_sz = ab->b_size; 4070fa94a07fSbrendan 4071fa94a07fSbrendan /* 4072fa94a07fSbrendan * Compute and store the buffer cksum before 4073fa94a07fSbrendan * writing. On debug the cksum is verified first. 4074fa94a07fSbrendan */ 4075fa94a07fSbrendan arc_cksum_verify(ab->b_buf); 4076fa94a07fSbrendan arc_cksum_compute(ab->b_buf, B_TRUE); 4077fa94a07fSbrendan 4078fa94a07fSbrendan mutex_exit(hash_lock); 4079fa94a07fSbrendan 4080fa94a07fSbrendan wzio = zio_write_phys(pio, dev->l2ad_vdev, 4081fa94a07fSbrendan dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4082fa94a07fSbrendan NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4083fa94a07fSbrendan ZIO_FLAG_CANFAIL, B_FALSE); 4084fa94a07fSbrendan 4085fa94a07fSbrendan DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4086fa94a07fSbrendan zio_t *, wzio); 4087fa94a07fSbrendan (void) zio_nowait(wzio); 4088fa94a07fSbrendan 4089fa94a07fSbrendan write_sz += buf_sz; 4090fa94a07fSbrendan dev->l2ad_hand += buf_sz; 4091fa94a07fSbrendan } 4092fa94a07fSbrendan 4093fa94a07fSbrendan mutex_exit(list_lock); 4094fa94a07fSbrendan 4095fa94a07fSbrendan if (full == B_TRUE) 4096fa94a07fSbrendan break; 4097fa94a07fSbrendan } 4098fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 4099fa94a07fSbrendan 4100fa94a07fSbrendan if (pio == NULL) { 4101fa94a07fSbrendan ASSERT3U(write_sz, ==, 0); 4102fa94a07fSbrendan kmem_cache_free(hdr_cache, head); 4103fa94a07fSbrendan return; 4104fa94a07fSbrendan } 4105fa94a07fSbrendan 4106fa94a07fSbrendan ASSERT3U(write_sz, <=, target_sz); 4107fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_writes_sent); 4108fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, write_sz); 4109fa94a07fSbrendan spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); 4110fa94a07fSbrendan 4111fa94a07fSbrendan /* 4112fa94a07fSbrendan * Bump device hand to the device start if it is approaching the end. 4113fa94a07fSbrendan * l2arc_evict() will already have evicted ahead for this case. 4114fa94a07fSbrendan */ 4115*3a737e0dSbrendan if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4116fa94a07fSbrendan spa_l2cache_space_update(dev->l2ad_vdev, 0, 4117fa94a07fSbrendan dev->l2ad_end - dev->l2ad_hand); 4118fa94a07fSbrendan dev->l2ad_hand = dev->l2ad_start; 4119fa94a07fSbrendan dev->l2ad_evict = dev->l2ad_start; 4120fa94a07fSbrendan dev->l2ad_first = B_FALSE; 4121fa94a07fSbrendan } 4122fa94a07fSbrendan 4123fa94a07fSbrendan (void) zio_wait(pio); 4124fa94a07fSbrendan } 4125fa94a07fSbrendan 4126fa94a07fSbrendan /* 4127fa94a07fSbrendan * This thread feeds the L2ARC at regular intervals. This is the beating 4128fa94a07fSbrendan * heart of the L2ARC. 4129fa94a07fSbrendan */ 4130fa94a07fSbrendan static void 4131fa94a07fSbrendan l2arc_feed_thread(void) 4132fa94a07fSbrendan { 4133fa94a07fSbrendan callb_cpr_t cpr; 4134fa94a07fSbrendan l2arc_dev_t *dev; 4135fa94a07fSbrendan spa_t *spa; 4136*3a737e0dSbrendan uint64_t size; 4137fa94a07fSbrendan 4138fa94a07fSbrendan CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4139fa94a07fSbrendan 4140fa94a07fSbrendan mutex_enter(&l2arc_feed_thr_lock); 4141fa94a07fSbrendan 4142fa94a07fSbrendan while (l2arc_thread_exit == 0) { 4143fa94a07fSbrendan /* 4144*3a737e0dSbrendan * Pause for l2arc_feed_secs seconds between writes. 4145fa94a07fSbrendan */ 4146fa94a07fSbrendan CALLB_CPR_SAFE_BEGIN(&cpr); 4147fa94a07fSbrendan (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4148*3a737e0dSbrendan lbolt + (hz * l2arc_feed_secs)); 4149fa94a07fSbrendan CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4150fa94a07fSbrendan 4151*3a737e0dSbrendan /* 4152*3a737e0dSbrendan * Quick check for L2ARC devices. 4153*3a737e0dSbrendan */ 4154c5904d13Seschrock mutex_enter(&l2arc_dev_mtx); 4155*3a737e0dSbrendan if (l2arc_ndev == 0) { 4156*3a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 4157*3a737e0dSbrendan continue; 4158*3a737e0dSbrendan } 4159*3a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 4160c5904d13Seschrock 4161fa94a07fSbrendan /* 4162c5904d13Seschrock * This selects the next l2arc device to write to, and in 4163c5904d13Seschrock * doing so the next spa to feed from: dev->l2ad_spa. This 4164*3a737e0dSbrendan * will return NULL if there are now no l2arc devices or if 4165*3a737e0dSbrendan * they are all faulted. 4166*3a737e0dSbrendan * 4167*3a737e0dSbrendan * If a device is returned, its spa's config lock is also 4168*3a737e0dSbrendan * held to prevent device removal. l2arc_dev_get_next() 4169*3a737e0dSbrendan * will grab and release l2arc_dev_mtx. 4170fa94a07fSbrendan */ 4171*3a737e0dSbrendan if ((dev = l2arc_dev_get_next()) == NULL) 4172fa94a07fSbrendan continue; 4173*3a737e0dSbrendan 4174*3a737e0dSbrendan spa = dev->l2ad_spa; 4175*3a737e0dSbrendan ASSERT(spa != NULL); 4176fa94a07fSbrendan 4177fa94a07fSbrendan /* 4178fa94a07fSbrendan * Avoid contributing to memory pressure. 4179fa94a07fSbrendan */ 4180fa94a07fSbrendan if (arc_reclaim_needed()) { 4181fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4182*3a737e0dSbrendan spa_config_exit(spa, dev); 4183fa94a07fSbrendan continue; 4184fa94a07fSbrendan } 4185fa94a07fSbrendan 4186fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_feeds); 4187fa94a07fSbrendan 4188*3a737e0dSbrendan size = dev->l2ad_write; 4189*3a737e0dSbrendan if (arc_warm == B_FALSE) 4190*3a737e0dSbrendan size += dev->l2ad_boost; 4191*3a737e0dSbrendan 4192fa94a07fSbrendan /* 4193fa94a07fSbrendan * Evict L2ARC buffers that will be overwritten. 4194fa94a07fSbrendan */ 4195*3a737e0dSbrendan l2arc_evict(dev, size, B_FALSE); 4196fa94a07fSbrendan 4197fa94a07fSbrendan /* 4198fa94a07fSbrendan * Write ARC buffers. 4199fa94a07fSbrendan */ 4200*3a737e0dSbrendan l2arc_write_buffers(spa, dev, size); 4201*3a737e0dSbrendan spa_config_exit(spa, dev); 4202fa94a07fSbrendan } 4203fa94a07fSbrendan 4204fa94a07fSbrendan l2arc_thread_exit = 0; 4205fa94a07fSbrendan cv_broadcast(&l2arc_feed_thr_cv); 4206fa94a07fSbrendan CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 4207fa94a07fSbrendan thread_exit(); 4208fa94a07fSbrendan } 4209fa94a07fSbrendan 4210c5904d13Seschrock boolean_t 4211c5904d13Seschrock l2arc_vdev_present(vdev_t *vd) 4212c5904d13Seschrock { 4213c5904d13Seschrock l2arc_dev_t *dev; 4214c5904d13Seschrock 4215c5904d13Seschrock mutex_enter(&l2arc_dev_mtx); 4216c5904d13Seschrock for (dev = list_head(l2arc_dev_list); dev != NULL; 4217c5904d13Seschrock dev = list_next(l2arc_dev_list, dev)) { 4218c5904d13Seschrock if (dev->l2ad_vdev == vd) 4219c5904d13Seschrock break; 4220c5904d13Seschrock } 4221c5904d13Seschrock mutex_exit(&l2arc_dev_mtx); 4222c5904d13Seschrock 4223c5904d13Seschrock return (dev != NULL); 4224c5904d13Seschrock } 4225c5904d13Seschrock 4226fa94a07fSbrendan /* 4227fa94a07fSbrendan * Add a vdev for use by the L2ARC. By this point the spa has already 4228fa94a07fSbrendan * validated the vdev and opened it. 4229fa94a07fSbrendan */ 4230fa94a07fSbrendan void 4231fa94a07fSbrendan l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) 4232fa94a07fSbrendan { 4233fa94a07fSbrendan l2arc_dev_t *adddev; 4234fa94a07fSbrendan 4235c5904d13Seschrock ASSERT(!l2arc_vdev_present(vd)); 4236c5904d13Seschrock 4237fa94a07fSbrendan /* 4238fa94a07fSbrendan * Create a new l2arc device entry. 4239fa94a07fSbrendan */ 4240fa94a07fSbrendan adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 4241fa94a07fSbrendan adddev->l2ad_spa = spa; 4242fa94a07fSbrendan adddev->l2ad_vdev = vd; 4243fa94a07fSbrendan adddev->l2ad_write = l2arc_write_max; 4244*3a737e0dSbrendan adddev->l2ad_boost = l2arc_write_boost; 4245fa94a07fSbrendan adddev->l2ad_start = start; 4246fa94a07fSbrendan adddev->l2ad_end = end; 4247fa94a07fSbrendan adddev->l2ad_hand = adddev->l2ad_start; 4248fa94a07fSbrendan adddev->l2ad_evict = adddev->l2ad_start; 4249fa94a07fSbrendan adddev->l2ad_first = B_TRUE; 4250fa94a07fSbrendan ASSERT3U(adddev->l2ad_write, >, 0); 4251fa94a07fSbrendan 4252fa94a07fSbrendan /* 4253fa94a07fSbrendan * This is a list of all ARC buffers that are still valid on the 4254fa94a07fSbrendan * device. 4255fa94a07fSbrendan */ 4256fa94a07fSbrendan adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 4257fa94a07fSbrendan list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 4258fa94a07fSbrendan offsetof(arc_buf_hdr_t, b_l2node)); 4259fa94a07fSbrendan 4260fa94a07fSbrendan spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); 4261fa94a07fSbrendan 4262fa94a07fSbrendan /* 4263fa94a07fSbrendan * Add device to global list 4264fa94a07fSbrendan */ 4265fa94a07fSbrendan mutex_enter(&l2arc_dev_mtx); 4266fa94a07fSbrendan list_insert_head(l2arc_dev_list, adddev); 4267fa94a07fSbrendan atomic_inc_64(&l2arc_ndev); 4268fa94a07fSbrendan mutex_exit(&l2arc_dev_mtx); 4269fa94a07fSbrendan } 4270fa94a07fSbrendan 4271fa94a07fSbrendan /* 4272fa94a07fSbrendan * Remove a vdev from the L2ARC. 4273fa94a07fSbrendan */ 4274fa94a07fSbrendan void 4275fa94a07fSbrendan l2arc_remove_vdev(vdev_t *vd) 4276fa94a07fSbrendan { 4277fa94a07fSbrendan l2arc_dev_t *dev, *nextdev, *remdev = NULL; 4278fa94a07fSbrendan 4279fa94a07fSbrendan /* 4280fa94a07fSbrendan * Find the device by vdev 4281fa94a07fSbrendan */ 4282fa94a07fSbrendan mutex_enter(&l2arc_dev_mtx); 4283fa94a07fSbrendan for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 4284fa94a07fSbrendan nextdev = list_next(l2arc_dev_list, dev); 4285fa94a07fSbrendan if (vd == dev->l2ad_vdev) { 4286fa94a07fSbrendan remdev = dev; 4287fa94a07fSbrendan break; 4288fa94a07fSbrendan } 4289fa94a07fSbrendan } 4290fa94a07fSbrendan ASSERT(remdev != NULL); 4291fa94a07fSbrendan 4292fa94a07fSbrendan /* 4293fa94a07fSbrendan * Remove device from global list 4294fa94a07fSbrendan */ 4295fa94a07fSbrendan list_remove(l2arc_dev_list, remdev); 4296fa94a07fSbrendan l2arc_dev_last = NULL; /* may have been invalidated */ 4297*3a737e0dSbrendan atomic_dec_64(&l2arc_ndev); 4298*3a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 4299fa94a07fSbrendan 4300fa94a07fSbrendan /* 4301fa94a07fSbrendan * Clear all buflists and ARC references. L2ARC device flush. 4302fa94a07fSbrendan */ 4303fa94a07fSbrendan l2arc_evict(remdev, 0, B_TRUE); 4304fa94a07fSbrendan list_destroy(remdev->l2ad_buflist); 4305fa94a07fSbrendan kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 4306fa94a07fSbrendan kmem_free(remdev, sizeof (l2arc_dev_t)); 4307fa94a07fSbrendan } 4308fa94a07fSbrendan 4309fa94a07fSbrendan void 4310fa94a07fSbrendan l2arc_init() 4311fa94a07fSbrendan { 4312fa94a07fSbrendan l2arc_thread_exit = 0; 4313fa94a07fSbrendan l2arc_ndev = 0; 4314fa94a07fSbrendan l2arc_writes_sent = 0; 4315fa94a07fSbrendan l2arc_writes_done = 0; 4316fa94a07fSbrendan 4317fa94a07fSbrendan mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4318fa94a07fSbrendan cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 4319fa94a07fSbrendan mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 4320fa94a07fSbrendan mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 4321fa94a07fSbrendan mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 4322fa94a07fSbrendan 4323fa94a07fSbrendan l2arc_dev_list = &L2ARC_dev_list; 4324fa94a07fSbrendan l2arc_free_on_write = &L2ARC_free_on_write; 4325fa94a07fSbrendan list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 4326fa94a07fSbrendan offsetof(l2arc_dev_t, l2ad_node)); 4327fa94a07fSbrendan list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 4328fa94a07fSbrendan offsetof(l2arc_data_free_t, l2df_list_node)); 4329fa94a07fSbrendan 4330fa94a07fSbrendan (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 4331fa94a07fSbrendan TS_RUN, minclsyspri); 4332fa94a07fSbrendan } 4333fa94a07fSbrendan 4334fa94a07fSbrendan void 4335fa94a07fSbrendan l2arc_fini() 4336fa94a07fSbrendan { 4337*3a737e0dSbrendan /* 4338*3a737e0dSbrendan * This is called from dmu_fini(), which is called from spa_fini(); 4339*3a737e0dSbrendan * Because of this, we can assume that all l2arc devices have 4340*3a737e0dSbrendan * already been removed when the pools themselves were removed. 4341*3a737e0dSbrendan */ 4342*3a737e0dSbrendan 4343fa94a07fSbrendan mutex_enter(&l2arc_feed_thr_lock); 4344fa94a07fSbrendan cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 4345fa94a07fSbrendan l2arc_thread_exit = 1; 4346fa94a07fSbrendan while (l2arc_thread_exit != 0) 4347fa94a07fSbrendan cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 4348fa94a07fSbrendan mutex_exit(&l2arc_feed_thr_lock); 4349fa94a07fSbrendan 4350*3a737e0dSbrendan l2arc_do_free_on_write(); 4351*3a737e0dSbrendan 4352fa94a07fSbrendan mutex_destroy(&l2arc_feed_thr_lock); 4353fa94a07fSbrendan cv_destroy(&l2arc_feed_thr_cv); 4354fa94a07fSbrendan mutex_destroy(&l2arc_dev_mtx); 4355fa94a07fSbrendan mutex_destroy(&l2arc_buflist_mtx); 4356fa94a07fSbrendan mutex_destroy(&l2arc_free_on_write_mtx); 4357fa94a07fSbrendan 4358fa94a07fSbrendan list_destroy(l2arc_dev_list); 4359fa94a07fSbrendan list_destroy(l2arc_free_on_write); 4360fa94a07fSbrendan } 4361