1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5033f9833Sek * Common Development and Distribution License (the "License"). 6033f9833Sek * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22033f9833Sek * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens /* 29fa9e4066Sahrens * DVA-based Adjustable Relpacement Cache 30fa9e4066Sahrens * 31ea8dc4b6Seschrock * While much of the theory of operation used here is 32ea8dc4b6Seschrock * based on the self-tuning, low overhead replacement cache 33fa9e4066Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 34fa9e4066Sahrens * significant differences: 35fa9e4066Sahrens * 36fa9e4066Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 37fa9e4066Sahrens * Pages in its cache cannot be "locked" into memory. This makes 38fa9e4066Sahrens * the eviction algorithm simple: evict the last page in the list. 39fa9e4066Sahrens * This also make the performance characteristics easy to reason 40fa9e4066Sahrens * about. Our cache is not so simple. At any given moment, some 41fa9e4066Sahrens * subset of the blocks in the cache are un-evictable because we 42fa9e4066Sahrens * have handed out a reference to them. Blocks are only evictable 43fa9e4066Sahrens * when there are no external references active. This makes 44fa9e4066Sahrens * eviction far more problematic: we choose to evict the evictable 45fa9e4066Sahrens * blocks that are the "lowest" in the list. 46fa9e4066Sahrens * 47fa9e4066Sahrens * There are times when it is not possible to evict the requested 48fa9e4066Sahrens * space. In these circumstances we are unable to adjust the cache 49fa9e4066Sahrens * size. To prevent the cache growing unbounded at these times we 50fa9e4066Sahrens * implement a "cache throttle" that slowes the flow of new data 51fa9e4066Sahrens * into the cache until we can make space avaiable. 52fa9e4066Sahrens * 53fa9e4066Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 54fa9e4066Sahrens * Pages are evicted when the cache is full and there is a cache 55fa9e4066Sahrens * miss. Our model has a variable sized cache. It grows with 56fa9e4066Sahrens * high use, but also tries to react to memory preasure from the 57fa9e4066Sahrens * operating system: decreasing its size when system memory is 58fa9e4066Sahrens * tight. 59fa9e4066Sahrens * 60fa9e4066Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 61fa9e4066Sahrens * elements of the cache are therefor exactly the same size. So 62fa9e4066Sahrens * when adjusting the cache size following a cache miss, its simply 63fa9e4066Sahrens * a matter of choosing a single page to evict. In our model, we 64fa9e4066Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 65fa9e4066Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 66fa9e4066Sahrens * space for a cache miss that approximates as closely as possible 67fa9e4066Sahrens * the space used by the new block. 68fa9e4066Sahrens * 69fa9e4066Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70fa9e4066Sahrens * by N. Megiddo & D. Modha, FAST 2003 71fa9e4066Sahrens */ 72fa9e4066Sahrens 73fa9e4066Sahrens /* 74fa9e4066Sahrens * The locking model: 75fa9e4066Sahrens * 76fa9e4066Sahrens * A new reference to a cache buffer can be obtained in two 77fa9e4066Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 78fa9e4066Sahrens * or 2) via one of the ARC lists. The arc_read() inerface 79fa9e4066Sahrens * uses method 1, while the internal arc algorithms for 80fa9e4066Sahrens * adjusting the cache use method 2. We therefor provide two 81fa9e4066Sahrens * types of locks: 1) the hash table lock array, and 2) the 82fa9e4066Sahrens * arc list locks. 83fa9e4066Sahrens * 84fa9e4066Sahrens * Buffers do not have their own mutexs, rather they rely on the 85fa9e4066Sahrens * hash table mutexs for the bulk of their protection (i.e. most 86fa9e4066Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 87fa9e4066Sahrens * 88fa9e4066Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 89fa9e4066Sahrens * locates the requested buffer in the hash table. It returns 90fa9e4066Sahrens * NULL for the mutex if the buffer was not in the table. 91fa9e4066Sahrens * 92fa9e4066Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 93fa9e4066Sahrens * already held before it is invoked. 94fa9e4066Sahrens * 95fa9e4066Sahrens * Each arc state also has a mutex which is used to protect the 96fa9e4066Sahrens * buffer list associated with the state. When attempting to 97fa9e4066Sahrens * obtain a hash table lock while holding an arc list lock you 98fa9e4066Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 99*44eda4d7Smaybee * the active state mutex must be held before the ghost state mutex. 100fa9e4066Sahrens * 101ea8dc4b6Seschrock * Arc buffers may have an associated eviction callback function. 102ea8dc4b6Seschrock * This function will be invoked prior to removing the buffer (e.g. 103ea8dc4b6Seschrock * in arc_do_user_evicts()). Note however that the data associated 104ea8dc4b6Seschrock * with the buffer may be evicted prior to the callback. The callback 105ea8dc4b6Seschrock * must be made with *no locks held* (to prevent deadlock). Additionally, 106ea8dc4b6Seschrock * the users of callbacks must ensure that their private data is 107ea8dc4b6Seschrock * protected from simultaneous callbacks from arc_buf_evict() 108ea8dc4b6Seschrock * and arc_do_user_evicts(). 109ea8dc4b6Seschrock * 110fa9e4066Sahrens * Note that the majority of the performance stats are manipulated 111fa9e4066Sahrens * with atomic operations. 112fa9e4066Sahrens */ 113fa9e4066Sahrens 114fa9e4066Sahrens #include <sys/spa.h> 115fa9e4066Sahrens #include <sys/zio.h> 116fa9e4066Sahrens #include <sys/zfs_context.h> 117fa9e4066Sahrens #include <sys/arc.h> 118fa9e4066Sahrens #include <sys/refcount.h> 119fa9e4066Sahrens #ifdef _KERNEL 120fa9e4066Sahrens #include <sys/vmsystm.h> 121fa9e4066Sahrens #include <vm/anon.h> 122fa9e4066Sahrens #include <sys/fs/swapnode.h> 123033f9833Sek #include <sys/dnlc.h> 124fa9e4066Sahrens #endif 125fa9e4066Sahrens #include <sys/callb.h> 126fa9e4066Sahrens 127fa9e4066Sahrens static kmutex_t arc_reclaim_thr_lock; 128fa9e4066Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 129fa9e4066Sahrens static uint8_t arc_thread_exit; 130fa9e4066Sahrens 131033f9833Sek #define ARC_REDUCE_DNLC_PERCENT 3 132033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 133033f9833Sek 134fa9e4066Sahrens typedef enum arc_reclaim_strategy { 135fa9e4066Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 136fa9e4066Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 137fa9e4066Sahrens } arc_reclaim_strategy_t; 138fa9e4066Sahrens 139fa9e4066Sahrens /* number of seconds before growing cache again */ 140fa9e4066Sahrens static int arc_grow_retry = 60; 141fa9e4066Sahrens 14213506d1eSmaybee /* 143b19a79ecSperrin * minimum lifespan of a prefetch block in clock ticks 144b19a79ecSperrin * (initialized in arc_init()) 14513506d1eSmaybee */ 146b19a79ecSperrin static int arc_min_prefetch_lifespan; 14713506d1eSmaybee 148fa9e4066Sahrens static kmutex_t arc_reclaim_lock; 149fa9e4066Sahrens static int arc_dead; 150fa9e4066Sahrens 151fa9e4066Sahrens /* 152fa9e4066Sahrens * Note that buffers can be on one of 5 states: 153fa9e4066Sahrens * ARC_anon - anonymous (discussed below) 154ea8dc4b6Seschrock * ARC_mru - recently used, currently cached 155ea8dc4b6Seschrock * ARC_mru_ghost - recentely used, no longer in cache 156ea8dc4b6Seschrock * ARC_mfu - frequently used, currently cached 157ea8dc4b6Seschrock * ARC_mfu_ghost - frequently used, no longer in cache 158fa9e4066Sahrens * When there are no active references to the buffer, they 159fa9e4066Sahrens * are linked onto one of the lists in arc. These are the 160fa9e4066Sahrens * only buffers that can be evicted or deleted. 161fa9e4066Sahrens * 162fa9e4066Sahrens * Anonymous buffers are buffers that are not associated with 163fa9e4066Sahrens * a DVA. These are buffers that hold dirty block copies 164fa9e4066Sahrens * before they are written to stable storage. By definition, 165ea8dc4b6Seschrock * they are "ref'd" and are considered part of arc_mru 166fa9e4066Sahrens * that cannot be freed. Generally, they will aquire a DVA 167ea8dc4b6Seschrock * as they are written and migrate onto the arc_mru list. 168fa9e4066Sahrens */ 169fa9e4066Sahrens 170fa9e4066Sahrens typedef struct arc_state { 171fa9e4066Sahrens list_t list; /* linked list of evictable buffer in state */ 172fa9e4066Sahrens uint64_t lsize; /* total size of buffers in the linked list */ 173fa9e4066Sahrens uint64_t size; /* total size of all buffers in this state */ 174fa9e4066Sahrens uint64_t hits; 175fa9e4066Sahrens kmutex_t mtx; 176fa9e4066Sahrens } arc_state_t; 177fa9e4066Sahrens 178fa9e4066Sahrens /* The 5 states: */ 179fa9e4066Sahrens static arc_state_t ARC_anon; 180ea8dc4b6Seschrock static arc_state_t ARC_mru; 181ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost; 182ea8dc4b6Seschrock static arc_state_t ARC_mfu; 183ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost; 184fa9e4066Sahrens 185fa9e4066Sahrens static struct arc { 186fa9e4066Sahrens arc_state_t *anon; 187ea8dc4b6Seschrock arc_state_t *mru; 188ea8dc4b6Seschrock arc_state_t *mru_ghost; 189ea8dc4b6Seschrock arc_state_t *mfu; 190ea8dc4b6Seschrock arc_state_t *mfu_ghost; 191fa9e4066Sahrens uint64_t size; /* Actual total arc size */ 192ea8dc4b6Seschrock uint64_t p; /* Target size (in bytes) of mru */ 193fa9e4066Sahrens uint64_t c; /* Target size of cache (in bytes) */ 194fa9e4066Sahrens uint64_t c_min; /* Minimum target cache size */ 195fa9e4066Sahrens uint64_t c_max; /* Maximum target cache size */ 196fa9e4066Sahrens 197fa9e4066Sahrens /* performance stats */ 198fa9e4066Sahrens uint64_t hits; 199fa9e4066Sahrens uint64_t misses; 200fa9e4066Sahrens uint64_t deleted; 201*44eda4d7Smaybee uint64_t recycle_miss; 202*44eda4d7Smaybee uint64_t mutex_miss; 203*44eda4d7Smaybee uint64_t evict_skip; 204fa9e4066Sahrens uint64_t hash_elements; 205fa9e4066Sahrens uint64_t hash_elements_max; 206fa9e4066Sahrens uint64_t hash_collisions; 207fa9e4066Sahrens uint64_t hash_chains; 208fa9e4066Sahrens uint32_t hash_chain_max; 209fa9e4066Sahrens 210fa9e4066Sahrens int no_grow; /* Don't try to grow cache size */ 211fa9e4066Sahrens } arc; 212fa9e4066Sahrens 213fa9e4066Sahrens static uint64_t arc_tempreserve; 214fa9e4066Sahrens 215fa9e4066Sahrens typedef struct arc_callback arc_callback_t; 216fa9e4066Sahrens 217fa9e4066Sahrens struct arc_callback { 218fa9e4066Sahrens arc_done_func_t *acb_done; 219fa9e4066Sahrens void *acb_private; 220fa9e4066Sahrens arc_byteswap_func_t *acb_byteswap; 221fa9e4066Sahrens arc_buf_t *acb_buf; 222fa9e4066Sahrens zio_t *acb_zio_dummy; 223fa9e4066Sahrens arc_callback_t *acb_next; 224fa9e4066Sahrens }; 225fa9e4066Sahrens 226fa9e4066Sahrens struct arc_buf_hdr { 227fa9e4066Sahrens /* immutable */ 228fa9e4066Sahrens uint64_t b_size; 229fa9e4066Sahrens spa_t *b_spa; 230fa9e4066Sahrens 231fa9e4066Sahrens /* protected by hash lock */ 232fa9e4066Sahrens dva_t b_dva; 233fa9e4066Sahrens uint64_t b_birth; 234fa9e4066Sahrens uint64_t b_cksum0; 235fa9e4066Sahrens 236fa9e4066Sahrens arc_buf_hdr_t *b_hash_next; 237fa9e4066Sahrens arc_buf_t *b_buf; 238fa9e4066Sahrens uint32_t b_flags; 239ea8dc4b6Seschrock uint32_t b_datacnt; 240fa9e4066Sahrens 241fa9e4066Sahrens kcondvar_t b_cv; 242fa9e4066Sahrens arc_callback_t *b_acb; 243fa9e4066Sahrens 244fa9e4066Sahrens /* protected by arc state mutex */ 245fa9e4066Sahrens arc_state_t *b_state; 246fa9e4066Sahrens list_node_t b_arc_node; 247fa9e4066Sahrens 248fa9e4066Sahrens /* updated atomically */ 249fa9e4066Sahrens clock_t b_arc_access; 250fa9e4066Sahrens 251fa9e4066Sahrens /* self protecting */ 252fa9e4066Sahrens refcount_t b_refcnt; 253fa9e4066Sahrens }; 254fa9e4066Sahrens 255ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list; 256ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx; 257*44eda4d7Smaybee static void arc_get_data_buf(arc_buf_t *buf); 258*44eda4d7Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 259ea8dc4b6Seschrock 260ea8dc4b6Seschrock #define GHOST_STATE(state) \ 261ea8dc4b6Seschrock ((state) == arc.mru_ghost || (state) == arc.mfu_ghost) 262ea8dc4b6Seschrock 263fa9e4066Sahrens /* 264fa9e4066Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 265fa9e4066Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 266fa9e4066Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 267fa9e4066Sahrens * should never be passed and should only be set by ARC code. When adding new 268fa9e4066Sahrens * public flags, make sure not to smash the private ones. 269fa9e4066Sahrens */ 270fa9e4066Sahrens 271ea8dc4b6Seschrock #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 272fa9e4066Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 273fa9e4066Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 274fa9e4066Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 275ea8dc4b6Seschrock #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 27613506d1eSmaybee #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 277fa9e4066Sahrens 278ea8dc4b6Seschrock #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 279fa9e4066Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 280fa9e4066Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 281fa9e4066Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 282ea8dc4b6Seschrock #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 283fa9e4066Sahrens 284fa9e4066Sahrens /* 285fa9e4066Sahrens * Hash table routines 286fa9e4066Sahrens */ 287fa9e4066Sahrens 288fa9e4066Sahrens #define HT_LOCK_PAD 64 289fa9e4066Sahrens 290fa9e4066Sahrens struct ht_lock { 291fa9e4066Sahrens kmutex_t ht_lock; 292fa9e4066Sahrens #ifdef _KERNEL 293fa9e4066Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 294fa9e4066Sahrens #endif 295fa9e4066Sahrens }; 296fa9e4066Sahrens 297fa9e4066Sahrens #define BUF_LOCKS 256 298fa9e4066Sahrens typedef struct buf_hash_table { 299fa9e4066Sahrens uint64_t ht_mask; 300fa9e4066Sahrens arc_buf_hdr_t **ht_table; 301fa9e4066Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 302fa9e4066Sahrens } buf_hash_table_t; 303fa9e4066Sahrens 304fa9e4066Sahrens static buf_hash_table_t buf_hash_table; 305fa9e4066Sahrens 306fa9e4066Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 307fa9e4066Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 308fa9e4066Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 309fa9e4066Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 310fa9e4066Sahrens #define HDR_LOCK(buf) \ 311fa9e4066Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 312fa9e4066Sahrens 313fa9e4066Sahrens uint64_t zfs_crc64_table[256]; 314fa9e4066Sahrens 315fa9e4066Sahrens static uint64_t 316fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 317fa9e4066Sahrens { 318fa9e4066Sahrens uintptr_t spav = (uintptr_t)spa; 319fa9e4066Sahrens uint8_t *vdva = (uint8_t *)dva; 320fa9e4066Sahrens uint64_t crc = -1ULL; 321fa9e4066Sahrens int i; 322fa9e4066Sahrens 323fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 324fa9e4066Sahrens 325fa9e4066Sahrens for (i = 0; i < sizeof (dva_t); i++) 326fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 327fa9e4066Sahrens 328fa9e4066Sahrens crc ^= (spav>>8) ^ birth; 329fa9e4066Sahrens 330fa9e4066Sahrens return (crc); 331fa9e4066Sahrens } 332fa9e4066Sahrens 333fa9e4066Sahrens #define BUF_EMPTY(buf) \ 334fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 335fa9e4066Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 336fa9e4066Sahrens (buf)->b_birth == 0) 337fa9e4066Sahrens 338fa9e4066Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 339fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 340fa9e4066Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 341fa9e4066Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 342fa9e4066Sahrens 343fa9e4066Sahrens static arc_buf_hdr_t * 344fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 345fa9e4066Sahrens { 346fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 347fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 348fa9e4066Sahrens arc_buf_hdr_t *buf; 349fa9e4066Sahrens 350fa9e4066Sahrens mutex_enter(hash_lock); 351fa9e4066Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 352fa9e4066Sahrens buf = buf->b_hash_next) { 353fa9e4066Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 354fa9e4066Sahrens *lockp = hash_lock; 355fa9e4066Sahrens return (buf); 356fa9e4066Sahrens } 357fa9e4066Sahrens } 358fa9e4066Sahrens mutex_exit(hash_lock); 359fa9e4066Sahrens *lockp = NULL; 360fa9e4066Sahrens return (NULL); 361fa9e4066Sahrens } 362fa9e4066Sahrens 363fa9e4066Sahrens /* 364fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 365fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 366fa9e4066Sahrens * will be returned and the new element will not be inserted. 367fa9e4066Sahrens * Otherwise returns NULL. 368fa9e4066Sahrens */ 369fa9e4066Sahrens static arc_buf_hdr_t * 370fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 371fa9e4066Sahrens { 372fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 373fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 374fa9e4066Sahrens arc_buf_hdr_t *fbuf; 375fa9e4066Sahrens uint32_t max, i; 376fa9e4066Sahrens 377ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(buf)); 378fa9e4066Sahrens *lockp = hash_lock; 379fa9e4066Sahrens mutex_enter(hash_lock); 380fa9e4066Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 381fa9e4066Sahrens fbuf = fbuf->b_hash_next, i++) { 382fa9e4066Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 383fa9e4066Sahrens return (fbuf); 384fa9e4066Sahrens } 385fa9e4066Sahrens 386fa9e4066Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 387fa9e4066Sahrens buf_hash_table.ht_table[idx] = buf; 388ea8dc4b6Seschrock buf->b_flags |= ARC_IN_HASH_TABLE; 389fa9e4066Sahrens 390fa9e4066Sahrens /* collect some hash table performance data */ 391fa9e4066Sahrens if (i > 0) { 392fa9e4066Sahrens atomic_add_64(&arc.hash_collisions, 1); 393fa9e4066Sahrens if (i == 1) 394fa9e4066Sahrens atomic_add_64(&arc.hash_chains, 1); 395fa9e4066Sahrens } 396fa9e4066Sahrens while (i > (max = arc.hash_chain_max) && 397fa9e4066Sahrens max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 398fa9e4066Sahrens continue; 399fa9e4066Sahrens } 400fa9e4066Sahrens atomic_add_64(&arc.hash_elements, 1); 401fa9e4066Sahrens if (arc.hash_elements > arc.hash_elements_max) 402fa9e4066Sahrens atomic_add_64(&arc.hash_elements_max, 1); 403fa9e4066Sahrens 404fa9e4066Sahrens return (NULL); 405fa9e4066Sahrens } 406fa9e4066Sahrens 407fa9e4066Sahrens static void 408fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 409fa9e4066Sahrens { 410fa9e4066Sahrens arc_buf_hdr_t *fbuf, **bufp; 411fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 412fa9e4066Sahrens 413fa9e4066Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 414ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(buf)); 415fa9e4066Sahrens 416fa9e4066Sahrens bufp = &buf_hash_table.ht_table[idx]; 417fa9e4066Sahrens while ((fbuf = *bufp) != buf) { 418fa9e4066Sahrens ASSERT(fbuf != NULL); 419fa9e4066Sahrens bufp = &fbuf->b_hash_next; 420fa9e4066Sahrens } 421fa9e4066Sahrens *bufp = buf->b_hash_next; 422fa9e4066Sahrens buf->b_hash_next = NULL; 423ea8dc4b6Seschrock buf->b_flags &= ~ARC_IN_HASH_TABLE; 424fa9e4066Sahrens 425fa9e4066Sahrens /* collect some hash table performance data */ 426fa9e4066Sahrens atomic_add_64(&arc.hash_elements, -1); 427fa9e4066Sahrens if (buf_hash_table.ht_table[idx] && 428fa9e4066Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 429fa9e4066Sahrens atomic_add_64(&arc.hash_chains, -1); 430fa9e4066Sahrens } 431fa9e4066Sahrens 432fa9e4066Sahrens /* 433fa9e4066Sahrens * Global data structures and functions for the buf kmem cache. 434fa9e4066Sahrens */ 435fa9e4066Sahrens static kmem_cache_t *hdr_cache; 436fa9e4066Sahrens static kmem_cache_t *buf_cache; 437fa9e4066Sahrens 438fa9e4066Sahrens static void 439fa9e4066Sahrens buf_fini(void) 440fa9e4066Sahrens { 441fa9e4066Sahrens int i; 442fa9e4066Sahrens 443fa9e4066Sahrens kmem_free(buf_hash_table.ht_table, 444fa9e4066Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 445fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) 446fa9e4066Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 447fa9e4066Sahrens kmem_cache_destroy(hdr_cache); 448fa9e4066Sahrens kmem_cache_destroy(buf_cache); 449fa9e4066Sahrens } 450fa9e4066Sahrens 451fa9e4066Sahrens /* 452fa9e4066Sahrens * Constructor callback - called when the cache is empty 453fa9e4066Sahrens * and a new buf is requested. 454fa9e4066Sahrens */ 455fa9e4066Sahrens /* ARGSUSED */ 456fa9e4066Sahrens static int 457fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 458fa9e4066Sahrens { 459fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 460fa9e4066Sahrens 461fa9e4066Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 462fa9e4066Sahrens refcount_create(&buf->b_refcnt); 463fa9e4066Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 464fa9e4066Sahrens return (0); 465fa9e4066Sahrens } 466fa9e4066Sahrens 467fa9e4066Sahrens /* 468fa9e4066Sahrens * Destructor callback - called when a cached buf is 469fa9e4066Sahrens * no longer required. 470fa9e4066Sahrens */ 471fa9e4066Sahrens /* ARGSUSED */ 472fa9e4066Sahrens static void 473fa9e4066Sahrens hdr_dest(void *vbuf, void *unused) 474fa9e4066Sahrens { 475fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 476fa9e4066Sahrens 477fa9e4066Sahrens refcount_destroy(&buf->b_refcnt); 478fa9e4066Sahrens cv_destroy(&buf->b_cv); 479fa9e4066Sahrens } 480fa9e4066Sahrens 481ea8dc4b6Seschrock static int arc_reclaim_needed(void); 482fa9e4066Sahrens void arc_kmem_reclaim(void); 483fa9e4066Sahrens 484fa9e4066Sahrens /* 485fa9e4066Sahrens * Reclaim callback -- invoked when memory is low. 486fa9e4066Sahrens */ 487fa9e4066Sahrens /* ARGSUSED */ 488fa9e4066Sahrens static void 489fa9e4066Sahrens hdr_recl(void *unused) 490fa9e4066Sahrens { 491fa9e4066Sahrens dprintf("hdr_recl called\n"); 492ea8dc4b6Seschrock if (arc_reclaim_needed()) 493ea8dc4b6Seschrock arc_kmem_reclaim(); 494fa9e4066Sahrens } 495fa9e4066Sahrens 496fa9e4066Sahrens static void 497fa9e4066Sahrens buf_init(void) 498fa9e4066Sahrens { 499fa9e4066Sahrens uint64_t *ct; 500ea8dc4b6Seschrock uint64_t hsize = 1ULL << 12; 501fa9e4066Sahrens int i, j; 502fa9e4066Sahrens 503fa9e4066Sahrens /* 504fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 505ea8dc4b6Seschrock * with an average 64K block size. The table will take up 506ea8dc4b6Seschrock * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 507fa9e4066Sahrens */ 508ea8dc4b6Seschrock while (hsize * 65536 < physmem * PAGESIZE) 509fa9e4066Sahrens hsize <<= 1; 510ea8dc4b6Seschrock retry: 511fa9e4066Sahrens buf_hash_table.ht_mask = hsize - 1; 512ea8dc4b6Seschrock buf_hash_table.ht_table = 513ea8dc4b6Seschrock kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 514ea8dc4b6Seschrock if (buf_hash_table.ht_table == NULL) { 515ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 8)); 516ea8dc4b6Seschrock hsize >>= 1; 517ea8dc4b6Seschrock goto retry; 518ea8dc4b6Seschrock } 519fa9e4066Sahrens 520fa9e4066Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 521fa9e4066Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 522fa9e4066Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 523fa9e4066Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 524fa9e4066Sahrens 525fa9e4066Sahrens for (i = 0; i < 256; i++) 526fa9e4066Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 527fa9e4066Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 528fa9e4066Sahrens 529fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) { 530fa9e4066Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 531fa9e4066Sahrens NULL, MUTEX_DEFAULT, NULL); 532fa9e4066Sahrens } 533fa9e4066Sahrens } 534fa9e4066Sahrens 535fa9e4066Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 536fa9e4066Sahrens 537fa9e4066Sahrens static void 538fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 539fa9e4066Sahrens { 540fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 541fa9e4066Sahrens 542fa9e4066Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 543fa9e4066Sahrens (ab->b_state != arc.anon)) { 544ea8dc4b6Seschrock int delta = ab->b_size * ab->b_datacnt; 545fa9e4066Sahrens 546fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 547fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 548fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 549fa9e4066Sahrens list_remove(&ab->b_state->list, ab); 550ea8dc4b6Seschrock if (GHOST_STATE(ab->b_state)) { 551ea8dc4b6Seschrock ASSERT3U(ab->b_datacnt, ==, 0); 552ea8dc4b6Seschrock ASSERT3P(ab->b_buf, ==, NULL); 553ea8dc4b6Seschrock delta = ab->b_size; 554ea8dc4b6Seschrock } 555ea8dc4b6Seschrock ASSERT(delta > 0); 556ea8dc4b6Seschrock ASSERT3U(ab->b_state->lsize, >=, delta); 557ea8dc4b6Seschrock atomic_add_64(&ab->b_state->lsize, -delta); 558fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 55913506d1eSmaybee /* remove the prefetch flag is we get a reference */ 56013506d1eSmaybee if (ab->b_flags & ARC_PREFETCH) 56113506d1eSmaybee ab->b_flags &= ~ARC_PREFETCH; 562fa9e4066Sahrens } 563fa9e4066Sahrens } 564fa9e4066Sahrens 565fa9e4066Sahrens static int 566fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 567fa9e4066Sahrens { 568fa9e4066Sahrens int cnt; 569fa9e4066Sahrens 570ea8dc4b6Seschrock ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock)); 571ea8dc4b6Seschrock ASSERT(!GHOST_STATE(ab->b_state)); 572fa9e4066Sahrens 573fa9e4066Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 574fa9e4066Sahrens (ab->b_state != arc.anon)) { 575fa9e4066Sahrens 576fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 577fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 578fa9e4066Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 579fa9e4066Sahrens list_insert_head(&ab->b_state->list, ab); 580ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 581ea8dc4b6Seschrock atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt); 582ea8dc4b6Seschrock ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize); 583fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 584fa9e4066Sahrens } 585fa9e4066Sahrens return (cnt); 586fa9e4066Sahrens } 587fa9e4066Sahrens 588fa9e4066Sahrens /* 589fa9e4066Sahrens * Move the supplied buffer to the indicated state. The mutex 590fa9e4066Sahrens * for the buffer must be held by the caller. 591fa9e4066Sahrens */ 592fa9e4066Sahrens static void 593ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 594fa9e4066Sahrens { 595ea8dc4b6Seschrock arc_state_t *old_state = ab->b_state; 596ea8dc4b6Seschrock int refcnt = refcount_count(&ab->b_refcnt); 597ea8dc4b6Seschrock int from_delta, to_delta; 598fa9e4066Sahrens 599fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 600ea8dc4b6Seschrock ASSERT(new_state != old_state); 601ea8dc4b6Seschrock ASSERT(refcnt == 0 || ab->b_datacnt > 0); 602ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 603ea8dc4b6Seschrock 604ea8dc4b6Seschrock from_delta = to_delta = ab->b_datacnt * ab->b_size; 605fa9e4066Sahrens 606fa9e4066Sahrens /* 607fa9e4066Sahrens * If this buffer is evictable, transfer it from the 608fa9e4066Sahrens * old state list to the new state list. 609fa9e4066Sahrens */ 610ea8dc4b6Seschrock if (refcnt == 0) { 611ea8dc4b6Seschrock if (old_state != arc.anon) { 612ea8dc4b6Seschrock int use_mutex = !MUTEX_HELD(&old_state->mtx); 613ea8dc4b6Seschrock 614ea8dc4b6Seschrock if (use_mutex) 615ea8dc4b6Seschrock mutex_enter(&old_state->mtx); 616fa9e4066Sahrens 617fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 618ea8dc4b6Seschrock list_remove(&old_state->list, ab); 619ea8dc4b6Seschrock 62013506d1eSmaybee /* 62113506d1eSmaybee * If prefetching out of the ghost cache, 62213506d1eSmaybee * we will have a non-null datacnt. 62313506d1eSmaybee */ 62413506d1eSmaybee if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 62513506d1eSmaybee /* ghost elements have a ghost size */ 626ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 627ea8dc4b6Seschrock from_delta = ab->b_size; 628ea8dc4b6Seschrock } 629ea8dc4b6Seschrock ASSERT3U(old_state->lsize, >=, from_delta); 630ea8dc4b6Seschrock atomic_add_64(&old_state->lsize, -from_delta); 631ea8dc4b6Seschrock 632ea8dc4b6Seschrock if (use_mutex) 633ea8dc4b6Seschrock mutex_exit(&old_state->mtx); 634fa9e4066Sahrens } 635fa9e4066Sahrens if (new_state != arc.anon) { 636ea8dc4b6Seschrock int use_mutex = !MUTEX_HELD(&new_state->mtx); 637fa9e4066Sahrens 638ea8dc4b6Seschrock if (use_mutex) 639fa9e4066Sahrens mutex_enter(&new_state->mtx); 640ea8dc4b6Seschrock 641fa9e4066Sahrens list_insert_head(&new_state->list, ab); 642ea8dc4b6Seschrock 643ea8dc4b6Seschrock /* ghost elements have a ghost size */ 644ea8dc4b6Seschrock if (GHOST_STATE(new_state)) { 645ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 646ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 647ea8dc4b6Seschrock to_delta = ab->b_size; 648ea8dc4b6Seschrock } 649ea8dc4b6Seschrock atomic_add_64(&new_state->lsize, to_delta); 650ea8dc4b6Seschrock ASSERT3U(new_state->size + to_delta, >=, 651ea8dc4b6Seschrock new_state->lsize); 652ea8dc4b6Seschrock 653ea8dc4b6Seschrock if (use_mutex) 654fa9e4066Sahrens mutex_exit(&new_state->mtx); 655fa9e4066Sahrens } 656fa9e4066Sahrens } 657fa9e4066Sahrens 658fa9e4066Sahrens ASSERT(!BUF_EMPTY(ab)); 659ea8dc4b6Seschrock if (new_state == arc.anon && old_state != arc.anon) { 660fa9e4066Sahrens buf_hash_remove(ab); 661fa9e4066Sahrens } 662fa9e4066Sahrens 663ea8dc4b6Seschrock /* adjust state sizes */ 664ea8dc4b6Seschrock if (to_delta) 665ea8dc4b6Seschrock atomic_add_64(&new_state->size, to_delta); 666ea8dc4b6Seschrock if (from_delta) { 667ea8dc4b6Seschrock ASSERT3U(old_state->size, >=, from_delta); 668ea8dc4b6Seschrock atomic_add_64(&old_state->size, -from_delta); 669fa9e4066Sahrens } 670fa9e4066Sahrens ab->b_state = new_state; 671fa9e4066Sahrens } 672fa9e4066Sahrens 673fa9e4066Sahrens arc_buf_t * 674fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag) 675fa9e4066Sahrens { 676fa9e4066Sahrens arc_buf_hdr_t *hdr; 677fa9e4066Sahrens arc_buf_t *buf; 678fa9e4066Sahrens 679fa9e4066Sahrens ASSERT3U(size, >, 0); 680fa9e4066Sahrens hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 681fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 682fa9e4066Sahrens hdr->b_size = size; 683fa9e4066Sahrens hdr->b_spa = spa; 684fa9e4066Sahrens hdr->b_state = arc.anon; 685fa9e4066Sahrens hdr->b_arc_access = 0; 686fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 687fa9e4066Sahrens buf->b_hdr = hdr; 688*44eda4d7Smaybee buf->b_data = NULL; 689ea8dc4b6Seschrock buf->b_efunc = NULL; 690ea8dc4b6Seschrock buf->b_private = NULL; 691fa9e4066Sahrens buf->b_next = NULL; 692fa9e4066Sahrens hdr->b_buf = buf; 693*44eda4d7Smaybee arc_get_data_buf(buf); 694ea8dc4b6Seschrock hdr->b_datacnt = 1; 695fa9e4066Sahrens hdr->b_flags = 0; 696fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 697fa9e4066Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 698fa9e4066Sahrens 699fa9e4066Sahrens return (buf); 700fa9e4066Sahrens } 701fa9e4066Sahrens 702*44eda4d7Smaybee static arc_buf_t * 703*44eda4d7Smaybee arc_buf_clone(arc_buf_t *from) 704ea8dc4b6Seschrock { 705*44eda4d7Smaybee arc_buf_t *buf; 706*44eda4d7Smaybee arc_buf_hdr_t *hdr = from->b_hdr; 707*44eda4d7Smaybee uint64_t size = hdr->b_size; 708ea8dc4b6Seschrock 709*44eda4d7Smaybee buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 710*44eda4d7Smaybee buf->b_hdr = hdr; 711*44eda4d7Smaybee buf->b_data = NULL; 712*44eda4d7Smaybee buf->b_efunc = NULL; 713*44eda4d7Smaybee buf->b_private = NULL; 714*44eda4d7Smaybee buf->b_next = hdr->b_buf; 715*44eda4d7Smaybee hdr->b_buf = buf; 716*44eda4d7Smaybee arc_get_data_buf(buf); 717*44eda4d7Smaybee bcopy(from->b_data, buf->b_data, size); 718*44eda4d7Smaybee hdr->b_datacnt += 1; 719*44eda4d7Smaybee return (buf); 720ea8dc4b6Seschrock } 721ea8dc4b6Seschrock 722ea8dc4b6Seschrock void 723ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag) 724ea8dc4b6Seschrock { 725ea8dc4b6Seschrock arc_buf_hdr_t *hdr; 726ea8dc4b6Seschrock kmutex_t *hash_lock; 727ea8dc4b6Seschrock 728ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 729ea8dc4b6Seschrock hdr = buf->b_hdr; 730ea8dc4b6Seschrock if (buf->b_data == NULL) { 731ea8dc4b6Seschrock /* 732ea8dc4b6Seschrock * This buffer is evicted. 733ea8dc4b6Seschrock */ 734ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 735ea8dc4b6Seschrock return; 736ea8dc4b6Seschrock } else { 737ea8dc4b6Seschrock /* 738ea8dc4b6Seschrock * Prevent this buffer from being evicted 739ea8dc4b6Seschrock * while we add a reference. 740ea8dc4b6Seschrock */ 741ea8dc4b6Seschrock buf->b_hdr = NULL; 742ea8dc4b6Seschrock } 743ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 744ea8dc4b6Seschrock 745ea8dc4b6Seschrock ASSERT(hdr->b_state != arc.anon); 746ea8dc4b6Seschrock hash_lock = HDR_LOCK(hdr); 747ea8dc4b6Seschrock mutex_enter(hash_lock); 748ea8dc4b6Seschrock ASSERT(!GHOST_STATE(hdr->b_state)); 749ea8dc4b6Seschrock buf->b_hdr = hdr; 750ea8dc4b6Seschrock add_reference(hdr, hash_lock, tag); 751*44eda4d7Smaybee arc_access(hdr, hash_lock); 752*44eda4d7Smaybee mutex_exit(hash_lock); 753ea8dc4b6Seschrock atomic_add_64(&arc.hits, 1); 754ea8dc4b6Seschrock } 755ea8dc4b6Seschrock 756ea8dc4b6Seschrock static void 757*44eda4d7Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 758ea8dc4b6Seschrock { 759ea8dc4b6Seschrock arc_buf_t **bufp; 760ea8dc4b6Seschrock 761ea8dc4b6Seschrock /* free up data associated with the buf */ 762ea8dc4b6Seschrock if (buf->b_data) { 763ea8dc4b6Seschrock arc_state_t *state = buf->b_hdr->b_state; 764ea8dc4b6Seschrock uint64_t size = buf->b_hdr->b_size; 765ea8dc4b6Seschrock 766*44eda4d7Smaybee if (!recycle) { 767*44eda4d7Smaybee zio_buf_free(buf->b_data, size); 768*44eda4d7Smaybee atomic_add_64(&arc.size, -size); 769*44eda4d7Smaybee } 770ea8dc4b6Seschrock if (list_link_active(&buf->b_hdr->b_arc_node)) { 771ea8dc4b6Seschrock ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 772ea8dc4b6Seschrock ASSERT(state != arc.anon); 773ea8dc4b6Seschrock ASSERT3U(state->lsize, >=, size); 774ea8dc4b6Seschrock atomic_add_64(&state->lsize, -size); 775ea8dc4b6Seschrock } 776ea8dc4b6Seschrock ASSERT3U(state->size, >=, size); 777ea8dc4b6Seschrock atomic_add_64(&state->size, -size); 778ea8dc4b6Seschrock buf->b_data = NULL; 779ea8dc4b6Seschrock ASSERT(buf->b_hdr->b_datacnt > 0); 780ea8dc4b6Seschrock buf->b_hdr->b_datacnt -= 1; 781ea8dc4b6Seschrock } 782ea8dc4b6Seschrock 783ea8dc4b6Seschrock /* only remove the buf if requested */ 784ea8dc4b6Seschrock if (!all) 785ea8dc4b6Seschrock return; 786ea8dc4b6Seschrock 787ea8dc4b6Seschrock /* remove the buf from the hdr list */ 788ea8dc4b6Seschrock for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 789ea8dc4b6Seschrock continue; 790ea8dc4b6Seschrock *bufp = buf->b_next; 791ea8dc4b6Seschrock 792ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 793ea8dc4b6Seschrock 794ea8dc4b6Seschrock /* clean up the buf */ 795ea8dc4b6Seschrock buf->b_hdr = NULL; 796ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 797ea8dc4b6Seschrock } 798ea8dc4b6Seschrock 799fa9e4066Sahrens static void 800ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr) 801fa9e4066Sahrens { 802fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 803fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 804ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 805fa9e4066Sahrens 806fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 807ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(hdr)); 808fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 809fa9e4066Sahrens hdr->b_birth = 0; 810fa9e4066Sahrens hdr->b_cksum0 = 0; 811fa9e4066Sahrens } 812ea8dc4b6Seschrock while (hdr->b_buf) { 813fa9e4066Sahrens arc_buf_t *buf = hdr->b_buf; 814fa9e4066Sahrens 815ea8dc4b6Seschrock if (buf->b_efunc) { 816ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 817ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 818*44eda4d7Smaybee arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 819ea8dc4b6Seschrock hdr->b_buf = buf->b_next; 820ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 821ea8dc4b6Seschrock arc_eviction_list = buf; 822ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 823ea8dc4b6Seschrock } else { 824*44eda4d7Smaybee arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 825ea8dc4b6Seschrock } 826fa9e4066Sahrens } 827ea8dc4b6Seschrock 828fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 829fa9e4066Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 830fa9e4066Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 831fa9e4066Sahrens kmem_cache_free(hdr_cache, hdr); 832fa9e4066Sahrens } 833fa9e4066Sahrens 834fa9e4066Sahrens void 835fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 836fa9e4066Sahrens { 837fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 838ea8dc4b6Seschrock int hashed = hdr->b_state != arc.anon; 839fa9e4066Sahrens 840ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 841ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 842ea8dc4b6Seschrock 843ea8dc4b6Seschrock if (hashed) { 844ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 845ea8dc4b6Seschrock 846ea8dc4b6Seschrock mutex_enter(hash_lock); 847ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 848ea8dc4b6Seschrock if (hdr->b_datacnt > 1) 849*44eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 850ea8dc4b6Seschrock else 851ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 852fa9e4066Sahrens mutex_exit(hash_lock); 853ea8dc4b6Seschrock } else if (HDR_IO_IN_PROGRESS(hdr)) { 854ea8dc4b6Seschrock int destroy_hdr; 855ea8dc4b6Seschrock /* 856ea8dc4b6Seschrock * We are in the middle of an async write. Don't destroy 857ea8dc4b6Seschrock * this buffer unless the write completes before we finish 858ea8dc4b6Seschrock * decrementing the reference count. 859ea8dc4b6Seschrock */ 860ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 861ea8dc4b6Seschrock (void) remove_reference(hdr, NULL, tag); 862ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 863ea8dc4b6Seschrock destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 864ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 865ea8dc4b6Seschrock if (destroy_hdr) 866ea8dc4b6Seschrock arc_hdr_destroy(hdr); 867ea8dc4b6Seschrock } else { 868ea8dc4b6Seschrock if (remove_reference(hdr, NULL, tag) > 0) { 869ea8dc4b6Seschrock ASSERT(HDR_IO_ERROR(hdr)); 870*44eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 871ea8dc4b6Seschrock } else { 872ea8dc4b6Seschrock arc_hdr_destroy(hdr); 873ea8dc4b6Seschrock } 874fa9e4066Sahrens } 875ea8dc4b6Seschrock } 876fa9e4066Sahrens 877ea8dc4b6Seschrock int 878ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag) 879ea8dc4b6Seschrock { 880ea8dc4b6Seschrock arc_buf_hdr_t *hdr = buf->b_hdr; 881ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 882ea8dc4b6Seschrock int no_callback = (buf->b_efunc == NULL); 883fa9e4066Sahrens 884ea8dc4b6Seschrock if (hdr->b_state == arc.anon) { 885ea8dc4b6Seschrock arc_buf_free(buf, tag); 886ea8dc4b6Seschrock return (no_callback); 887ea8dc4b6Seschrock } 888ea8dc4b6Seschrock 889ea8dc4b6Seschrock mutex_enter(hash_lock); 890ea8dc4b6Seschrock ASSERT(hdr->b_state != arc.anon); 891ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 892ea8dc4b6Seschrock 893ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 894ea8dc4b6Seschrock if (hdr->b_datacnt > 1) { 895ea8dc4b6Seschrock if (no_callback) 896*44eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 897ea8dc4b6Seschrock } else if (no_callback) { 898ea8dc4b6Seschrock ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 899ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 900ea8dc4b6Seschrock } 901ea8dc4b6Seschrock ASSERT(no_callback || hdr->b_datacnt > 1 || 902ea8dc4b6Seschrock refcount_is_zero(&hdr->b_refcnt)); 903ea8dc4b6Seschrock mutex_exit(hash_lock); 904ea8dc4b6Seschrock return (no_callback); 905fa9e4066Sahrens } 906fa9e4066Sahrens 907fa9e4066Sahrens int 908fa9e4066Sahrens arc_buf_size(arc_buf_t *buf) 909fa9e4066Sahrens { 910fa9e4066Sahrens return (buf->b_hdr->b_size); 911fa9e4066Sahrens } 912fa9e4066Sahrens 913fa9e4066Sahrens /* 914fa9e4066Sahrens * Evict buffers from list until we've removed the specified number of 915fa9e4066Sahrens * bytes. Move the removed buffers to the appropriate evict state. 916*44eda4d7Smaybee * If the recycle flag is set, then attempt to "recycle" a buffer: 917*44eda4d7Smaybee * - look for a buffer to evict that is `bytes' long. 918*44eda4d7Smaybee * - return the data block from this buffer rather than freeing it. 919*44eda4d7Smaybee * This flag is used by callers that are trying to make space for a 920*44eda4d7Smaybee * new buffer in a full arc cache. 921fa9e4066Sahrens */ 922*44eda4d7Smaybee static void * 923*44eda4d7Smaybee arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle) 924fa9e4066Sahrens { 925fa9e4066Sahrens arc_state_t *evicted_state; 926*44eda4d7Smaybee uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 927fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 928fa9e4066Sahrens kmutex_t *hash_lock; 929*44eda4d7Smaybee boolean_t have_lock; 930*44eda4d7Smaybee void *steal = NULL; 931fa9e4066Sahrens 932ea8dc4b6Seschrock ASSERT(state == arc.mru || state == arc.mfu); 933fa9e4066Sahrens 934ea8dc4b6Seschrock evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 935fa9e4066Sahrens 936fa9e4066Sahrens mutex_enter(&state->mtx); 937fa9e4066Sahrens mutex_enter(&evicted_state->mtx); 938fa9e4066Sahrens 939fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 940fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 94113506d1eSmaybee /* prefetch buffers have a minimum lifespan */ 942*44eda4d7Smaybee if (HDR_IO_IN_PROGRESS(ab) || 943*44eda4d7Smaybee (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 944*44eda4d7Smaybee lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 94513506d1eSmaybee skipped++; 94613506d1eSmaybee continue; 94713506d1eSmaybee } 948*44eda4d7Smaybee if (recycle && (ab->b_size != bytes || ab->b_datacnt > 1)) 949*44eda4d7Smaybee continue; 950fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 951*44eda4d7Smaybee have_lock = MUTEX_HELD(hash_lock); 952*44eda4d7Smaybee if (have_lock || mutex_tryenter(hash_lock)) { 953fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 954ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 955ea8dc4b6Seschrock while (ab->b_buf) { 956ea8dc4b6Seschrock arc_buf_t *buf = ab->b_buf; 957*44eda4d7Smaybee if (buf->b_data) { 958ea8dc4b6Seschrock bytes_evicted += ab->b_size; 959*44eda4d7Smaybee if (recycle) 960*44eda4d7Smaybee steal = buf->b_data; 961*44eda4d7Smaybee } 962ea8dc4b6Seschrock if (buf->b_efunc) { 963ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 964ea8dc4b6Seschrock /* 965ea8dc4b6Seschrock * arc_buf_add_ref() could derail 966ea8dc4b6Seschrock * this eviction. 967ea8dc4b6Seschrock */ 968ea8dc4b6Seschrock if (buf->b_hdr == NULL) { 969ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 970*44eda4d7Smaybee bytes_evicted -= ab->b_size; 971*44eda4d7Smaybee if (recycle) 972*44eda4d7Smaybee steal = NULL; 973*44eda4d7Smaybee if (!have_lock) 974*44eda4d7Smaybee mutex_exit(hash_lock); 975*44eda4d7Smaybee goto derailed; 976ea8dc4b6Seschrock } 977*44eda4d7Smaybee arc_buf_destroy(buf, recycle, FALSE); 978ea8dc4b6Seschrock ab->b_buf = buf->b_next; 979ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 980ea8dc4b6Seschrock arc_eviction_list = buf; 981ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 982ea8dc4b6Seschrock } else { 983*44eda4d7Smaybee arc_buf_destroy(buf, recycle, TRUE); 984ea8dc4b6Seschrock } 985ea8dc4b6Seschrock } 986ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 987fa9e4066Sahrens arc_change_state(evicted_state, ab, hash_lock); 988ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(ab)); 989ea8dc4b6Seschrock ab->b_flags = ARC_IN_HASH_TABLE; 990fa9e4066Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 991*44eda4d7Smaybee if (!have_lock) 992*44eda4d7Smaybee mutex_exit(hash_lock); 993ea8dc4b6Seschrock if (bytes >= 0 && bytes_evicted >= bytes) 994fa9e4066Sahrens break; 995fa9e4066Sahrens } else { 996*44eda4d7Smaybee missed += 1; 997fa9e4066Sahrens } 998*44eda4d7Smaybee derailed: 999*44eda4d7Smaybee /* null statement */; 1000fa9e4066Sahrens } 1001fa9e4066Sahrens mutex_exit(&evicted_state->mtx); 1002fa9e4066Sahrens mutex_exit(&state->mtx); 1003fa9e4066Sahrens 1004fa9e4066Sahrens if (bytes_evicted < bytes) 1005fa9e4066Sahrens dprintf("only evicted %lld bytes from %x", 1006fa9e4066Sahrens (longlong_t)bytes_evicted, state); 1007fa9e4066Sahrens 1008*44eda4d7Smaybee if (skipped) 1009*44eda4d7Smaybee atomic_add_64(&arc.evict_skip, skipped); 1010*44eda4d7Smaybee if (missed) 1011*44eda4d7Smaybee atomic_add_64(&arc.mutex_miss, missed); 1012*44eda4d7Smaybee return (steal); 1013fa9e4066Sahrens } 1014fa9e4066Sahrens 1015fa9e4066Sahrens /* 1016fa9e4066Sahrens * Remove buffers from list until we've removed the specified number of 1017fa9e4066Sahrens * bytes. Destroy the buffers that are removed. 1018fa9e4066Sahrens */ 1019fa9e4066Sahrens static void 1020ea8dc4b6Seschrock arc_evict_ghost(arc_state_t *state, int64_t bytes) 1021fa9e4066Sahrens { 1022fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 1023fa9e4066Sahrens kmutex_t *hash_lock; 1024ea8dc4b6Seschrock uint64_t bytes_deleted = 0; 1025ea8dc4b6Seschrock uint_t bufs_skipped = 0; 1026fa9e4066Sahrens 1027ea8dc4b6Seschrock ASSERT(GHOST_STATE(state)); 1028fa9e4066Sahrens top: 1029fa9e4066Sahrens mutex_enter(&state->mtx); 1030fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 1031fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 1032fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 1033fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 103413506d1eSmaybee ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1035ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 1036fa9e4066Sahrens arc_change_state(arc.anon, ab, hash_lock); 1037fa9e4066Sahrens mutex_exit(hash_lock); 1038fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 1039fa9e4066Sahrens bytes_deleted += ab->b_size; 1040ea8dc4b6Seschrock arc_hdr_destroy(ab); 1041ea8dc4b6Seschrock DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1042fa9e4066Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 1043fa9e4066Sahrens break; 1044fa9e4066Sahrens } else { 1045fa9e4066Sahrens if (bytes < 0) { 1046fa9e4066Sahrens mutex_exit(&state->mtx); 1047fa9e4066Sahrens mutex_enter(hash_lock); 1048fa9e4066Sahrens mutex_exit(hash_lock); 1049fa9e4066Sahrens goto top; 1050fa9e4066Sahrens } 1051fa9e4066Sahrens bufs_skipped += 1; 1052fa9e4066Sahrens } 1053fa9e4066Sahrens } 1054fa9e4066Sahrens mutex_exit(&state->mtx); 1055fa9e4066Sahrens 1056fa9e4066Sahrens if (bufs_skipped) { 1057*44eda4d7Smaybee atomic_add_64(&arc.mutex_miss, bufs_skipped); 1058fa9e4066Sahrens ASSERT(bytes >= 0); 1059fa9e4066Sahrens } 1060fa9e4066Sahrens 1061fa9e4066Sahrens if (bytes_deleted < bytes) 1062fa9e4066Sahrens dprintf("only deleted %lld bytes from %p", 1063fa9e4066Sahrens (longlong_t)bytes_deleted, state); 1064fa9e4066Sahrens } 1065fa9e4066Sahrens 1066fa9e4066Sahrens static void 1067fa9e4066Sahrens arc_adjust(void) 1068fa9e4066Sahrens { 1069fa9e4066Sahrens int64_t top_sz, mru_over, arc_over; 1070fa9e4066Sahrens 1071ea8dc4b6Seschrock top_sz = arc.anon->size + arc.mru->size; 1072fa9e4066Sahrens 1073ea8dc4b6Seschrock if (top_sz > arc.p && arc.mru->lsize > 0) { 1074ea8dc4b6Seschrock int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p); 1075*44eda4d7Smaybee (void) arc_evict(arc.mru, toevict, FALSE); 1076ea8dc4b6Seschrock top_sz = arc.anon->size + arc.mru->size; 1077fa9e4066Sahrens } 1078fa9e4066Sahrens 1079ea8dc4b6Seschrock mru_over = top_sz + arc.mru_ghost->size - arc.c; 1080fa9e4066Sahrens 1081fa9e4066Sahrens if (mru_over > 0) { 1082ea8dc4b6Seschrock if (arc.mru_ghost->lsize > 0) { 1083ea8dc4b6Seschrock int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over); 1084ea8dc4b6Seschrock arc_evict_ghost(arc.mru_ghost, todelete); 1085fa9e4066Sahrens } 1086fa9e4066Sahrens } 1087fa9e4066Sahrens 1088fa9e4066Sahrens if ((arc_over = arc.size - arc.c) > 0) { 1089ea8dc4b6Seschrock int64_t tbl_over; 1090fa9e4066Sahrens 1091ea8dc4b6Seschrock if (arc.mfu->lsize > 0) { 1092ea8dc4b6Seschrock int64_t toevict = MIN(arc.mfu->lsize, arc_over); 1093*44eda4d7Smaybee (void) arc_evict(arc.mfu, toevict, FALSE); 1094fa9e4066Sahrens } 1095fa9e4066Sahrens 1096ea8dc4b6Seschrock tbl_over = arc.size + arc.mru_ghost->lsize + 1097ea8dc4b6Seschrock arc.mfu_ghost->lsize - arc.c*2; 1098fa9e4066Sahrens 1099ea8dc4b6Seschrock if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) { 1100ea8dc4b6Seschrock int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over); 1101ea8dc4b6Seschrock arc_evict_ghost(arc.mfu_ghost, todelete); 1102fa9e4066Sahrens } 1103fa9e4066Sahrens } 1104fa9e4066Sahrens } 1105fa9e4066Sahrens 1106ea8dc4b6Seschrock static void 1107ea8dc4b6Seschrock arc_do_user_evicts(void) 1108ea8dc4b6Seschrock { 1109ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1110ea8dc4b6Seschrock while (arc_eviction_list != NULL) { 1111ea8dc4b6Seschrock arc_buf_t *buf = arc_eviction_list; 1112ea8dc4b6Seschrock arc_eviction_list = buf->b_next; 1113ea8dc4b6Seschrock buf->b_hdr = NULL; 1114ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1115ea8dc4b6Seschrock 1116dd6ef538Smaybee if (buf->b_efunc != NULL) 1117dd6ef538Smaybee VERIFY(buf->b_efunc(buf) == 0); 1118ea8dc4b6Seschrock 1119ea8dc4b6Seschrock buf->b_efunc = NULL; 1120ea8dc4b6Seschrock buf->b_private = NULL; 1121ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 1122ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1123ea8dc4b6Seschrock } 1124ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1125ea8dc4b6Seschrock } 1126ea8dc4b6Seschrock 1127fa9e4066Sahrens /* 1128fa9e4066Sahrens * Flush all *evictable* data from the cache. 1129fa9e4066Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 1130fa9e4066Sahrens */ 1131fa9e4066Sahrens void 1132fa9e4066Sahrens arc_flush(void) 1133fa9e4066Sahrens { 1134*44eda4d7Smaybee while (list_head(&arc.mru->list)) 1135*44eda4d7Smaybee (void) arc_evict(arc.mru, -1, FALSE); 1136*44eda4d7Smaybee while (list_head(&arc.mfu->list)) 1137*44eda4d7Smaybee (void) arc_evict(arc.mfu, -1, FALSE); 1138fa9e4066Sahrens 1139ea8dc4b6Seschrock arc_evict_ghost(arc.mru_ghost, -1); 1140ea8dc4b6Seschrock arc_evict_ghost(arc.mfu_ghost, -1); 1141ea8dc4b6Seschrock 1142ea8dc4b6Seschrock mutex_enter(&arc_reclaim_thr_lock); 1143ea8dc4b6Seschrock arc_do_user_evicts(); 1144ea8dc4b6Seschrock mutex_exit(&arc_reclaim_thr_lock); 1145ea8dc4b6Seschrock ASSERT(arc_eviction_list == NULL); 1146fa9e4066Sahrens } 1147fa9e4066Sahrens 114813506d1eSmaybee int arc_kmem_reclaim_shift = 5; /* log2(fraction of arc to reclaim) */ 114913506d1eSmaybee 1150fa9e4066Sahrens void 1151fa9e4066Sahrens arc_kmem_reclaim(void) 1152fa9e4066Sahrens { 11533cff2f43Sstans uint64_t to_free; 11543cff2f43Sstans 1155fa9e4066Sahrens /* 1156fa9e4066Sahrens * We need arc_reclaim_lock because we don't want multiple 1157fa9e4066Sahrens * threads trying to reclaim concurrently. 1158fa9e4066Sahrens */ 1159fa9e4066Sahrens 1160fa9e4066Sahrens /* 1161fa9e4066Sahrens * umem calls the reclaim func when we destroy the buf cache, 1162fa9e4066Sahrens * which is after we do arc_fini(). So we set a flag to prevent 1163fa9e4066Sahrens * accessing the destroyed mutexes and lists. 1164fa9e4066Sahrens */ 1165fa9e4066Sahrens if (arc_dead) 1166fa9e4066Sahrens return; 1167fa9e4066Sahrens 1168ea8dc4b6Seschrock if (arc.c <= arc.c_min) 1169ea8dc4b6Seschrock return; 1170ea8dc4b6Seschrock 1171fa9e4066Sahrens mutex_enter(&arc_reclaim_lock); 1172fa9e4066Sahrens 11733cff2f43Sstans #ifdef _KERNEL 117413506d1eSmaybee to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree)); 11753cff2f43Sstans #else 117613506d1eSmaybee to_free = arc.c >> arc_kmem_reclaim_shift; 11773cff2f43Sstans #endif 11783cff2f43Sstans if (arc.c > to_free) 11793cff2f43Sstans atomic_add_64(&arc.c, -to_free); 11803cff2f43Sstans else 11813cff2f43Sstans arc.c = arc.c_min; 11823cff2f43Sstans 118313506d1eSmaybee atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift)); 1184ea8dc4b6Seschrock if (arc.c > arc.size) 1185ea8dc4b6Seschrock arc.c = arc.size; 1186fa9e4066Sahrens if (arc.c < arc.c_min) 1187fa9e4066Sahrens arc.c = arc.c_min; 1188ea8dc4b6Seschrock if (arc.p > arc.c) 1189ea8dc4b6Seschrock arc.p = (arc.c >> 1); 1190ea8dc4b6Seschrock ASSERT((int64_t)arc.p >= 0); 1191fa9e4066Sahrens 1192fa9e4066Sahrens arc_adjust(); 1193fa9e4066Sahrens 1194fa9e4066Sahrens mutex_exit(&arc_reclaim_lock); 1195fa9e4066Sahrens } 1196fa9e4066Sahrens 1197fa9e4066Sahrens static int 1198fa9e4066Sahrens arc_reclaim_needed(void) 1199fa9e4066Sahrens { 1200fa9e4066Sahrens uint64_t extra; 1201fa9e4066Sahrens 1202fa9e4066Sahrens #ifdef _KERNEL 12033cff2f43Sstans 12043cff2f43Sstans if (needfree) 12053cff2f43Sstans return (1); 12063cff2f43Sstans 1207fa9e4066Sahrens /* 1208fa9e4066Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 1209fa9e4066Sahrens */ 1210fa9e4066Sahrens extra = desfree; 1211fa9e4066Sahrens 1212fa9e4066Sahrens /* 1213fa9e4066Sahrens * check that we're out of range of the pageout scanner. It starts to 1214fa9e4066Sahrens * schedule paging if freemem is less than lotsfree and needfree. 1215fa9e4066Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 1216fa9e4066Sahrens * number of needed free pages. We add extra pages here to make sure 1217fa9e4066Sahrens * the scanner doesn't start up while we're freeing memory. 1218fa9e4066Sahrens */ 1219fa9e4066Sahrens if (freemem < lotsfree + needfree + extra) 1220fa9e4066Sahrens return (1); 1221fa9e4066Sahrens 1222fa9e4066Sahrens /* 1223fa9e4066Sahrens * check to make sure that swapfs has enough space so that anon 1224fa9e4066Sahrens * reservations can still succeeed. anon_resvmem() checks that the 1225fa9e4066Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 1226fa9e4066Sahrens * swap pages. We also add a bit of extra here just to prevent 1227fa9e4066Sahrens * circumstances from getting really dire. 1228fa9e4066Sahrens */ 1229fa9e4066Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1230fa9e4066Sahrens return (1); 1231fa9e4066Sahrens 12325dc8af33Smaybee #if defined(__i386) 1233fa9e4066Sahrens /* 1234fa9e4066Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 1235fa9e4066Sahrens * kernel heap space before we ever run out of available physical 1236fa9e4066Sahrens * memory. Most checks of the size of the heap_area compare against 1237fa9e4066Sahrens * tune.t_minarmem, which is the minimum available real memory that we 1238fa9e4066Sahrens * can have in the system. However, this is generally fixed at 25 pages 1239fa9e4066Sahrens * which is so low that it's useless. In this comparison, we seek to 1240fa9e4066Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 1241fa9e4066Sahrens * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1242fa9e4066Sahrens * free) 1243fa9e4066Sahrens */ 1244fa9e4066Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1245fa9e4066Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1246fa9e4066Sahrens return (1); 1247fa9e4066Sahrens #endif 1248fa9e4066Sahrens 1249fa9e4066Sahrens #else 1250fa9e4066Sahrens if (spa_get_random(100) == 0) 1251fa9e4066Sahrens return (1); 1252fa9e4066Sahrens #endif 1253fa9e4066Sahrens return (0); 1254fa9e4066Sahrens } 1255fa9e4066Sahrens 1256fa9e4066Sahrens static void 1257fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1258fa9e4066Sahrens { 1259fa9e4066Sahrens size_t i; 1260fa9e4066Sahrens kmem_cache_t *prev_cache = NULL; 1261fa9e4066Sahrens extern kmem_cache_t *zio_buf_cache[]; 1262fa9e4066Sahrens 1263033f9833Sek #ifdef _KERNEL 1264033f9833Sek /* 1265033f9833Sek * First purge some DNLC entries, in case the DNLC is using 1266033f9833Sek * up too much memory. 1267033f9833Sek */ 1268cee972f8Sek dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 12695dc8af33Smaybee 12705dc8af33Smaybee #if defined(__i386) 12715dc8af33Smaybee /* 12725dc8af33Smaybee * Reclaim unused memory from all kmem caches. 12735dc8af33Smaybee */ 12745dc8af33Smaybee kmem_reap(); 12755dc8af33Smaybee #endif 1276033f9833Sek #endif 1277033f9833Sek 1278fa9e4066Sahrens /* 1279ea8dc4b6Seschrock * An agressive reclamation will shrink the cache size as well as 1280ea8dc4b6Seschrock * reap free buffers from the arc kmem caches. 1281fa9e4066Sahrens */ 1282fa9e4066Sahrens if (strat == ARC_RECLAIM_AGGR) 1283ea8dc4b6Seschrock arc_kmem_reclaim(); 1284fa9e4066Sahrens 1285fa9e4066Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1286fa9e4066Sahrens if (zio_buf_cache[i] != prev_cache) { 1287fa9e4066Sahrens prev_cache = zio_buf_cache[i]; 1288fa9e4066Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 1289fa9e4066Sahrens } 1290fa9e4066Sahrens } 1291ea8dc4b6Seschrock kmem_cache_reap_now(buf_cache); 1292ea8dc4b6Seschrock kmem_cache_reap_now(hdr_cache); 1293fa9e4066Sahrens } 1294fa9e4066Sahrens 1295fa9e4066Sahrens static void 1296fa9e4066Sahrens arc_reclaim_thread(void) 1297fa9e4066Sahrens { 1298fa9e4066Sahrens clock_t growtime = 0; 1299fa9e4066Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1300fa9e4066Sahrens callb_cpr_t cpr; 1301fa9e4066Sahrens 1302fa9e4066Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1303fa9e4066Sahrens 1304fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1305fa9e4066Sahrens while (arc_thread_exit == 0) { 1306fa9e4066Sahrens if (arc_reclaim_needed()) { 1307fa9e4066Sahrens 1308fa9e4066Sahrens if (arc.no_grow) { 1309fa9e4066Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1310fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1311fa9e4066Sahrens } else { 1312fa9e4066Sahrens last_reclaim = ARC_RECLAIM_CONS; 1313fa9e4066Sahrens } 1314fa9e4066Sahrens } else { 1315fa9e4066Sahrens arc.no_grow = TRUE; 1316fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1317fa9e4066Sahrens membar_producer(); 1318fa9e4066Sahrens } 1319fa9e4066Sahrens 1320fa9e4066Sahrens /* reset the growth delay for every reclaim */ 1321fa9e4066Sahrens growtime = lbolt + (arc_grow_retry * hz); 1322fa9e4066Sahrens 1323fa9e4066Sahrens arc_kmem_reap_now(last_reclaim); 1324fa9e4066Sahrens 1325fa9e4066Sahrens } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1326fa9e4066Sahrens arc.no_grow = FALSE; 1327fa9e4066Sahrens } 1328fa9e4066Sahrens 1329ea8dc4b6Seschrock if (arc_eviction_list != NULL) 1330ea8dc4b6Seschrock arc_do_user_evicts(); 1331ea8dc4b6Seschrock 1332fa9e4066Sahrens /* block until needed, or one second, whichever is shorter */ 1333fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1334fa9e4066Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1335fa9e4066Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1336fa9e4066Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1337fa9e4066Sahrens } 1338fa9e4066Sahrens 1339fa9e4066Sahrens arc_thread_exit = 0; 1340fa9e4066Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1341fa9e4066Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1342fa9e4066Sahrens thread_exit(); 1343fa9e4066Sahrens } 1344fa9e4066Sahrens 1345ea8dc4b6Seschrock /* 1346ea8dc4b6Seschrock * Adapt arc info given the number of bytes we are trying to add and 1347ea8dc4b6Seschrock * the state that we are comming from. This function is only called 1348ea8dc4b6Seschrock * when we are adding new content to the cache. 1349ea8dc4b6Seschrock */ 1350fa9e4066Sahrens static void 1351ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state) 1352fa9e4066Sahrens { 1353ea8dc4b6Seschrock int mult; 1354ea8dc4b6Seschrock 1355ea8dc4b6Seschrock ASSERT(bytes > 0); 1356fa9e4066Sahrens /* 1357ea8dc4b6Seschrock * Adapt the target size of the MRU list: 1358ea8dc4b6Seschrock * - if we just hit in the MRU ghost list, then increase 1359ea8dc4b6Seschrock * the target size of the MRU list. 1360ea8dc4b6Seschrock * - if we just hit in the MFU ghost list, then increase 1361ea8dc4b6Seschrock * the target size of the MFU list by decreasing the 1362ea8dc4b6Seschrock * target size of the MRU list. 1363fa9e4066Sahrens */ 1364ea8dc4b6Seschrock if (state == arc.mru_ghost) { 1365ea8dc4b6Seschrock mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ? 1366ea8dc4b6Seschrock 1 : (arc.mfu_ghost->size/arc.mru_ghost->size)); 1367ea8dc4b6Seschrock 1368ea8dc4b6Seschrock arc.p = MIN(arc.c, arc.p + bytes * mult); 1369ea8dc4b6Seschrock } else if (state == arc.mfu_ghost) { 1370ea8dc4b6Seschrock mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ? 1371ea8dc4b6Seschrock 1 : (arc.mru_ghost->size/arc.mfu_ghost->size)); 1372ea8dc4b6Seschrock 1373ea8dc4b6Seschrock arc.p = MAX(0, (int64_t)arc.p - bytes * mult); 1374ea8dc4b6Seschrock } 1375ea8dc4b6Seschrock ASSERT((int64_t)arc.p >= 0); 1376fa9e4066Sahrens 1377fa9e4066Sahrens if (arc_reclaim_needed()) { 1378fa9e4066Sahrens cv_signal(&arc_reclaim_thr_cv); 1379fa9e4066Sahrens return; 1380fa9e4066Sahrens } 1381fa9e4066Sahrens 1382fa9e4066Sahrens if (arc.no_grow) 1383fa9e4066Sahrens return; 1384fa9e4066Sahrens 1385ea8dc4b6Seschrock if (arc.c >= arc.c_max) 1386ea8dc4b6Seschrock return; 1387ea8dc4b6Seschrock 1388fa9e4066Sahrens /* 1389ea8dc4b6Seschrock * If we're within (2 * maxblocksize) bytes of the target 1390ea8dc4b6Seschrock * cache size, increment the target cache size 1391fa9e4066Sahrens */ 1392ea8dc4b6Seschrock if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1393ea8dc4b6Seschrock atomic_add_64(&arc.c, (int64_t)bytes); 1394fa9e4066Sahrens if (arc.c > arc.c_max) 1395fa9e4066Sahrens arc.c = arc.c_max; 1396ea8dc4b6Seschrock else if (state == arc.anon) 1397ea8dc4b6Seschrock atomic_add_64(&arc.p, (int64_t)bytes); 1398ea8dc4b6Seschrock if (arc.p > arc.c) 1399ea8dc4b6Seschrock arc.p = arc.c; 1400fa9e4066Sahrens } 1401ea8dc4b6Seschrock ASSERT((int64_t)arc.p >= 0); 1402fa9e4066Sahrens } 1403fa9e4066Sahrens 1404fa9e4066Sahrens /* 1405ea8dc4b6Seschrock * Check if the cache has reached its limits and eviction is required 1406ea8dc4b6Seschrock * prior to insert. 1407fa9e4066Sahrens */ 1408fa9e4066Sahrens static int 1409fa9e4066Sahrens arc_evict_needed() 1410fa9e4066Sahrens { 1411fa9e4066Sahrens if (arc_reclaim_needed()) 1412fa9e4066Sahrens return (1); 1413fa9e4066Sahrens 1414ea8dc4b6Seschrock return (arc.size > arc.c); 1415fa9e4066Sahrens } 1416fa9e4066Sahrens 1417fa9e4066Sahrens /* 1418*44eda4d7Smaybee * The buffer, supplied as the first argument, needs a data block. 1419*44eda4d7Smaybee * So, if we are at cache max, determine which cache should be victimized. 1420*44eda4d7Smaybee * We have the following cases: 1421fa9e4066Sahrens * 1422ea8dc4b6Seschrock * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) -> 1423fa9e4066Sahrens * In this situation if we're out of space, but the resident size of the MFU is 1424fa9e4066Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 1425fa9e4066Sahrens * 1426ea8dc4b6Seschrock * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) -> 1427fa9e4066Sahrens * Here, we've used up all of the available space for the MRU, so we need to 1428fa9e4066Sahrens * evict from our own cache instead. Evict from the set of resident MRU 1429fa9e4066Sahrens * entries. 1430fa9e4066Sahrens * 1431ea8dc4b6Seschrock * 3. Insert for MFU (c - p) > sizeof(arc.mfu) -> 1432fa9e4066Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 1433fa9e4066Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 1434fa9e4066Sahrens * the MFU side, so the MRU side needs to be victimized. 1435fa9e4066Sahrens * 1436ea8dc4b6Seschrock * 4. Insert for MFU (c - p) < sizeof(arc.mfu) -> 1437fa9e4066Sahrens * MFU's resident set is consuming more space than it has been allotted. In 1438fa9e4066Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 1439fa9e4066Sahrens */ 1440fa9e4066Sahrens static void 1441*44eda4d7Smaybee arc_get_data_buf(arc_buf_t *buf) 1442fa9e4066Sahrens { 1443*44eda4d7Smaybee arc_state_t *state = buf->b_hdr->b_state; 1444*44eda4d7Smaybee uint64_t size = buf->b_hdr->b_size; 1445fa9e4066Sahrens 1446*44eda4d7Smaybee arc_adapt(size, state); 1447fa9e4066Sahrens 1448*44eda4d7Smaybee /* 1449*44eda4d7Smaybee * We have not yet reached cache maximum size, 1450*44eda4d7Smaybee * just allocate a new buffer. 1451*44eda4d7Smaybee */ 1452*44eda4d7Smaybee if (!arc_evict_needed()) { 1453*44eda4d7Smaybee buf->b_data = zio_buf_alloc(size); 1454*44eda4d7Smaybee atomic_add_64(&arc.size, size); 1455*44eda4d7Smaybee goto out; 1456*44eda4d7Smaybee } 1457*44eda4d7Smaybee 1458*44eda4d7Smaybee /* 1459*44eda4d7Smaybee * If we are prefetching from the mfu ghost list, this buffer 1460*44eda4d7Smaybee * will end up on the mru list; so steal space from there. 1461*44eda4d7Smaybee */ 1462*44eda4d7Smaybee if (state == arc.mfu_ghost) 1463*44eda4d7Smaybee state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc.mru : arc.mfu; 1464*44eda4d7Smaybee else if (state == arc.mru_ghost) 1465*44eda4d7Smaybee state = arc.mru; 1466*44eda4d7Smaybee 1467*44eda4d7Smaybee if (state == arc.mru || state == arc.anon) { 1468*44eda4d7Smaybee uint64_t mru_used = arc.anon->size + arc.mru->size; 1469*44eda4d7Smaybee state = (arc.p > mru_used) ? arc.mfu : arc.mru; 1470fa9e4066Sahrens } else { 1471*44eda4d7Smaybee /* MFU cases */ 1472*44eda4d7Smaybee uint64_t mfu_space = arc.c - arc.p; 1473*44eda4d7Smaybee state = (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu; 1474*44eda4d7Smaybee } 1475*44eda4d7Smaybee if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) { 1476*44eda4d7Smaybee (void) arc_evict(state, size, FALSE); 1477*44eda4d7Smaybee buf->b_data = zio_buf_alloc(size); 1478*44eda4d7Smaybee atomic_add_64(&arc.size, size); 1479*44eda4d7Smaybee atomic_add_64(&arc.recycle_miss, 1); 1480*44eda4d7Smaybee if (arc.size > arc.c) 1481*44eda4d7Smaybee arc_adjust(); 1482*44eda4d7Smaybee } 1483*44eda4d7Smaybee ASSERT(buf->b_data != NULL); 1484*44eda4d7Smaybee out: 1485*44eda4d7Smaybee /* 1486*44eda4d7Smaybee * Update the state size. Note that ghost states have a 1487*44eda4d7Smaybee * "ghost size" and so don't need to be updated. 1488*44eda4d7Smaybee */ 1489*44eda4d7Smaybee if (!GHOST_STATE(buf->b_hdr->b_state)) { 1490*44eda4d7Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 1491*44eda4d7Smaybee 1492*44eda4d7Smaybee atomic_add_64(&hdr->b_state->size, size); 1493*44eda4d7Smaybee if (list_link_active(&hdr->b_arc_node)) { 1494*44eda4d7Smaybee ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1495*44eda4d7Smaybee atomic_add_64(&hdr->b_state->lsize, size); 1496fa9e4066Sahrens } 1497fa9e4066Sahrens } 1498fa9e4066Sahrens } 1499fa9e4066Sahrens 1500fa9e4066Sahrens /* 1501fa9e4066Sahrens * This routine is called whenever a buffer is accessed. 1502ea8dc4b6Seschrock * NOTE: the hash lock is dropped in this function. 1503fa9e4066Sahrens */ 1504fa9e4066Sahrens static void 1505*44eda4d7Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1506fa9e4066Sahrens { 1507fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1508fa9e4066Sahrens 1509fa9e4066Sahrens if (buf->b_state == arc.anon) { 1510fa9e4066Sahrens /* 1511fa9e4066Sahrens * This buffer is not in the cache, and does not 1512fa9e4066Sahrens * appear in our "ghost" list. Add the new buffer 1513fa9e4066Sahrens * to the MRU state. 1514fa9e4066Sahrens */ 1515fa9e4066Sahrens 1516fa9e4066Sahrens ASSERT(buf->b_arc_access == 0); 1517fa9e4066Sahrens buf->b_arc_access = lbolt; 1518ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1519ea8dc4b6Seschrock arc_change_state(arc.mru, buf, hash_lock); 1520fa9e4066Sahrens 1521ea8dc4b6Seschrock } else if (buf->b_state == arc.mru) { 1522fa9e4066Sahrens /* 152313506d1eSmaybee * If this buffer is here because of a prefetch, then either: 152413506d1eSmaybee * - clear the flag if this is a "referencing" read 152513506d1eSmaybee * (any subsequent access will bump this into the MFU state). 152613506d1eSmaybee * or 152713506d1eSmaybee * - move the buffer to the head of the list if this is 152813506d1eSmaybee * another prefetch (to make it less likely to be evicted). 1529fa9e4066Sahrens */ 1530fa9e4066Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 153113506d1eSmaybee if (refcount_count(&buf->b_refcnt) == 0) { 153213506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 153313506d1eSmaybee mutex_enter(&arc.mru->mtx); 153413506d1eSmaybee list_remove(&arc.mru->list, buf); 153513506d1eSmaybee list_insert_head(&arc.mru->list, buf); 153613506d1eSmaybee mutex_exit(&arc.mru->mtx); 153713506d1eSmaybee } else { 153813506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 153913506d1eSmaybee atomic_add_64(&arc.mru->hits, 1); 154013506d1eSmaybee } 154113506d1eSmaybee buf->b_arc_access = lbolt; 1542fa9e4066Sahrens return; 1543fa9e4066Sahrens } 1544fa9e4066Sahrens 1545fa9e4066Sahrens /* 1546fa9e4066Sahrens * This buffer has been "accessed" only once so far, 1547fa9e4066Sahrens * but it is still in the cache. Move it to the MFU 1548fa9e4066Sahrens * state. 1549fa9e4066Sahrens */ 1550fa9e4066Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1551fa9e4066Sahrens /* 1552fa9e4066Sahrens * More than 125ms have passed since we 1553fa9e4066Sahrens * instantiated this buffer. Move it to the 1554fa9e4066Sahrens * most frequently used state. 1555fa9e4066Sahrens */ 1556fa9e4066Sahrens buf->b_arc_access = lbolt; 1557ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1558ea8dc4b6Seschrock arc_change_state(arc.mfu, buf, hash_lock); 1559fa9e4066Sahrens } 1560ea8dc4b6Seschrock atomic_add_64(&arc.mru->hits, 1); 1561ea8dc4b6Seschrock } else if (buf->b_state == arc.mru_ghost) { 1562fa9e4066Sahrens arc_state_t *new_state; 1563fa9e4066Sahrens /* 1564fa9e4066Sahrens * This buffer has been "accessed" recently, but 1565fa9e4066Sahrens * was evicted from the cache. Move it to the 1566fa9e4066Sahrens * MFU state. 1567fa9e4066Sahrens */ 1568fa9e4066Sahrens 1569fa9e4066Sahrens if (buf->b_flags & ARC_PREFETCH) { 1570ea8dc4b6Seschrock new_state = arc.mru; 157113506d1eSmaybee if (refcount_count(&buf->b_refcnt) > 0) 157213506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 1573ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1574fa9e4066Sahrens } else { 1575ea8dc4b6Seschrock new_state = arc.mfu; 1576ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1577fa9e4066Sahrens } 1578fa9e4066Sahrens 1579fa9e4066Sahrens buf->b_arc_access = lbolt; 1580fa9e4066Sahrens arc_change_state(new_state, buf, hash_lock); 1581fa9e4066Sahrens 1582ea8dc4b6Seschrock atomic_add_64(&arc.mru_ghost->hits, 1); 1583ea8dc4b6Seschrock } else if (buf->b_state == arc.mfu) { 1584fa9e4066Sahrens /* 1585fa9e4066Sahrens * This buffer has been accessed more than once and is 1586fa9e4066Sahrens * still in the cache. Keep it in the MFU state. 1587fa9e4066Sahrens * 158813506d1eSmaybee * NOTE: an add_reference() that occurred when we did 158913506d1eSmaybee * the arc_read() will have kicked this off the list. 159013506d1eSmaybee * If it was a prefetch, we will explicitly move it to 159113506d1eSmaybee * the head of the list now. 1592fa9e4066Sahrens */ 159313506d1eSmaybee if ((buf->b_flags & ARC_PREFETCH) != 0) { 159413506d1eSmaybee ASSERT(refcount_count(&buf->b_refcnt) == 0); 159513506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 159613506d1eSmaybee mutex_enter(&arc.mfu->mtx); 159713506d1eSmaybee list_remove(&arc.mfu->list, buf); 159813506d1eSmaybee list_insert_head(&arc.mfu->list, buf); 159913506d1eSmaybee mutex_exit(&arc.mfu->mtx); 160013506d1eSmaybee } 1601ea8dc4b6Seschrock atomic_add_64(&arc.mfu->hits, 1); 160213506d1eSmaybee buf->b_arc_access = lbolt; 1603ea8dc4b6Seschrock } else if (buf->b_state == arc.mfu_ghost) { 160413506d1eSmaybee arc_state_t *new_state = arc.mfu; 1605fa9e4066Sahrens /* 1606fa9e4066Sahrens * This buffer has been accessed more than once but has 1607fa9e4066Sahrens * been evicted from the cache. Move it back to the 1608fa9e4066Sahrens * MFU state. 1609fa9e4066Sahrens */ 1610fa9e4066Sahrens 161113506d1eSmaybee if (buf->b_flags & ARC_PREFETCH) { 161213506d1eSmaybee /* 161313506d1eSmaybee * This is a prefetch access... 161413506d1eSmaybee * move this block back to the MRU state. 161513506d1eSmaybee */ 161613506d1eSmaybee ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 161713506d1eSmaybee new_state = arc.mru; 161813506d1eSmaybee } 161913506d1eSmaybee 1620fa9e4066Sahrens buf->b_arc_access = lbolt; 1621ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 162213506d1eSmaybee arc_change_state(new_state, buf, hash_lock); 1623fa9e4066Sahrens 1624ea8dc4b6Seschrock atomic_add_64(&arc.mfu_ghost->hits, 1); 1625fa9e4066Sahrens } else { 1626fa9e4066Sahrens ASSERT(!"invalid arc state"); 1627fa9e4066Sahrens } 1628fa9e4066Sahrens } 1629fa9e4066Sahrens 1630fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1631fa9e4066Sahrens /* ARGSUSED */ 1632fa9e4066Sahrens void 1633fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1634fa9e4066Sahrens { 1635fa9e4066Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1636ea8dc4b6Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1637fa9e4066Sahrens } 1638fa9e4066Sahrens 1639fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1640fa9e4066Sahrens void 1641fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1642fa9e4066Sahrens { 1643fa9e4066Sahrens arc_buf_t **bufp = arg; 1644fa9e4066Sahrens if (zio && zio->io_error) { 1645ea8dc4b6Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1646fa9e4066Sahrens *bufp = NULL; 1647fa9e4066Sahrens } else { 1648fa9e4066Sahrens *bufp = buf; 1649fa9e4066Sahrens } 1650fa9e4066Sahrens } 1651fa9e4066Sahrens 1652fa9e4066Sahrens static void 1653fa9e4066Sahrens arc_read_done(zio_t *zio) 1654fa9e4066Sahrens { 1655bbf4a8dfSmaybee arc_buf_hdr_t *hdr, *found; 1656fa9e4066Sahrens arc_buf_t *buf; 1657fa9e4066Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 1658fa9e4066Sahrens kmutex_t *hash_lock; 1659fa9e4066Sahrens arc_callback_t *callback_list, *acb; 1660fa9e4066Sahrens int freeable = FALSE; 1661fa9e4066Sahrens 1662fa9e4066Sahrens buf = zio->io_private; 1663fa9e4066Sahrens hdr = buf->b_hdr; 1664fa9e4066Sahrens 1665bbf4a8dfSmaybee /* 1666bbf4a8dfSmaybee * The hdr was inserted into hash-table and removed from lists 1667bbf4a8dfSmaybee * prior to starting I/O. We should find this header, since 1668bbf4a8dfSmaybee * it's in the hash table, and it should be legit since it's 1669bbf4a8dfSmaybee * not possible to evict it during the I/O. The only possible 1670bbf4a8dfSmaybee * reason for it not to be found is if we were freed during the 1671bbf4a8dfSmaybee * read. 1672bbf4a8dfSmaybee */ 1673bbf4a8dfSmaybee found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1674fa9e4066Sahrens &hash_lock); 1675fa9e4066Sahrens 1676bbf4a8dfSmaybee ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1677bbf4a8dfSmaybee (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1678fa9e4066Sahrens 1679fa9e4066Sahrens /* byteswap if necessary */ 1680fa9e4066Sahrens callback_list = hdr->b_acb; 1681fa9e4066Sahrens ASSERT(callback_list != NULL); 1682fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1683fa9e4066Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1684fa9e4066Sahrens 1685fa9e4066Sahrens /* create copies of the data buffer for the callers */ 1686fa9e4066Sahrens abuf = buf; 1687fa9e4066Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 1688fa9e4066Sahrens if (acb->acb_done) { 1689*44eda4d7Smaybee if (abuf == NULL) 1690*44eda4d7Smaybee abuf = arc_buf_clone(buf); 1691fa9e4066Sahrens acb->acb_buf = abuf; 1692fa9e4066Sahrens abuf = NULL; 1693fa9e4066Sahrens } 1694fa9e4066Sahrens } 1695fa9e4066Sahrens hdr->b_acb = NULL; 1696fa9e4066Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1697ea8dc4b6Seschrock ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1698ea8dc4b6Seschrock if (abuf == buf) 1699ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1700fa9e4066Sahrens 1701fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1702fa9e4066Sahrens 1703fa9e4066Sahrens if (zio->io_error != 0) { 1704fa9e4066Sahrens hdr->b_flags |= ARC_IO_ERROR; 1705fa9e4066Sahrens if (hdr->b_state != arc.anon) 1706fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1707ea8dc4b6Seschrock if (HDR_IN_HASH_TABLE(hdr)) 1708ea8dc4b6Seschrock buf_hash_remove(hdr); 1709fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 171013506d1eSmaybee /* convert checksum errors into IO errors */ 1711ea8dc4b6Seschrock if (zio->io_error == ECKSUM) 1712ea8dc4b6Seschrock zio->io_error = EIO; 1713fa9e4066Sahrens } 1714fa9e4066Sahrens 1715ea8dc4b6Seschrock /* 171613506d1eSmaybee * Broadcast before we drop the hash_lock to avoid the possibility 171713506d1eSmaybee * that the hdr (and hence the cv) might be freed before we get to 171813506d1eSmaybee * the cv_broadcast(). 1719ea8dc4b6Seschrock */ 1720ea8dc4b6Seschrock cv_broadcast(&hdr->b_cv); 1721ea8dc4b6Seschrock 1722bbf4a8dfSmaybee if (hash_lock) { 1723fa9e4066Sahrens /* 1724fa9e4066Sahrens * Only call arc_access on anonymous buffers. This is because 1725fa9e4066Sahrens * if we've issued an I/O for an evicted buffer, we've already 1726fa9e4066Sahrens * called arc_access (to prevent any simultaneous readers from 1727fa9e4066Sahrens * getting confused). 1728fa9e4066Sahrens */ 1729fa9e4066Sahrens if (zio->io_error == 0 && hdr->b_state == arc.anon) 1730*44eda4d7Smaybee arc_access(hdr, hash_lock); 1731*44eda4d7Smaybee mutex_exit(hash_lock); 1732fa9e4066Sahrens } else { 1733fa9e4066Sahrens /* 1734fa9e4066Sahrens * This block was freed while we waited for the read to 1735fa9e4066Sahrens * complete. It has been removed from the hash table and 1736fa9e4066Sahrens * moved to the anonymous state (so that it won't show up 1737fa9e4066Sahrens * in the cache). 1738fa9e4066Sahrens */ 1739fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1740fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1741fa9e4066Sahrens } 1742fa9e4066Sahrens 1743fa9e4066Sahrens /* execute each callback and free its structure */ 1744fa9e4066Sahrens while ((acb = callback_list) != NULL) { 1745fa9e4066Sahrens if (acb->acb_done) 1746fa9e4066Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1747fa9e4066Sahrens 1748fa9e4066Sahrens if (acb->acb_zio_dummy != NULL) { 1749fa9e4066Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 1750fa9e4066Sahrens zio_nowait(acb->acb_zio_dummy); 1751fa9e4066Sahrens } 1752fa9e4066Sahrens 1753fa9e4066Sahrens callback_list = acb->acb_next; 1754fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1755fa9e4066Sahrens } 1756fa9e4066Sahrens 1757fa9e4066Sahrens if (freeable) 1758ea8dc4b6Seschrock arc_hdr_destroy(hdr); 1759fa9e4066Sahrens } 1760fa9e4066Sahrens 1761fa9e4066Sahrens /* 1762fa9e4066Sahrens * "Read" the block block at the specified DVA (in bp) via the 1763fa9e4066Sahrens * cache. If the block is found in the cache, invoke the provided 1764fa9e4066Sahrens * callback immediately and return. Note that the `zio' parameter 1765fa9e4066Sahrens * in the callback will be NULL in this case, since no IO was 1766fa9e4066Sahrens * required. If the block is not in the cache pass the read request 1767fa9e4066Sahrens * on to the spa with a substitute callback function, so that the 1768fa9e4066Sahrens * requested block will be added to the cache. 1769fa9e4066Sahrens * 1770fa9e4066Sahrens * If a read request arrives for a block that has a read in-progress, 1771fa9e4066Sahrens * either wait for the in-progress read to complete (and return the 1772fa9e4066Sahrens * results); or, if this is a read with a "done" func, add a record 1773fa9e4066Sahrens * to the read to invoke the "done" func when the read completes, 1774fa9e4066Sahrens * and return; or just return. 1775fa9e4066Sahrens * 1776fa9e4066Sahrens * arc_read_done() will invoke all the requested "done" functions 1777fa9e4066Sahrens * for readers of this block. 1778fa9e4066Sahrens */ 1779fa9e4066Sahrens int 1780fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1781fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 178213506d1eSmaybee uint32_t *arc_flags, zbookmark_t *zb) 1783fa9e4066Sahrens { 1784fa9e4066Sahrens arc_buf_hdr_t *hdr; 1785fa9e4066Sahrens arc_buf_t *buf; 1786fa9e4066Sahrens kmutex_t *hash_lock; 1787fa9e4066Sahrens zio_t *rzio; 1788fa9e4066Sahrens 1789fa9e4066Sahrens top: 1790fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1791ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0) { 1792fa9e4066Sahrens 179313506d1eSmaybee *arc_flags |= ARC_CACHED; 179413506d1eSmaybee 1795fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 179613506d1eSmaybee 179713506d1eSmaybee if (*arc_flags & ARC_WAIT) { 179813506d1eSmaybee cv_wait(&hdr->b_cv, hash_lock); 179913506d1eSmaybee mutex_exit(hash_lock); 180013506d1eSmaybee goto top; 180113506d1eSmaybee } 180213506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 180313506d1eSmaybee 180413506d1eSmaybee if (done) { 1805fa9e4066Sahrens arc_callback_t *acb = NULL; 1806fa9e4066Sahrens 1807fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 1808fa9e4066Sahrens KM_SLEEP); 1809fa9e4066Sahrens acb->acb_done = done; 1810fa9e4066Sahrens acb->acb_private = private; 1811fa9e4066Sahrens acb->acb_byteswap = swap; 1812fa9e4066Sahrens if (pio != NULL) 1813fa9e4066Sahrens acb->acb_zio_dummy = zio_null(pio, 1814fa9e4066Sahrens spa, NULL, NULL, flags); 1815fa9e4066Sahrens 1816fa9e4066Sahrens ASSERT(acb->acb_done != NULL); 1817fa9e4066Sahrens acb->acb_next = hdr->b_acb; 1818fa9e4066Sahrens hdr->b_acb = acb; 1819fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1820fa9e4066Sahrens mutex_exit(hash_lock); 1821fa9e4066Sahrens return (0); 1822fa9e4066Sahrens } 1823fa9e4066Sahrens mutex_exit(hash_lock); 1824fa9e4066Sahrens return (0); 1825fa9e4066Sahrens } 1826fa9e4066Sahrens 1827ea8dc4b6Seschrock ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 1828fa9e4066Sahrens 1829ea8dc4b6Seschrock if (done) { 1830*44eda4d7Smaybee add_reference(hdr, hash_lock, private); 1831ea8dc4b6Seschrock /* 1832ea8dc4b6Seschrock * If this block is already in use, create a new 1833ea8dc4b6Seschrock * copy of the data so that we will be guaranteed 1834ea8dc4b6Seschrock * that arc_release() will always succeed. 1835ea8dc4b6Seschrock */ 1836fa9e4066Sahrens buf = hdr->b_buf; 1837ea8dc4b6Seschrock ASSERT(buf); 1838ea8dc4b6Seschrock ASSERT(buf->b_data); 1839*44eda4d7Smaybee if (HDR_BUF_AVAILABLE(hdr)) { 1840ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 1841ea8dc4b6Seschrock hdr->b_flags &= ~ARC_BUF_AVAILABLE; 1842*44eda4d7Smaybee } else { 1843*44eda4d7Smaybee buf = arc_buf_clone(buf); 1844ea8dc4b6Seschrock } 184513506d1eSmaybee } else if (*arc_flags & ARC_PREFETCH && 184613506d1eSmaybee refcount_count(&hdr->b_refcnt) == 0) { 184713506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 1848fa9e4066Sahrens } 1849fa9e4066Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1850*44eda4d7Smaybee arc_access(hdr, hash_lock); 1851*44eda4d7Smaybee mutex_exit(hash_lock); 1852fa9e4066Sahrens atomic_add_64(&arc.hits, 1); 1853fa9e4066Sahrens if (done) 1854fa9e4066Sahrens done(NULL, buf, private); 1855fa9e4066Sahrens } else { 1856fa9e4066Sahrens uint64_t size = BP_GET_LSIZE(bp); 1857fa9e4066Sahrens arc_callback_t *acb; 1858fa9e4066Sahrens 1859fa9e4066Sahrens if (hdr == NULL) { 1860fa9e4066Sahrens /* this block is not in the cache */ 1861fa9e4066Sahrens arc_buf_hdr_t *exists; 1862fa9e4066Sahrens 1863fa9e4066Sahrens buf = arc_buf_alloc(spa, size, private); 1864fa9e4066Sahrens hdr = buf->b_hdr; 1865fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(bp); 1866fa9e4066Sahrens hdr->b_birth = bp->blk_birth; 1867fa9e4066Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1868fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1869fa9e4066Sahrens if (exists) { 1870fa9e4066Sahrens /* somebody beat us to the hash insert */ 1871fa9e4066Sahrens mutex_exit(hash_lock); 1872fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1873fa9e4066Sahrens hdr->b_birth = 0; 1874fa9e4066Sahrens hdr->b_cksum0 = 0; 1875ea8dc4b6Seschrock (void) arc_buf_remove_ref(buf, private); 1876fa9e4066Sahrens goto top; /* restart the IO request */ 1877fa9e4066Sahrens } 187813506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 187913506d1eSmaybee if (*arc_flags & ARC_PREFETCH) { 188013506d1eSmaybee (void) remove_reference(hdr, hash_lock, 188113506d1eSmaybee private); 188213506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 188313506d1eSmaybee } 188413506d1eSmaybee if (BP_GET_LEVEL(bp) > 0) 188513506d1eSmaybee hdr->b_flags |= ARC_INDIRECT; 1886fa9e4066Sahrens } else { 1887fa9e4066Sahrens /* this block is in the ghost cache */ 1888ea8dc4b6Seschrock ASSERT(GHOST_STATE(hdr->b_state)); 1889ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 189013506d1eSmaybee ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 1891ea8dc4b6Seschrock ASSERT(hdr->b_buf == NULL); 189213506d1eSmaybee 189313506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 189413506d1eSmaybee if (*arc_flags & ARC_PREFETCH) 189513506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 189613506d1eSmaybee else 189713506d1eSmaybee add_reference(hdr, hash_lock, private); 1898fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1899fa9e4066Sahrens buf->b_hdr = hdr; 1900*44eda4d7Smaybee buf->b_data = NULL; 1901ea8dc4b6Seschrock buf->b_efunc = NULL; 1902ea8dc4b6Seschrock buf->b_private = NULL; 1903fa9e4066Sahrens buf->b_next = NULL; 1904fa9e4066Sahrens hdr->b_buf = buf; 1905*44eda4d7Smaybee arc_get_data_buf(buf); 1906ea8dc4b6Seschrock ASSERT(hdr->b_datacnt == 0); 1907ea8dc4b6Seschrock hdr->b_datacnt = 1; 190813506d1eSmaybee 1909fa9e4066Sahrens } 1910fa9e4066Sahrens 1911fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1912fa9e4066Sahrens acb->acb_done = done; 1913fa9e4066Sahrens acb->acb_private = private; 1914fa9e4066Sahrens acb->acb_byteswap = swap; 1915fa9e4066Sahrens 1916fa9e4066Sahrens ASSERT(hdr->b_acb == NULL); 1917fa9e4066Sahrens hdr->b_acb = acb; 1918fa9e4066Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 1919fa9e4066Sahrens 1920fa9e4066Sahrens /* 1921fa9e4066Sahrens * If the buffer has been evicted, migrate it to a present state 1922fa9e4066Sahrens * before issuing the I/O. Once we drop the hash-table lock, 1923fa9e4066Sahrens * the header will be marked as I/O in progress and have an 1924fa9e4066Sahrens * attached buffer. At this point, anybody who finds this 1925fa9e4066Sahrens * buffer ought to notice that it's legit but has a pending I/O. 1926fa9e4066Sahrens */ 1927fa9e4066Sahrens 1928ea8dc4b6Seschrock if (GHOST_STATE(hdr->b_state)) 1929*44eda4d7Smaybee arc_access(hdr, hash_lock); 1930*44eda4d7Smaybee mutex_exit(hash_lock); 1931fa9e4066Sahrens 1932fa9e4066Sahrens ASSERT3U(hdr->b_size, ==, size); 1933c543ec06Sahrens DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 1934c543ec06Sahrens zbookmark_t *, zb); 1935fa9e4066Sahrens atomic_add_64(&arc.misses, 1); 1936ea8dc4b6Seschrock 1937fa9e4066Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 1938ea8dc4b6Seschrock arc_read_done, buf, priority, flags, zb); 1939fa9e4066Sahrens 194013506d1eSmaybee if (*arc_flags & ARC_WAIT) 1941fa9e4066Sahrens return (zio_wait(rzio)); 1942fa9e4066Sahrens 194313506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 1944fa9e4066Sahrens zio_nowait(rzio); 1945fa9e4066Sahrens } 1946fa9e4066Sahrens return (0); 1947fa9e4066Sahrens } 1948fa9e4066Sahrens 1949fa9e4066Sahrens /* 1950fa9e4066Sahrens * arc_read() variant to support pool traversal. If the block is already 1951fa9e4066Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 1952fa9e4066Sahrens * The idea is that we don't want pool traversal filling up memory, but 1953fa9e4066Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 1954fa9e4066Sahrens */ 1955fa9e4066Sahrens int 1956fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 1957fa9e4066Sahrens { 1958fa9e4066Sahrens arc_buf_hdr_t *hdr; 1959fa9e4066Sahrens kmutex_t *hash_mtx; 1960fa9e4066Sahrens int rc = 0; 1961fa9e4066Sahrens 1962fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 1963fa9e4066Sahrens 1964ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 1965ea8dc4b6Seschrock arc_buf_t *buf = hdr->b_buf; 1966ea8dc4b6Seschrock 1967ea8dc4b6Seschrock ASSERT(buf); 1968ea8dc4b6Seschrock while (buf->b_data == NULL) { 1969ea8dc4b6Seschrock buf = buf->b_next; 1970ea8dc4b6Seschrock ASSERT(buf); 1971ea8dc4b6Seschrock } 1972ea8dc4b6Seschrock bcopy(buf->b_data, data, hdr->b_size); 1973ea8dc4b6Seschrock } else { 1974fa9e4066Sahrens rc = ENOENT; 1975ea8dc4b6Seschrock } 1976fa9e4066Sahrens 1977fa9e4066Sahrens if (hash_mtx) 1978fa9e4066Sahrens mutex_exit(hash_mtx); 1979fa9e4066Sahrens 1980fa9e4066Sahrens return (rc); 1981fa9e4066Sahrens } 1982fa9e4066Sahrens 1983ea8dc4b6Seschrock void 1984ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 1985ea8dc4b6Seschrock { 1986ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 1987ea8dc4b6Seschrock ASSERT(buf->b_hdr->b_state != arc.anon); 1988ea8dc4b6Seschrock ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 1989ea8dc4b6Seschrock buf->b_efunc = func; 1990ea8dc4b6Seschrock buf->b_private = private; 1991ea8dc4b6Seschrock } 1992ea8dc4b6Seschrock 1993ea8dc4b6Seschrock /* 1994ea8dc4b6Seschrock * This is used by the DMU to let the ARC know that a buffer is 1995ea8dc4b6Seschrock * being evicted, so the ARC should clean up. If this arc buf 1996ea8dc4b6Seschrock * is not yet in the evicted state, it will be put there. 1997ea8dc4b6Seschrock */ 1998ea8dc4b6Seschrock int 1999ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf) 2000ea8dc4b6Seschrock { 2001ea8dc4b6Seschrock arc_buf_hdr_t *hdr; 2002ea8dc4b6Seschrock kmutex_t *hash_lock; 2003ea8dc4b6Seschrock arc_buf_t **bufp; 2004ea8dc4b6Seschrock 2005ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 2006ea8dc4b6Seschrock hdr = buf->b_hdr; 2007ea8dc4b6Seschrock if (hdr == NULL) { 2008ea8dc4b6Seschrock /* 2009ea8dc4b6Seschrock * We are in arc_do_user_evicts(). 2010ea8dc4b6Seschrock * NOTE: We can't be in arc_buf_add_ref() because 2011ea8dc4b6Seschrock * that would violate the interface rules. 2012ea8dc4b6Seschrock */ 2013ea8dc4b6Seschrock ASSERT(buf->b_data == NULL); 2014ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2015ea8dc4b6Seschrock return (0); 2016ea8dc4b6Seschrock } else if (buf->b_data == NULL) { 2017dd6ef538Smaybee arc_buf_t copy = *buf; /* structure assignment */ 2018ea8dc4b6Seschrock /* 2019dd6ef538Smaybee * We are on the eviction list. Process this buffer 2020dd6ef538Smaybee * now but let arc_do_user_evicts() do the reaping. 2021ea8dc4b6Seschrock */ 2022dd6ef538Smaybee buf->b_efunc = NULL; 2023dd6ef538Smaybee buf->b_hdr = NULL; 2024ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2025dd6ef538Smaybee VERIFY(copy.b_efunc(©) == 0); 2026dd6ef538Smaybee return (1); 2027ea8dc4b6Seschrock } else { 2028ea8dc4b6Seschrock /* 2029ea8dc4b6Seschrock * Prevent a race with arc_evict() 2030ea8dc4b6Seschrock */ 2031ea8dc4b6Seschrock ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2032ea8dc4b6Seschrock buf->b_hdr = NULL; 2033ea8dc4b6Seschrock } 2034ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2035ea8dc4b6Seschrock 2036ea8dc4b6Seschrock hash_lock = HDR_LOCK(hdr); 2037ea8dc4b6Seschrock mutex_enter(hash_lock); 2038ea8dc4b6Seschrock 2039ea8dc4b6Seschrock ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 2040ea8dc4b6Seschrock 2041ea8dc4b6Seschrock /* 2042ea8dc4b6Seschrock * Pull this buffer off of the hdr 2043ea8dc4b6Seschrock */ 2044ea8dc4b6Seschrock bufp = &hdr->b_buf; 2045ea8dc4b6Seschrock while (*bufp != buf) 2046ea8dc4b6Seschrock bufp = &(*bufp)->b_next; 2047ea8dc4b6Seschrock *bufp = buf->b_next; 2048ea8dc4b6Seschrock 2049ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 2050ea8dc4b6Seschrock buf->b_hdr = hdr; 2051*44eda4d7Smaybee arc_buf_destroy(buf, FALSE, FALSE); 2052ea8dc4b6Seschrock 2053ea8dc4b6Seschrock if (hdr->b_datacnt == 0) { 2054ea8dc4b6Seschrock arc_state_t *old_state = hdr->b_state; 2055ea8dc4b6Seschrock arc_state_t *evicted_state; 2056ea8dc4b6Seschrock 2057ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2058ea8dc4b6Seschrock 2059ea8dc4b6Seschrock evicted_state = 2060ea8dc4b6Seschrock (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 2061ea8dc4b6Seschrock 2062ea8dc4b6Seschrock mutex_enter(&old_state->mtx); 2063ea8dc4b6Seschrock mutex_enter(&evicted_state->mtx); 2064ea8dc4b6Seschrock 2065ea8dc4b6Seschrock arc_change_state(evicted_state, hdr, hash_lock); 2066ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(hdr)); 2067ea8dc4b6Seschrock hdr->b_flags = ARC_IN_HASH_TABLE; 2068ea8dc4b6Seschrock 2069ea8dc4b6Seschrock mutex_exit(&evicted_state->mtx); 2070ea8dc4b6Seschrock mutex_exit(&old_state->mtx); 2071ea8dc4b6Seschrock } 2072ea8dc4b6Seschrock mutex_exit(hash_lock); 2073dd6ef538Smaybee 2074ea8dc4b6Seschrock VERIFY(buf->b_efunc(buf) == 0); 2075ea8dc4b6Seschrock buf->b_efunc = NULL; 2076ea8dc4b6Seschrock buf->b_private = NULL; 2077ea8dc4b6Seschrock buf->b_hdr = NULL; 2078ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 2079ea8dc4b6Seschrock return (1); 2080ea8dc4b6Seschrock } 2081ea8dc4b6Seschrock 2082fa9e4066Sahrens /* 2083fa9e4066Sahrens * Release this buffer from the cache. This must be done 2084fa9e4066Sahrens * after a read and prior to modifying the buffer contents. 2085fa9e4066Sahrens * If the buffer has more than one reference, we must make 2086fa9e4066Sahrens * make a new hdr for the buffer. 2087fa9e4066Sahrens */ 2088fa9e4066Sahrens void 2089fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag) 2090fa9e4066Sahrens { 2091fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 2092fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 2093fa9e4066Sahrens 2094fa9e4066Sahrens /* this buffer is not on any list */ 2095fa9e4066Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2096fa9e4066Sahrens 2097fa9e4066Sahrens if (hdr->b_state == arc.anon) { 2098fa9e4066Sahrens /* this buffer is already released */ 2099fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2100fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 2101ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 2102fa9e4066Sahrens return; 2103fa9e4066Sahrens } 2104fa9e4066Sahrens 2105fa9e4066Sahrens mutex_enter(hash_lock); 2106fa9e4066Sahrens 2107ea8dc4b6Seschrock /* 2108ea8dc4b6Seschrock * Do we have more than one buf? 2109ea8dc4b6Seschrock */ 2110ea8dc4b6Seschrock if (hdr->b_buf != buf || buf->b_next != NULL) { 2111fa9e4066Sahrens arc_buf_hdr_t *nhdr; 2112fa9e4066Sahrens arc_buf_t **bufp; 2113fa9e4066Sahrens uint64_t blksz = hdr->b_size; 2114fa9e4066Sahrens spa_t *spa = hdr->b_spa; 2115fa9e4066Sahrens 2116ea8dc4b6Seschrock ASSERT(hdr->b_datacnt > 1); 2117fa9e4066Sahrens /* 2118fa9e4066Sahrens * Pull the data off of this buf and attach it to 2119fa9e4066Sahrens * a new anonymous buf. 2120fa9e4066Sahrens */ 2121ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 2122fa9e4066Sahrens bufp = &hdr->b_buf; 2123ea8dc4b6Seschrock while (*bufp != buf) 2124fa9e4066Sahrens bufp = &(*bufp)->b_next; 2125fa9e4066Sahrens *bufp = (*bufp)->b_next; 2126ea8dc4b6Seschrock 2127fa9e4066Sahrens ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 2128fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, -hdr->b_size); 2129ea8dc4b6Seschrock if (refcount_is_zero(&hdr->b_refcnt)) { 2130ea8dc4b6Seschrock ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size); 2131ea8dc4b6Seschrock atomic_add_64(&hdr->b_state->lsize, -hdr->b_size); 2132ea8dc4b6Seschrock } 2133ea8dc4b6Seschrock hdr->b_datacnt -= 1; 2134ea8dc4b6Seschrock 2135fa9e4066Sahrens mutex_exit(hash_lock); 2136fa9e4066Sahrens 2137fa9e4066Sahrens nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2138fa9e4066Sahrens nhdr->b_size = blksz; 2139fa9e4066Sahrens nhdr->b_spa = spa; 2140fa9e4066Sahrens nhdr->b_buf = buf; 2141fa9e4066Sahrens nhdr->b_state = arc.anon; 2142fa9e4066Sahrens nhdr->b_arc_access = 0; 2143fa9e4066Sahrens nhdr->b_flags = 0; 2144ea8dc4b6Seschrock nhdr->b_datacnt = 1; 2145fa9e4066Sahrens buf->b_hdr = nhdr; 2146fa9e4066Sahrens buf->b_next = NULL; 2147fa9e4066Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 2148fa9e4066Sahrens atomic_add_64(&arc.anon->size, blksz); 2149fa9e4066Sahrens 2150fa9e4066Sahrens hdr = nhdr; 2151fa9e4066Sahrens } else { 2152ea8dc4b6Seschrock ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2153fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 2154fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2155fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 2156fa9e4066Sahrens hdr->b_arc_access = 0; 2157fa9e4066Sahrens mutex_exit(hash_lock); 2158fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2159fa9e4066Sahrens hdr->b_birth = 0; 2160fa9e4066Sahrens hdr->b_cksum0 = 0; 2161fa9e4066Sahrens } 2162ea8dc4b6Seschrock buf->b_efunc = NULL; 2163ea8dc4b6Seschrock buf->b_private = NULL; 2164fa9e4066Sahrens } 2165fa9e4066Sahrens 2166fa9e4066Sahrens int 2167fa9e4066Sahrens arc_released(arc_buf_t *buf) 2168fa9e4066Sahrens { 2169ea8dc4b6Seschrock return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon); 2170ea8dc4b6Seschrock } 2171ea8dc4b6Seschrock 2172ea8dc4b6Seschrock int 2173ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf) 2174ea8dc4b6Seschrock { 2175ea8dc4b6Seschrock return (buf->b_efunc != NULL); 2176fa9e4066Sahrens } 2177fa9e4066Sahrens 2178ea8dc4b6Seschrock #ifdef ZFS_DEBUG 2179ea8dc4b6Seschrock int 2180ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf) 2181ea8dc4b6Seschrock { 2182ea8dc4b6Seschrock return (refcount_count(&buf->b_hdr->b_refcnt)); 2183ea8dc4b6Seschrock } 2184ea8dc4b6Seschrock #endif 2185ea8dc4b6Seschrock 2186fa9e4066Sahrens static void 2187fa9e4066Sahrens arc_write_done(zio_t *zio) 2188fa9e4066Sahrens { 2189fa9e4066Sahrens arc_buf_t *buf; 2190fa9e4066Sahrens arc_buf_hdr_t *hdr; 2191fa9e4066Sahrens arc_callback_t *acb; 2192fa9e4066Sahrens 2193fa9e4066Sahrens buf = zio->io_private; 2194fa9e4066Sahrens hdr = buf->b_hdr; 2195fa9e4066Sahrens acb = hdr->b_acb; 2196fa9e4066Sahrens hdr->b_acb = NULL; 2197ea8dc4b6Seschrock ASSERT(acb != NULL); 2198fa9e4066Sahrens 2199fa9e4066Sahrens /* this buffer is on no lists and is not in the hash table */ 2200fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 2201fa9e4066Sahrens 2202fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2203fa9e4066Sahrens hdr->b_birth = zio->io_bp->blk_birth; 2204fa9e4066Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2205ea8dc4b6Seschrock /* 2206ea8dc4b6Seschrock * If the block to be written was all-zero, we may have 2207ea8dc4b6Seschrock * compressed it away. In this case no write was performed 2208ea8dc4b6Seschrock * so there will be no dva/birth-date/checksum. The buffer 2209ea8dc4b6Seschrock * must therefor remain anonymous (and uncached). 2210ea8dc4b6Seschrock */ 2211fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 2212fa9e4066Sahrens arc_buf_hdr_t *exists; 2213fa9e4066Sahrens kmutex_t *hash_lock; 2214fa9e4066Sahrens 2215fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2216fa9e4066Sahrens if (exists) { 2217fa9e4066Sahrens /* 2218fa9e4066Sahrens * This can only happen if we overwrite for 2219fa9e4066Sahrens * sync-to-convergence, because we remove 2220fa9e4066Sahrens * buffers from the hash table when we arc_free(). 2221fa9e4066Sahrens */ 2222fa9e4066Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2223fa9e4066Sahrens BP_IDENTITY(zio->io_bp))); 2224fa9e4066Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2225fa9e4066Sahrens zio->io_bp->blk_birth); 2226fa9e4066Sahrens 2227fa9e4066Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 2228fa9e4066Sahrens arc_change_state(arc.anon, exists, hash_lock); 2229fa9e4066Sahrens mutex_exit(hash_lock); 2230ea8dc4b6Seschrock arc_hdr_destroy(exists); 2231fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2232fa9e4066Sahrens ASSERT3P(exists, ==, NULL); 2233fa9e4066Sahrens } 2234ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2235*44eda4d7Smaybee arc_access(hdr, hash_lock); 2236*44eda4d7Smaybee mutex_exit(hash_lock); 2237ea8dc4b6Seschrock } else if (acb->acb_done == NULL) { 2238ea8dc4b6Seschrock int destroy_hdr; 2239ea8dc4b6Seschrock /* 2240ea8dc4b6Seschrock * This is an anonymous buffer with no user callback, 2241ea8dc4b6Seschrock * destroy it if there are no active references. 2242ea8dc4b6Seschrock */ 2243ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 2244ea8dc4b6Seschrock destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2245ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2246ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2247ea8dc4b6Seschrock if (destroy_hdr) 2248ea8dc4b6Seschrock arc_hdr_destroy(hdr); 2249ea8dc4b6Seschrock } else { 2250ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2251fa9e4066Sahrens } 2252ea8dc4b6Seschrock 2253ea8dc4b6Seschrock if (acb->acb_done) { 2254fa9e4066Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2255fa9e4066Sahrens acb->acb_done(zio, buf, acb->acb_private); 2256fa9e4066Sahrens } 2257fa9e4066Sahrens 2258ea8dc4b6Seschrock kmem_free(acb, sizeof (arc_callback_t)); 2259fa9e4066Sahrens } 2260fa9e4066Sahrens 2261fa9e4066Sahrens int 226244cd46caSbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2263fa9e4066Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2264fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 2265ea8dc4b6Seschrock uint32_t arc_flags, zbookmark_t *zb) 2266fa9e4066Sahrens { 2267fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 2268fa9e4066Sahrens arc_callback_t *acb; 2269fa9e4066Sahrens zio_t *rzio; 2270fa9e4066Sahrens 2271fa9e4066Sahrens /* this is a private buffer - no locking required */ 2272fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 2273fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 2274fa9e4066Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 2275c5c6ffa0Smaybee ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2276c5c6ffa0Smaybee ASSERT(hdr->b_acb == 0); 2277fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2278fa9e4066Sahrens acb->acb_done = done; 2279fa9e4066Sahrens acb->acb_private = private; 2280fa9e4066Sahrens acb->acb_byteswap = (arc_byteswap_func_t *)-1; 2281fa9e4066Sahrens hdr->b_acb = acb; 2282ea8dc4b6Seschrock hdr->b_flags |= ARC_IO_IN_PROGRESS; 228344cd46caSbillm rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2284ea8dc4b6Seschrock buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); 2285fa9e4066Sahrens 2286fa9e4066Sahrens if (arc_flags & ARC_WAIT) 2287fa9e4066Sahrens return (zio_wait(rzio)); 2288fa9e4066Sahrens 2289fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 2290fa9e4066Sahrens zio_nowait(rzio); 2291fa9e4066Sahrens 2292fa9e4066Sahrens return (0); 2293fa9e4066Sahrens } 2294fa9e4066Sahrens 2295fa9e4066Sahrens int 2296fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2297fa9e4066Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 2298fa9e4066Sahrens { 2299fa9e4066Sahrens arc_buf_hdr_t *ab; 2300fa9e4066Sahrens kmutex_t *hash_lock; 2301fa9e4066Sahrens zio_t *zio; 2302fa9e4066Sahrens 2303fa9e4066Sahrens /* 2304fa9e4066Sahrens * If this buffer is in the cache, release it, so it 2305fa9e4066Sahrens * can be re-used. 2306fa9e4066Sahrens */ 2307fa9e4066Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2308fa9e4066Sahrens if (ab != NULL) { 2309fa9e4066Sahrens /* 2310fa9e4066Sahrens * The checksum of blocks to free is not always 2311fa9e4066Sahrens * preserved (eg. on the deadlist). However, if it is 2312fa9e4066Sahrens * nonzero, it should match what we have in the cache. 2313fa9e4066Sahrens */ 2314fa9e4066Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2315fa9e4066Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 231677ed8509Smaybee if (ab->b_state != arc.anon) 231777ed8509Smaybee arc_change_state(arc.anon, ab, hash_lock); 231813506d1eSmaybee if (HDR_IO_IN_PROGRESS(ab)) { 231913506d1eSmaybee /* 232013506d1eSmaybee * This should only happen when we prefetch. 232113506d1eSmaybee */ 232213506d1eSmaybee ASSERT(ab->b_flags & ARC_PREFETCH); 232313506d1eSmaybee ASSERT3U(ab->b_datacnt, ==, 1); 232413506d1eSmaybee ab->b_flags |= ARC_FREED_IN_READ; 232513506d1eSmaybee if (HDR_IN_HASH_TABLE(ab)) 232613506d1eSmaybee buf_hash_remove(ab); 232713506d1eSmaybee ab->b_arc_access = 0; 232813506d1eSmaybee bzero(&ab->b_dva, sizeof (dva_t)); 232913506d1eSmaybee ab->b_birth = 0; 233013506d1eSmaybee ab->b_cksum0 = 0; 233113506d1eSmaybee ab->b_buf->b_efunc = NULL; 233213506d1eSmaybee ab->b_buf->b_private = NULL; 233313506d1eSmaybee mutex_exit(hash_lock); 233413506d1eSmaybee } else if (refcount_is_zero(&ab->b_refcnt)) { 2335fa9e4066Sahrens mutex_exit(hash_lock); 2336ea8dc4b6Seschrock arc_hdr_destroy(ab); 2337fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 2338fa9e4066Sahrens } else { 2339bbf4a8dfSmaybee /* 234013506d1eSmaybee * We still have an active reference on this 234113506d1eSmaybee * buffer. This can happen, e.g., from 234213506d1eSmaybee * dbuf_unoverride(). 2343bbf4a8dfSmaybee */ 234413506d1eSmaybee ASSERT(!HDR_IN_HASH_TABLE(ab)); 2345fa9e4066Sahrens ab->b_arc_access = 0; 2346fa9e4066Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 2347fa9e4066Sahrens ab->b_birth = 0; 2348fa9e4066Sahrens ab->b_cksum0 = 0; 2349ea8dc4b6Seschrock ab->b_buf->b_efunc = NULL; 2350ea8dc4b6Seschrock ab->b_buf->b_private = NULL; 2351fa9e4066Sahrens mutex_exit(hash_lock); 2352fa9e4066Sahrens } 2353fa9e4066Sahrens } 2354fa9e4066Sahrens 2355fa9e4066Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 2356fa9e4066Sahrens 2357fa9e4066Sahrens if (arc_flags & ARC_WAIT) 2358fa9e4066Sahrens return (zio_wait(zio)); 2359fa9e4066Sahrens 2360fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 2361fa9e4066Sahrens zio_nowait(zio); 2362fa9e4066Sahrens 2363fa9e4066Sahrens return (0); 2364fa9e4066Sahrens } 2365fa9e4066Sahrens 2366fa9e4066Sahrens void 2367fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve) 2368fa9e4066Sahrens { 2369fa9e4066Sahrens atomic_add_64(&arc_tempreserve, -tempreserve); 2370fa9e4066Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 2371fa9e4066Sahrens } 2372fa9e4066Sahrens 2373fa9e4066Sahrens int 2374fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve) 2375fa9e4066Sahrens { 2376fa9e4066Sahrens #ifdef ZFS_DEBUG 2377fa9e4066Sahrens /* 2378fa9e4066Sahrens * Once in a while, fail for no reason. Everything should cope. 2379fa9e4066Sahrens */ 2380fa9e4066Sahrens if (spa_get_random(10000) == 0) { 2381fa9e4066Sahrens dprintf("forcing random failure\n"); 2382fa9e4066Sahrens return (ERESTART); 2383fa9e4066Sahrens } 2384fa9e4066Sahrens #endif 2385112fe045Smaybee if (tempreserve > arc.c/4 && !arc.no_grow) 2386112fe045Smaybee arc.c = MIN(arc.c_max, tempreserve * 4); 2387112fe045Smaybee if (tempreserve > arc.c) 2388112fe045Smaybee return (ENOMEM); 2389112fe045Smaybee 2390fa9e4066Sahrens /* 2391112fe045Smaybee * Throttle writes when the amount of dirty data in the cache 2392112fe045Smaybee * gets too large. We try to keep the cache less than half full 2393112fe045Smaybee * of dirty blocks so that our sync times don't grow too large. 2394112fe045Smaybee * Note: if two requests come in concurrently, we might let them 2395112fe045Smaybee * both succeed, when one of them should fail. Not a huge deal. 2396112fe045Smaybee * 2397112fe045Smaybee * XXX The limit should be adjusted dynamically to keep the time 2398112fe045Smaybee * to sync a dataset fixed (around 1-5 seconds?). 2399fa9e4066Sahrens */ 2400fa9e4066Sahrens 2401112fe045Smaybee if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 && 2402112fe045Smaybee arc_tempreserve + arc.anon->size > arc.c / 4) { 2403fa9e4066Sahrens dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2404fa9e4066Sahrens "tempreserve=%lluK arc.c=%lluK\n", 2405fa9e4066Sahrens arc_tempreserve>>10, arc.anon->lsize>>10, 2406fa9e4066Sahrens tempreserve>>10, arc.c>>10); 2407fa9e4066Sahrens return (ERESTART); 2408fa9e4066Sahrens } 2409fa9e4066Sahrens atomic_add_64(&arc_tempreserve, tempreserve); 2410fa9e4066Sahrens return (0); 2411fa9e4066Sahrens } 2412fa9e4066Sahrens 2413fa9e4066Sahrens void 2414fa9e4066Sahrens arc_init(void) 2415fa9e4066Sahrens { 2416fa9e4066Sahrens mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 2417fa9e4066Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2418fa9e4066Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2419fa9e4066Sahrens 242013506d1eSmaybee /* Convert seconds to clock ticks */ 2421b19a79ecSperrin arc_min_prefetch_lifespan = 1 * hz; 242213506d1eSmaybee 2423fa9e4066Sahrens /* Start out with 1/8 of all memory */ 2424fa9e4066Sahrens arc.c = physmem * PAGESIZE / 8; 2425fa9e4066Sahrens 2426fa9e4066Sahrens #ifdef _KERNEL 2427fa9e4066Sahrens /* 2428fa9e4066Sahrens * On architectures where the physical memory can be larger 2429fa9e4066Sahrens * than the addressable space (intel in 32-bit mode), we may 2430fa9e4066Sahrens * need to limit the cache to 1/8 of VM size. 2431fa9e4066Sahrens */ 2432fa9e4066Sahrens arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2433fa9e4066Sahrens #endif 2434fa9e4066Sahrens 2435112fe045Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2436fa9e4066Sahrens arc.c_min = MAX(arc.c / 4, 64<<20); 2437112fe045Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2438fa9e4066Sahrens if (arc.c * 8 >= 1<<30) 2439fa9e4066Sahrens arc.c_max = (arc.c * 8) - (1<<30); 2440fa9e4066Sahrens else 2441fa9e4066Sahrens arc.c_max = arc.c_min; 2442fa9e4066Sahrens arc.c_max = MAX(arc.c * 6, arc.c_max); 2443fa9e4066Sahrens arc.c = arc.c_max; 2444fa9e4066Sahrens arc.p = (arc.c >> 1); 2445fa9e4066Sahrens 2446fa9e4066Sahrens /* if kmem_flags are set, lets try to use less memory */ 2447fa9e4066Sahrens if (kmem_debugging()) 2448fa9e4066Sahrens arc.c = arc.c / 2; 2449fa9e4066Sahrens if (arc.c < arc.c_min) 2450fa9e4066Sahrens arc.c = arc.c_min; 2451fa9e4066Sahrens 2452fa9e4066Sahrens arc.anon = &ARC_anon; 2453ea8dc4b6Seschrock arc.mru = &ARC_mru; 2454ea8dc4b6Seschrock arc.mru_ghost = &ARC_mru_ghost; 2455ea8dc4b6Seschrock arc.mfu = &ARC_mfu; 2456ea8dc4b6Seschrock arc.mfu_ghost = &ARC_mfu_ghost; 2457ea8dc4b6Seschrock arc.size = 0; 2458fa9e4066Sahrens 2459*44eda4d7Smaybee arc.hits = 0; 2460*44eda4d7Smaybee arc.recycle_miss = 0; 2461*44eda4d7Smaybee arc.evict_skip = 0; 2462*44eda4d7Smaybee arc.mutex_miss = 0; 2463*44eda4d7Smaybee 2464ea8dc4b6Seschrock list_create(&arc.mru->list, sizeof (arc_buf_hdr_t), 2465fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2466ea8dc4b6Seschrock list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t), 2467fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2468ea8dc4b6Seschrock list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t), 2469fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2470ea8dc4b6Seschrock list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t), 2471fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2472fa9e4066Sahrens 2473fa9e4066Sahrens buf_init(); 2474fa9e4066Sahrens 2475fa9e4066Sahrens arc_thread_exit = 0; 2476ea8dc4b6Seschrock arc_eviction_list = NULL; 2477ea8dc4b6Seschrock mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2478fa9e4066Sahrens 2479fa9e4066Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2480fa9e4066Sahrens TS_RUN, minclsyspri); 2481fa9e4066Sahrens } 2482fa9e4066Sahrens 2483fa9e4066Sahrens void 2484fa9e4066Sahrens arc_fini(void) 2485fa9e4066Sahrens { 2486fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 2487fa9e4066Sahrens arc_thread_exit = 1; 2488fa9e4066Sahrens while (arc_thread_exit != 0) 2489fa9e4066Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2490fa9e4066Sahrens mutex_exit(&arc_reclaim_thr_lock); 2491fa9e4066Sahrens 2492fa9e4066Sahrens arc_flush(); 2493fa9e4066Sahrens 2494fa9e4066Sahrens arc_dead = TRUE; 2495fa9e4066Sahrens 2496ea8dc4b6Seschrock mutex_destroy(&arc_eviction_mtx); 2497fa9e4066Sahrens mutex_destroy(&arc_reclaim_lock); 2498fa9e4066Sahrens mutex_destroy(&arc_reclaim_thr_lock); 2499fa9e4066Sahrens cv_destroy(&arc_reclaim_thr_cv); 2500fa9e4066Sahrens 2501ea8dc4b6Seschrock list_destroy(&arc.mru->list); 2502ea8dc4b6Seschrock list_destroy(&arc.mru_ghost->list); 2503ea8dc4b6Seschrock list_destroy(&arc.mfu->list); 2504ea8dc4b6Seschrock list_destroy(&arc.mfu_ghost->list); 2505fa9e4066Sahrens 2506fa9e4066Sahrens buf_fini(); 2507fa9e4066Sahrens } 2508