1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5033f9833Sek * Common Development and Distribution License (the "License"). 6033f9833Sek * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22033f9833Sek * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens /* 29fa9e4066Sahrens * DVA-based Adjustable Relpacement Cache 30fa9e4066Sahrens * 31ea8dc4b6Seschrock * While much of the theory of operation used here is 32ea8dc4b6Seschrock * based on the self-tuning, low overhead replacement cache 33fa9e4066Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 34fa9e4066Sahrens * significant differences: 35fa9e4066Sahrens * 36fa9e4066Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 37fa9e4066Sahrens * Pages in its cache cannot be "locked" into memory. This makes 38fa9e4066Sahrens * the eviction algorithm simple: evict the last page in the list. 39fa9e4066Sahrens * This also make the performance characteristics easy to reason 40fa9e4066Sahrens * about. Our cache is not so simple. At any given moment, some 41fa9e4066Sahrens * subset of the blocks in the cache are un-evictable because we 42fa9e4066Sahrens * have handed out a reference to them. Blocks are only evictable 43fa9e4066Sahrens * when there are no external references active. This makes 44fa9e4066Sahrens * eviction far more problematic: we choose to evict the evictable 45fa9e4066Sahrens * blocks that are the "lowest" in the list. 46fa9e4066Sahrens * 47fa9e4066Sahrens * There are times when it is not possible to evict the requested 48fa9e4066Sahrens * space. In these circumstances we are unable to adjust the cache 49fa9e4066Sahrens * size. To prevent the cache growing unbounded at these times we 50fa9e4066Sahrens * implement a "cache throttle" that slowes the flow of new data 51fa9e4066Sahrens * into the cache until we can make space avaiable. 52fa9e4066Sahrens * 53fa9e4066Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 54fa9e4066Sahrens * Pages are evicted when the cache is full and there is a cache 55fa9e4066Sahrens * miss. Our model has a variable sized cache. It grows with 56fa9e4066Sahrens * high use, but also tries to react to memory preasure from the 57fa9e4066Sahrens * operating system: decreasing its size when system memory is 58fa9e4066Sahrens * tight. 59fa9e4066Sahrens * 60fa9e4066Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 61fa9e4066Sahrens * elements of the cache are therefor exactly the same size. So 62fa9e4066Sahrens * when adjusting the cache size following a cache miss, its simply 63fa9e4066Sahrens * a matter of choosing a single page to evict. In our model, we 64fa9e4066Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 65fa9e4066Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 66fa9e4066Sahrens * space for a cache miss that approximates as closely as possible 67fa9e4066Sahrens * the space used by the new block. 68fa9e4066Sahrens * 69fa9e4066Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70fa9e4066Sahrens * by N. Megiddo & D. Modha, FAST 2003 71fa9e4066Sahrens */ 72fa9e4066Sahrens 73fa9e4066Sahrens /* 74fa9e4066Sahrens * The locking model: 75fa9e4066Sahrens * 76fa9e4066Sahrens * A new reference to a cache buffer can be obtained in two 77fa9e4066Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 78fa9e4066Sahrens * or 2) via one of the ARC lists. The arc_read() inerface 79fa9e4066Sahrens * uses method 1, while the internal arc algorithms for 80fa9e4066Sahrens * adjusting the cache use method 2. We therefor provide two 81fa9e4066Sahrens * types of locks: 1) the hash table lock array, and 2) the 82fa9e4066Sahrens * arc list locks. 83fa9e4066Sahrens * 84fa9e4066Sahrens * Buffers do not have their own mutexs, rather they rely on the 85fa9e4066Sahrens * hash table mutexs for the bulk of their protection (i.e. most 86fa9e4066Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 87fa9e4066Sahrens * 88fa9e4066Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 89fa9e4066Sahrens * locates the requested buffer in the hash table. It returns 90fa9e4066Sahrens * NULL for the mutex if the buffer was not in the table. 91fa9e4066Sahrens * 92fa9e4066Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 93fa9e4066Sahrens * already held before it is invoked. 94fa9e4066Sahrens * 95fa9e4066Sahrens * Each arc state also has a mutex which is used to protect the 96fa9e4066Sahrens * buffer list associated with the state. When attempting to 97fa9e4066Sahrens * obtain a hash table lock while holding an arc list lock you 98fa9e4066Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 99fa9e4066Sahrens * the "top" state mutex must be held before the "bot" state mutex. 100fa9e4066Sahrens * 101ea8dc4b6Seschrock * Arc buffers may have an associated eviction callback function. 102ea8dc4b6Seschrock * This function will be invoked prior to removing the buffer (e.g. 103ea8dc4b6Seschrock * in arc_do_user_evicts()). Note however that the data associated 104ea8dc4b6Seschrock * with the buffer may be evicted prior to the callback. The callback 105ea8dc4b6Seschrock * must be made with *no locks held* (to prevent deadlock). Additionally, 106ea8dc4b6Seschrock * the users of callbacks must ensure that their private data is 107ea8dc4b6Seschrock * protected from simultaneous callbacks from arc_buf_evict() 108ea8dc4b6Seschrock * and arc_do_user_evicts(). 109ea8dc4b6Seschrock * 110fa9e4066Sahrens * Note that the majority of the performance stats are manipulated 111fa9e4066Sahrens * with atomic operations. 112fa9e4066Sahrens */ 113fa9e4066Sahrens 114fa9e4066Sahrens #include <sys/spa.h> 115fa9e4066Sahrens #include <sys/zio.h> 116fa9e4066Sahrens #include <sys/zfs_context.h> 117fa9e4066Sahrens #include <sys/arc.h> 118fa9e4066Sahrens #include <sys/refcount.h> 119fa9e4066Sahrens #ifdef _KERNEL 120fa9e4066Sahrens #include <sys/vmsystm.h> 121fa9e4066Sahrens #include <vm/anon.h> 122fa9e4066Sahrens #include <sys/fs/swapnode.h> 123033f9833Sek #include <sys/dnlc.h> 124fa9e4066Sahrens #endif 125fa9e4066Sahrens #include <sys/callb.h> 126fa9e4066Sahrens 127fa9e4066Sahrens static kmutex_t arc_reclaim_thr_lock; 128fa9e4066Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 129fa9e4066Sahrens static uint8_t arc_thread_exit; 130fa9e4066Sahrens 131033f9833Sek #define ARC_REDUCE_DNLC_PERCENT 3 132033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 133033f9833Sek 134fa9e4066Sahrens typedef enum arc_reclaim_strategy { 135fa9e4066Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 136fa9e4066Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 137fa9e4066Sahrens } arc_reclaim_strategy_t; 138fa9e4066Sahrens 139fa9e4066Sahrens /* number of seconds before growing cache again */ 140fa9e4066Sahrens static int arc_grow_retry = 60; 141fa9e4066Sahrens 142*13506d1eSmaybee /* 143*13506d1eSmaybee * minimum lifespan of a prefetched block in seconds 144*13506d1eSmaybee * (this is converted to ticks during the arc initialization) 145*13506d1eSmaybee */ 146*13506d1eSmaybee static int arc_min_prefetch_lifespan = 1; 147*13506d1eSmaybee 148fa9e4066Sahrens static kmutex_t arc_reclaim_lock; 149fa9e4066Sahrens static int arc_dead; 150fa9e4066Sahrens 151fa9e4066Sahrens /* 152fa9e4066Sahrens * Note that buffers can be on one of 5 states: 153fa9e4066Sahrens * ARC_anon - anonymous (discussed below) 154ea8dc4b6Seschrock * ARC_mru - recently used, currently cached 155ea8dc4b6Seschrock * ARC_mru_ghost - recentely used, no longer in cache 156ea8dc4b6Seschrock * ARC_mfu - frequently used, currently cached 157ea8dc4b6Seschrock * ARC_mfu_ghost - frequently used, no longer in cache 158fa9e4066Sahrens * When there are no active references to the buffer, they 159fa9e4066Sahrens * are linked onto one of the lists in arc. These are the 160fa9e4066Sahrens * only buffers that can be evicted or deleted. 161fa9e4066Sahrens * 162fa9e4066Sahrens * Anonymous buffers are buffers that are not associated with 163fa9e4066Sahrens * a DVA. These are buffers that hold dirty block copies 164fa9e4066Sahrens * before they are written to stable storage. By definition, 165ea8dc4b6Seschrock * they are "ref'd" and are considered part of arc_mru 166fa9e4066Sahrens * that cannot be freed. Generally, they will aquire a DVA 167ea8dc4b6Seschrock * as they are written and migrate onto the arc_mru list. 168fa9e4066Sahrens */ 169fa9e4066Sahrens 170fa9e4066Sahrens typedef struct arc_state { 171fa9e4066Sahrens list_t list; /* linked list of evictable buffer in state */ 172fa9e4066Sahrens uint64_t lsize; /* total size of buffers in the linked list */ 173fa9e4066Sahrens uint64_t size; /* total size of all buffers in this state */ 174fa9e4066Sahrens uint64_t hits; 175fa9e4066Sahrens kmutex_t mtx; 176fa9e4066Sahrens } arc_state_t; 177fa9e4066Sahrens 178fa9e4066Sahrens /* The 5 states: */ 179fa9e4066Sahrens static arc_state_t ARC_anon; 180ea8dc4b6Seschrock static arc_state_t ARC_mru; 181ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost; 182ea8dc4b6Seschrock static arc_state_t ARC_mfu; 183ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost; 184fa9e4066Sahrens 185fa9e4066Sahrens static struct arc { 186fa9e4066Sahrens arc_state_t *anon; 187ea8dc4b6Seschrock arc_state_t *mru; 188ea8dc4b6Seschrock arc_state_t *mru_ghost; 189ea8dc4b6Seschrock arc_state_t *mfu; 190ea8dc4b6Seschrock arc_state_t *mfu_ghost; 191fa9e4066Sahrens uint64_t size; /* Actual total arc size */ 192ea8dc4b6Seschrock uint64_t p; /* Target size (in bytes) of mru */ 193fa9e4066Sahrens uint64_t c; /* Target size of cache (in bytes) */ 194fa9e4066Sahrens uint64_t c_min; /* Minimum target cache size */ 195fa9e4066Sahrens uint64_t c_max; /* Maximum target cache size */ 196fa9e4066Sahrens 197fa9e4066Sahrens /* performance stats */ 198fa9e4066Sahrens uint64_t hits; 199fa9e4066Sahrens uint64_t misses; 200fa9e4066Sahrens uint64_t deleted; 201fa9e4066Sahrens uint64_t skipped; 202fa9e4066Sahrens uint64_t hash_elements; 203fa9e4066Sahrens uint64_t hash_elements_max; 204fa9e4066Sahrens uint64_t hash_collisions; 205fa9e4066Sahrens uint64_t hash_chains; 206fa9e4066Sahrens uint32_t hash_chain_max; 207fa9e4066Sahrens 208fa9e4066Sahrens int no_grow; /* Don't try to grow cache size */ 209fa9e4066Sahrens } arc; 210fa9e4066Sahrens 211fa9e4066Sahrens static uint64_t arc_tempreserve; 212fa9e4066Sahrens 213fa9e4066Sahrens typedef struct arc_callback arc_callback_t; 214fa9e4066Sahrens 215fa9e4066Sahrens struct arc_callback { 216fa9e4066Sahrens arc_done_func_t *acb_done; 217fa9e4066Sahrens void *acb_private; 218fa9e4066Sahrens arc_byteswap_func_t *acb_byteswap; 219fa9e4066Sahrens arc_buf_t *acb_buf; 220fa9e4066Sahrens zio_t *acb_zio_dummy; 221fa9e4066Sahrens arc_callback_t *acb_next; 222fa9e4066Sahrens }; 223fa9e4066Sahrens 224fa9e4066Sahrens struct arc_buf_hdr { 225fa9e4066Sahrens /* immutable */ 226fa9e4066Sahrens uint64_t b_size; 227fa9e4066Sahrens spa_t *b_spa; 228fa9e4066Sahrens 229fa9e4066Sahrens /* protected by hash lock */ 230fa9e4066Sahrens dva_t b_dva; 231fa9e4066Sahrens uint64_t b_birth; 232fa9e4066Sahrens uint64_t b_cksum0; 233fa9e4066Sahrens 234fa9e4066Sahrens arc_buf_hdr_t *b_hash_next; 235fa9e4066Sahrens arc_buf_t *b_buf; 236fa9e4066Sahrens uint32_t b_flags; 237ea8dc4b6Seschrock uint32_t b_datacnt; 238fa9e4066Sahrens 239fa9e4066Sahrens kcondvar_t b_cv; 240fa9e4066Sahrens arc_callback_t *b_acb; 241fa9e4066Sahrens 242fa9e4066Sahrens /* protected by arc state mutex */ 243fa9e4066Sahrens arc_state_t *b_state; 244fa9e4066Sahrens list_node_t b_arc_node; 245fa9e4066Sahrens 246fa9e4066Sahrens /* updated atomically */ 247fa9e4066Sahrens clock_t b_arc_access; 248fa9e4066Sahrens 249fa9e4066Sahrens /* self protecting */ 250fa9e4066Sahrens refcount_t b_refcnt; 251fa9e4066Sahrens }; 252fa9e4066Sahrens 253ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list; 254ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx; 255ea8dc4b6Seschrock static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 256ea8dc4b6Seschrock 257ea8dc4b6Seschrock #define GHOST_STATE(state) \ 258ea8dc4b6Seschrock ((state) == arc.mru_ghost || (state) == arc.mfu_ghost) 259ea8dc4b6Seschrock 260fa9e4066Sahrens /* 261fa9e4066Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 262fa9e4066Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 263fa9e4066Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 264fa9e4066Sahrens * should never be passed and should only be set by ARC code. When adding new 265fa9e4066Sahrens * public flags, make sure not to smash the private ones. 266fa9e4066Sahrens */ 267fa9e4066Sahrens 268ea8dc4b6Seschrock #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 269fa9e4066Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 270fa9e4066Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 271fa9e4066Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 272ea8dc4b6Seschrock #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 273*13506d1eSmaybee #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 274fa9e4066Sahrens 275ea8dc4b6Seschrock #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 276fa9e4066Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 277fa9e4066Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 278fa9e4066Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 279ea8dc4b6Seschrock #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 280fa9e4066Sahrens 281fa9e4066Sahrens /* 282fa9e4066Sahrens * Hash table routines 283fa9e4066Sahrens */ 284fa9e4066Sahrens 285fa9e4066Sahrens #define HT_LOCK_PAD 64 286fa9e4066Sahrens 287fa9e4066Sahrens struct ht_lock { 288fa9e4066Sahrens kmutex_t ht_lock; 289fa9e4066Sahrens #ifdef _KERNEL 290fa9e4066Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 291fa9e4066Sahrens #endif 292fa9e4066Sahrens }; 293fa9e4066Sahrens 294fa9e4066Sahrens #define BUF_LOCKS 256 295fa9e4066Sahrens typedef struct buf_hash_table { 296fa9e4066Sahrens uint64_t ht_mask; 297fa9e4066Sahrens arc_buf_hdr_t **ht_table; 298fa9e4066Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 299fa9e4066Sahrens } buf_hash_table_t; 300fa9e4066Sahrens 301fa9e4066Sahrens static buf_hash_table_t buf_hash_table; 302fa9e4066Sahrens 303fa9e4066Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 304fa9e4066Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 305fa9e4066Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 306fa9e4066Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 307fa9e4066Sahrens #define HDR_LOCK(buf) \ 308fa9e4066Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 309fa9e4066Sahrens 310fa9e4066Sahrens uint64_t zfs_crc64_table[256]; 311fa9e4066Sahrens 312fa9e4066Sahrens static uint64_t 313fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 314fa9e4066Sahrens { 315fa9e4066Sahrens uintptr_t spav = (uintptr_t)spa; 316fa9e4066Sahrens uint8_t *vdva = (uint8_t *)dva; 317fa9e4066Sahrens uint64_t crc = -1ULL; 318fa9e4066Sahrens int i; 319fa9e4066Sahrens 320fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 321fa9e4066Sahrens 322fa9e4066Sahrens for (i = 0; i < sizeof (dva_t); i++) 323fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 324fa9e4066Sahrens 325fa9e4066Sahrens crc ^= (spav>>8) ^ birth; 326fa9e4066Sahrens 327fa9e4066Sahrens return (crc); 328fa9e4066Sahrens } 329fa9e4066Sahrens 330fa9e4066Sahrens #define BUF_EMPTY(buf) \ 331fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 332fa9e4066Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 333fa9e4066Sahrens (buf)->b_birth == 0) 334fa9e4066Sahrens 335fa9e4066Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 336fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 337fa9e4066Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 338fa9e4066Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 339fa9e4066Sahrens 340fa9e4066Sahrens static arc_buf_hdr_t * 341fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 342fa9e4066Sahrens { 343fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 344fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 345fa9e4066Sahrens arc_buf_hdr_t *buf; 346fa9e4066Sahrens 347fa9e4066Sahrens mutex_enter(hash_lock); 348fa9e4066Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 349fa9e4066Sahrens buf = buf->b_hash_next) { 350fa9e4066Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 351fa9e4066Sahrens *lockp = hash_lock; 352fa9e4066Sahrens return (buf); 353fa9e4066Sahrens } 354fa9e4066Sahrens } 355fa9e4066Sahrens mutex_exit(hash_lock); 356fa9e4066Sahrens *lockp = NULL; 357fa9e4066Sahrens return (NULL); 358fa9e4066Sahrens } 359fa9e4066Sahrens 360fa9e4066Sahrens /* 361fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 362fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 363fa9e4066Sahrens * will be returned and the new element will not be inserted. 364fa9e4066Sahrens * Otherwise returns NULL. 365fa9e4066Sahrens */ 366fa9e4066Sahrens static arc_buf_hdr_t * 367fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 368fa9e4066Sahrens { 369fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 370fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 371fa9e4066Sahrens arc_buf_hdr_t *fbuf; 372fa9e4066Sahrens uint32_t max, i; 373fa9e4066Sahrens 374ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(buf)); 375fa9e4066Sahrens *lockp = hash_lock; 376fa9e4066Sahrens mutex_enter(hash_lock); 377fa9e4066Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 378fa9e4066Sahrens fbuf = fbuf->b_hash_next, i++) { 379fa9e4066Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 380fa9e4066Sahrens return (fbuf); 381fa9e4066Sahrens } 382fa9e4066Sahrens 383fa9e4066Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 384fa9e4066Sahrens buf_hash_table.ht_table[idx] = buf; 385ea8dc4b6Seschrock buf->b_flags |= ARC_IN_HASH_TABLE; 386fa9e4066Sahrens 387fa9e4066Sahrens /* collect some hash table performance data */ 388fa9e4066Sahrens if (i > 0) { 389fa9e4066Sahrens atomic_add_64(&arc.hash_collisions, 1); 390fa9e4066Sahrens if (i == 1) 391fa9e4066Sahrens atomic_add_64(&arc.hash_chains, 1); 392fa9e4066Sahrens } 393fa9e4066Sahrens while (i > (max = arc.hash_chain_max) && 394fa9e4066Sahrens max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 395fa9e4066Sahrens continue; 396fa9e4066Sahrens } 397fa9e4066Sahrens atomic_add_64(&arc.hash_elements, 1); 398fa9e4066Sahrens if (arc.hash_elements > arc.hash_elements_max) 399fa9e4066Sahrens atomic_add_64(&arc.hash_elements_max, 1); 400fa9e4066Sahrens 401fa9e4066Sahrens return (NULL); 402fa9e4066Sahrens } 403fa9e4066Sahrens 404fa9e4066Sahrens static void 405fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 406fa9e4066Sahrens { 407fa9e4066Sahrens arc_buf_hdr_t *fbuf, **bufp; 408fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 409fa9e4066Sahrens 410fa9e4066Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 411ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(buf)); 412fa9e4066Sahrens 413fa9e4066Sahrens bufp = &buf_hash_table.ht_table[idx]; 414fa9e4066Sahrens while ((fbuf = *bufp) != buf) { 415fa9e4066Sahrens ASSERT(fbuf != NULL); 416fa9e4066Sahrens bufp = &fbuf->b_hash_next; 417fa9e4066Sahrens } 418fa9e4066Sahrens *bufp = buf->b_hash_next; 419fa9e4066Sahrens buf->b_hash_next = NULL; 420ea8dc4b6Seschrock buf->b_flags &= ~ARC_IN_HASH_TABLE; 421fa9e4066Sahrens 422fa9e4066Sahrens /* collect some hash table performance data */ 423fa9e4066Sahrens atomic_add_64(&arc.hash_elements, -1); 424fa9e4066Sahrens if (buf_hash_table.ht_table[idx] && 425fa9e4066Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 426fa9e4066Sahrens atomic_add_64(&arc.hash_chains, -1); 427fa9e4066Sahrens } 428fa9e4066Sahrens 429fa9e4066Sahrens /* 430fa9e4066Sahrens * Global data structures and functions for the buf kmem cache. 431fa9e4066Sahrens */ 432fa9e4066Sahrens static kmem_cache_t *hdr_cache; 433fa9e4066Sahrens static kmem_cache_t *buf_cache; 434fa9e4066Sahrens 435fa9e4066Sahrens static void 436fa9e4066Sahrens buf_fini(void) 437fa9e4066Sahrens { 438fa9e4066Sahrens int i; 439fa9e4066Sahrens 440fa9e4066Sahrens kmem_free(buf_hash_table.ht_table, 441fa9e4066Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 442fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) 443fa9e4066Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 444fa9e4066Sahrens kmem_cache_destroy(hdr_cache); 445fa9e4066Sahrens kmem_cache_destroy(buf_cache); 446fa9e4066Sahrens } 447fa9e4066Sahrens 448fa9e4066Sahrens /* 449fa9e4066Sahrens * Constructor callback - called when the cache is empty 450fa9e4066Sahrens * and a new buf is requested. 451fa9e4066Sahrens */ 452fa9e4066Sahrens /* ARGSUSED */ 453fa9e4066Sahrens static int 454fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 455fa9e4066Sahrens { 456fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 457fa9e4066Sahrens 458fa9e4066Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 459fa9e4066Sahrens refcount_create(&buf->b_refcnt); 460fa9e4066Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 461fa9e4066Sahrens return (0); 462fa9e4066Sahrens } 463fa9e4066Sahrens 464fa9e4066Sahrens /* 465fa9e4066Sahrens * Destructor callback - called when a cached buf is 466fa9e4066Sahrens * no longer required. 467fa9e4066Sahrens */ 468fa9e4066Sahrens /* ARGSUSED */ 469fa9e4066Sahrens static void 470fa9e4066Sahrens hdr_dest(void *vbuf, void *unused) 471fa9e4066Sahrens { 472fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 473fa9e4066Sahrens 474fa9e4066Sahrens refcount_destroy(&buf->b_refcnt); 475fa9e4066Sahrens cv_destroy(&buf->b_cv); 476fa9e4066Sahrens } 477fa9e4066Sahrens 478ea8dc4b6Seschrock static int arc_reclaim_needed(void); 479fa9e4066Sahrens void arc_kmem_reclaim(void); 480fa9e4066Sahrens 481fa9e4066Sahrens /* 482fa9e4066Sahrens * Reclaim callback -- invoked when memory is low. 483fa9e4066Sahrens */ 484fa9e4066Sahrens /* ARGSUSED */ 485fa9e4066Sahrens static void 486fa9e4066Sahrens hdr_recl(void *unused) 487fa9e4066Sahrens { 488fa9e4066Sahrens dprintf("hdr_recl called\n"); 489ea8dc4b6Seschrock if (arc_reclaim_needed()) 490ea8dc4b6Seschrock arc_kmem_reclaim(); 491fa9e4066Sahrens } 492fa9e4066Sahrens 493fa9e4066Sahrens static void 494fa9e4066Sahrens buf_init(void) 495fa9e4066Sahrens { 496fa9e4066Sahrens uint64_t *ct; 497ea8dc4b6Seschrock uint64_t hsize = 1ULL << 12; 498fa9e4066Sahrens int i, j; 499fa9e4066Sahrens 500fa9e4066Sahrens /* 501fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 502ea8dc4b6Seschrock * with an average 64K block size. The table will take up 503ea8dc4b6Seschrock * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 504fa9e4066Sahrens */ 505ea8dc4b6Seschrock while (hsize * 65536 < physmem * PAGESIZE) 506fa9e4066Sahrens hsize <<= 1; 507ea8dc4b6Seschrock retry: 508fa9e4066Sahrens buf_hash_table.ht_mask = hsize - 1; 509ea8dc4b6Seschrock buf_hash_table.ht_table = 510ea8dc4b6Seschrock kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 511ea8dc4b6Seschrock if (buf_hash_table.ht_table == NULL) { 512ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 8)); 513ea8dc4b6Seschrock hsize >>= 1; 514ea8dc4b6Seschrock goto retry; 515ea8dc4b6Seschrock } 516fa9e4066Sahrens 517fa9e4066Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 518fa9e4066Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 519fa9e4066Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 520fa9e4066Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 521fa9e4066Sahrens 522fa9e4066Sahrens for (i = 0; i < 256; i++) 523fa9e4066Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 524fa9e4066Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 525fa9e4066Sahrens 526fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) { 527fa9e4066Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 528fa9e4066Sahrens NULL, MUTEX_DEFAULT, NULL); 529fa9e4066Sahrens } 530fa9e4066Sahrens } 531fa9e4066Sahrens 532fa9e4066Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 533fa9e4066Sahrens 534fa9e4066Sahrens static void 535fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 536fa9e4066Sahrens { 537fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 538fa9e4066Sahrens 539fa9e4066Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 540fa9e4066Sahrens (ab->b_state != arc.anon)) { 541ea8dc4b6Seschrock int delta = ab->b_size * ab->b_datacnt; 542fa9e4066Sahrens 543fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 544fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 545fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 546fa9e4066Sahrens list_remove(&ab->b_state->list, ab); 547ea8dc4b6Seschrock if (GHOST_STATE(ab->b_state)) { 548ea8dc4b6Seschrock ASSERT3U(ab->b_datacnt, ==, 0); 549ea8dc4b6Seschrock ASSERT3P(ab->b_buf, ==, NULL); 550ea8dc4b6Seschrock delta = ab->b_size; 551ea8dc4b6Seschrock } 552ea8dc4b6Seschrock ASSERT(delta > 0); 553ea8dc4b6Seschrock ASSERT3U(ab->b_state->lsize, >=, delta); 554ea8dc4b6Seschrock atomic_add_64(&ab->b_state->lsize, -delta); 555fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 556*13506d1eSmaybee /* remove the prefetch flag is we get a reference */ 557*13506d1eSmaybee if (ab->b_flags & ARC_PREFETCH) 558*13506d1eSmaybee ab->b_flags &= ~ARC_PREFETCH; 559fa9e4066Sahrens } 560fa9e4066Sahrens } 561fa9e4066Sahrens 562fa9e4066Sahrens static int 563fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 564fa9e4066Sahrens { 565fa9e4066Sahrens int cnt; 566fa9e4066Sahrens 567ea8dc4b6Seschrock ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock)); 568ea8dc4b6Seschrock ASSERT(!GHOST_STATE(ab->b_state)); 569fa9e4066Sahrens 570fa9e4066Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 571fa9e4066Sahrens (ab->b_state != arc.anon)) { 572fa9e4066Sahrens 573fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 574fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 575fa9e4066Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 576fa9e4066Sahrens list_insert_head(&ab->b_state->list, ab); 577ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 578ea8dc4b6Seschrock atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt); 579ea8dc4b6Seschrock ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize); 580fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 581fa9e4066Sahrens } 582fa9e4066Sahrens return (cnt); 583fa9e4066Sahrens } 584fa9e4066Sahrens 585fa9e4066Sahrens /* 586fa9e4066Sahrens * Move the supplied buffer to the indicated state. The mutex 587fa9e4066Sahrens * for the buffer must be held by the caller. 588fa9e4066Sahrens */ 589fa9e4066Sahrens static void 590ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 591fa9e4066Sahrens { 592ea8dc4b6Seschrock arc_state_t *old_state = ab->b_state; 593ea8dc4b6Seschrock int refcnt = refcount_count(&ab->b_refcnt); 594ea8dc4b6Seschrock int from_delta, to_delta; 595fa9e4066Sahrens 596fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 597ea8dc4b6Seschrock ASSERT(new_state != old_state); 598ea8dc4b6Seschrock ASSERT(refcnt == 0 || ab->b_datacnt > 0); 599ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 600ea8dc4b6Seschrock 601ea8dc4b6Seschrock from_delta = to_delta = ab->b_datacnt * ab->b_size; 602fa9e4066Sahrens 603fa9e4066Sahrens /* 604fa9e4066Sahrens * If this buffer is evictable, transfer it from the 605fa9e4066Sahrens * old state list to the new state list. 606fa9e4066Sahrens */ 607ea8dc4b6Seschrock if (refcnt == 0) { 608ea8dc4b6Seschrock if (old_state != arc.anon) { 609ea8dc4b6Seschrock int use_mutex = !MUTEX_HELD(&old_state->mtx); 610ea8dc4b6Seschrock 611ea8dc4b6Seschrock if (use_mutex) 612ea8dc4b6Seschrock mutex_enter(&old_state->mtx); 613fa9e4066Sahrens 614fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 615ea8dc4b6Seschrock list_remove(&old_state->list, ab); 616ea8dc4b6Seschrock 617*13506d1eSmaybee /* 618*13506d1eSmaybee * If prefetching out of the ghost cache, 619*13506d1eSmaybee * we will have a non-null datacnt. 620*13506d1eSmaybee */ 621*13506d1eSmaybee if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 622*13506d1eSmaybee /* ghost elements have a ghost size */ 623ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 624ea8dc4b6Seschrock from_delta = ab->b_size; 625ea8dc4b6Seschrock } 626ea8dc4b6Seschrock ASSERT3U(old_state->lsize, >=, from_delta); 627ea8dc4b6Seschrock atomic_add_64(&old_state->lsize, -from_delta); 628ea8dc4b6Seschrock 629ea8dc4b6Seschrock if (use_mutex) 630ea8dc4b6Seschrock mutex_exit(&old_state->mtx); 631fa9e4066Sahrens } 632fa9e4066Sahrens if (new_state != arc.anon) { 633ea8dc4b6Seschrock int use_mutex = !MUTEX_HELD(&new_state->mtx); 634fa9e4066Sahrens 635ea8dc4b6Seschrock if (use_mutex) 636fa9e4066Sahrens mutex_enter(&new_state->mtx); 637ea8dc4b6Seschrock 638fa9e4066Sahrens list_insert_head(&new_state->list, ab); 639ea8dc4b6Seschrock 640ea8dc4b6Seschrock /* ghost elements have a ghost size */ 641ea8dc4b6Seschrock if (GHOST_STATE(new_state)) { 642ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 643ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 644ea8dc4b6Seschrock to_delta = ab->b_size; 645ea8dc4b6Seschrock } 646ea8dc4b6Seschrock atomic_add_64(&new_state->lsize, to_delta); 647ea8dc4b6Seschrock ASSERT3U(new_state->size + to_delta, >=, 648ea8dc4b6Seschrock new_state->lsize); 649ea8dc4b6Seschrock 650ea8dc4b6Seschrock if (use_mutex) 651fa9e4066Sahrens mutex_exit(&new_state->mtx); 652fa9e4066Sahrens } 653fa9e4066Sahrens } 654fa9e4066Sahrens 655fa9e4066Sahrens ASSERT(!BUF_EMPTY(ab)); 656ea8dc4b6Seschrock if (new_state == arc.anon && old_state != arc.anon) { 657fa9e4066Sahrens buf_hash_remove(ab); 658fa9e4066Sahrens } 659fa9e4066Sahrens 660ea8dc4b6Seschrock /* adjust state sizes */ 661ea8dc4b6Seschrock if (to_delta) 662ea8dc4b6Seschrock atomic_add_64(&new_state->size, to_delta); 663ea8dc4b6Seschrock if (from_delta) { 664ea8dc4b6Seschrock ASSERT3U(old_state->size, >=, from_delta); 665ea8dc4b6Seschrock atomic_add_64(&old_state->size, -from_delta); 666fa9e4066Sahrens } 667fa9e4066Sahrens ab->b_state = new_state; 668fa9e4066Sahrens } 669fa9e4066Sahrens 670fa9e4066Sahrens arc_buf_t * 671fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag) 672fa9e4066Sahrens { 673fa9e4066Sahrens arc_buf_hdr_t *hdr; 674fa9e4066Sahrens arc_buf_t *buf; 675fa9e4066Sahrens 676fa9e4066Sahrens ASSERT3U(size, >, 0); 677fa9e4066Sahrens hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 678fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 679fa9e4066Sahrens hdr->b_size = size; 680fa9e4066Sahrens hdr->b_spa = spa; 681fa9e4066Sahrens hdr->b_state = arc.anon; 682fa9e4066Sahrens hdr->b_arc_access = 0; 683fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 684fa9e4066Sahrens buf->b_hdr = hdr; 685ea8dc4b6Seschrock buf->b_efunc = NULL; 686ea8dc4b6Seschrock buf->b_private = NULL; 687fa9e4066Sahrens buf->b_next = NULL; 688fa9e4066Sahrens buf->b_data = zio_buf_alloc(size); 689fa9e4066Sahrens hdr->b_buf = buf; 690ea8dc4b6Seschrock hdr->b_datacnt = 1; 691fa9e4066Sahrens hdr->b_flags = 0; 692fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 693fa9e4066Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 694fa9e4066Sahrens 695fa9e4066Sahrens atomic_add_64(&arc.size, size); 696fa9e4066Sahrens atomic_add_64(&arc.anon->size, size); 697fa9e4066Sahrens 698fa9e4066Sahrens return (buf); 699fa9e4066Sahrens } 700fa9e4066Sahrens 701ea8dc4b6Seschrock static void * 702ea8dc4b6Seschrock arc_data_copy(arc_buf_hdr_t *hdr, void *old_data) 703ea8dc4b6Seschrock { 704ea8dc4b6Seschrock void *new_data = zio_buf_alloc(hdr->b_size); 705ea8dc4b6Seschrock 706ea8dc4b6Seschrock atomic_add_64(&arc.size, hdr->b_size); 707ea8dc4b6Seschrock bcopy(old_data, new_data, hdr->b_size); 708ea8dc4b6Seschrock atomic_add_64(&hdr->b_state->size, hdr->b_size); 709ea8dc4b6Seschrock if (list_link_active(&hdr->b_arc_node)) { 710ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 711ea8dc4b6Seschrock atomic_add_64(&hdr->b_state->lsize, hdr->b_size); 712ea8dc4b6Seschrock } 713ea8dc4b6Seschrock return (new_data); 714ea8dc4b6Seschrock } 715ea8dc4b6Seschrock 716ea8dc4b6Seschrock void 717ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag) 718ea8dc4b6Seschrock { 719ea8dc4b6Seschrock arc_buf_hdr_t *hdr; 720ea8dc4b6Seschrock kmutex_t *hash_lock; 721ea8dc4b6Seschrock 722ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 723ea8dc4b6Seschrock hdr = buf->b_hdr; 724ea8dc4b6Seschrock if (buf->b_data == NULL) { 725ea8dc4b6Seschrock /* 726ea8dc4b6Seschrock * This buffer is evicted. 727ea8dc4b6Seschrock */ 728ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 729ea8dc4b6Seschrock return; 730ea8dc4b6Seschrock } else { 731ea8dc4b6Seschrock /* 732ea8dc4b6Seschrock * Prevent this buffer from being evicted 733ea8dc4b6Seschrock * while we add a reference. 734ea8dc4b6Seschrock */ 735ea8dc4b6Seschrock buf->b_hdr = NULL; 736ea8dc4b6Seschrock } 737ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 738ea8dc4b6Seschrock 739ea8dc4b6Seschrock ASSERT(hdr->b_state != arc.anon); 740ea8dc4b6Seschrock hash_lock = HDR_LOCK(hdr); 741ea8dc4b6Seschrock mutex_enter(hash_lock); 742ea8dc4b6Seschrock ASSERT(!GHOST_STATE(hdr->b_state)); 743ea8dc4b6Seschrock buf->b_hdr = hdr; 744ea8dc4b6Seschrock add_reference(hdr, hash_lock, tag); 745ea8dc4b6Seschrock arc_access_and_exit(hdr, hash_lock); 746ea8dc4b6Seschrock atomic_add_64(&arc.hits, 1); 747ea8dc4b6Seschrock } 748ea8dc4b6Seschrock 749ea8dc4b6Seschrock static void 750ea8dc4b6Seschrock arc_buf_destroy(arc_buf_t *buf, boolean_t all) 751ea8dc4b6Seschrock { 752ea8dc4b6Seschrock arc_buf_t **bufp; 753ea8dc4b6Seschrock 754ea8dc4b6Seschrock /* free up data associated with the buf */ 755ea8dc4b6Seschrock if (buf->b_data) { 756ea8dc4b6Seschrock arc_state_t *state = buf->b_hdr->b_state; 757ea8dc4b6Seschrock uint64_t size = buf->b_hdr->b_size; 758ea8dc4b6Seschrock 759ea8dc4b6Seschrock zio_buf_free(buf->b_data, size); 760ea8dc4b6Seschrock atomic_add_64(&arc.size, -size); 761ea8dc4b6Seschrock if (list_link_active(&buf->b_hdr->b_arc_node)) { 762ea8dc4b6Seschrock ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 763ea8dc4b6Seschrock ASSERT(state != arc.anon); 764ea8dc4b6Seschrock ASSERT3U(state->lsize, >=, size); 765ea8dc4b6Seschrock atomic_add_64(&state->lsize, -size); 766ea8dc4b6Seschrock } 767ea8dc4b6Seschrock ASSERT3U(state->size, >=, size); 768ea8dc4b6Seschrock atomic_add_64(&state->size, -size); 769ea8dc4b6Seschrock buf->b_data = NULL; 770ea8dc4b6Seschrock ASSERT(buf->b_hdr->b_datacnt > 0); 771ea8dc4b6Seschrock buf->b_hdr->b_datacnt -= 1; 772ea8dc4b6Seschrock } 773ea8dc4b6Seschrock 774ea8dc4b6Seschrock /* only remove the buf if requested */ 775ea8dc4b6Seschrock if (!all) 776ea8dc4b6Seschrock return; 777ea8dc4b6Seschrock 778ea8dc4b6Seschrock /* remove the buf from the hdr list */ 779ea8dc4b6Seschrock for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 780ea8dc4b6Seschrock continue; 781ea8dc4b6Seschrock *bufp = buf->b_next; 782ea8dc4b6Seschrock 783ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 784ea8dc4b6Seschrock 785ea8dc4b6Seschrock /* clean up the buf */ 786ea8dc4b6Seschrock buf->b_hdr = NULL; 787ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 788ea8dc4b6Seschrock } 789ea8dc4b6Seschrock 790fa9e4066Sahrens static void 791ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr) 792fa9e4066Sahrens { 793fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 794fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 795ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 796fa9e4066Sahrens 797fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 798ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(hdr)); 799fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 800fa9e4066Sahrens hdr->b_birth = 0; 801fa9e4066Sahrens hdr->b_cksum0 = 0; 802fa9e4066Sahrens } 803ea8dc4b6Seschrock while (hdr->b_buf) { 804fa9e4066Sahrens arc_buf_t *buf = hdr->b_buf; 805fa9e4066Sahrens 806ea8dc4b6Seschrock if (buf->b_efunc) { 807ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 808ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 809ea8dc4b6Seschrock arc_buf_destroy(hdr->b_buf, FALSE); 810ea8dc4b6Seschrock hdr->b_buf = buf->b_next; 811ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 812ea8dc4b6Seschrock arc_eviction_list = buf; 813ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 814ea8dc4b6Seschrock } else { 815ea8dc4b6Seschrock arc_buf_destroy(hdr->b_buf, TRUE); 816ea8dc4b6Seschrock } 817fa9e4066Sahrens } 818ea8dc4b6Seschrock 819fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 820fa9e4066Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 821fa9e4066Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 822fa9e4066Sahrens kmem_cache_free(hdr_cache, hdr); 823fa9e4066Sahrens } 824fa9e4066Sahrens 825fa9e4066Sahrens void 826fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 827fa9e4066Sahrens { 828fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 829ea8dc4b6Seschrock int hashed = hdr->b_state != arc.anon; 830fa9e4066Sahrens 831ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 832ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 833ea8dc4b6Seschrock 834ea8dc4b6Seschrock if (hashed) { 835ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 836ea8dc4b6Seschrock 837ea8dc4b6Seschrock mutex_enter(hash_lock); 838ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 839ea8dc4b6Seschrock if (hdr->b_datacnt > 1) 840ea8dc4b6Seschrock arc_buf_destroy(buf, TRUE); 841ea8dc4b6Seschrock else 842ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 843fa9e4066Sahrens mutex_exit(hash_lock); 844ea8dc4b6Seschrock } else if (HDR_IO_IN_PROGRESS(hdr)) { 845ea8dc4b6Seschrock int destroy_hdr; 846ea8dc4b6Seschrock /* 847ea8dc4b6Seschrock * We are in the middle of an async write. Don't destroy 848ea8dc4b6Seschrock * this buffer unless the write completes before we finish 849ea8dc4b6Seschrock * decrementing the reference count. 850ea8dc4b6Seschrock */ 851ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 852ea8dc4b6Seschrock (void) remove_reference(hdr, NULL, tag); 853ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 854ea8dc4b6Seschrock destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 855ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 856ea8dc4b6Seschrock if (destroy_hdr) 857ea8dc4b6Seschrock arc_hdr_destroy(hdr); 858ea8dc4b6Seschrock } else { 859ea8dc4b6Seschrock if (remove_reference(hdr, NULL, tag) > 0) { 860ea8dc4b6Seschrock ASSERT(HDR_IO_ERROR(hdr)); 861ea8dc4b6Seschrock arc_buf_destroy(buf, TRUE); 862ea8dc4b6Seschrock } else { 863ea8dc4b6Seschrock arc_hdr_destroy(hdr); 864ea8dc4b6Seschrock } 865fa9e4066Sahrens } 866ea8dc4b6Seschrock } 867fa9e4066Sahrens 868ea8dc4b6Seschrock int 869ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag) 870ea8dc4b6Seschrock { 871ea8dc4b6Seschrock arc_buf_hdr_t *hdr = buf->b_hdr; 872ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 873ea8dc4b6Seschrock int no_callback = (buf->b_efunc == NULL); 874fa9e4066Sahrens 875ea8dc4b6Seschrock if (hdr->b_state == arc.anon) { 876ea8dc4b6Seschrock arc_buf_free(buf, tag); 877ea8dc4b6Seschrock return (no_callback); 878ea8dc4b6Seschrock } 879ea8dc4b6Seschrock 880ea8dc4b6Seschrock mutex_enter(hash_lock); 881ea8dc4b6Seschrock ASSERT(hdr->b_state != arc.anon); 882ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 883ea8dc4b6Seschrock 884ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 885ea8dc4b6Seschrock if (hdr->b_datacnt > 1) { 886ea8dc4b6Seschrock if (no_callback) 887ea8dc4b6Seschrock arc_buf_destroy(buf, TRUE); 888ea8dc4b6Seschrock } else if (no_callback) { 889ea8dc4b6Seschrock ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 890ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 891ea8dc4b6Seschrock } 892ea8dc4b6Seschrock ASSERT(no_callback || hdr->b_datacnt > 1 || 893ea8dc4b6Seschrock refcount_is_zero(&hdr->b_refcnt)); 894ea8dc4b6Seschrock mutex_exit(hash_lock); 895ea8dc4b6Seschrock return (no_callback); 896fa9e4066Sahrens } 897fa9e4066Sahrens 898fa9e4066Sahrens int 899fa9e4066Sahrens arc_buf_size(arc_buf_t *buf) 900fa9e4066Sahrens { 901fa9e4066Sahrens return (buf->b_hdr->b_size); 902fa9e4066Sahrens } 903fa9e4066Sahrens 904fa9e4066Sahrens /* 905fa9e4066Sahrens * Evict buffers from list until we've removed the specified number of 906fa9e4066Sahrens * bytes. Move the removed buffers to the appropriate evict state. 907fa9e4066Sahrens */ 908fa9e4066Sahrens static uint64_t 909ea8dc4b6Seschrock arc_evict(arc_state_t *state, int64_t bytes) 910fa9e4066Sahrens { 911fa9e4066Sahrens arc_state_t *evicted_state; 912ea8dc4b6Seschrock uint64_t bytes_evicted = 0, skipped = 0; 913fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 914fa9e4066Sahrens kmutex_t *hash_lock; 915fa9e4066Sahrens 916ea8dc4b6Seschrock ASSERT(state == arc.mru || state == arc.mfu); 917fa9e4066Sahrens 918ea8dc4b6Seschrock evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 919fa9e4066Sahrens 920fa9e4066Sahrens mutex_enter(&state->mtx); 921fa9e4066Sahrens mutex_enter(&evicted_state->mtx); 922fa9e4066Sahrens 923fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 924fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 925*13506d1eSmaybee /* prefetch buffers have a minimum lifespan */ 926*13506d1eSmaybee if (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 927*13506d1eSmaybee lbolt - ab->b_arc_access < arc_min_prefetch_lifespan) { 928*13506d1eSmaybee skipped++; 929*13506d1eSmaybee continue; 930*13506d1eSmaybee } 931fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 932*13506d1eSmaybee if (!HDR_IO_IN_PROGRESS(ab) && mutex_tryenter(hash_lock)) { 933fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 934ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 935ea8dc4b6Seschrock while (ab->b_buf) { 936ea8dc4b6Seschrock arc_buf_t *buf = ab->b_buf; 937ea8dc4b6Seschrock if (buf->b_data) 938ea8dc4b6Seschrock bytes_evicted += ab->b_size; 939ea8dc4b6Seschrock if (buf->b_efunc) { 940ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 941ea8dc4b6Seschrock /* 942ea8dc4b6Seschrock * arc_buf_add_ref() could derail 943ea8dc4b6Seschrock * this eviction. 944ea8dc4b6Seschrock */ 945ea8dc4b6Seschrock if (buf->b_hdr == NULL) { 946ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 947ea8dc4b6Seschrock mutex_exit(hash_lock); 948ea8dc4b6Seschrock goto skip; 949ea8dc4b6Seschrock } 950ea8dc4b6Seschrock arc_buf_destroy(buf, FALSE); 951ea8dc4b6Seschrock ab->b_buf = buf->b_next; 952ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 953ea8dc4b6Seschrock arc_eviction_list = buf; 954ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 955ea8dc4b6Seschrock } else { 956ea8dc4b6Seschrock arc_buf_destroy(buf, TRUE); 957ea8dc4b6Seschrock } 958ea8dc4b6Seschrock } 959ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 960fa9e4066Sahrens arc_change_state(evicted_state, ab, hash_lock); 961ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(ab)); 962ea8dc4b6Seschrock ab->b_flags = ARC_IN_HASH_TABLE; 963fa9e4066Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 964fa9e4066Sahrens mutex_exit(hash_lock); 965ea8dc4b6Seschrock if (bytes >= 0 && bytes_evicted >= bytes) 966fa9e4066Sahrens break; 967fa9e4066Sahrens } else { 968ea8dc4b6Seschrock skip: 969ea8dc4b6Seschrock skipped += 1; 970fa9e4066Sahrens } 971fa9e4066Sahrens } 972fa9e4066Sahrens mutex_exit(&evicted_state->mtx); 973fa9e4066Sahrens mutex_exit(&state->mtx); 974fa9e4066Sahrens 975fa9e4066Sahrens if (bytes_evicted < bytes) 976fa9e4066Sahrens dprintf("only evicted %lld bytes from %x", 977fa9e4066Sahrens (longlong_t)bytes_evicted, state); 978fa9e4066Sahrens 979ea8dc4b6Seschrock atomic_add_64(&arc.skipped, skipped); 980ea8dc4b6Seschrock if (bytes < 0) 981ea8dc4b6Seschrock return (skipped); 982fa9e4066Sahrens return (bytes_evicted); 983fa9e4066Sahrens } 984fa9e4066Sahrens 985fa9e4066Sahrens /* 986fa9e4066Sahrens * Remove buffers from list until we've removed the specified number of 987fa9e4066Sahrens * bytes. Destroy the buffers that are removed. 988fa9e4066Sahrens */ 989fa9e4066Sahrens static void 990ea8dc4b6Seschrock arc_evict_ghost(arc_state_t *state, int64_t bytes) 991fa9e4066Sahrens { 992fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 993fa9e4066Sahrens kmutex_t *hash_lock; 994ea8dc4b6Seschrock uint64_t bytes_deleted = 0; 995ea8dc4b6Seschrock uint_t bufs_skipped = 0; 996fa9e4066Sahrens 997ea8dc4b6Seschrock ASSERT(GHOST_STATE(state)); 998fa9e4066Sahrens top: 999fa9e4066Sahrens mutex_enter(&state->mtx); 1000fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 1001fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 1002fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 1003fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 1004*13506d1eSmaybee ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1005ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 1006fa9e4066Sahrens arc_change_state(arc.anon, ab, hash_lock); 1007fa9e4066Sahrens mutex_exit(hash_lock); 1008fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 1009fa9e4066Sahrens bytes_deleted += ab->b_size; 1010ea8dc4b6Seschrock arc_hdr_destroy(ab); 1011ea8dc4b6Seschrock DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1012fa9e4066Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 1013fa9e4066Sahrens break; 1014fa9e4066Sahrens } else { 1015fa9e4066Sahrens if (bytes < 0) { 1016fa9e4066Sahrens mutex_exit(&state->mtx); 1017fa9e4066Sahrens mutex_enter(hash_lock); 1018fa9e4066Sahrens mutex_exit(hash_lock); 1019fa9e4066Sahrens goto top; 1020fa9e4066Sahrens } 1021fa9e4066Sahrens bufs_skipped += 1; 1022fa9e4066Sahrens } 1023fa9e4066Sahrens } 1024fa9e4066Sahrens mutex_exit(&state->mtx); 1025fa9e4066Sahrens 1026fa9e4066Sahrens if (bufs_skipped) { 1027fa9e4066Sahrens atomic_add_64(&arc.skipped, bufs_skipped); 1028fa9e4066Sahrens ASSERT(bytes >= 0); 1029fa9e4066Sahrens } 1030fa9e4066Sahrens 1031fa9e4066Sahrens if (bytes_deleted < bytes) 1032fa9e4066Sahrens dprintf("only deleted %lld bytes from %p", 1033fa9e4066Sahrens (longlong_t)bytes_deleted, state); 1034fa9e4066Sahrens } 1035fa9e4066Sahrens 1036fa9e4066Sahrens static void 1037fa9e4066Sahrens arc_adjust(void) 1038fa9e4066Sahrens { 1039fa9e4066Sahrens int64_t top_sz, mru_over, arc_over; 1040fa9e4066Sahrens 1041ea8dc4b6Seschrock top_sz = arc.anon->size + arc.mru->size; 1042fa9e4066Sahrens 1043ea8dc4b6Seschrock if (top_sz > arc.p && arc.mru->lsize > 0) { 1044ea8dc4b6Seschrock int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p); 1045ea8dc4b6Seschrock (void) arc_evict(arc.mru, toevict); 1046ea8dc4b6Seschrock top_sz = arc.anon->size + arc.mru->size; 1047fa9e4066Sahrens } 1048fa9e4066Sahrens 1049ea8dc4b6Seschrock mru_over = top_sz + arc.mru_ghost->size - arc.c; 1050fa9e4066Sahrens 1051fa9e4066Sahrens if (mru_over > 0) { 1052ea8dc4b6Seschrock if (arc.mru_ghost->lsize > 0) { 1053ea8dc4b6Seschrock int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over); 1054ea8dc4b6Seschrock arc_evict_ghost(arc.mru_ghost, todelete); 1055fa9e4066Sahrens } 1056fa9e4066Sahrens } 1057fa9e4066Sahrens 1058fa9e4066Sahrens if ((arc_over = arc.size - arc.c) > 0) { 1059ea8dc4b6Seschrock int64_t tbl_over; 1060fa9e4066Sahrens 1061ea8dc4b6Seschrock if (arc.mfu->lsize > 0) { 1062ea8dc4b6Seschrock int64_t toevict = MIN(arc.mfu->lsize, arc_over); 1063ea8dc4b6Seschrock (void) arc_evict(arc.mfu, toevict); 1064fa9e4066Sahrens } 1065fa9e4066Sahrens 1066ea8dc4b6Seschrock tbl_over = arc.size + arc.mru_ghost->lsize + 1067ea8dc4b6Seschrock arc.mfu_ghost->lsize - arc.c*2; 1068fa9e4066Sahrens 1069ea8dc4b6Seschrock if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) { 1070ea8dc4b6Seschrock int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over); 1071ea8dc4b6Seschrock arc_evict_ghost(arc.mfu_ghost, todelete); 1072fa9e4066Sahrens } 1073fa9e4066Sahrens } 1074fa9e4066Sahrens } 1075fa9e4066Sahrens 1076ea8dc4b6Seschrock static void 1077ea8dc4b6Seschrock arc_do_user_evicts(void) 1078ea8dc4b6Seschrock { 1079ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1080ea8dc4b6Seschrock while (arc_eviction_list != NULL) { 1081ea8dc4b6Seschrock arc_buf_t *buf = arc_eviction_list; 1082ea8dc4b6Seschrock arc_eviction_list = buf->b_next; 1083ea8dc4b6Seschrock buf->b_hdr = NULL; 1084ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1085ea8dc4b6Seschrock 1086dd6ef538Smaybee if (buf->b_efunc != NULL) 1087dd6ef538Smaybee VERIFY(buf->b_efunc(buf) == 0); 1088ea8dc4b6Seschrock 1089ea8dc4b6Seschrock buf->b_efunc = NULL; 1090ea8dc4b6Seschrock buf->b_private = NULL; 1091ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 1092ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1093ea8dc4b6Seschrock } 1094ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1095ea8dc4b6Seschrock } 1096ea8dc4b6Seschrock 1097fa9e4066Sahrens /* 1098fa9e4066Sahrens * Flush all *evictable* data from the cache. 1099fa9e4066Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 1100fa9e4066Sahrens */ 1101fa9e4066Sahrens void 1102fa9e4066Sahrens arc_flush(void) 1103fa9e4066Sahrens { 1104ea8dc4b6Seschrock while (arc_evict(arc.mru, -1)); 1105ea8dc4b6Seschrock while (arc_evict(arc.mfu, -1)); 1106fa9e4066Sahrens 1107ea8dc4b6Seschrock arc_evict_ghost(arc.mru_ghost, -1); 1108ea8dc4b6Seschrock arc_evict_ghost(arc.mfu_ghost, -1); 1109ea8dc4b6Seschrock 1110ea8dc4b6Seschrock mutex_enter(&arc_reclaim_thr_lock); 1111ea8dc4b6Seschrock arc_do_user_evicts(); 1112ea8dc4b6Seschrock mutex_exit(&arc_reclaim_thr_lock); 1113ea8dc4b6Seschrock ASSERT(arc_eviction_list == NULL); 1114fa9e4066Sahrens } 1115fa9e4066Sahrens 1116*13506d1eSmaybee int arc_kmem_reclaim_shift = 5; /* log2(fraction of arc to reclaim) */ 1117*13506d1eSmaybee 1118fa9e4066Sahrens void 1119fa9e4066Sahrens arc_kmem_reclaim(void) 1120fa9e4066Sahrens { 11213cff2f43Sstans uint64_t to_free; 11223cff2f43Sstans 1123fa9e4066Sahrens /* 1124fa9e4066Sahrens * We need arc_reclaim_lock because we don't want multiple 1125fa9e4066Sahrens * threads trying to reclaim concurrently. 1126fa9e4066Sahrens */ 1127fa9e4066Sahrens 1128fa9e4066Sahrens /* 1129fa9e4066Sahrens * umem calls the reclaim func when we destroy the buf cache, 1130fa9e4066Sahrens * which is after we do arc_fini(). So we set a flag to prevent 1131fa9e4066Sahrens * accessing the destroyed mutexes and lists. 1132fa9e4066Sahrens */ 1133fa9e4066Sahrens if (arc_dead) 1134fa9e4066Sahrens return; 1135fa9e4066Sahrens 1136ea8dc4b6Seschrock if (arc.c <= arc.c_min) 1137ea8dc4b6Seschrock return; 1138ea8dc4b6Seschrock 1139fa9e4066Sahrens mutex_enter(&arc_reclaim_lock); 1140fa9e4066Sahrens 11413cff2f43Sstans #ifdef _KERNEL 1142*13506d1eSmaybee to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree)); 11433cff2f43Sstans #else 1144*13506d1eSmaybee to_free = arc.c >> arc_kmem_reclaim_shift; 11453cff2f43Sstans #endif 11463cff2f43Sstans if (arc.c > to_free) 11473cff2f43Sstans atomic_add_64(&arc.c, -to_free); 11483cff2f43Sstans else 11493cff2f43Sstans arc.c = arc.c_min; 11503cff2f43Sstans 1151*13506d1eSmaybee atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift)); 1152ea8dc4b6Seschrock if (arc.c > arc.size) 1153ea8dc4b6Seschrock arc.c = arc.size; 1154fa9e4066Sahrens if (arc.c < arc.c_min) 1155fa9e4066Sahrens arc.c = arc.c_min; 1156ea8dc4b6Seschrock if (arc.p > arc.c) 1157ea8dc4b6Seschrock arc.p = (arc.c >> 1); 1158ea8dc4b6Seschrock ASSERT((int64_t)arc.p >= 0); 1159fa9e4066Sahrens 1160fa9e4066Sahrens arc_adjust(); 1161fa9e4066Sahrens 1162fa9e4066Sahrens mutex_exit(&arc_reclaim_lock); 1163fa9e4066Sahrens } 1164fa9e4066Sahrens 1165fa9e4066Sahrens static int 1166fa9e4066Sahrens arc_reclaim_needed(void) 1167fa9e4066Sahrens { 1168fa9e4066Sahrens uint64_t extra; 1169fa9e4066Sahrens 1170fa9e4066Sahrens #ifdef _KERNEL 11713cff2f43Sstans 11723cff2f43Sstans if (needfree) 11733cff2f43Sstans return (1); 11743cff2f43Sstans 1175fa9e4066Sahrens /* 1176fa9e4066Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 1177fa9e4066Sahrens */ 1178fa9e4066Sahrens extra = desfree; 1179fa9e4066Sahrens 1180fa9e4066Sahrens /* 1181fa9e4066Sahrens * check that we're out of range of the pageout scanner. It starts to 1182fa9e4066Sahrens * schedule paging if freemem is less than lotsfree and needfree. 1183fa9e4066Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 1184fa9e4066Sahrens * number of needed free pages. We add extra pages here to make sure 1185fa9e4066Sahrens * the scanner doesn't start up while we're freeing memory. 1186fa9e4066Sahrens */ 1187fa9e4066Sahrens if (freemem < lotsfree + needfree + extra) 1188fa9e4066Sahrens return (1); 1189fa9e4066Sahrens 1190fa9e4066Sahrens /* 1191fa9e4066Sahrens * check to make sure that swapfs has enough space so that anon 1192fa9e4066Sahrens * reservations can still succeeed. anon_resvmem() checks that the 1193fa9e4066Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 1194fa9e4066Sahrens * swap pages. We also add a bit of extra here just to prevent 1195fa9e4066Sahrens * circumstances from getting really dire. 1196fa9e4066Sahrens */ 1197fa9e4066Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1198fa9e4066Sahrens return (1); 1199fa9e4066Sahrens 12005dc8af33Smaybee #if defined(__i386) 1201fa9e4066Sahrens /* 1202fa9e4066Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 1203fa9e4066Sahrens * kernel heap space before we ever run out of available physical 1204fa9e4066Sahrens * memory. Most checks of the size of the heap_area compare against 1205fa9e4066Sahrens * tune.t_minarmem, which is the minimum available real memory that we 1206fa9e4066Sahrens * can have in the system. However, this is generally fixed at 25 pages 1207fa9e4066Sahrens * which is so low that it's useless. In this comparison, we seek to 1208fa9e4066Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 1209fa9e4066Sahrens * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1210fa9e4066Sahrens * free) 1211fa9e4066Sahrens */ 1212fa9e4066Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1213fa9e4066Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1214fa9e4066Sahrens return (1); 1215fa9e4066Sahrens #endif 1216fa9e4066Sahrens 1217fa9e4066Sahrens #else 1218fa9e4066Sahrens if (spa_get_random(100) == 0) 1219fa9e4066Sahrens return (1); 1220fa9e4066Sahrens #endif 1221fa9e4066Sahrens return (0); 1222fa9e4066Sahrens } 1223fa9e4066Sahrens 1224fa9e4066Sahrens static void 1225fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1226fa9e4066Sahrens { 1227fa9e4066Sahrens size_t i; 1228fa9e4066Sahrens kmem_cache_t *prev_cache = NULL; 1229fa9e4066Sahrens extern kmem_cache_t *zio_buf_cache[]; 1230fa9e4066Sahrens 1231033f9833Sek #ifdef _KERNEL 1232033f9833Sek /* 1233033f9833Sek * First purge some DNLC entries, in case the DNLC is using 1234033f9833Sek * up too much memory. 1235033f9833Sek */ 1236cee972f8Sek dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 12375dc8af33Smaybee 12385dc8af33Smaybee #if defined(__i386) 12395dc8af33Smaybee /* 12405dc8af33Smaybee * Reclaim unused memory from all kmem caches. 12415dc8af33Smaybee */ 12425dc8af33Smaybee kmem_reap(); 12435dc8af33Smaybee #endif 1244033f9833Sek #endif 1245033f9833Sek 1246fa9e4066Sahrens /* 1247ea8dc4b6Seschrock * An agressive reclamation will shrink the cache size as well as 1248ea8dc4b6Seschrock * reap free buffers from the arc kmem caches. 1249fa9e4066Sahrens */ 1250fa9e4066Sahrens if (strat == ARC_RECLAIM_AGGR) 1251ea8dc4b6Seschrock arc_kmem_reclaim(); 1252fa9e4066Sahrens 1253fa9e4066Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1254fa9e4066Sahrens if (zio_buf_cache[i] != prev_cache) { 1255fa9e4066Sahrens prev_cache = zio_buf_cache[i]; 1256fa9e4066Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 1257fa9e4066Sahrens } 1258fa9e4066Sahrens } 1259ea8dc4b6Seschrock kmem_cache_reap_now(buf_cache); 1260ea8dc4b6Seschrock kmem_cache_reap_now(hdr_cache); 1261fa9e4066Sahrens } 1262fa9e4066Sahrens 1263fa9e4066Sahrens static void 1264fa9e4066Sahrens arc_reclaim_thread(void) 1265fa9e4066Sahrens { 1266fa9e4066Sahrens clock_t growtime = 0; 1267fa9e4066Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1268fa9e4066Sahrens callb_cpr_t cpr; 1269fa9e4066Sahrens 1270fa9e4066Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1271fa9e4066Sahrens 1272fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1273fa9e4066Sahrens while (arc_thread_exit == 0) { 1274fa9e4066Sahrens if (arc_reclaim_needed()) { 1275fa9e4066Sahrens 1276fa9e4066Sahrens if (arc.no_grow) { 1277fa9e4066Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1278fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1279fa9e4066Sahrens } else { 1280fa9e4066Sahrens last_reclaim = ARC_RECLAIM_CONS; 1281fa9e4066Sahrens } 1282fa9e4066Sahrens } else { 1283fa9e4066Sahrens arc.no_grow = TRUE; 1284fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1285fa9e4066Sahrens membar_producer(); 1286fa9e4066Sahrens } 1287fa9e4066Sahrens 1288fa9e4066Sahrens /* reset the growth delay for every reclaim */ 1289fa9e4066Sahrens growtime = lbolt + (arc_grow_retry * hz); 1290fa9e4066Sahrens 1291fa9e4066Sahrens arc_kmem_reap_now(last_reclaim); 1292fa9e4066Sahrens 1293fa9e4066Sahrens } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1294fa9e4066Sahrens arc.no_grow = FALSE; 1295fa9e4066Sahrens } 1296fa9e4066Sahrens 1297ea8dc4b6Seschrock if (arc_eviction_list != NULL) 1298ea8dc4b6Seschrock arc_do_user_evicts(); 1299ea8dc4b6Seschrock 1300fa9e4066Sahrens /* block until needed, or one second, whichever is shorter */ 1301fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1302fa9e4066Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1303fa9e4066Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1304fa9e4066Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1305fa9e4066Sahrens } 1306fa9e4066Sahrens 1307fa9e4066Sahrens arc_thread_exit = 0; 1308fa9e4066Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1309fa9e4066Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1310fa9e4066Sahrens thread_exit(); 1311fa9e4066Sahrens } 1312fa9e4066Sahrens 1313ea8dc4b6Seschrock /* 1314ea8dc4b6Seschrock * Adapt arc info given the number of bytes we are trying to add and 1315ea8dc4b6Seschrock * the state that we are comming from. This function is only called 1316ea8dc4b6Seschrock * when we are adding new content to the cache. 1317ea8dc4b6Seschrock */ 1318fa9e4066Sahrens static void 1319ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state) 1320fa9e4066Sahrens { 1321ea8dc4b6Seschrock int mult; 1322ea8dc4b6Seschrock 1323ea8dc4b6Seschrock ASSERT(bytes > 0); 1324fa9e4066Sahrens /* 1325ea8dc4b6Seschrock * Adapt the target size of the MRU list: 1326ea8dc4b6Seschrock * - if we just hit in the MRU ghost list, then increase 1327ea8dc4b6Seschrock * the target size of the MRU list. 1328ea8dc4b6Seschrock * - if we just hit in the MFU ghost list, then increase 1329ea8dc4b6Seschrock * the target size of the MFU list by decreasing the 1330ea8dc4b6Seschrock * target size of the MRU list. 1331fa9e4066Sahrens */ 1332ea8dc4b6Seschrock if (state == arc.mru_ghost) { 1333ea8dc4b6Seschrock mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ? 1334ea8dc4b6Seschrock 1 : (arc.mfu_ghost->size/arc.mru_ghost->size)); 1335ea8dc4b6Seschrock 1336ea8dc4b6Seschrock arc.p = MIN(arc.c, arc.p + bytes * mult); 1337ea8dc4b6Seschrock } else if (state == arc.mfu_ghost) { 1338ea8dc4b6Seschrock mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ? 1339ea8dc4b6Seschrock 1 : (arc.mru_ghost->size/arc.mfu_ghost->size)); 1340ea8dc4b6Seschrock 1341ea8dc4b6Seschrock arc.p = MAX(0, (int64_t)arc.p - bytes * mult); 1342ea8dc4b6Seschrock } 1343ea8dc4b6Seschrock ASSERT((int64_t)arc.p >= 0); 1344fa9e4066Sahrens 1345fa9e4066Sahrens if (arc_reclaim_needed()) { 1346fa9e4066Sahrens cv_signal(&arc_reclaim_thr_cv); 1347fa9e4066Sahrens return; 1348fa9e4066Sahrens } 1349fa9e4066Sahrens 1350fa9e4066Sahrens if (arc.no_grow) 1351fa9e4066Sahrens return; 1352fa9e4066Sahrens 1353ea8dc4b6Seschrock if (arc.c >= arc.c_max) 1354ea8dc4b6Seschrock return; 1355ea8dc4b6Seschrock 1356fa9e4066Sahrens /* 1357ea8dc4b6Seschrock * If we're within (2 * maxblocksize) bytes of the target 1358ea8dc4b6Seschrock * cache size, increment the target cache size 1359fa9e4066Sahrens */ 1360ea8dc4b6Seschrock if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1361ea8dc4b6Seschrock atomic_add_64(&arc.c, (int64_t)bytes); 1362fa9e4066Sahrens if (arc.c > arc.c_max) 1363fa9e4066Sahrens arc.c = arc.c_max; 1364ea8dc4b6Seschrock else if (state == arc.anon) 1365ea8dc4b6Seschrock atomic_add_64(&arc.p, (int64_t)bytes); 1366ea8dc4b6Seschrock if (arc.p > arc.c) 1367ea8dc4b6Seschrock arc.p = arc.c; 1368fa9e4066Sahrens } 1369ea8dc4b6Seschrock ASSERT((int64_t)arc.p >= 0); 1370fa9e4066Sahrens } 1371fa9e4066Sahrens 1372fa9e4066Sahrens /* 1373ea8dc4b6Seschrock * Check if the cache has reached its limits and eviction is required 1374ea8dc4b6Seschrock * prior to insert. 1375fa9e4066Sahrens */ 1376fa9e4066Sahrens static int 1377fa9e4066Sahrens arc_evict_needed() 1378fa9e4066Sahrens { 1379fa9e4066Sahrens if (arc_reclaim_needed()) 1380fa9e4066Sahrens return (1); 1381fa9e4066Sahrens 1382ea8dc4b6Seschrock return (arc.size > arc.c); 1383fa9e4066Sahrens } 1384fa9e4066Sahrens 1385fa9e4066Sahrens /* 1386fa9e4066Sahrens * The state, supplied as the first argument, is going to have something 1387fa9e4066Sahrens * inserted on its behalf. So, determine which cache must be victimized to 1388fa9e4066Sahrens * satisfy an insertion for this state. We have the following cases: 1389fa9e4066Sahrens * 1390ea8dc4b6Seschrock * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) -> 1391fa9e4066Sahrens * In this situation if we're out of space, but the resident size of the MFU is 1392fa9e4066Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 1393fa9e4066Sahrens * 1394ea8dc4b6Seschrock * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) -> 1395fa9e4066Sahrens * Here, we've used up all of the available space for the MRU, so we need to 1396fa9e4066Sahrens * evict from our own cache instead. Evict from the set of resident MRU 1397fa9e4066Sahrens * entries. 1398fa9e4066Sahrens * 1399ea8dc4b6Seschrock * 3. Insert for MFU (c - p) > sizeof(arc.mfu) -> 1400fa9e4066Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 1401fa9e4066Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 1402fa9e4066Sahrens * the MFU side, so the MRU side needs to be victimized. 1403fa9e4066Sahrens * 1404ea8dc4b6Seschrock * 4. Insert for MFU (c - p) < sizeof(arc.mfu) -> 1405fa9e4066Sahrens * MFU's resident set is consuming more space than it has been allotted. In 1406fa9e4066Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 1407fa9e4066Sahrens */ 1408fa9e4066Sahrens static void 1409fa9e4066Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes) 1410fa9e4066Sahrens { 1411fa9e4066Sahrens uint64_t mru_used; 1412fa9e4066Sahrens uint64_t mfu_space; 1413fa9e4066Sahrens uint64_t evicted; 1414fa9e4066Sahrens 1415ea8dc4b6Seschrock ASSERT(state == arc.mru || state == arc.mfu); 1416fa9e4066Sahrens 1417ea8dc4b6Seschrock if (state == arc.mru) { 1418ea8dc4b6Seschrock mru_used = arc.anon->size + arc.mru->size; 1419fa9e4066Sahrens if (arc.p > mru_used) { 1420fa9e4066Sahrens /* case 1 */ 1421ea8dc4b6Seschrock evicted = arc_evict(arc.mfu, bytes); 1422fa9e4066Sahrens if (evicted < bytes) { 1423fa9e4066Sahrens arc_adjust(); 1424fa9e4066Sahrens } 1425fa9e4066Sahrens } else { 1426fa9e4066Sahrens /* case 2 */ 1427ea8dc4b6Seschrock evicted = arc_evict(arc.mru, bytes); 1428fa9e4066Sahrens if (evicted < bytes) { 1429fa9e4066Sahrens arc_adjust(); 1430fa9e4066Sahrens } 1431fa9e4066Sahrens } 1432fa9e4066Sahrens } else { 1433ea8dc4b6Seschrock /* MFU case */ 1434fa9e4066Sahrens mfu_space = arc.c - arc.p; 1435ea8dc4b6Seschrock if (mfu_space > arc.mfu->size) { 1436fa9e4066Sahrens /* case 3 */ 1437ea8dc4b6Seschrock evicted = arc_evict(arc.mru, bytes); 1438fa9e4066Sahrens if (evicted < bytes) { 1439fa9e4066Sahrens arc_adjust(); 1440fa9e4066Sahrens } 1441fa9e4066Sahrens } else { 1442fa9e4066Sahrens /* case 4 */ 1443ea8dc4b6Seschrock evicted = arc_evict(arc.mfu, bytes); 1444fa9e4066Sahrens if (evicted < bytes) { 1445fa9e4066Sahrens arc_adjust(); 1446fa9e4066Sahrens } 1447fa9e4066Sahrens } 1448fa9e4066Sahrens } 1449fa9e4066Sahrens } 1450fa9e4066Sahrens 1451fa9e4066Sahrens /* 1452fa9e4066Sahrens * This routine is called whenever a buffer is accessed. 1453ea8dc4b6Seschrock * NOTE: the hash lock is dropped in this function. 1454fa9e4066Sahrens */ 1455fa9e4066Sahrens static void 1456ea8dc4b6Seschrock arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1457fa9e4066Sahrens { 1458ea8dc4b6Seschrock arc_state_t *evict_state = NULL; 1459ea8dc4b6Seschrock int blksz; 1460fa9e4066Sahrens 1461fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1462fa9e4066Sahrens 1463fa9e4066Sahrens blksz = buf->b_size; 1464fa9e4066Sahrens 1465fa9e4066Sahrens if (buf->b_state == arc.anon) { 1466fa9e4066Sahrens /* 1467fa9e4066Sahrens * This buffer is not in the cache, and does not 1468fa9e4066Sahrens * appear in our "ghost" list. Add the new buffer 1469fa9e4066Sahrens * to the MRU state. 1470fa9e4066Sahrens */ 1471fa9e4066Sahrens 1472ea8dc4b6Seschrock arc_adapt(blksz, arc.anon); 1473ea8dc4b6Seschrock if (arc_evict_needed()) 1474ea8dc4b6Seschrock evict_state = arc.mru; 1475fa9e4066Sahrens 1476fa9e4066Sahrens ASSERT(buf->b_arc_access == 0); 1477fa9e4066Sahrens buf->b_arc_access = lbolt; 1478ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1479ea8dc4b6Seschrock arc_change_state(arc.mru, buf, hash_lock); 1480fa9e4066Sahrens 1481ea8dc4b6Seschrock } else if (buf->b_state == arc.mru) { 1482fa9e4066Sahrens /* 1483*13506d1eSmaybee * If this buffer is here because of a prefetch, then either: 1484*13506d1eSmaybee * - clear the flag if this is a "referencing" read 1485*13506d1eSmaybee * (any subsequent access will bump this into the MFU state). 1486*13506d1eSmaybee * or 1487*13506d1eSmaybee * - move the buffer to the head of the list if this is 1488*13506d1eSmaybee * another prefetch (to make it less likely to be evicted). 1489fa9e4066Sahrens */ 1490fa9e4066Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 1491*13506d1eSmaybee if (refcount_count(&buf->b_refcnt) == 0) { 1492*13506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 1493*13506d1eSmaybee mutex_enter(&arc.mru->mtx); 1494*13506d1eSmaybee list_remove(&arc.mru->list, buf); 1495*13506d1eSmaybee list_insert_head(&arc.mru->list, buf); 1496*13506d1eSmaybee mutex_exit(&arc.mru->mtx); 1497*13506d1eSmaybee } else { 1498*13506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 1499*13506d1eSmaybee atomic_add_64(&arc.mru->hits, 1); 1500*13506d1eSmaybee } 1501*13506d1eSmaybee buf->b_arc_access = lbolt; 1502ea8dc4b6Seschrock mutex_exit(hash_lock); 1503fa9e4066Sahrens return; 1504fa9e4066Sahrens } 1505fa9e4066Sahrens 1506fa9e4066Sahrens /* 1507fa9e4066Sahrens * This buffer has been "accessed" only once so far, 1508fa9e4066Sahrens * but it is still in the cache. Move it to the MFU 1509fa9e4066Sahrens * state. 1510fa9e4066Sahrens */ 1511fa9e4066Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1512fa9e4066Sahrens /* 1513fa9e4066Sahrens * More than 125ms have passed since we 1514fa9e4066Sahrens * instantiated this buffer. Move it to the 1515fa9e4066Sahrens * most frequently used state. 1516fa9e4066Sahrens */ 1517fa9e4066Sahrens buf->b_arc_access = lbolt; 1518ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1519ea8dc4b6Seschrock arc_change_state(arc.mfu, buf, hash_lock); 1520fa9e4066Sahrens } 1521ea8dc4b6Seschrock atomic_add_64(&arc.mru->hits, 1); 1522ea8dc4b6Seschrock } else if (buf->b_state == arc.mru_ghost) { 1523fa9e4066Sahrens arc_state_t *new_state; 1524fa9e4066Sahrens /* 1525fa9e4066Sahrens * This buffer has been "accessed" recently, but 1526fa9e4066Sahrens * was evicted from the cache. Move it to the 1527fa9e4066Sahrens * MFU state. 1528fa9e4066Sahrens */ 1529fa9e4066Sahrens 1530fa9e4066Sahrens if (buf->b_flags & ARC_PREFETCH) { 1531ea8dc4b6Seschrock new_state = arc.mru; 1532*13506d1eSmaybee if (refcount_count(&buf->b_refcnt) > 0) 1533*13506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 1534ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1535fa9e4066Sahrens } else { 1536ea8dc4b6Seschrock new_state = arc.mfu; 1537ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1538fa9e4066Sahrens } 1539fa9e4066Sahrens 1540ea8dc4b6Seschrock arc_adapt(blksz, arc.mru_ghost); 1541ea8dc4b6Seschrock if (arc_evict_needed()) 1542ea8dc4b6Seschrock evict_state = new_state; 1543fa9e4066Sahrens 1544fa9e4066Sahrens buf->b_arc_access = lbolt; 1545fa9e4066Sahrens arc_change_state(new_state, buf, hash_lock); 1546fa9e4066Sahrens 1547ea8dc4b6Seschrock atomic_add_64(&arc.mru_ghost->hits, 1); 1548ea8dc4b6Seschrock } else if (buf->b_state == arc.mfu) { 1549fa9e4066Sahrens /* 1550fa9e4066Sahrens * This buffer has been accessed more than once and is 1551fa9e4066Sahrens * still in the cache. Keep it in the MFU state. 1552fa9e4066Sahrens * 1553*13506d1eSmaybee * NOTE: an add_reference() that occurred when we did 1554*13506d1eSmaybee * the arc_read() will have kicked this off the list. 1555*13506d1eSmaybee * If it was a prefetch, we will explicitly move it to 1556*13506d1eSmaybee * the head of the list now. 1557fa9e4066Sahrens */ 1558*13506d1eSmaybee if ((buf->b_flags & ARC_PREFETCH) != 0) { 1559*13506d1eSmaybee ASSERT(refcount_count(&buf->b_refcnt) == 0); 1560*13506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 1561*13506d1eSmaybee mutex_enter(&arc.mfu->mtx); 1562*13506d1eSmaybee list_remove(&arc.mfu->list, buf); 1563*13506d1eSmaybee list_insert_head(&arc.mfu->list, buf); 1564*13506d1eSmaybee mutex_exit(&arc.mfu->mtx); 1565*13506d1eSmaybee } 1566ea8dc4b6Seschrock atomic_add_64(&arc.mfu->hits, 1); 1567*13506d1eSmaybee buf->b_arc_access = lbolt; 1568ea8dc4b6Seschrock } else if (buf->b_state == arc.mfu_ghost) { 1569*13506d1eSmaybee arc_state_t *new_state = arc.mfu; 1570fa9e4066Sahrens /* 1571fa9e4066Sahrens * This buffer has been accessed more than once but has 1572fa9e4066Sahrens * been evicted from the cache. Move it back to the 1573fa9e4066Sahrens * MFU state. 1574fa9e4066Sahrens */ 1575fa9e4066Sahrens 1576*13506d1eSmaybee if (buf->b_flags & ARC_PREFETCH) { 1577*13506d1eSmaybee /* 1578*13506d1eSmaybee * This is a prefetch access... 1579*13506d1eSmaybee * move this block back to the MRU state. 1580*13506d1eSmaybee */ 1581*13506d1eSmaybee ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1582*13506d1eSmaybee new_state = arc.mru; 1583*13506d1eSmaybee } 1584*13506d1eSmaybee 1585ea8dc4b6Seschrock arc_adapt(blksz, arc.mfu_ghost); 1586ea8dc4b6Seschrock if (arc_evict_needed()) 1587*13506d1eSmaybee evict_state = new_state; 1588fa9e4066Sahrens 1589fa9e4066Sahrens buf->b_arc_access = lbolt; 1590ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1591*13506d1eSmaybee arc_change_state(new_state, buf, hash_lock); 1592fa9e4066Sahrens 1593ea8dc4b6Seschrock atomic_add_64(&arc.mfu_ghost->hits, 1); 1594fa9e4066Sahrens } else { 1595fa9e4066Sahrens ASSERT(!"invalid arc state"); 1596fa9e4066Sahrens } 1597fa9e4066Sahrens 1598ea8dc4b6Seschrock mutex_exit(hash_lock); 1599ea8dc4b6Seschrock if (evict_state) 1600ea8dc4b6Seschrock arc_evict_for_state(evict_state, blksz); 1601fa9e4066Sahrens } 1602fa9e4066Sahrens 1603fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1604fa9e4066Sahrens /* ARGSUSED */ 1605fa9e4066Sahrens void 1606fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1607fa9e4066Sahrens { 1608fa9e4066Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1609ea8dc4b6Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1610fa9e4066Sahrens } 1611fa9e4066Sahrens 1612fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1613fa9e4066Sahrens void 1614fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1615fa9e4066Sahrens { 1616fa9e4066Sahrens arc_buf_t **bufp = arg; 1617fa9e4066Sahrens if (zio && zio->io_error) { 1618ea8dc4b6Seschrock VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1619fa9e4066Sahrens *bufp = NULL; 1620fa9e4066Sahrens } else { 1621fa9e4066Sahrens *bufp = buf; 1622fa9e4066Sahrens } 1623fa9e4066Sahrens } 1624fa9e4066Sahrens 1625fa9e4066Sahrens static void 1626fa9e4066Sahrens arc_read_done(zio_t *zio) 1627fa9e4066Sahrens { 1628bbf4a8dfSmaybee arc_buf_hdr_t *hdr, *found; 1629fa9e4066Sahrens arc_buf_t *buf; 1630fa9e4066Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 1631fa9e4066Sahrens kmutex_t *hash_lock; 1632fa9e4066Sahrens arc_callback_t *callback_list, *acb; 1633fa9e4066Sahrens int freeable = FALSE; 1634fa9e4066Sahrens 1635fa9e4066Sahrens buf = zio->io_private; 1636fa9e4066Sahrens hdr = buf->b_hdr; 1637fa9e4066Sahrens 1638bbf4a8dfSmaybee /* 1639bbf4a8dfSmaybee * The hdr was inserted into hash-table and removed from lists 1640bbf4a8dfSmaybee * prior to starting I/O. We should find this header, since 1641bbf4a8dfSmaybee * it's in the hash table, and it should be legit since it's 1642bbf4a8dfSmaybee * not possible to evict it during the I/O. The only possible 1643bbf4a8dfSmaybee * reason for it not to be found is if we were freed during the 1644bbf4a8dfSmaybee * read. 1645bbf4a8dfSmaybee */ 1646bbf4a8dfSmaybee found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1647fa9e4066Sahrens &hash_lock); 1648fa9e4066Sahrens 1649bbf4a8dfSmaybee ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1650bbf4a8dfSmaybee (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1651fa9e4066Sahrens 1652fa9e4066Sahrens /* byteswap if necessary */ 1653fa9e4066Sahrens callback_list = hdr->b_acb; 1654fa9e4066Sahrens ASSERT(callback_list != NULL); 1655fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1656fa9e4066Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1657fa9e4066Sahrens 1658fa9e4066Sahrens /* create copies of the data buffer for the callers */ 1659fa9e4066Sahrens abuf = buf; 1660fa9e4066Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 1661fa9e4066Sahrens if (acb->acb_done) { 1662fa9e4066Sahrens if (abuf == NULL) { 1663fa9e4066Sahrens abuf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1664ea8dc4b6Seschrock abuf->b_data = arc_data_copy(hdr, buf->b_data); 1665fa9e4066Sahrens abuf->b_hdr = hdr; 1666ea8dc4b6Seschrock abuf->b_efunc = NULL; 1667ea8dc4b6Seschrock abuf->b_private = NULL; 1668fa9e4066Sahrens abuf->b_next = hdr->b_buf; 1669fa9e4066Sahrens hdr->b_buf = abuf; 1670ea8dc4b6Seschrock hdr->b_datacnt += 1; 1671fa9e4066Sahrens } 1672fa9e4066Sahrens acb->acb_buf = abuf; 1673fa9e4066Sahrens abuf = NULL; 1674fa9e4066Sahrens } 1675fa9e4066Sahrens } 1676fa9e4066Sahrens hdr->b_acb = NULL; 1677fa9e4066Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1678ea8dc4b6Seschrock ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1679ea8dc4b6Seschrock if (abuf == buf) 1680ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1681fa9e4066Sahrens 1682fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1683fa9e4066Sahrens 1684fa9e4066Sahrens if (zio->io_error != 0) { 1685fa9e4066Sahrens hdr->b_flags |= ARC_IO_ERROR; 1686fa9e4066Sahrens if (hdr->b_state != arc.anon) 1687fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1688ea8dc4b6Seschrock if (HDR_IN_HASH_TABLE(hdr)) 1689ea8dc4b6Seschrock buf_hash_remove(hdr); 1690fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1691*13506d1eSmaybee /* convert checksum errors into IO errors */ 1692ea8dc4b6Seschrock if (zio->io_error == ECKSUM) 1693ea8dc4b6Seschrock zio->io_error = EIO; 1694fa9e4066Sahrens } 1695fa9e4066Sahrens 1696ea8dc4b6Seschrock /* 1697*13506d1eSmaybee * Broadcast before we drop the hash_lock to avoid the possibility 1698*13506d1eSmaybee * that the hdr (and hence the cv) might be freed before we get to 1699*13506d1eSmaybee * the cv_broadcast(). 1700ea8dc4b6Seschrock */ 1701ea8dc4b6Seschrock cv_broadcast(&hdr->b_cv); 1702ea8dc4b6Seschrock 1703bbf4a8dfSmaybee if (hash_lock) { 1704fa9e4066Sahrens /* 1705fa9e4066Sahrens * Only call arc_access on anonymous buffers. This is because 1706fa9e4066Sahrens * if we've issued an I/O for an evicted buffer, we've already 1707fa9e4066Sahrens * called arc_access (to prevent any simultaneous readers from 1708fa9e4066Sahrens * getting confused). 1709fa9e4066Sahrens */ 1710fa9e4066Sahrens if (zio->io_error == 0 && hdr->b_state == arc.anon) 1711ea8dc4b6Seschrock arc_access_and_exit(hdr, hash_lock); 1712ea8dc4b6Seschrock else 1713ea8dc4b6Seschrock mutex_exit(hash_lock); 1714fa9e4066Sahrens } else { 1715fa9e4066Sahrens /* 1716fa9e4066Sahrens * This block was freed while we waited for the read to 1717fa9e4066Sahrens * complete. It has been removed from the hash table and 1718fa9e4066Sahrens * moved to the anonymous state (so that it won't show up 1719fa9e4066Sahrens * in the cache). 1720fa9e4066Sahrens */ 1721fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1722fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1723fa9e4066Sahrens } 1724fa9e4066Sahrens 1725fa9e4066Sahrens /* execute each callback and free its structure */ 1726fa9e4066Sahrens while ((acb = callback_list) != NULL) { 1727fa9e4066Sahrens if (acb->acb_done) 1728fa9e4066Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1729fa9e4066Sahrens 1730fa9e4066Sahrens if (acb->acb_zio_dummy != NULL) { 1731fa9e4066Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 1732fa9e4066Sahrens zio_nowait(acb->acb_zio_dummy); 1733fa9e4066Sahrens } 1734fa9e4066Sahrens 1735fa9e4066Sahrens callback_list = acb->acb_next; 1736fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1737fa9e4066Sahrens } 1738fa9e4066Sahrens 1739fa9e4066Sahrens if (freeable) 1740ea8dc4b6Seschrock arc_hdr_destroy(hdr); 1741fa9e4066Sahrens } 1742fa9e4066Sahrens 1743fa9e4066Sahrens /* 1744fa9e4066Sahrens * "Read" the block block at the specified DVA (in bp) via the 1745fa9e4066Sahrens * cache. If the block is found in the cache, invoke the provided 1746fa9e4066Sahrens * callback immediately and return. Note that the `zio' parameter 1747fa9e4066Sahrens * in the callback will be NULL in this case, since no IO was 1748fa9e4066Sahrens * required. If the block is not in the cache pass the read request 1749fa9e4066Sahrens * on to the spa with a substitute callback function, so that the 1750fa9e4066Sahrens * requested block will be added to the cache. 1751fa9e4066Sahrens * 1752fa9e4066Sahrens * If a read request arrives for a block that has a read in-progress, 1753fa9e4066Sahrens * either wait for the in-progress read to complete (and return the 1754fa9e4066Sahrens * results); or, if this is a read with a "done" func, add a record 1755fa9e4066Sahrens * to the read to invoke the "done" func when the read completes, 1756fa9e4066Sahrens * and return; or just return. 1757fa9e4066Sahrens * 1758fa9e4066Sahrens * arc_read_done() will invoke all the requested "done" functions 1759fa9e4066Sahrens * for readers of this block. 1760fa9e4066Sahrens */ 1761fa9e4066Sahrens int 1762fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1763fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1764*13506d1eSmaybee uint32_t *arc_flags, zbookmark_t *zb) 1765fa9e4066Sahrens { 1766fa9e4066Sahrens arc_buf_hdr_t *hdr; 1767fa9e4066Sahrens arc_buf_t *buf; 1768fa9e4066Sahrens kmutex_t *hash_lock; 1769fa9e4066Sahrens zio_t *rzio; 1770fa9e4066Sahrens 1771fa9e4066Sahrens top: 1772fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1773ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0) { 1774fa9e4066Sahrens 1775*13506d1eSmaybee *arc_flags |= ARC_CACHED; 1776*13506d1eSmaybee 1777fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 1778*13506d1eSmaybee 1779*13506d1eSmaybee if (*arc_flags & ARC_WAIT) { 1780*13506d1eSmaybee cv_wait(&hdr->b_cv, hash_lock); 1781*13506d1eSmaybee mutex_exit(hash_lock); 1782*13506d1eSmaybee goto top; 1783*13506d1eSmaybee } 1784*13506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 1785*13506d1eSmaybee 1786*13506d1eSmaybee if (done) { 1787fa9e4066Sahrens arc_callback_t *acb = NULL; 1788fa9e4066Sahrens 1789fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 1790fa9e4066Sahrens KM_SLEEP); 1791fa9e4066Sahrens acb->acb_done = done; 1792fa9e4066Sahrens acb->acb_private = private; 1793fa9e4066Sahrens acb->acb_byteswap = swap; 1794fa9e4066Sahrens if (pio != NULL) 1795fa9e4066Sahrens acb->acb_zio_dummy = zio_null(pio, 1796fa9e4066Sahrens spa, NULL, NULL, flags); 1797fa9e4066Sahrens 1798fa9e4066Sahrens ASSERT(acb->acb_done != NULL); 1799fa9e4066Sahrens acb->acb_next = hdr->b_acb; 1800fa9e4066Sahrens hdr->b_acb = acb; 1801fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1802fa9e4066Sahrens mutex_exit(hash_lock); 1803fa9e4066Sahrens return (0); 1804fa9e4066Sahrens } 1805fa9e4066Sahrens mutex_exit(hash_lock); 1806fa9e4066Sahrens return (0); 1807fa9e4066Sahrens } 1808fa9e4066Sahrens 1809ea8dc4b6Seschrock ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 1810fa9e4066Sahrens 1811ea8dc4b6Seschrock if (done) { 1812ea8dc4b6Seschrock /* 1813ea8dc4b6Seschrock * If this block is already in use, create a new 1814ea8dc4b6Seschrock * copy of the data so that we will be guaranteed 1815ea8dc4b6Seschrock * that arc_release() will always succeed. 1816ea8dc4b6Seschrock */ 1817fa9e4066Sahrens buf = hdr->b_buf; 1818ea8dc4b6Seschrock ASSERT(buf); 1819ea8dc4b6Seschrock ASSERT(buf->b_data); 1820ea8dc4b6Seschrock if (!HDR_BUF_AVAILABLE(hdr)) { 1821ea8dc4b6Seschrock void *data = arc_data_copy(hdr, buf->b_data); 1822ea8dc4b6Seschrock buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1823ea8dc4b6Seschrock buf->b_hdr = hdr; 1824ea8dc4b6Seschrock buf->b_data = data; 1825ea8dc4b6Seschrock buf->b_efunc = NULL; 1826ea8dc4b6Seschrock buf->b_private = NULL; 1827ea8dc4b6Seschrock buf->b_next = hdr->b_buf; 1828ea8dc4b6Seschrock hdr->b_buf = buf; 1829ea8dc4b6Seschrock hdr->b_datacnt += 1; 1830ea8dc4b6Seschrock } else { 1831ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 1832ea8dc4b6Seschrock hdr->b_flags &= ~ARC_BUF_AVAILABLE; 1833ea8dc4b6Seschrock } 1834ea8dc4b6Seschrock add_reference(hdr, hash_lock, private); 1835*13506d1eSmaybee } else if (*arc_flags & ARC_PREFETCH && 1836*13506d1eSmaybee refcount_count(&hdr->b_refcnt) == 0) { 1837*13506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 1838fa9e4066Sahrens } 1839fa9e4066Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1840ea8dc4b6Seschrock arc_access_and_exit(hdr, hash_lock); 1841fa9e4066Sahrens atomic_add_64(&arc.hits, 1); 1842fa9e4066Sahrens if (done) 1843fa9e4066Sahrens done(NULL, buf, private); 1844fa9e4066Sahrens } else { 1845fa9e4066Sahrens uint64_t size = BP_GET_LSIZE(bp); 1846fa9e4066Sahrens arc_callback_t *acb; 1847fa9e4066Sahrens 1848fa9e4066Sahrens if (hdr == NULL) { 1849fa9e4066Sahrens /* this block is not in the cache */ 1850fa9e4066Sahrens arc_buf_hdr_t *exists; 1851fa9e4066Sahrens 1852fa9e4066Sahrens buf = arc_buf_alloc(spa, size, private); 1853fa9e4066Sahrens hdr = buf->b_hdr; 1854fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(bp); 1855fa9e4066Sahrens hdr->b_birth = bp->blk_birth; 1856fa9e4066Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1857fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1858fa9e4066Sahrens if (exists) { 1859fa9e4066Sahrens /* somebody beat us to the hash insert */ 1860fa9e4066Sahrens mutex_exit(hash_lock); 1861fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1862fa9e4066Sahrens hdr->b_birth = 0; 1863fa9e4066Sahrens hdr->b_cksum0 = 0; 1864ea8dc4b6Seschrock (void) arc_buf_remove_ref(buf, private); 1865fa9e4066Sahrens goto top; /* restart the IO request */ 1866fa9e4066Sahrens } 1867*13506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 1868*13506d1eSmaybee if (*arc_flags & ARC_PREFETCH) { 1869*13506d1eSmaybee (void) remove_reference(hdr, hash_lock, 1870*13506d1eSmaybee private); 1871*13506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 1872*13506d1eSmaybee } 1873*13506d1eSmaybee if (BP_GET_LEVEL(bp) > 0) 1874*13506d1eSmaybee hdr->b_flags |= ARC_INDIRECT; 1875fa9e4066Sahrens } else { 1876fa9e4066Sahrens /* this block is in the ghost cache */ 1877ea8dc4b6Seschrock ASSERT(GHOST_STATE(hdr->b_state)); 1878ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1879*13506d1eSmaybee ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 1880ea8dc4b6Seschrock ASSERT(hdr->b_buf == NULL); 1881*13506d1eSmaybee 1882*13506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 1883*13506d1eSmaybee if (*arc_flags & ARC_PREFETCH) 1884*13506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 1885*13506d1eSmaybee else 1886*13506d1eSmaybee add_reference(hdr, hash_lock, private); 1887fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1888fa9e4066Sahrens buf->b_hdr = hdr; 1889ea8dc4b6Seschrock buf->b_efunc = NULL; 1890ea8dc4b6Seschrock buf->b_private = NULL; 1891fa9e4066Sahrens buf->b_next = NULL; 1892fa9e4066Sahrens hdr->b_buf = buf; 1893ea8dc4b6Seschrock buf->b_data = zio_buf_alloc(hdr->b_size); 1894ea8dc4b6Seschrock atomic_add_64(&arc.size, hdr->b_size); 1895ea8dc4b6Seschrock ASSERT(hdr->b_datacnt == 0); 1896ea8dc4b6Seschrock hdr->b_datacnt = 1; 1897*13506d1eSmaybee 1898fa9e4066Sahrens } 1899fa9e4066Sahrens 1900fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1901fa9e4066Sahrens acb->acb_done = done; 1902fa9e4066Sahrens acb->acb_private = private; 1903fa9e4066Sahrens acb->acb_byteswap = swap; 1904fa9e4066Sahrens 1905fa9e4066Sahrens ASSERT(hdr->b_acb == NULL); 1906fa9e4066Sahrens hdr->b_acb = acb; 1907fa9e4066Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 1908fa9e4066Sahrens 1909fa9e4066Sahrens /* 1910fa9e4066Sahrens * If the buffer has been evicted, migrate it to a present state 1911fa9e4066Sahrens * before issuing the I/O. Once we drop the hash-table lock, 1912fa9e4066Sahrens * the header will be marked as I/O in progress and have an 1913fa9e4066Sahrens * attached buffer. At this point, anybody who finds this 1914fa9e4066Sahrens * buffer ought to notice that it's legit but has a pending I/O. 1915fa9e4066Sahrens */ 1916fa9e4066Sahrens 1917ea8dc4b6Seschrock if (GHOST_STATE(hdr->b_state)) 1918ea8dc4b6Seschrock arc_access_and_exit(hdr, hash_lock); 1919ea8dc4b6Seschrock else 1920ea8dc4b6Seschrock mutex_exit(hash_lock); 1921fa9e4066Sahrens 1922fa9e4066Sahrens ASSERT3U(hdr->b_size, ==, size); 1923c543ec06Sahrens DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 1924c543ec06Sahrens zbookmark_t *, zb); 1925fa9e4066Sahrens atomic_add_64(&arc.misses, 1); 1926ea8dc4b6Seschrock 1927fa9e4066Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 1928ea8dc4b6Seschrock arc_read_done, buf, priority, flags, zb); 1929fa9e4066Sahrens 1930*13506d1eSmaybee if (*arc_flags & ARC_WAIT) 1931fa9e4066Sahrens return (zio_wait(rzio)); 1932fa9e4066Sahrens 1933*13506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 1934fa9e4066Sahrens zio_nowait(rzio); 1935fa9e4066Sahrens } 1936fa9e4066Sahrens return (0); 1937fa9e4066Sahrens } 1938fa9e4066Sahrens 1939fa9e4066Sahrens /* 1940fa9e4066Sahrens * arc_read() variant to support pool traversal. If the block is already 1941fa9e4066Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 1942fa9e4066Sahrens * The idea is that we don't want pool traversal filling up memory, but 1943fa9e4066Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 1944fa9e4066Sahrens */ 1945fa9e4066Sahrens int 1946fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 1947fa9e4066Sahrens { 1948fa9e4066Sahrens arc_buf_hdr_t *hdr; 1949fa9e4066Sahrens kmutex_t *hash_mtx; 1950fa9e4066Sahrens int rc = 0; 1951fa9e4066Sahrens 1952fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 1953fa9e4066Sahrens 1954ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 1955ea8dc4b6Seschrock arc_buf_t *buf = hdr->b_buf; 1956ea8dc4b6Seschrock 1957ea8dc4b6Seschrock ASSERT(buf); 1958ea8dc4b6Seschrock while (buf->b_data == NULL) { 1959ea8dc4b6Seschrock buf = buf->b_next; 1960ea8dc4b6Seschrock ASSERT(buf); 1961ea8dc4b6Seschrock } 1962ea8dc4b6Seschrock bcopy(buf->b_data, data, hdr->b_size); 1963ea8dc4b6Seschrock } else { 1964fa9e4066Sahrens rc = ENOENT; 1965ea8dc4b6Seschrock } 1966fa9e4066Sahrens 1967fa9e4066Sahrens if (hash_mtx) 1968fa9e4066Sahrens mutex_exit(hash_mtx); 1969fa9e4066Sahrens 1970fa9e4066Sahrens return (rc); 1971fa9e4066Sahrens } 1972fa9e4066Sahrens 1973ea8dc4b6Seschrock void 1974ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 1975ea8dc4b6Seschrock { 1976ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 1977ea8dc4b6Seschrock ASSERT(buf->b_hdr->b_state != arc.anon); 1978ea8dc4b6Seschrock ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 1979ea8dc4b6Seschrock buf->b_efunc = func; 1980ea8dc4b6Seschrock buf->b_private = private; 1981ea8dc4b6Seschrock } 1982ea8dc4b6Seschrock 1983ea8dc4b6Seschrock /* 1984ea8dc4b6Seschrock * This is used by the DMU to let the ARC know that a buffer is 1985ea8dc4b6Seschrock * being evicted, so the ARC should clean up. If this arc buf 1986ea8dc4b6Seschrock * is not yet in the evicted state, it will be put there. 1987ea8dc4b6Seschrock */ 1988ea8dc4b6Seschrock int 1989ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf) 1990ea8dc4b6Seschrock { 1991ea8dc4b6Seschrock arc_buf_hdr_t *hdr; 1992ea8dc4b6Seschrock kmutex_t *hash_lock; 1993ea8dc4b6Seschrock arc_buf_t **bufp; 1994ea8dc4b6Seschrock 1995ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1996ea8dc4b6Seschrock hdr = buf->b_hdr; 1997ea8dc4b6Seschrock if (hdr == NULL) { 1998ea8dc4b6Seschrock /* 1999ea8dc4b6Seschrock * We are in arc_do_user_evicts(). 2000ea8dc4b6Seschrock * NOTE: We can't be in arc_buf_add_ref() because 2001ea8dc4b6Seschrock * that would violate the interface rules. 2002ea8dc4b6Seschrock */ 2003ea8dc4b6Seschrock ASSERT(buf->b_data == NULL); 2004ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2005ea8dc4b6Seschrock return (0); 2006ea8dc4b6Seschrock } else if (buf->b_data == NULL) { 2007dd6ef538Smaybee arc_buf_t copy = *buf; /* structure assignment */ 2008ea8dc4b6Seschrock /* 2009dd6ef538Smaybee * We are on the eviction list. Process this buffer 2010dd6ef538Smaybee * now but let arc_do_user_evicts() do the reaping. 2011ea8dc4b6Seschrock */ 2012dd6ef538Smaybee buf->b_efunc = NULL; 2013dd6ef538Smaybee buf->b_hdr = NULL; 2014ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2015dd6ef538Smaybee VERIFY(copy.b_efunc(©) == 0); 2016dd6ef538Smaybee return (1); 2017ea8dc4b6Seschrock } else { 2018ea8dc4b6Seschrock /* 2019ea8dc4b6Seschrock * Prevent a race with arc_evict() 2020ea8dc4b6Seschrock */ 2021ea8dc4b6Seschrock ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2022ea8dc4b6Seschrock buf->b_hdr = NULL; 2023ea8dc4b6Seschrock } 2024ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2025ea8dc4b6Seschrock 2026ea8dc4b6Seschrock hash_lock = HDR_LOCK(hdr); 2027ea8dc4b6Seschrock mutex_enter(hash_lock); 2028ea8dc4b6Seschrock 2029ea8dc4b6Seschrock ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 2030ea8dc4b6Seschrock 2031ea8dc4b6Seschrock /* 2032ea8dc4b6Seschrock * Pull this buffer off of the hdr 2033ea8dc4b6Seschrock */ 2034ea8dc4b6Seschrock bufp = &hdr->b_buf; 2035ea8dc4b6Seschrock while (*bufp != buf) 2036ea8dc4b6Seschrock bufp = &(*bufp)->b_next; 2037ea8dc4b6Seschrock *bufp = buf->b_next; 2038ea8dc4b6Seschrock 2039ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 2040ea8dc4b6Seschrock buf->b_hdr = hdr; 2041ea8dc4b6Seschrock arc_buf_destroy(buf, FALSE); 2042ea8dc4b6Seschrock 2043ea8dc4b6Seschrock if (hdr->b_datacnt == 0) { 2044ea8dc4b6Seschrock arc_state_t *old_state = hdr->b_state; 2045ea8dc4b6Seschrock arc_state_t *evicted_state; 2046ea8dc4b6Seschrock 2047ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2048ea8dc4b6Seschrock 2049ea8dc4b6Seschrock evicted_state = 2050ea8dc4b6Seschrock (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 2051ea8dc4b6Seschrock 2052ea8dc4b6Seschrock mutex_enter(&old_state->mtx); 2053ea8dc4b6Seschrock mutex_enter(&evicted_state->mtx); 2054ea8dc4b6Seschrock 2055ea8dc4b6Seschrock arc_change_state(evicted_state, hdr, hash_lock); 2056ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(hdr)); 2057ea8dc4b6Seschrock hdr->b_flags = ARC_IN_HASH_TABLE; 2058ea8dc4b6Seschrock 2059ea8dc4b6Seschrock mutex_exit(&evicted_state->mtx); 2060ea8dc4b6Seschrock mutex_exit(&old_state->mtx); 2061ea8dc4b6Seschrock } 2062ea8dc4b6Seschrock mutex_exit(hash_lock); 2063dd6ef538Smaybee 2064ea8dc4b6Seschrock VERIFY(buf->b_efunc(buf) == 0); 2065ea8dc4b6Seschrock buf->b_efunc = NULL; 2066ea8dc4b6Seschrock buf->b_private = NULL; 2067ea8dc4b6Seschrock buf->b_hdr = NULL; 2068ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 2069ea8dc4b6Seschrock return (1); 2070ea8dc4b6Seschrock } 2071ea8dc4b6Seschrock 2072fa9e4066Sahrens /* 2073fa9e4066Sahrens * Release this buffer from the cache. This must be done 2074fa9e4066Sahrens * after a read and prior to modifying the buffer contents. 2075fa9e4066Sahrens * If the buffer has more than one reference, we must make 2076fa9e4066Sahrens * make a new hdr for the buffer. 2077fa9e4066Sahrens */ 2078fa9e4066Sahrens void 2079fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag) 2080fa9e4066Sahrens { 2081fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 2082fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 2083fa9e4066Sahrens 2084fa9e4066Sahrens /* this buffer is not on any list */ 2085fa9e4066Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2086fa9e4066Sahrens 2087fa9e4066Sahrens if (hdr->b_state == arc.anon) { 2088fa9e4066Sahrens /* this buffer is already released */ 2089fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2090fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 2091ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 2092fa9e4066Sahrens return; 2093fa9e4066Sahrens } 2094fa9e4066Sahrens 2095fa9e4066Sahrens mutex_enter(hash_lock); 2096fa9e4066Sahrens 2097ea8dc4b6Seschrock /* 2098ea8dc4b6Seschrock * Do we have more than one buf? 2099ea8dc4b6Seschrock */ 2100ea8dc4b6Seschrock if (hdr->b_buf != buf || buf->b_next != NULL) { 2101fa9e4066Sahrens arc_buf_hdr_t *nhdr; 2102fa9e4066Sahrens arc_buf_t **bufp; 2103fa9e4066Sahrens uint64_t blksz = hdr->b_size; 2104fa9e4066Sahrens spa_t *spa = hdr->b_spa; 2105fa9e4066Sahrens 2106ea8dc4b6Seschrock ASSERT(hdr->b_datacnt > 1); 2107fa9e4066Sahrens /* 2108fa9e4066Sahrens * Pull the data off of this buf and attach it to 2109fa9e4066Sahrens * a new anonymous buf. 2110fa9e4066Sahrens */ 2111ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 2112fa9e4066Sahrens bufp = &hdr->b_buf; 2113ea8dc4b6Seschrock while (*bufp != buf) 2114fa9e4066Sahrens bufp = &(*bufp)->b_next; 2115fa9e4066Sahrens *bufp = (*bufp)->b_next; 2116ea8dc4b6Seschrock 2117fa9e4066Sahrens ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 2118fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, -hdr->b_size); 2119ea8dc4b6Seschrock if (refcount_is_zero(&hdr->b_refcnt)) { 2120ea8dc4b6Seschrock ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size); 2121ea8dc4b6Seschrock atomic_add_64(&hdr->b_state->lsize, -hdr->b_size); 2122ea8dc4b6Seschrock } 2123ea8dc4b6Seschrock hdr->b_datacnt -= 1; 2124ea8dc4b6Seschrock 2125fa9e4066Sahrens mutex_exit(hash_lock); 2126fa9e4066Sahrens 2127fa9e4066Sahrens nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2128fa9e4066Sahrens nhdr->b_size = blksz; 2129fa9e4066Sahrens nhdr->b_spa = spa; 2130fa9e4066Sahrens nhdr->b_buf = buf; 2131fa9e4066Sahrens nhdr->b_state = arc.anon; 2132fa9e4066Sahrens nhdr->b_arc_access = 0; 2133fa9e4066Sahrens nhdr->b_flags = 0; 2134ea8dc4b6Seschrock nhdr->b_datacnt = 1; 2135fa9e4066Sahrens buf->b_hdr = nhdr; 2136fa9e4066Sahrens buf->b_next = NULL; 2137fa9e4066Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 2138fa9e4066Sahrens atomic_add_64(&arc.anon->size, blksz); 2139fa9e4066Sahrens 2140fa9e4066Sahrens hdr = nhdr; 2141fa9e4066Sahrens } else { 2142ea8dc4b6Seschrock ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2143fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 2144fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2145fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 2146fa9e4066Sahrens hdr->b_arc_access = 0; 2147fa9e4066Sahrens mutex_exit(hash_lock); 2148fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 2149fa9e4066Sahrens hdr->b_birth = 0; 2150fa9e4066Sahrens hdr->b_cksum0 = 0; 2151fa9e4066Sahrens } 2152ea8dc4b6Seschrock buf->b_efunc = NULL; 2153ea8dc4b6Seschrock buf->b_private = NULL; 2154fa9e4066Sahrens } 2155fa9e4066Sahrens 2156fa9e4066Sahrens int 2157fa9e4066Sahrens arc_released(arc_buf_t *buf) 2158fa9e4066Sahrens { 2159ea8dc4b6Seschrock return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon); 2160ea8dc4b6Seschrock } 2161ea8dc4b6Seschrock 2162ea8dc4b6Seschrock int 2163ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf) 2164ea8dc4b6Seschrock { 2165ea8dc4b6Seschrock return (buf->b_efunc != NULL); 2166fa9e4066Sahrens } 2167fa9e4066Sahrens 2168ea8dc4b6Seschrock #ifdef ZFS_DEBUG 2169ea8dc4b6Seschrock int 2170ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf) 2171ea8dc4b6Seschrock { 2172ea8dc4b6Seschrock return (refcount_count(&buf->b_hdr->b_refcnt)); 2173ea8dc4b6Seschrock } 2174ea8dc4b6Seschrock #endif 2175ea8dc4b6Seschrock 2176fa9e4066Sahrens static void 2177fa9e4066Sahrens arc_write_done(zio_t *zio) 2178fa9e4066Sahrens { 2179fa9e4066Sahrens arc_buf_t *buf; 2180fa9e4066Sahrens arc_buf_hdr_t *hdr; 2181fa9e4066Sahrens arc_callback_t *acb; 2182fa9e4066Sahrens 2183fa9e4066Sahrens buf = zio->io_private; 2184fa9e4066Sahrens hdr = buf->b_hdr; 2185fa9e4066Sahrens acb = hdr->b_acb; 2186fa9e4066Sahrens hdr->b_acb = NULL; 2187ea8dc4b6Seschrock ASSERT(acb != NULL); 2188fa9e4066Sahrens 2189fa9e4066Sahrens /* this buffer is on no lists and is not in the hash table */ 2190fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 2191fa9e4066Sahrens 2192fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2193fa9e4066Sahrens hdr->b_birth = zio->io_bp->blk_birth; 2194fa9e4066Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2195ea8dc4b6Seschrock /* 2196ea8dc4b6Seschrock * If the block to be written was all-zero, we may have 2197ea8dc4b6Seschrock * compressed it away. In this case no write was performed 2198ea8dc4b6Seschrock * so there will be no dva/birth-date/checksum. The buffer 2199ea8dc4b6Seschrock * must therefor remain anonymous (and uncached). 2200ea8dc4b6Seschrock */ 2201fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 2202fa9e4066Sahrens arc_buf_hdr_t *exists; 2203fa9e4066Sahrens kmutex_t *hash_lock; 2204fa9e4066Sahrens 2205fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2206fa9e4066Sahrens if (exists) { 2207fa9e4066Sahrens /* 2208fa9e4066Sahrens * This can only happen if we overwrite for 2209fa9e4066Sahrens * sync-to-convergence, because we remove 2210fa9e4066Sahrens * buffers from the hash table when we arc_free(). 2211fa9e4066Sahrens */ 2212fa9e4066Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2213fa9e4066Sahrens BP_IDENTITY(zio->io_bp))); 2214fa9e4066Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2215fa9e4066Sahrens zio->io_bp->blk_birth); 2216fa9e4066Sahrens 2217fa9e4066Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 2218fa9e4066Sahrens arc_change_state(arc.anon, exists, hash_lock); 2219fa9e4066Sahrens mutex_exit(hash_lock); 2220ea8dc4b6Seschrock arc_hdr_destroy(exists); 2221fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2222fa9e4066Sahrens ASSERT3P(exists, ==, NULL); 2223fa9e4066Sahrens } 2224ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2225ea8dc4b6Seschrock arc_access_and_exit(hdr, hash_lock); 2226ea8dc4b6Seschrock } else if (acb->acb_done == NULL) { 2227ea8dc4b6Seschrock int destroy_hdr; 2228ea8dc4b6Seschrock /* 2229ea8dc4b6Seschrock * This is an anonymous buffer with no user callback, 2230ea8dc4b6Seschrock * destroy it if there are no active references. 2231ea8dc4b6Seschrock */ 2232ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 2233ea8dc4b6Seschrock destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2234ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2235ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2236ea8dc4b6Seschrock if (destroy_hdr) 2237ea8dc4b6Seschrock arc_hdr_destroy(hdr); 2238ea8dc4b6Seschrock } else { 2239ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2240fa9e4066Sahrens } 2241ea8dc4b6Seschrock 2242ea8dc4b6Seschrock if (acb->acb_done) { 2243fa9e4066Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2244fa9e4066Sahrens acb->acb_done(zio, buf, acb->acb_private); 2245fa9e4066Sahrens } 2246fa9e4066Sahrens 2247ea8dc4b6Seschrock kmem_free(acb, sizeof (arc_callback_t)); 2248fa9e4066Sahrens } 2249fa9e4066Sahrens 2250fa9e4066Sahrens int 225144cd46caSbillm arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2252fa9e4066Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2253fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 2254ea8dc4b6Seschrock uint32_t arc_flags, zbookmark_t *zb) 2255fa9e4066Sahrens { 2256fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 2257fa9e4066Sahrens arc_callback_t *acb; 2258fa9e4066Sahrens zio_t *rzio; 2259fa9e4066Sahrens 2260fa9e4066Sahrens /* this is a private buffer - no locking required */ 2261fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 2262fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 2263fa9e4066Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 2264c5c6ffa0Smaybee ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2265c5c6ffa0Smaybee ASSERT(hdr->b_acb == 0); 2266fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2267fa9e4066Sahrens acb->acb_done = done; 2268fa9e4066Sahrens acb->acb_private = private; 2269fa9e4066Sahrens acb->acb_byteswap = (arc_byteswap_func_t *)-1; 2270fa9e4066Sahrens hdr->b_acb = acb; 2271ea8dc4b6Seschrock hdr->b_flags |= ARC_IO_IN_PROGRESS; 227244cd46caSbillm rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2273ea8dc4b6Seschrock buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); 2274fa9e4066Sahrens 2275fa9e4066Sahrens if (arc_flags & ARC_WAIT) 2276fa9e4066Sahrens return (zio_wait(rzio)); 2277fa9e4066Sahrens 2278fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 2279fa9e4066Sahrens zio_nowait(rzio); 2280fa9e4066Sahrens 2281fa9e4066Sahrens return (0); 2282fa9e4066Sahrens } 2283fa9e4066Sahrens 2284fa9e4066Sahrens int 2285fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2286fa9e4066Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 2287fa9e4066Sahrens { 2288fa9e4066Sahrens arc_buf_hdr_t *ab; 2289fa9e4066Sahrens kmutex_t *hash_lock; 2290fa9e4066Sahrens zio_t *zio; 2291fa9e4066Sahrens 2292fa9e4066Sahrens /* 2293fa9e4066Sahrens * If this buffer is in the cache, release it, so it 2294fa9e4066Sahrens * can be re-used. 2295fa9e4066Sahrens */ 2296fa9e4066Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2297fa9e4066Sahrens if (ab != NULL) { 2298fa9e4066Sahrens /* 2299fa9e4066Sahrens * The checksum of blocks to free is not always 2300fa9e4066Sahrens * preserved (eg. on the deadlist). However, if it is 2301fa9e4066Sahrens * nonzero, it should match what we have in the cache. 2302fa9e4066Sahrens */ 2303fa9e4066Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2304fa9e4066Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 230577ed8509Smaybee if (ab->b_state != arc.anon) 230677ed8509Smaybee arc_change_state(arc.anon, ab, hash_lock); 2307*13506d1eSmaybee if (HDR_IO_IN_PROGRESS(ab)) { 2308*13506d1eSmaybee /* 2309*13506d1eSmaybee * This should only happen when we prefetch. 2310*13506d1eSmaybee */ 2311*13506d1eSmaybee ASSERT(ab->b_flags & ARC_PREFETCH); 2312*13506d1eSmaybee ASSERT3U(ab->b_datacnt, ==, 1); 2313*13506d1eSmaybee ab->b_flags |= ARC_FREED_IN_READ; 2314*13506d1eSmaybee if (HDR_IN_HASH_TABLE(ab)) 2315*13506d1eSmaybee buf_hash_remove(ab); 2316*13506d1eSmaybee ab->b_arc_access = 0; 2317*13506d1eSmaybee bzero(&ab->b_dva, sizeof (dva_t)); 2318*13506d1eSmaybee ab->b_birth = 0; 2319*13506d1eSmaybee ab->b_cksum0 = 0; 2320*13506d1eSmaybee ab->b_buf->b_efunc = NULL; 2321*13506d1eSmaybee ab->b_buf->b_private = NULL; 2322*13506d1eSmaybee mutex_exit(hash_lock); 2323*13506d1eSmaybee } else if (refcount_is_zero(&ab->b_refcnt)) { 2324fa9e4066Sahrens mutex_exit(hash_lock); 2325ea8dc4b6Seschrock arc_hdr_destroy(ab); 2326fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 2327fa9e4066Sahrens } else { 2328bbf4a8dfSmaybee /* 2329*13506d1eSmaybee * We still have an active reference on this 2330*13506d1eSmaybee * buffer. This can happen, e.g., from 2331*13506d1eSmaybee * dbuf_unoverride(). 2332bbf4a8dfSmaybee */ 2333*13506d1eSmaybee ASSERT(!HDR_IN_HASH_TABLE(ab)); 2334fa9e4066Sahrens ab->b_arc_access = 0; 2335fa9e4066Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 2336fa9e4066Sahrens ab->b_birth = 0; 2337fa9e4066Sahrens ab->b_cksum0 = 0; 2338ea8dc4b6Seschrock ab->b_buf->b_efunc = NULL; 2339ea8dc4b6Seschrock ab->b_buf->b_private = NULL; 2340fa9e4066Sahrens mutex_exit(hash_lock); 2341fa9e4066Sahrens } 2342fa9e4066Sahrens } 2343fa9e4066Sahrens 2344fa9e4066Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 2345fa9e4066Sahrens 2346fa9e4066Sahrens if (arc_flags & ARC_WAIT) 2347fa9e4066Sahrens return (zio_wait(zio)); 2348fa9e4066Sahrens 2349fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 2350fa9e4066Sahrens zio_nowait(zio); 2351fa9e4066Sahrens 2352fa9e4066Sahrens return (0); 2353fa9e4066Sahrens } 2354fa9e4066Sahrens 2355fa9e4066Sahrens void 2356fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve) 2357fa9e4066Sahrens { 2358fa9e4066Sahrens atomic_add_64(&arc_tempreserve, -tempreserve); 2359fa9e4066Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 2360fa9e4066Sahrens } 2361fa9e4066Sahrens 2362fa9e4066Sahrens int 2363fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve) 2364fa9e4066Sahrens { 2365fa9e4066Sahrens #ifdef ZFS_DEBUG 2366fa9e4066Sahrens /* 2367fa9e4066Sahrens * Once in a while, fail for no reason. Everything should cope. 2368fa9e4066Sahrens */ 2369fa9e4066Sahrens if (spa_get_random(10000) == 0) { 2370fa9e4066Sahrens dprintf("forcing random failure\n"); 2371fa9e4066Sahrens return (ERESTART); 2372fa9e4066Sahrens } 2373fa9e4066Sahrens #endif 2374112fe045Smaybee if (tempreserve > arc.c/4 && !arc.no_grow) 2375112fe045Smaybee arc.c = MIN(arc.c_max, tempreserve * 4); 2376112fe045Smaybee if (tempreserve > arc.c) 2377112fe045Smaybee return (ENOMEM); 2378112fe045Smaybee 2379fa9e4066Sahrens /* 2380112fe045Smaybee * Throttle writes when the amount of dirty data in the cache 2381112fe045Smaybee * gets too large. We try to keep the cache less than half full 2382112fe045Smaybee * of dirty blocks so that our sync times don't grow too large. 2383112fe045Smaybee * Note: if two requests come in concurrently, we might let them 2384112fe045Smaybee * both succeed, when one of them should fail. Not a huge deal. 2385112fe045Smaybee * 2386112fe045Smaybee * XXX The limit should be adjusted dynamically to keep the time 2387112fe045Smaybee * to sync a dataset fixed (around 1-5 seconds?). 2388fa9e4066Sahrens */ 2389fa9e4066Sahrens 2390112fe045Smaybee if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 && 2391112fe045Smaybee arc_tempreserve + arc.anon->size > arc.c / 4) { 2392fa9e4066Sahrens dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2393fa9e4066Sahrens "tempreserve=%lluK arc.c=%lluK\n", 2394fa9e4066Sahrens arc_tempreserve>>10, arc.anon->lsize>>10, 2395fa9e4066Sahrens tempreserve>>10, arc.c>>10); 2396fa9e4066Sahrens return (ERESTART); 2397fa9e4066Sahrens } 2398fa9e4066Sahrens atomic_add_64(&arc_tempreserve, tempreserve); 2399fa9e4066Sahrens return (0); 2400fa9e4066Sahrens } 2401fa9e4066Sahrens 2402fa9e4066Sahrens void 2403fa9e4066Sahrens arc_init(void) 2404fa9e4066Sahrens { 2405fa9e4066Sahrens mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 2406fa9e4066Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2407fa9e4066Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2408fa9e4066Sahrens 2409*13506d1eSmaybee /* Convert seconds to clock ticks */ 2410*13506d1eSmaybee arc_min_prefetch_lifespan *= hz; 2411*13506d1eSmaybee 2412fa9e4066Sahrens /* Start out with 1/8 of all memory */ 2413fa9e4066Sahrens arc.c = physmem * PAGESIZE / 8; 2414fa9e4066Sahrens 2415fa9e4066Sahrens #ifdef _KERNEL 2416fa9e4066Sahrens /* 2417fa9e4066Sahrens * On architectures where the physical memory can be larger 2418fa9e4066Sahrens * than the addressable space (intel in 32-bit mode), we may 2419fa9e4066Sahrens * need to limit the cache to 1/8 of VM size. 2420fa9e4066Sahrens */ 2421fa9e4066Sahrens arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2422fa9e4066Sahrens #endif 2423fa9e4066Sahrens 2424112fe045Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2425fa9e4066Sahrens arc.c_min = MAX(arc.c / 4, 64<<20); 2426112fe045Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2427fa9e4066Sahrens if (arc.c * 8 >= 1<<30) 2428fa9e4066Sahrens arc.c_max = (arc.c * 8) - (1<<30); 2429fa9e4066Sahrens else 2430fa9e4066Sahrens arc.c_max = arc.c_min; 2431fa9e4066Sahrens arc.c_max = MAX(arc.c * 6, arc.c_max); 2432fa9e4066Sahrens arc.c = arc.c_max; 2433fa9e4066Sahrens arc.p = (arc.c >> 1); 2434fa9e4066Sahrens 2435fa9e4066Sahrens /* if kmem_flags are set, lets try to use less memory */ 2436fa9e4066Sahrens if (kmem_debugging()) 2437fa9e4066Sahrens arc.c = arc.c / 2; 2438fa9e4066Sahrens if (arc.c < arc.c_min) 2439fa9e4066Sahrens arc.c = arc.c_min; 2440fa9e4066Sahrens 2441fa9e4066Sahrens arc.anon = &ARC_anon; 2442ea8dc4b6Seschrock arc.mru = &ARC_mru; 2443ea8dc4b6Seschrock arc.mru_ghost = &ARC_mru_ghost; 2444ea8dc4b6Seschrock arc.mfu = &ARC_mfu; 2445ea8dc4b6Seschrock arc.mfu_ghost = &ARC_mfu_ghost; 2446ea8dc4b6Seschrock arc.size = 0; 2447fa9e4066Sahrens 2448ea8dc4b6Seschrock list_create(&arc.mru->list, sizeof (arc_buf_hdr_t), 2449fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2450ea8dc4b6Seschrock list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t), 2451fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2452ea8dc4b6Seschrock list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t), 2453fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2454ea8dc4b6Seschrock list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t), 2455fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 2456fa9e4066Sahrens 2457fa9e4066Sahrens buf_init(); 2458fa9e4066Sahrens 2459fa9e4066Sahrens arc_thread_exit = 0; 2460ea8dc4b6Seschrock arc_eviction_list = NULL; 2461ea8dc4b6Seschrock mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2462fa9e4066Sahrens 2463fa9e4066Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2464fa9e4066Sahrens TS_RUN, minclsyspri); 2465fa9e4066Sahrens } 2466fa9e4066Sahrens 2467fa9e4066Sahrens void 2468fa9e4066Sahrens arc_fini(void) 2469fa9e4066Sahrens { 2470fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 2471fa9e4066Sahrens arc_thread_exit = 1; 2472fa9e4066Sahrens while (arc_thread_exit != 0) 2473fa9e4066Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2474fa9e4066Sahrens mutex_exit(&arc_reclaim_thr_lock); 2475fa9e4066Sahrens 2476fa9e4066Sahrens arc_flush(); 2477fa9e4066Sahrens 2478fa9e4066Sahrens arc_dead = TRUE; 2479fa9e4066Sahrens 2480ea8dc4b6Seschrock mutex_destroy(&arc_eviction_mtx); 2481fa9e4066Sahrens mutex_destroy(&arc_reclaim_lock); 2482fa9e4066Sahrens mutex_destroy(&arc_reclaim_thr_lock); 2483fa9e4066Sahrens cv_destroy(&arc_reclaim_thr_cv); 2484fa9e4066Sahrens 2485ea8dc4b6Seschrock list_destroy(&arc.mru->list); 2486ea8dc4b6Seschrock list_destroy(&arc.mru_ghost->list); 2487ea8dc4b6Seschrock list_destroy(&arc.mfu->list); 2488ea8dc4b6Seschrock list_destroy(&arc.mfu_ghost->list); 2489fa9e4066Sahrens 2490fa9e4066Sahrens buf_fini(); 2491fa9e4066Sahrens } 2492