1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5*033f9833Sek * Common Development and Distribution License (the "License"). 6*033f9833Sek * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22*033f9833Sek * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens /* 29fa9e4066Sahrens * DVA-based Adjustable Relpacement Cache 30fa9e4066Sahrens * 31fa9e4066Sahrens * While much of the theory of operation and algorithms used here 32fa9e4066Sahrens * are based on the self-tuning, low overhead replacement cache 33fa9e4066Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 34fa9e4066Sahrens * significant differences: 35fa9e4066Sahrens * 36fa9e4066Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 37fa9e4066Sahrens * Pages in its cache cannot be "locked" into memory. This makes 38fa9e4066Sahrens * the eviction algorithm simple: evict the last page in the list. 39fa9e4066Sahrens * This also make the performance characteristics easy to reason 40fa9e4066Sahrens * about. Our cache is not so simple. At any given moment, some 41fa9e4066Sahrens * subset of the blocks in the cache are un-evictable because we 42fa9e4066Sahrens * have handed out a reference to them. Blocks are only evictable 43fa9e4066Sahrens * when there are no external references active. This makes 44fa9e4066Sahrens * eviction far more problematic: we choose to evict the evictable 45fa9e4066Sahrens * blocks that are the "lowest" in the list. 46fa9e4066Sahrens * 47fa9e4066Sahrens * There are times when it is not possible to evict the requested 48fa9e4066Sahrens * space. In these circumstances we are unable to adjust the cache 49fa9e4066Sahrens * size. To prevent the cache growing unbounded at these times we 50fa9e4066Sahrens * implement a "cache throttle" that slowes the flow of new data 51fa9e4066Sahrens * into the cache until we can make space avaiable. 52fa9e4066Sahrens * 53fa9e4066Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 54fa9e4066Sahrens * Pages are evicted when the cache is full and there is a cache 55fa9e4066Sahrens * miss. Our model has a variable sized cache. It grows with 56fa9e4066Sahrens * high use, but also tries to react to memory preasure from the 57fa9e4066Sahrens * operating system: decreasing its size when system memory is 58fa9e4066Sahrens * tight. 59fa9e4066Sahrens * 60fa9e4066Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 61fa9e4066Sahrens * elements of the cache are therefor exactly the same size. So 62fa9e4066Sahrens * when adjusting the cache size following a cache miss, its simply 63fa9e4066Sahrens * a matter of choosing a single page to evict. In our model, we 64fa9e4066Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 65fa9e4066Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 66fa9e4066Sahrens * space for a cache miss that approximates as closely as possible 67fa9e4066Sahrens * the space used by the new block. 68fa9e4066Sahrens * 69fa9e4066Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70fa9e4066Sahrens * by N. Megiddo & D. Modha, FAST 2003 71fa9e4066Sahrens */ 72fa9e4066Sahrens 73fa9e4066Sahrens /* 74fa9e4066Sahrens * The locking model: 75fa9e4066Sahrens * 76fa9e4066Sahrens * A new reference to a cache buffer can be obtained in two 77fa9e4066Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 78fa9e4066Sahrens * or 2) via one of the ARC lists. The arc_read() inerface 79fa9e4066Sahrens * uses method 1, while the internal arc algorithms for 80fa9e4066Sahrens * adjusting the cache use method 2. We therefor provide two 81fa9e4066Sahrens * types of locks: 1) the hash table lock array, and 2) the 82fa9e4066Sahrens * arc list locks. 83fa9e4066Sahrens * 84fa9e4066Sahrens * Buffers do not have their own mutexs, rather they rely on the 85fa9e4066Sahrens * hash table mutexs for the bulk of their protection (i.e. most 86fa9e4066Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 87fa9e4066Sahrens * 88fa9e4066Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 89fa9e4066Sahrens * locates the requested buffer in the hash table. It returns 90fa9e4066Sahrens * NULL for the mutex if the buffer was not in the table. 91fa9e4066Sahrens * 92fa9e4066Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 93fa9e4066Sahrens * already held before it is invoked. 94fa9e4066Sahrens * 95fa9e4066Sahrens * Each arc state also has a mutex which is used to protect the 96fa9e4066Sahrens * buffer list associated with the state. When attempting to 97fa9e4066Sahrens * obtain a hash table lock while holding an arc list lock you 98fa9e4066Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 99fa9e4066Sahrens * the "top" state mutex must be held before the "bot" state mutex. 100fa9e4066Sahrens * 101fa9e4066Sahrens * Note that the majority of the performance stats are manipulated 102fa9e4066Sahrens * with atomic operations. 103fa9e4066Sahrens */ 104fa9e4066Sahrens 105fa9e4066Sahrens #include <sys/spa.h> 106fa9e4066Sahrens #include <sys/zio.h> 107fa9e4066Sahrens #include <sys/zfs_context.h> 108fa9e4066Sahrens #include <sys/arc.h> 109fa9e4066Sahrens #include <sys/refcount.h> 110fa9e4066Sahrens #ifdef _KERNEL 111fa9e4066Sahrens #include <sys/vmsystm.h> 112fa9e4066Sahrens #include <vm/anon.h> 113fa9e4066Sahrens #include <sys/fs/swapnode.h> 114*033f9833Sek #include <sys/dnlc.h> 115fa9e4066Sahrens #endif 116fa9e4066Sahrens #include <sys/callb.h> 117fa9e4066Sahrens 118fa9e4066Sahrens static kmutex_t arc_reclaim_thr_lock; 119fa9e4066Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 120fa9e4066Sahrens static uint8_t arc_thread_exit; 121fa9e4066Sahrens 122*033f9833Sek #define ARC_REDUCE_DNLC_PERCENT 3 123*033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 124*033f9833Sek 125fa9e4066Sahrens typedef enum arc_reclaim_strategy { 126fa9e4066Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 127fa9e4066Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 128fa9e4066Sahrens } arc_reclaim_strategy_t; 129fa9e4066Sahrens 130fa9e4066Sahrens /* number of seconds before growing cache again */ 131fa9e4066Sahrens static int arc_grow_retry = 60; 132fa9e4066Sahrens 133fa9e4066Sahrens static kmutex_t arc_reclaim_lock; 134fa9e4066Sahrens static int arc_dead; 135fa9e4066Sahrens 136fa9e4066Sahrens /* 137fa9e4066Sahrens * Note that buffers can be on one of 5 states: 138fa9e4066Sahrens * ARC_anon - anonymous (discussed below) 139fa9e4066Sahrens * ARC_mru_top - recently used, currently cached 140fa9e4066Sahrens * ARC_mru_bot - recentely used, no longer in cache 141fa9e4066Sahrens * ARC_mfu_top - frequently used, currently cached 142fa9e4066Sahrens * ARC_mfu_bot - frequently used, no longer in cache 143fa9e4066Sahrens * When there are no active references to the buffer, they 144fa9e4066Sahrens * are linked onto one of the lists in arc. These are the 145fa9e4066Sahrens * only buffers that can be evicted or deleted. 146fa9e4066Sahrens * 147fa9e4066Sahrens * Anonymous buffers are buffers that are not associated with 148fa9e4066Sahrens * a DVA. These are buffers that hold dirty block copies 149fa9e4066Sahrens * before they are written to stable storage. By definition, 150fa9e4066Sahrens * they are "ref'd" and are considered part of arc_mru_top 151fa9e4066Sahrens * that cannot be freed. Generally, they will aquire a DVA 152fa9e4066Sahrens * as they are written and migrate onto the arc_mru_top list. 153fa9e4066Sahrens */ 154fa9e4066Sahrens 155fa9e4066Sahrens typedef struct arc_state { 156fa9e4066Sahrens list_t list; /* linked list of evictable buffer in state */ 157fa9e4066Sahrens uint64_t lsize; /* total size of buffers in the linked list */ 158fa9e4066Sahrens uint64_t size; /* total size of all buffers in this state */ 159fa9e4066Sahrens uint64_t hits; 160fa9e4066Sahrens kmutex_t mtx; 161fa9e4066Sahrens } arc_state_t; 162fa9e4066Sahrens 163fa9e4066Sahrens /* The 5 states: */ 164fa9e4066Sahrens static arc_state_t ARC_anon; 165fa9e4066Sahrens static arc_state_t ARC_mru_top; 166fa9e4066Sahrens static arc_state_t ARC_mru_bot; 167fa9e4066Sahrens static arc_state_t ARC_mfu_top; 168fa9e4066Sahrens static arc_state_t ARC_mfu_bot; 169fa9e4066Sahrens 170fa9e4066Sahrens static struct arc { 171fa9e4066Sahrens arc_state_t *anon; 172fa9e4066Sahrens arc_state_t *mru_top; 173fa9e4066Sahrens arc_state_t *mru_bot; 174fa9e4066Sahrens arc_state_t *mfu_top; 175fa9e4066Sahrens arc_state_t *mfu_bot; 176fa9e4066Sahrens uint64_t size; /* Actual total arc size */ 177fa9e4066Sahrens uint64_t p; /* Target size (in bytes) of mru_top */ 178fa9e4066Sahrens uint64_t c; /* Target size of cache (in bytes) */ 179fa9e4066Sahrens uint64_t c_min; /* Minimum target cache size */ 180fa9e4066Sahrens uint64_t c_max; /* Maximum target cache size */ 181fa9e4066Sahrens uint64_t incr; /* Size by which to increment arc.c */ 182fa9e4066Sahrens int64_t size_check; 183fa9e4066Sahrens 184fa9e4066Sahrens /* performance stats */ 185fa9e4066Sahrens uint64_t hits; 186fa9e4066Sahrens uint64_t misses; 187fa9e4066Sahrens uint64_t deleted; 188fa9e4066Sahrens uint64_t skipped; 189fa9e4066Sahrens uint64_t hash_elements; 190fa9e4066Sahrens uint64_t hash_elements_max; 191fa9e4066Sahrens uint64_t hash_collisions; 192fa9e4066Sahrens uint64_t hash_chains; 193fa9e4066Sahrens uint32_t hash_chain_max; 194fa9e4066Sahrens 195fa9e4066Sahrens int no_grow; /* Don't try to grow cache size */ 196fa9e4066Sahrens } arc; 197fa9e4066Sahrens 198fa9e4066Sahrens /* Default amount to grow arc.incr */ 199fa9e4066Sahrens static int64_t arc_incr_size = 1024; 200fa9e4066Sahrens 201fa9e4066Sahrens /* > 0 ==> time to increment arc.c */ 202fa9e4066Sahrens static int64_t arc_size_check_default = -1000; 203fa9e4066Sahrens 204fa9e4066Sahrens static uint64_t arc_tempreserve; 205fa9e4066Sahrens 206fa9e4066Sahrens typedef struct arc_callback arc_callback_t; 207fa9e4066Sahrens 208fa9e4066Sahrens struct arc_callback { 209fa9e4066Sahrens arc_done_func_t *acb_done; 210fa9e4066Sahrens void *acb_private; 211fa9e4066Sahrens arc_byteswap_func_t *acb_byteswap; 212fa9e4066Sahrens arc_buf_t *acb_buf; 213fa9e4066Sahrens zio_t *acb_zio_dummy; 214fa9e4066Sahrens arc_callback_t *acb_next; 215fa9e4066Sahrens }; 216fa9e4066Sahrens 217fa9e4066Sahrens struct arc_buf_hdr { 218fa9e4066Sahrens /* immutable */ 219fa9e4066Sahrens uint64_t b_size; 220fa9e4066Sahrens spa_t *b_spa; 221fa9e4066Sahrens 222fa9e4066Sahrens /* protected by hash lock */ 223fa9e4066Sahrens dva_t b_dva; 224fa9e4066Sahrens uint64_t b_birth; 225fa9e4066Sahrens uint64_t b_cksum0; 226fa9e4066Sahrens 227fa9e4066Sahrens arc_buf_hdr_t *b_hash_next; 228fa9e4066Sahrens arc_buf_t *b_buf; 229fa9e4066Sahrens uint32_t b_flags; 230fa9e4066Sahrens 231fa9e4066Sahrens kcondvar_t b_cv; 232fa9e4066Sahrens arc_callback_t *b_acb; 233fa9e4066Sahrens 234fa9e4066Sahrens /* protected by arc state mutex */ 235fa9e4066Sahrens arc_state_t *b_state; 236fa9e4066Sahrens list_node_t b_arc_node; 237fa9e4066Sahrens 238fa9e4066Sahrens /* updated atomically */ 239fa9e4066Sahrens clock_t b_arc_access; 240fa9e4066Sahrens 241fa9e4066Sahrens /* self protecting */ 242fa9e4066Sahrens refcount_t b_refcnt; 243fa9e4066Sahrens }; 244fa9e4066Sahrens 245fa9e4066Sahrens /* 246fa9e4066Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 247fa9e4066Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 248fa9e4066Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 249fa9e4066Sahrens * should never be passed and should only be set by ARC code. When adding new 250fa9e4066Sahrens * public flags, make sure not to smash the private ones. 251fa9e4066Sahrens */ 252fa9e4066Sahrens 253fa9e4066Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 254fa9e4066Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 255fa9e4066Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 256fa9e4066Sahrens 257fa9e4066Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 258fa9e4066Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 259fa9e4066Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 260fa9e4066Sahrens 261fa9e4066Sahrens /* 262fa9e4066Sahrens * Hash table routines 263fa9e4066Sahrens */ 264fa9e4066Sahrens 265fa9e4066Sahrens #define HT_LOCK_PAD 64 266fa9e4066Sahrens 267fa9e4066Sahrens struct ht_lock { 268fa9e4066Sahrens kmutex_t ht_lock; 269fa9e4066Sahrens #ifdef _KERNEL 270fa9e4066Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 271fa9e4066Sahrens #endif 272fa9e4066Sahrens }; 273fa9e4066Sahrens 274fa9e4066Sahrens #define BUF_LOCKS 256 275fa9e4066Sahrens typedef struct buf_hash_table { 276fa9e4066Sahrens uint64_t ht_mask; 277fa9e4066Sahrens arc_buf_hdr_t **ht_table; 278fa9e4066Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 279fa9e4066Sahrens } buf_hash_table_t; 280fa9e4066Sahrens 281fa9e4066Sahrens static buf_hash_table_t buf_hash_table; 282fa9e4066Sahrens 283fa9e4066Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 284fa9e4066Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 285fa9e4066Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 286fa9e4066Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 287fa9e4066Sahrens #define HDR_LOCK(buf) \ 288fa9e4066Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 289fa9e4066Sahrens 290fa9e4066Sahrens uint64_t zfs_crc64_table[256]; 291fa9e4066Sahrens 292fa9e4066Sahrens static uint64_t 293fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 294fa9e4066Sahrens { 295fa9e4066Sahrens uintptr_t spav = (uintptr_t)spa; 296fa9e4066Sahrens uint8_t *vdva = (uint8_t *)dva; 297fa9e4066Sahrens uint64_t crc = -1ULL; 298fa9e4066Sahrens int i; 299fa9e4066Sahrens 300fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 301fa9e4066Sahrens 302fa9e4066Sahrens for (i = 0; i < sizeof (dva_t); i++) 303fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 304fa9e4066Sahrens 305fa9e4066Sahrens crc ^= (spav>>8) ^ birth; 306fa9e4066Sahrens 307fa9e4066Sahrens return (crc); 308fa9e4066Sahrens } 309fa9e4066Sahrens 310fa9e4066Sahrens #define BUF_EMPTY(buf) \ 311fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 312fa9e4066Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 313fa9e4066Sahrens (buf)->b_birth == 0) 314fa9e4066Sahrens 315fa9e4066Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 316fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 317fa9e4066Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 318fa9e4066Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 319fa9e4066Sahrens 320fa9e4066Sahrens static arc_buf_hdr_t * 321fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 322fa9e4066Sahrens { 323fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 324fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 325fa9e4066Sahrens arc_buf_hdr_t *buf; 326fa9e4066Sahrens 327fa9e4066Sahrens mutex_enter(hash_lock); 328fa9e4066Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 329fa9e4066Sahrens buf = buf->b_hash_next) { 330fa9e4066Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 331fa9e4066Sahrens *lockp = hash_lock; 332fa9e4066Sahrens return (buf); 333fa9e4066Sahrens } 334fa9e4066Sahrens } 335fa9e4066Sahrens mutex_exit(hash_lock); 336fa9e4066Sahrens *lockp = NULL; 337fa9e4066Sahrens return (NULL); 338fa9e4066Sahrens } 339fa9e4066Sahrens 340fa9e4066Sahrens /* 341fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 342fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 343fa9e4066Sahrens * will be returned and the new element will not be inserted. 344fa9e4066Sahrens * Otherwise returns NULL. 345fa9e4066Sahrens */ 346fa9e4066Sahrens static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */ 347fa9e4066Sahrens static kthread_t *fbufs_lastthread; 348fa9e4066Sahrens static arc_buf_hdr_t * 349fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 350fa9e4066Sahrens { 351fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 352fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 353fa9e4066Sahrens arc_buf_hdr_t *fbuf; 354fa9e4066Sahrens uint32_t max, i; 355fa9e4066Sahrens 356fa9e4066Sahrens fbufs_lastthread = curthread; 357fa9e4066Sahrens *lockp = hash_lock; 358fa9e4066Sahrens mutex_enter(hash_lock); 359fa9e4066Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 360fa9e4066Sahrens fbuf = fbuf->b_hash_next, i++) { 361fa9e4066Sahrens if (i < sizeof (fbufs) / sizeof (fbufs[0])) 362fa9e4066Sahrens fbufs[i] = fbuf; 363fa9e4066Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 364fa9e4066Sahrens return (fbuf); 365fa9e4066Sahrens } 366fa9e4066Sahrens 367fa9e4066Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 368fa9e4066Sahrens buf_hash_table.ht_table[idx] = buf; 369fa9e4066Sahrens 370fa9e4066Sahrens /* collect some hash table performance data */ 371fa9e4066Sahrens if (i > 0) { 372fa9e4066Sahrens atomic_add_64(&arc.hash_collisions, 1); 373fa9e4066Sahrens if (i == 1) 374fa9e4066Sahrens atomic_add_64(&arc.hash_chains, 1); 375fa9e4066Sahrens } 376fa9e4066Sahrens while (i > (max = arc.hash_chain_max) && 377fa9e4066Sahrens max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 378fa9e4066Sahrens continue; 379fa9e4066Sahrens } 380fa9e4066Sahrens atomic_add_64(&arc.hash_elements, 1); 381fa9e4066Sahrens if (arc.hash_elements > arc.hash_elements_max) 382fa9e4066Sahrens atomic_add_64(&arc.hash_elements_max, 1); 383fa9e4066Sahrens 384fa9e4066Sahrens return (NULL); 385fa9e4066Sahrens } 386fa9e4066Sahrens 387fa9e4066Sahrens static void 388fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 389fa9e4066Sahrens { 390fa9e4066Sahrens arc_buf_hdr_t *fbuf, **bufp; 391fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 392fa9e4066Sahrens 393fa9e4066Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 394fa9e4066Sahrens 395fa9e4066Sahrens bufp = &buf_hash_table.ht_table[idx]; 396fa9e4066Sahrens while ((fbuf = *bufp) != buf) { 397fa9e4066Sahrens ASSERT(fbuf != NULL); 398fa9e4066Sahrens bufp = &fbuf->b_hash_next; 399fa9e4066Sahrens } 400fa9e4066Sahrens *bufp = buf->b_hash_next; 401fa9e4066Sahrens buf->b_hash_next = NULL; 402fa9e4066Sahrens 403fa9e4066Sahrens /* collect some hash table performance data */ 404fa9e4066Sahrens atomic_add_64(&arc.hash_elements, -1); 405fa9e4066Sahrens if (buf_hash_table.ht_table[idx] && 406fa9e4066Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 407fa9e4066Sahrens atomic_add_64(&arc.hash_chains, -1); 408fa9e4066Sahrens } 409fa9e4066Sahrens 410fa9e4066Sahrens /* 411fa9e4066Sahrens * Global data structures and functions for the buf kmem cache. 412fa9e4066Sahrens */ 413fa9e4066Sahrens static kmem_cache_t *hdr_cache; 414fa9e4066Sahrens static kmem_cache_t *buf_cache; 415fa9e4066Sahrens 416fa9e4066Sahrens static void 417fa9e4066Sahrens buf_fini(void) 418fa9e4066Sahrens { 419fa9e4066Sahrens int i; 420fa9e4066Sahrens 421fa9e4066Sahrens kmem_free(buf_hash_table.ht_table, 422fa9e4066Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 423fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) 424fa9e4066Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 425fa9e4066Sahrens kmem_cache_destroy(hdr_cache); 426fa9e4066Sahrens kmem_cache_destroy(buf_cache); 427fa9e4066Sahrens } 428fa9e4066Sahrens 429fa9e4066Sahrens /* 430fa9e4066Sahrens * Constructor callback - called when the cache is empty 431fa9e4066Sahrens * and a new buf is requested. 432fa9e4066Sahrens */ 433fa9e4066Sahrens /* ARGSUSED */ 434fa9e4066Sahrens static int 435fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 436fa9e4066Sahrens { 437fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 438fa9e4066Sahrens 439fa9e4066Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 440fa9e4066Sahrens refcount_create(&buf->b_refcnt); 441fa9e4066Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 442fa9e4066Sahrens return (0); 443fa9e4066Sahrens } 444fa9e4066Sahrens 445fa9e4066Sahrens /* 446fa9e4066Sahrens * Destructor callback - called when a cached buf is 447fa9e4066Sahrens * no longer required. 448fa9e4066Sahrens */ 449fa9e4066Sahrens /* ARGSUSED */ 450fa9e4066Sahrens static void 451fa9e4066Sahrens hdr_dest(void *vbuf, void *unused) 452fa9e4066Sahrens { 453fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 454fa9e4066Sahrens 455fa9e4066Sahrens refcount_destroy(&buf->b_refcnt); 456fa9e4066Sahrens cv_destroy(&buf->b_cv); 457fa9e4066Sahrens } 458fa9e4066Sahrens 459fa9e4066Sahrens void arc_kmem_reclaim(void); 460fa9e4066Sahrens 461fa9e4066Sahrens /* 462fa9e4066Sahrens * Reclaim callback -- invoked when memory is low. 463fa9e4066Sahrens */ 464fa9e4066Sahrens /* ARGSUSED */ 465fa9e4066Sahrens static void 466fa9e4066Sahrens hdr_recl(void *unused) 467fa9e4066Sahrens { 468fa9e4066Sahrens dprintf("hdr_recl called\n"); 469fa9e4066Sahrens arc_kmem_reclaim(); 470fa9e4066Sahrens } 471fa9e4066Sahrens 472fa9e4066Sahrens static void 473fa9e4066Sahrens buf_init(void) 474fa9e4066Sahrens { 475fa9e4066Sahrens uint64_t *ct; 476fa9e4066Sahrens uint64_t hsize = 1ULL << 10; 477fa9e4066Sahrens int i, j; 478fa9e4066Sahrens 479fa9e4066Sahrens /* 480fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 481fa9e4066Sahrens * with an average 4k block size. The table will take up 482fa9e4066Sahrens * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte 483fa9e4066Sahrens * pointers). 484fa9e4066Sahrens */ 485fa9e4066Sahrens while (hsize * 4096 < physmem * PAGESIZE) 486fa9e4066Sahrens hsize <<= 1; 487fa9e4066Sahrens 488fa9e4066Sahrens buf_hash_table.ht_mask = hsize - 1; 489fa9e4066Sahrens buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP); 490fa9e4066Sahrens 491fa9e4066Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 492fa9e4066Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 493fa9e4066Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 494fa9e4066Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 495fa9e4066Sahrens 496fa9e4066Sahrens for (i = 0; i < 256; i++) 497fa9e4066Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 498fa9e4066Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 499fa9e4066Sahrens 500fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) { 501fa9e4066Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 502fa9e4066Sahrens NULL, MUTEX_DEFAULT, NULL); 503fa9e4066Sahrens } 504fa9e4066Sahrens } 505fa9e4066Sahrens 506fa9e4066Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 507fa9e4066Sahrens 508fa9e4066Sahrens #define ARC_TAG (void *)0x05201962 509fa9e4066Sahrens 510fa9e4066Sahrens static void 511fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 512fa9e4066Sahrens { 513fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 514fa9e4066Sahrens 515fa9e4066Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 516fa9e4066Sahrens (ab->b_state != arc.anon)) { 517fa9e4066Sahrens 518fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 519fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 520fa9e4066Sahrens ASSERT(!refcount_is_zero(&ab->b_refcnt)); 521fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 522fa9e4066Sahrens list_remove(&ab->b_state->list, ab); 523fa9e4066Sahrens ASSERT3U(ab->b_state->lsize, >=, ab->b_size); 524fa9e4066Sahrens ab->b_state->lsize -= ab->b_size; 525fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 526fa9e4066Sahrens } 527fa9e4066Sahrens } 528fa9e4066Sahrens 529fa9e4066Sahrens static int 530fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 531fa9e4066Sahrens { 532fa9e4066Sahrens int cnt; 533fa9e4066Sahrens 534fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 535fa9e4066Sahrens 536fa9e4066Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 537fa9e4066Sahrens (ab->b_state != arc.anon)) { 538fa9e4066Sahrens 539fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 540fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 541fa9e4066Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 542fa9e4066Sahrens list_insert_head(&ab->b_state->list, ab); 543fa9e4066Sahrens ASSERT(ab->b_buf != NULL); 544fa9e4066Sahrens ab->b_state->lsize += ab->b_size; 545fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 546fa9e4066Sahrens } 547fa9e4066Sahrens return (cnt); 548fa9e4066Sahrens } 549fa9e4066Sahrens 550fa9e4066Sahrens /* 551fa9e4066Sahrens * Move the supplied buffer to the indicated state. The mutex 552fa9e4066Sahrens * for the buffer must be held by the caller. 553fa9e4066Sahrens */ 554fa9e4066Sahrens static void 555fa9e4066Sahrens arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, 556fa9e4066Sahrens kmutex_t *hash_lock) 557fa9e4066Sahrens { 558fa9e4066Sahrens arc_buf_t *buf; 559fa9e4066Sahrens 560fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 561fa9e4066Sahrens 562fa9e4066Sahrens /* 563fa9e4066Sahrens * If this buffer is evictable, transfer it from the 564fa9e4066Sahrens * old state list to the new state list. 565fa9e4066Sahrens */ 566fa9e4066Sahrens if (refcount_is_zero(&ab->b_refcnt)) { 567fa9e4066Sahrens if (ab->b_state != arc.anon) { 568fa9e4066Sahrens int drop_mutex = FALSE; 569fa9e4066Sahrens 570fa9e4066Sahrens if (!MUTEX_HELD(&ab->b_state->mtx)) { 571fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 572fa9e4066Sahrens drop_mutex = TRUE; 573fa9e4066Sahrens } 574fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 575fa9e4066Sahrens list_remove(&ab->b_state->list, ab); 576fa9e4066Sahrens ASSERT3U(ab->b_state->lsize, >=, ab->b_size); 577fa9e4066Sahrens ab->b_state->lsize -= ab->b_size; 578fa9e4066Sahrens if (drop_mutex) 579fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 580fa9e4066Sahrens } 581fa9e4066Sahrens if (new_state != arc.anon) { 582fa9e4066Sahrens int drop_mutex = FALSE; 583fa9e4066Sahrens 584fa9e4066Sahrens if (!MUTEX_HELD(&new_state->mtx)) { 585fa9e4066Sahrens mutex_enter(&new_state->mtx); 586fa9e4066Sahrens drop_mutex = TRUE; 587fa9e4066Sahrens } 588fa9e4066Sahrens list_insert_head(&new_state->list, ab); 589fa9e4066Sahrens ASSERT(ab->b_buf != NULL); 590fa9e4066Sahrens new_state->lsize += ab->b_size; 591fa9e4066Sahrens if (drop_mutex) 592fa9e4066Sahrens mutex_exit(&new_state->mtx); 593fa9e4066Sahrens } 594fa9e4066Sahrens } 595fa9e4066Sahrens 596fa9e4066Sahrens ASSERT(!BUF_EMPTY(ab)); 597fa9e4066Sahrens if (new_state == arc.anon && ab->b_state != arc.anon) { 598fa9e4066Sahrens buf_hash_remove(ab); 599fa9e4066Sahrens } 600fa9e4066Sahrens 601fa9e4066Sahrens /* 602fa9e4066Sahrens * If this buffer isn't being transferred to the MRU-top 603fa9e4066Sahrens * state, it's safe to clear its prefetch flag 604fa9e4066Sahrens */ 605fa9e4066Sahrens if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) { 606fa9e4066Sahrens ab->b_flags &= ~ARC_PREFETCH; 607fa9e4066Sahrens } 608fa9e4066Sahrens 609fa9e4066Sahrens buf = ab->b_buf; 610fa9e4066Sahrens if (buf == NULL) { 611fa9e4066Sahrens ASSERT3U(ab->b_state->size, >=, ab->b_size); 612fa9e4066Sahrens atomic_add_64(&ab->b_state->size, -ab->b_size); 613fa9e4066Sahrens /* we should only be here if we are deleting state */ 614fa9e4066Sahrens ASSERT(new_state == arc.anon && 615fa9e4066Sahrens (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot)); 616fa9e4066Sahrens } else while (buf) { 617fa9e4066Sahrens ASSERT3U(ab->b_state->size, >=, ab->b_size); 618fa9e4066Sahrens atomic_add_64(&ab->b_state->size, -ab->b_size); 619fa9e4066Sahrens atomic_add_64(&new_state->size, ab->b_size); 620fa9e4066Sahrens buf = buf->b_next; 621fa9e4066Sahrens } 622fa9e4066Sahrens ab->b_state = new_state; 623fa9e4066Sahrens } 624fa9e4066Sahrens 625fa9e4066Sahrens arc_buf_t * 626fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag) 627fa9e4066Sahrens { 628fa9e4066Sahrens arc_buf_hdr_t *hdr; 629fa9e4066Sahrens arc_buf_t *buf; 630fa9e4066Sahrens 631fa9e4066Sahrens ASSERT3U(size, >, 0); 632fa9e4066Sahrens hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 633fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 634fa9e4066Sahrens hdr->b_size = size; 635fa9e4066Sahrens hdr->b_spa = spa; 636fa9e4066Sahrens hdr->b_state = arc.anon; 637fa9e4066Sahrens hdr->b_arc_access = 0; 638fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 639fa9e4066Sahrens buf->b_hdr = hdr; 640fa9e4066Sahrens buf->b_next = NULL; 641fa9e4066Sahrens buf->b_data = zio_buf_alloc(size); 642fa9e4066Sahrens hdr->b_buf = buf; 643fa9e4066Sahrens hdr->b_flags = 0; 644fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 645fa9e4066Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 646fa9e4066Sahrens 647fa9e4066Sahrens atomic_add_64(&arc.size, size); 648fa9e4066Sahrens atomic_add_64(&arc.anon->size, size); 649fa9e4066Sahrens 650fa9e4066Sahrens return (buf); 651fa9e4066Sahrens } 652fa9e4066Sahrens 653fa9e4066Sahrens static void 654fa9e4066Sahrens arc_hdr_free(arc_buf_hdr_t *hdr) 655fa9e4066Sahrens { 656fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 657fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 658fa9e4066Sahrens 659fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 660fa9e4066Sahrens /* 661fa9e4066Sahrens * We can be called with an arc state lock held, 662fa9e4066Sahrens * so we can't hold a hash lock here. 663fa9e4066Sahrens * ASSERT(not in hash table) 664fa9e4066Sahrens */ 665fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 666fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 667fa9e4066Sahrens hdr->b_birth = 0; 668fa9e4066Sahrens hdr->b_cksum0 = 0; 669fa9e4066Sahrens } 670fa9e4066Sahrens if (hdr->b_buf) { 671fa9e4066Sahrens arc_buf_t *buf = hdr->b_buf; 672fa9e4066Sahrens 673fa9e4066Sahrens ASSERT3U(hdr->b_size, >, 0); 674fa9e4066Sahrens zio_buf_free(buf->b_data, hdr->b_size); 675fa9e4066Sahrens atomic_add_64(&arc.size, -hdr->b_size); 676fa9e4066Sahrens ASSERT3U(arc.anon->size, >=, hdr->b_size); 677fa9e4066Sahrens atomic_add_64(&arc.anon->size, -hdr->b_size); 678fa9e4066Sahrens ASSERT3P(buf->b_next, ==, NULL); 679fa9e4066Sahrens kmem_cache_free(buf_cache, buf); 680fa9e4066Sahrens hdr->b_buf = NULL; 681fa9e4066Sahrens } 682fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 683fa9e4066Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 684fa9e4066Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 685fa9e4066Sahrens kmem_cache_free(hdr_cache, hdr); 686fa9e4066Sahrens } 687fa9e4066Sahrens 688fa9e4066Sahrens void 689fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 690fa9e4066Sahrens { 691fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 692fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 693fa9e4066Sahrens int freeable; 694fa9e4066Sahrens 695fa9e4066Sahrens mutex_enter(hash_lock); 696fa9e4066Sahrens if (remove_reference(hdr, hash_lock, tag) > 0) { 697fa9e4066Sahrens arc_buf_t **bufp = &hdr->b_buf; 698fa9e4066Sahrens arc_state_t *state = hdr->b_state; 699fa9e4066Sahrens uint64_t size = hdr->b_size; 700fa9e4066Sahrens 701fa9e4066Sahrens ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr)); 702fa9e4066Sahrens while (*bufp != buf) { 703fa9e4066Sahrens ASSERT(*bufp); 704fa9e4066Sahrens bufp = &(*bufp)->b_next; 705fa9e4066Sahrens } 706fa9e4066Sahrens *bufp = buf->b_next; 707fa9e4066Sahrens mutex_exit(hash_lock); 708fa9e4066Sahrens zio_buf_free(buf->b_data, size); 709fa9e4066Sahrens atomic_add_64(&arc.size, -size); 710fa9e4066Sahrens kmem_cache_free(buf_cache, buf); 711fa9e4066Sahrens ASSERT3U(state->size, >=, size); 712fa9e4066Sahrens atomic_add_64(&state->size, -size); 713fa9e4066Sahrens return; 714fa9e4066Sahrens } 715fa9e4066Sahrens 716fa9e4066Sahrens /* don't free buffers that are in the middle of an async write */ 717fa9e4066Sahrens freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL); 718fa9e4066Sahrens mutex_exit(hash_lock); 719fa9e4066Sahrens 720fa9e4066Sahrens if (freeable) 721fa9e4066Sahrens arc_hdr_free(hdr); 722fa9e4066Sahrens } 723fa9e4066Sahrens 724fa9e4066Sahrens int 725fa9e4066Sahrens arc_buf_size(arc_buf_t *buf) 726fa9e4066Sahrens { 727fa9e4066Sahrens return (buf->b_hdr->b_size); 728fa9e4066Sahrens } 729fa9e4066Sahrens 730fa9e4066Sahrens /* 731fa9e4066Sahrens * Evict buffers from list until we've removed the specified number of 732fa9e4066Sahrens * bytes. Move the removed buffers to the appropriate evict state. 733fa9e4066Sahrens */ 734fa9e4066Sahrens static uint64_t 735fa9e4066Sahrens arc_evict_state(arc_state_t *state, int64_t bytes) 736fa9e4066Sahrens { 737fa9e4066Sahrens arc_state_t *evicted_state; 738fa9e4066Sahrens uint64_t bytes_evicted = 0; 739fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 740fa9e4066Sahrens kmutex_t *hash_lock; 741fa9e4066Sahrens 742fa9e4066Sahrens ASSERT(state == arc.mru_top || state == arc.mfu_top); 743fa9e4066Sahrens 744fa9e4066Sahrens if (state == arc.mru_top) 745fa9e4066Sahrens evicted_state = arc.mru_bot; 746fa9e4066Sahrens else 747fa9e4066Sahrens evicted_state = arc.mfu_bot; 748fa9e4066Sahrens 749fa9e4066Sahrens mutex_enter(&state->mtx); 750fa9e4066Sahrens mutex_enter(&evicted_state->mtx); 751fa9e4066Sahrens 752fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 753fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 754fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 755fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 756fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 757fa9e4066Sahrens arc_change_state(evicted_state, ab, hash_lock); 758fa9e4066Sahrens zio_buf_free(ab->b_buf->b_data, ab->b_size); 759fa9e4066Sahrens atomic_add_64(&arc.size, -ab->b_size); 760fa9e4066Sahrens ASSERT3P(ab->b_buf->b_next, ==, NULL); 761fa9e4066Sahrens kmem_cache_free(buf_cache, ab->b_buf); 762fa9e4066Sahrens ab->b_buf = NULL; 763fa9e4066Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 764fa9e4066Sahrens bytes_evicted += ab->b_size; 765fa9e4066Sahrens mutex_exit(hash_lock); 766fa9e4066Sahrens if (bytes_evicted >= bytes) 767fa9e4066Sahrens break; 768fa9e4066Sahrens } else { 769fa9e4066Sahrens atomic_add_64(&arc.skipped, 1); 770fa9e4066Sahrens } 771fa9e4066Sahrens } 772fa9e4066Sahrens mutex_exit(&evicted_state->mtx); 773fa9e4066Sahrens mutex_exit(&state->mtx); 774fa9e4066Sahrens 775fa9e4066Sahrens if (bytes_evicted < bytes) 776fa9e4066Sahrens dprintf("only evicted %lld bytes from %x", 777fa9e4066Sahrens (longlong_t)bytes_evicted, state); 778fa9e4066Sahrens 779fa9e4066Sahrens return (bytes_evicted); 780fa9e4066Sahrens } 781fa9e4066Sahrens 782fa9e4066Sahrens /* 783fa9e4066Sahrens * Remove buffers from list until we've removed the specified number of 784fa9e4066Sahrens * bytes. Destroy the buffers that are removed. 785fa9e4066Sahrens */ 786fa9e4066Sahrens static void 787fa9e4066Sahrens arc_delete_state(arc_state_t *state, int64_t bytes) 788fa9e4066Sahrens { 789fa9e4066Sahrens uint_t bufs_skipped = 0; 790fa9e4066Sahrens uint64_t bytes_deleted = 0; 791fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 792fa9e4066Sahrens kmutex_t *hash_lock; 793fa9e4066Sahrens 794fa9e4066Sahrens top: 795fa9e4066Sahrens mutex_enter(&state->mtx); 796fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 797fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 798fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 799fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 800fa9e4066Sahrens arc_change_state(arc.anon, ab, hash_lock); 801fa9e4066Sahrens mutex_exit(hash_lock); 802fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 803fa9e4066Sahrens DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 804fa9e4066Sahrens bytes_deleted += ab->b_size; 805fa9e4066Sahrens arc_hdr_free(ab); 806fa9e4066Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 807fa9e4066Sahrens break; 808fa9e4066Sahrens } else { 809fa9e4066Sahrens if (bytes < 0) { 810fa9e4066Sahrens mutex_exit(&state->mtx); 811fa9e4066Sahrens mutex_enter(hash_lock); 812fa9e4066Sahrens mutex_exit(hash_lock); 813fa9e4066Sahrens goto top; 814fa9e4066Sahrens } 815fa9e4066Sahrens bufs_skipped += 1; 816fa9e4066Sahrens } 817fa9e4066Sahrens } 818fa9e4066Sahrens mutex_exit(&state->mtx); 819fa9e4066Sahrens 820fa9e4066Sahrens if (bufs_skipped) { 821fa9e4066Sahrens atomic_add_64(&arc.skipped, bufs_skipped); 822fa9e4066Sahrens ASSERT(bytes >= 0); 823fa9e4066Sahrens } 824fa9e4066Sahrens 825fa9e4066Sahrens if (bytes_deleted < bytes) 826fa9e4066Sahrens dprintf("only deleted %lld bytes from %p", 827fa9e4066Sahrens (longlong_t)bytes_deleted, state); 828fa9e4066Sahrens } 829fa9e4066Sahrens 830fa9e4066Sahrens static void 831fa9e4066Sahrens arc_adjust(void) 832fa9e4066Sahrens { 833fa9e4066Sahrens int64_t top_sz, mru_over, arc_over; 834fa9e4066Sahrens 835fa9e4066Sahrens top_sz = arc.anon->size + arc.mru_top->size; 836fa9e4066Sahrens 837fa9e4066Sahrens if (top_sz > arc.p && arc.mru_top->lsize > 0) { 838fa9e4066Sahrens int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p); 839fa9e4066Sahrens (void) arc_evict_state(arc.mru_top, toevict); 840fa9e4066Sahrens top_sz = arc.anon->size + arc.mru_top->size; 841fa9e4066Sahrens } 842fa9e4066Sahrens 843fa9e4066Sahrens mru_over = top_sz + arc.mru_bot->size - arc.c; 844fa9e4066Sahrens 845fa9e4066Sahrens if (mru_over > 0) { 846fa9e4066Sahrens if (arc.mru_bot->lsize > 0) { 847fa9e4066Sahrens int64_t todelete = MIN(arc.mru_bot->lsize, mru_over); 848fa9e4066Sahrens arc_delete_state(arc.mru_bot, todelete); 849fa9e4066Sahrens } 850fa9e4066Sahrens } 851fa9e4066Sahrens 852fa9e4066Sahrens if ((arc_over = arc.size - arc.c) > 0) { 853fa9e4066Sahrens int64_t table_over; 854fa9e4066Sahrens 855fa9e4066Sahrens if (arc.mfu_top->lsize > 0) { 856fa9e4066Sahrens int64_t toevict = MIN(arc.mfu_top->lsize, arc_over); 857fa9e4066Sahrens (void) arc_evict_state(arc.mfu_top, toevict); 858fa9e4066Sahrens } 859fa9e4066Sahrens 860fa9e4066Sahrens table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize 861fa9e4066Sahrens - arc.c*2; 862fa9e4066Sahrens 863fa9e4066Sahrens if (table_over > 0 && arc.mfu_bot->lsize > 0) { 864fa9e4066Sahrens int64_t todelete = MIN(arc.mfu_bot->lsize, table_over); 865fa9e4066Sahrens arc_delete_state(arc.mfu_bot, todelete); 866fa9e4066Sahrens } 867fa9e4066Sahrens } 868fa9e4066Sahrens } 869fa9e4066Sahrens 870fa9e4066Sahrens /* 871fa9e4066Sahrens * Flush all *evictable* data from the cache. 872fa9e4066Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 873fa9e4066Sahrens */ 874fa9e4066Sahrens void 875fa9e4066Sahrens arc_flush(void) 876fa9e4066Sahrens { 877fa9e4066Sahrens arc_delete_state(arc.mru_top, -1); 878fa9e4066Sahrens arc_delete_state(arc.mfu_top, -1); 879fa9e4066Sahrens 880fa9e4066Sahrens arc_delete_state(arc.mru_bot, -1); 881fa9e4066Sahrens arc_delete_state(arc.mfu_bot, -1); 882fa9e4066Sahrens } 883fa9e4066Sahrens 884fa9e4066Sahrens void 885fa9e4066Sahrens arc_kmem_reclaim(void) 886fa9e4066Sahrens { 887fa9e4066Sahrens /* Remove 6.25% */ 888fa9e4066Sahrens /* 889fa9e4066Sahrens * We need arc_reclaim_lock because we don't want multiple 890fa9e4066Sahrens * threads trying to reclaim concurrently. 891fa9e4066Sahrens */ 892fa9e4066Sahrens 893fa9e4066Sahrens /* 894fa9e4066Sahrens * umem calls the reclaim func when we destroy the buf cache, 895fa9e4066Sahrens * which is after we do arc_fini(). So we set a flag to prevent 896fa9e4066Sahrens * accessing the destroyed mutexes and lists. 897fa9e4066Sahrens */ 898fa9e4066Sahrens if (arc_dead) 899fa9e4066Sahrens return; 900fa9e4066Sahrens 901fa9e4066Sahrens mutex_enter(&arc_reclaim_lock); 902fa9e4066Sahrens 903fa9e4066Sahrens atomic_add_64(&arc.c, -(arc.c >> 4)); 904fa9e4066Sahrens if (arc.c < arc.c_min) 905fa9e4066Sahrens arc.c = arc.c_min; 906fa9e4066Sahrens atomic_add_64(&arc.p, -(arc.p >> 4)); 907fa9e4066Sahrens 908fa9e4066Sahrens arc_adjust(); 909fa9e4066Sahrens 910fa9e4066Sahrens /* Cool it for a while */ 911fa9e4066Sahrens arc.incr = 0; 912fa9e4066Sahrens arc.size_check = arc_size_check_default << 3; 913fa9e4066Sahrens 914fa9e4066Sahrens mutex_exit(&arc_reclaim_lock); 915fa9e4066Sahrens } 916fa9e4066Sahrens 917fa9e4066Sahrens static int 918fa9e4066Sahrens arc_reclaim_needed(void) 919fa9e4066Sahrens { 920fa9e4066Sahrens uint64_t extra; 921fa9e4066Sahrens 922fa9e4066Sahrens #ifdef _KERNEL 923fa9e4066Sahrens /* 924fa9e4066Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 925fa9e4066Sahrens */ 926fa9e4066Sahrens extra = desfree; 927fa9e4066Sahrens 928fa9e4066Sahrens /* 929fa9e4066Sahrens * check that we're out of range of the pageout scanner. It starts to 930fa9e4066Sahrens * schedule paging if freemem is less than lotsfree and needfree. 931fa9e4066Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 932fa9e4066Sahrens * number of needed free pages. We add extra pages here to make sure 933fa9e4066Sahrens * the scanner doesn't start up while we're freeing memory. 934fa9e4066Sahrens */ 935fa9e4066Sahrens if (freemem < lotsfree + needfree + extra) 936fa9e4066Sahrens return (1); 937fa9e4066Sahrens 938fa9e4066Sahrens /* 939fa9e4066Sahrens * check to make sure that swapfs has enough space so that anon 940fa9e4066Sahrens * reservations can still succeeed. anon_resvmem() checks that the 941fa9e4066Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 942fa9e4066Sahrens * swap pages. We also add a bit of extra here just to prevent 943fa9e4066Sahrens * circumstances from getting really dire. 944fa9e4066Sahrens */ 945fa9e4066Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 946fa9e4066Sahrens return (1); 947fa9e4066Sahrens 948fa9e4066Sahrens /* 949fa9e4066Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 950fa9e4066Sahrens * kernel heap space before we ever run out of available physical 951fa9e4066Sahrens * memory. Most checks of the size of the heap_area compare against 952fa9e4066Sahrens * tune.t_minarmem, which is the minimum available real memory that we 953fa9e4066Sahrens * can have in the system. However, this is generally fixed at 25 pages 954fa9e4066Sahrens * which is so low that it's useless. In this comparison, we seek to 955fa9e4066Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 956fa9e4066Sahrens * heap is allocated. (Or, in the caclulation, if less than 1/4th is 957fa9e4066Sahrens * free) 958fa9e4066Sahrens */ 959fa9e4066Sahrens #if defined(__i386) 960fa9e4066Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 961fa9e4066Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 962fa9e4066Sahrens return (1); 963fa9e4066Sahrens #endif 964fa9e4066Sahrens 965fa9e4066Sahrens #else 966fa9e4066Sahrens if (spa_get_random(100) == 0) 967fa9e4066Sahrens return (1); 968fa9e4066Sahrens #endif 969fa9e4066Sahrens return (0); 970fa9e4066Sahrens } 971fa9e4066Sahrens 972fa9e4066Sahrens static void 973fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 974fa9e4066Sahrens { 975fa9e4066Sahrens size_t i; 976fa9e4066Sahrens kmem_cache_t *prev_cache = NULL; 977fa9e4066Sahrens extern kmem_cache_t *zio_buf_cache[]; 978fa9e4066Sahrens 979*033f9833Sek #ifdef _KERNEL 980*033f9833Sek /* 981*033f9833Sek * First purge some DNLC entries, in case the DNLC is using 982*033f9833Sek * up too much memory. 983*033f9833Sek */ 984*033f9833Sek dnlc_reduce_cache((void *)arc_reduce_dnlc_percent); 985*033f9833Sek #endif 986*033f9833Sek 987fa9e4066Sahrens /* 988fa9e4066Sahrens * an agressive reclamation will shrink the cache size as well as reap 989fa9e4066Sahrens * free kmem buffers. The arc_kmem_reclaim function is called when the 990fa9e4066Sahrens * header-cache is reaped, so we only reap the header cache if we're 991fa9e4066Sahrens * performing an agressive reclaim. If we're not, just clean the kmem 992fa9e4066Sahrens * buffer caches. 993fa9e4066Sahrens */ 994fa9e4066Sahrens if (strat == ARC_RECLAIM_AGGR) 995fa9e4066Sahrens kmem_cache_reap_now(hdr_cache); 996fa9e4066Sahrens 997fa9e4066Sahrens kmem_cache_reap_now(buf_cache); 998fa9e4066Sahrens 999fa9e4066Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1000fa9e4066Sahrens if (zio_buf_cache[i] != prev_cache) { 1001fa9e4066Sahrens prev_cache = zio_buf_cache[i]; 1002fa9e4066Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 1003fa9e4066Sahrens } 1004fa9e4066Sahrens } 1005fa9e4066Sahrens } 1006fa9e4066Sahrens 1007fa9e4066Sahrens static void 1008fa9e4066Sahrens arc_reclaim_thread(void) 1009fa9e4066Sahrens { 1010fa9e4066Sahrens clock_t growtime = 0; 1011fa9e4066Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1012fa9e4066Sahrens callb_cpr_t cpr; 1013fa9e4066Sahrens 1014fa9e4066Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1015fa9e4066Sahrens 1016fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1017fa9e4066Sahrens while (arc_thread_exit == 0) { 1018fa9e4066Sahrens if (arc_reclaim_needed()) { 1019fa9e4066Sahrens 1020fa9e4066Sahrens if (arc.no_grow) { 1021fa9e4066Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1022fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1023fa9e4066Sahrens } else { 1024fa9e4066Sahrens last_reclaim = ARC_RECLAIM_CONS; 1025fa9e4066Sahrens } 1026fa9e4066Sahrens } else { 1027fa9e4066Sahrens arc.no_grow = TRUE; 1028fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1029fa9e4066Sahrens membar_producer(); 1030fa9e4066Sahrens } 1031fa9e4066Sahrens 1032fa9e4066Sahrens /* reset the growth delay for every reclaim */ 1033fa9e4066Sahrens growtime = lbolt + (arc_grow_retry * hz); 1034fa9e4066Sahrens 1035fa9e4066Sahrens arc_kmem_reap_now(last_reclaim); 1036fa9e4066Sahrens 1037fa9e4066Sahrens } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1038fa9e4066Sahrens arc.no_grow = FALSE; 1039fa9e4066Sahrens } 1040fa9e4066Sahrens 1041fa9e4066Sahrens /* block until needed, or one second, whichever is shorter */ 1042fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1043fa9e4066Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1044fa9e4066Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1045fa9e4066Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1046fa9e4066Sahrens } 1047fa9e4066Sahrens 1048fa9e4066Sahrens arc_thread_exit = 0; 1049fa9e4066Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1050fa9e4066Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1051fa9e4066Sahrens thread_exit(); 1052fa9e4066Sahrens } 1053fa9e4066Sahrens 1054fa9e4066Sahrens static void 1055fa9e4066Sahrens arc_try_grow(int64_t bytes) 1056fa9e4066Sahrens { 1057fa9e4066Sahrens /* 1058fa9e4066Sahrens * If we're within (2 * maxblocksize) bytes of the target 1059fa9e4066Sahrens * cache size, increment the target cache size 1060fa9e4066Sahrens */ 1061fa9e4066Sahrens atomic_add_64((uint64_t *)&arc.size_check, 1); 1062fa9e4066Sahrens 1063fa9e4066Sahrens if (arc_reclaim_needed()) { 1064fa9e4066Sahrens cv_signal(&arc_reclaim_thr_cv); 1065fa9e4066Sahrens return; 1066fa9e4066Sahrens } 1067fa9e4066Sahrens 1068fa9e4066Sahrens if (arc.no_grow) 1069fa9e4066Sahrens return; 1070fa9e4066Sahrens 1071fa9e4066Sahrens /* 1072fa9e4066Sahrens * return true if we successfully grow, or if there's enough space that 1073fa9e4066Sahrens * we don't have to grow. Above, we return false if we can't grow, or 1074fa9e4066Sahrens * if we shouldn't because a reclaim is in progress. 1075fa9e4066Sahrens */ 1076fa9e4066Sahrens if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) { 1077fa9e4066Sahrens if (arc.size_check > 0) { 1078fa9e4066Sahrens arc.size_check = arc_size_check_default; 1079fa9e4066Sahrens atomic_add_64(&arc.incr, arc_incr_size); 1080fa9e4066Sahrens } 1081fa9e4066Sahrens atomic_add_64(&arc.c, MIN(bytes, arc.incr)); 1082fa9e4066Sahrens if (arc.c > arc.c_max) 1083fa9e4066Sahrens arc.c = arc.c_max; 1084fa9e4066Sahrens else 1085fa9e4066Sahrens atomic_add_64(&arc.p, MIN(bytes, arc.incr)); 1086fa9e4066Sahrens } else if (arc.size > arc.c) { 1087fa9e4066Sahrens if (arc.size_check > 0) { 1088fa9e4066Sahrens arc.size_check = arc_size_check_default; 1089fa9e4066Sahrens atomic_add_64(&arc.incr, arc_incr_size); 1090fa9e4066Sahrens } 1091fa9e4066Sahrens atomic_add_64(&arc.c, MIN(bytes, arc.incr)); 1092fa9e4066Sahrens if (arc.c > arc.c_max) 1093fa9e4066Sahrens arc.c = arc.c_max; 1094fa9e4066Sahrens else 1095fa9e4066Sahrens atomic_add_64(&arc.p, MIN(bytes, arc.incr)); 1096fa9e4066Sahrens } 1097fa9e4066Sahrens } 1098fa9e4066Sahrens 1099fa9e4066Sahrens /* 1100fa9e4066Sahrens * check if the cache has reached its limits and eviction is required prior to 1101fa9e4066Sahrens * insert. In this situation, we want to evict if no_grow is set Otherwise, the 1102fa9e4066Sahrens * cache is either big enough that we can insert, or a arc_try_grow will result 1103fa9e4066Sahrens * in more space being made available. 1104fa9e4066Sahrens */ 1105fa9e4066Sahrens 1106fa9e4066Sahrens static int 1107fa9e4066Sahrens arc_evict_needed() 1108fa9e4066Sahrens { 1109fa9e4066Sahrens 1110fa9e4066Sahrens if (arc_reclaim_needed()) 1111fa9e4066Sahrens return (1); 1112fa9e4066Sahrens 1113fa9e4066Sahrens if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c)) 1114fa9e4066Sahrens return (1); 1115fa9e4066Sahrens 1116fa9e4066Sahrens return (0); 1117fa9e4066Sahrens } 1118fa9e4066Sahrens 1119fa9e4066Sahrens /* 1120fa9e4066Sahrens * The state, supplied as the first argument, is going to have something 1121fa9e4066Sahrens * inserted on its behalf. So, determine which cache must be victimized to 1122fa9e4066Sahrens * satisfy an insertion for this state. We have the following cases: 1123fa9e4066Sahrens * 1124fa9e4066Sahrens * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) -> 1125fa9e4066Sahrens * In this situation if we're out of space, but the resident size of the MFU is 1126fa9e4066Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 1127fa9e4066Sahrens * 1128fa9e4066Sahrens * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) -> 1129fa9e4066Sahrens * Here, we've used up all of the available space for the MRU, so we need to 1130fa9e4066Sahrens * evict from our own cache instead. Evict from the set of resident MRU 1131fa9e4066Sahrens * entries. 1132fa9e4066Sahrens * 1133fa9e4066Sahrens * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) -> 1134fa9e4066Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 1135fa9e4066Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 1136fa9e4066Sahrens * the MFU side, so the MRU side needs to be victimized. 1137fa9e4066Sahrens * 1138fa9e4066Sahrens * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) -> 1139fa9e4066Sahrens * MFU's resident set is consuming more space than it has been allotted. In 1140fa9e4066Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 1141fa9e4066Sahrens */ 1142fa9e4066Sahrens static void 1143fa9e4066Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes) 1144fa9e4066Sahrens { 1145fa9e4066Sahrens uint64_t mru_used; 1146fa9e4066Sahrens uint64_t mfu_space; 1147fa9e4066Sahrens uint64_t evicted; 1148fa9e4066Sahrens 1149fa9e4066Sahrens ASSERT(state == arc.mru_top || state == arc.mfu_top); 1150fa9e4066Sahrens 1151fa9e4066Sahrens if (state == arc.mru_top) { 1152fa9e4066Sahrens mru_used = arc.anon->size + arc.mru_top->size; 1153fa9e4066Sahrens if (arc.p > mru_used) { 1154fa9e4066Sahrens /* case 1 */ 1155fa9e4066Sahrens evicted = arc_evict_state(arc.mfu_top, bytes); 1156fa9e4066Sahrens if (evicted < bytes) { 1157fa9e4066Sahrens arc_adjust(); 1158fa9e4066Sahrens } 1159fa9e4066Sahrens } else { 1160fa9e4066Sahrens /* case 2 */ 1161fa9e4066Sahrens evicted = arc_evict_state(arc.mru_top, bytes); 1162fa9e4066Sahrens if (evicted < bytes) { 1163fa9e4066Sahrens arc_adjust(); 1164fa9e4066Sahrens } 1165fa9e4066Sahrens } 1166fa9e4066Sahrens } else { 1167fa9e4066Sahrens /* MFU_top case */ 1168fa9e4066Sahrens mfu_space = arc.c - arc.p; 1169fa9e4066Sahrens if (mfu_space > arc.mfu_top->size) { 1170fa9e4066Sahrens /* case 3 */ 1171fa9e4066Sahrens evicted = arc_evict_state(arc.mru_top, bytes); 1172fa9e4066Sahrens if (evicted < bytes) { 1173fa9e4066Sahrens arc_adjust(); 1174fa9e4066Sahrens } 1175fa9e4066Sahrens } else { 1176fa9e4066Sahrens /* case 4 */ 1177fa9e4066Sahrens evicted = arc_evict_state(arc.mfu_top, bytes); 1178fa9e4066Sahrens if (evicted < bytes) { 1179fa9e4066Sahrens arc_adjust(); 1180fa9e4066Sahrens } 1181fa9e4066Sahrens } 1182fa9e4066Sahrens } 1183fa9e4066Sahrens } 1184fa9e4066Sahrens 1185fa9e4066Sahrens /* 1186fa9e4066Sahrens * This routine is called whenever a buffer is accessed. 1187fa9e4066Sahrens */ 1188fa9e4066Sahrens static void 1189fa9e4066Sahrens arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1190fa9e4066Sahrens { 1191fa9e4066Sahrens int blksz, mult; 1192fa9e4066Sahrens 1193fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1194fa9e4066Sahrens 1195fa9e4066Sahrens blksz = buf->b_size; 1196fa9e4066Sahrens 1197fa9e4066Sahrens if (buf->b_state == arc.anon) { 1198fa9e4066Sahrens /* 1199fa9e4066Sahrens * This buffer is not in the cache, and does not 1200fa9e4066Sahrens * appear in our "ghost" list. Add the new buffer 1201fa9e4066Sahrens * to the MRU state. 1202fa9e4066Sahrens */ 1203fa9e4066Sahrens 1204fa9e4066Sahrens arc_try_grow(blksz); 1205fa9e4066Sahrens if (arc_evict_needed()) { 1206fa9e4066Sahrens arc_evict_for_state(arc.mru_top, blksz); 1207fa9e4066Sahrens } 1208fa9e4066Sahrens 1209fa9e4066Sahrens ASSERT(buf->b_arc_access == 0); 1210fa9e4066Sahrens buf->b_arc_access = lbolt; 1211fa9e4066Sahrens DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *, 1212fa9e4066Sahrens buf); 1213fa9e4066Sahrens arc_change_state(arc.mru_top, buf, hash_lock); 1214fa9e4066Sahrens 1215fa9e4066Sahrens /* 1216fa9e4066Sahrens * If we are using less than 2/3 of our total target 1217fa9e4066Sahrens * cache size, bump up the target size for the MRU 1218fa9e4066Sahrens * list. 1219fa9e4066Sahrens */ 1220fa9e4066Sahrens if (arc.size < arc.c*2/3) { 1221fa9e4066Sahrens arc.p = arc.anon->size + arc.mru_top->size + arc.c/6; 1222fa9e4066Sahrens } 1223fa9e4066Sahrens 1224fa9e4066Sahrens } else if (buf->b_state == arc.mru_top) { 1225fa9e4066Sahrens /* 1226fa9e4066Sahrens * If this buffer is in the MRU-top state and has the prefetch 1227fa9e4066Sahrens * flag, the first read was actually part of a prefetch. In 1228fa9e4066Sahrens * this situation, we simply want to clear the flag and return. 1229fa9e4066Sahrens * A subsequent access should bump this into the MFU state. 1230fa9e4066Sahrens */ 1231fa9e4066Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 1232fa9e4066Sahrens buf->b_flags &= ~ARC_PREFETCH; 1233fa9e4066Sahrens atomic_add_64(&arc.mru_top->hits, 1); 1234fa9e4066Sahrens return; 1235fa9e4066Sahrens } 1236fa9e4066Sahrens 1237fa9e4066Sahrens /* 1238fa9e4066Sahrens * This buffer has been "accessed" only once so far, 1239fa9e4066Sahrens * but it is still in the cache. Move it to the MFU 1240fa9e4066Sahrens * state. 1241fa9e4066Sahrens */ 1242fa9e4066Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1243fa9e4066Sahrens /* 1244fa9e4066Sahrens * More than 125ms have passed since we 1245fa9e4066Sahrens * instantiated this buffer. Move it to the 1246fa9e4066Sahrens * most frequently used state. 1247fa9e4066Sahrens */ 1248fa9e4066Sahrens buf->b_arc_access = lbolt; 1249fa9e4066Sahrens DTRACE_PROBE1(new_state__mfu_top, 1250fa9e4066Sahrens arc_buf_hdr_t *, buf); 1251fa9e4066Sahrens arc_change_state(arc.mfu_top, buf, hash_lock); 1252fa9e4066Sahrens } 1253fa9e4066Sahrens atomic_add_64(&arc.mru_top->hits, 1); 1254fa9e4066Sahrens } else if (buf->b_state == arc.mru_bot) { 1255fa9e4066Sahrens arc_state_t *new_state; 1256fa9e4066Sahrens /* 1257fa9e4066Sahrens * This buffer has been "accessed" recently, but 1258fa9e4066Sahrens * was evicted from the cache. Move it to the 1259fa9e4066Sahrens * MFU state. 1260fa9e4066Sahrens */ 1261fa9e4066Sahrens 1262fa9e4066Sahrens if (buf->b_flags & ARC_PREFETCH) { 1263fa9e4066Sahrens new_state = arc.mru_top; 1264fa9e4066Sahrens DTRACE_PROBE1(new_state__mru_top, 1265fa9e4066Sahrens arc_buf_hdr_t *, buf); 1266fa9e4066Sahrens } else { 1267fa9e4066Sahrens new_state = arc.mfu_top; 1268fa9e4066Sahrens DTRACE_PROBE1(new_state__mfu_top, 1269fa9e4066Sahrens arc_buf_hdr_t *, buf); 1270fa9e4066Sahrens } 1271fa9e4066Sahrens 1272fa9e4066Sahrens arc_try_grow(blksz); 1273fa9e4066Sahrens if (arc_evict_needed()) { 1274fa9e4066Sahrens arc_evict_for_state(new_state, blksz); 1275fa9e4066Sahrens } 1276fa9e4066Sahrens 1277fa9e4066Sahrens /* Bump up the target size of the MRU list */ 1278fa9e4066Sahrens mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ? 1279fa9e4066Sahrens 1 : (arc.mfu_bot->size/arc.mru_bot->size)); 1280fa9e4066Sahrens arc.p = MIN(arc.c, arc.p + blksz * mult); 1281fa9e4066Sahrens 1282fa9e4066Sahrens buf->b_arc_access = lbolt; 1283fa9e4066Sahrens arc_change_state(new_state, buf, hash_lock); 1284fa9e4066Sahrens 1285fa9e4066Sahrens atomic_add_64(&arc.mru_bot->hits, 1); 1286fa9e4066Sahrens } else if (buf->b_state == arc.mfu_top) { 1287fa9e4066Sahrens /* 1288fa9e4066Sahrens * This buffer has been accessed more than once and is 1289fa9e4066Sahrens * still in the cache. Keep it in the MFU state. 1290fa9e4066Sahrens * 1291fa9e4066Sahrens * NOTE: the add_reference() that occurred when we did 1292fa9e4066Sahrens * the arc_read() should have kicked this off the list, 1293fa9e4066Sahrens * so even if it was a prefetch, it will be put back at 1294fa9e4066Sahrens * the head of the list when we remove_reference(). 1295fa9e4066Sahrens */ 1296fa9e4066Sahrens atomic_add_64(&arc.mfu_top->hits, 1); 1297fa9e4066Sahrens } else if (buf->b_state == arc.mfu_bot) { 1298fa9e4066Sahrens /* 1299fa9e4066Sahrens * This buffer has been accessed more than once but has 1300fa9e4066Sahrens * been evicted from the cache. Move it back to the 1301fa9e4066Sahrens * MFU state. 1302fa9e4066Sahrens */ 1303fa9e4066Sahrens 1304fa9e4066Sahrens arc_try_grow(blksz); 1305fa9e4066Sahrens if (arc_evict_needed()) { 1306fa9e4066Sahrens arc_evict_for_state(arc.mfu_top, blksz); 1307fa9e4066Sahrens } 1308fa9e4066Sahrens 1309fa9e4066Sahrens /* Bump up the target size for the MFU list */ 1310fa9e4066Sahrens mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ? 1311fa9e4066Sahrens 1 : (arc.mru_bot->size/arc.mfu_bot->size)); 1312fa9e4066Sahrens arc.p = MAX(0, (int64_t)arc.p - blksz * mult); 1313fa9e4066Sahrens 1314fa9e4066Sahrens buf->b_arc_access = lbolt; 1315fa9e4066Sahrens DTRACE_PROBE1(new_state__mfu_top, 1316fa9e4066Sahrens arc_buf_hdr_t *, buf); 1317fa9e4066Sahrens arc_change_state(arc.mfu_top, buf, hash_lock); 1318fa9e4066Sahrens 1319fa9e4066Sahrens atomic_add_64(&arc.mfu_bot->hits, 1); 1320fa9e4066Sahrens } else { 1321fa9e4066Sahrens ASSERT(!"invalid arc state"); 1322fa9e4066Sahrens } 1323fa9e4066Sahrens 1324fa9e4066Sahrens } 1325fa9e4066Sahrens 1326fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1327fa9e4066Sahrens /* ARGSUSED */ 1328fa9e4066Sahrens void 1329fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1330fa9e4066Sahrens { 1331fa9e4066Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1332fa9e4066Sahrens arc_buf_free(buf, arg); 1333fa9e4066Sahrens } 1334fa9e4066Sahrens 1335fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1336fa9e4066Sahrens void 1337fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1338fa9e4066Sahrens { 1339fa9e4066Sahrens arc_buf_t **bufp = arg; 1340fa9e4066Sahrens if (zio && zio->io_error) { 1341fa9e4066Sahrens arc_buf_free(buf, arg); 1342fa9e4066Sahrens *bufp = NULL; 1343fa9e4066Sahrens } else { 1344fa9e4066Sahrens *bufp = buf; 1345fa9e4066Sahrens } 1346fa9e4066Sahrens } 1347fa9e4066Sahrens 1348fa9e4066Sahrens static void 1349fa9e4066Sahrens arc_read_done(zio_t *zio) 1350fa9e4066Sahrens { 1351fa9e4066Sahrens arc_buf_hdr_t *hdr; 1352fa9e4066Sahrens arc_buf_t *buf; 1353fa9e4066Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 1354fa9e4066Sahrens kmutex_t *hash_lock; 1355fa9e4066Sahrens arc_callback_t *callback_list, *acb; 1356fa9e4066Sahrens int freeable = FALSE; 1357fa9e4066Sahrens 1358fa9e4066Sahrens buf = zio->io_private; 1359fa9e4066Sahrens hdr = buf->b_hdr; 1360fa9e4066Sahrens 1361fa9e4066Sahrens if (!HDR_FREED_IN_READ(hdr)) { 1362fa9e4066Sahrens arc_buf_hdr_t *found; 1363fa9e4066Sahrens 1364fa9e4066Sahrens found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1365fa9e4066Sahrens &hash_lock); 1366fa9e4066Sahrens 1367fa9e4066Sahrens /* 1368fa9e4066Sahrens * Buffer was inserted into hash-table and removed from lists 1369fa9e4066Sahrens * prior to starting I/O. We should find this header, since 1370fa9e4066Sahrens * it's in the hash table, and it should be legit since it's 1371fa9e4066Sahrens * not possible to evict it during the I/O. 1372fa9e4066Sahrens */ 1373fa9e4066Sahrens 1374fa9e4066Sahrens ASSERT(found); 1375fa9e4066Sahrens ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))); 1376fa9e4066Sahrens } 1377fa9e4066Sahrens 1378fa9e4066Sahrens /* byteswap if necessary */ 1379fa9e4066Sahrens callback_list = hdr->b_acb; 1380fa9e4066Sahrens ASSERT(callback_list != NULL); 1381fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1382fa9e4066Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1383fa9e4066Sahrens 1384fa9e4066Sahrens /* create copies of the data buffer for the callers */ 1385fa9e4066Sahrens abuf = buf; 1386fa9e4066Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 1387fa9e4066Sahrens if (acb->acb_done) { 1388fa9e4066Sahrens if (abuf == NULL) { 1389fa9e4066Sahrens abuf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1390fa9e4066Sahrens abuf->b_data = zio_buf_alloc(hdr->b_size); 1391fa9e4066Sahrens atomic_add_64(&arc.size, hdr->b_size); 1392fa9e4066Sahrens bcopy(buf->b_data, abuf->b_data, hdr->b_size); 1393fa9e4066Sahrens abuf->b_hdr = hdr; 1394fa9e4066Sahrens abuf->b_next = hdr->b_buf; 1395fa9e4066Sahrens hdr->b_buf = abuf; 1396fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, hdr->b_size); 1397fa9e4066Sahrens } 1398fa9e4066Sahrens acb->acb_buf = abuf; 1399fa9e4066Sahrens abuf = NULL; 1400fa9e4066Sahrens } else { 1401fa9e4066Sahrens /* 1402fa9e4066Sahrens * The caller did not provide a callback function. 1403fa9e4066Sahrens * In this case, we should just remove the reference. 1404fa9e4066Sahrens */ 1405fa9e4066Sahrens if (HDR_FREED_IN_READ(hdr)) { 1406fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1407fa9e4066Sahrens (void) refcount_remove(&hdr->b_refcnt, 1408fa9e4066Sahrens acb->acb_private); 1409fa9e4066Sahrens } else { 1410fa9e4066Sahrens (void) remove_reference(hdr, hash_lock, 1411fa9e4066Sahrens acb->acb_private); 1412fa9e4066Sahrens } 1413fa9e4066Sahrens } 1414fa9e4066Sahrens } 1415fa9e4066Sahrens hdr->b_acb = NULL; 1416fa9e4066Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1417fa9e4066Sahrens 1418fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1419fa9e4066Sahrens 1420fa9e4066Sahrens if (zio->io_error != 0) { 1421fa9e4066Sahrens hdr->b_flags |= ARC_IO_ERROR; 1422fa9e4066Sahrens if (hdr->b_state != arc.anon) 1423fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1424fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1425fa9e4066Sahrens } 1426fa9e4066Sahrens 1427fa9e4066Sahrens if (!HDR_FREED_IN_READ(hdr)) { 1428fa9e4066Sahrens /* 1429fa9e4066Sahrens * Only call arc_access on anonymous buffers. This is because 1430fa9e4066Sahrens * if we've issued an I/O for an evicted buffer, we've already 1431fa9e4066Sahrens * called arc_access (to prevent any simultaneous readers from 1432fa9e4066Sahrens * getting confused). 1433fa9e4066Sahrens */ 1434fa9e4066Sahrens if (zio->io_error == 0 && hdr->b_state == arc.anon) 1435fa9e4066Sahrens arc_access(hdr, hash_lock); 1436fa9e4066Sahrens mutex_exit(hash_lock); 1437fa9e4066Sahrens } else { 1438fa9e4066Sahrens /* 1439fa9e4066Sahrens * This block was freed while we waited for the read to 1440fa9e4066Sahrens * complete. It has been removed from the hash table and 1441fa9e4066Sahrens * moved to the anonymous state (so that it won't show up 1442fa9e4066Sahrens * in the cache). 1443fa9e4066Sahrens */ 1444fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1445fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1446fa9e4066Sahrens } 1447fa9e4066Sahrens 1448fa9e4066Sahrens cv_broadcast(&hdr->b_cv); 1449fa9e4066Sahrens 1450fa9e4066Sahrens /* execute each callback and free its structure */ 1451fa9e4066Sahrens while ((acb = callback_list) != NULL) { 1452fa9e4066Sahrens if (acb->acb_done) 1453fa9e4066Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1454fa9e4066Sahrens 1455fa9e4066Sahrens if (acb->acb_zio_dummy != NULL) { 1456fa9e4066Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 1457fa9e4066Sahrens zio_nowait(acb->acb_zio_dummy); 1458fa9e4066Sahrens } 1459fa9e4066Sahrens 1460fa9e4066Sahrens callback_list = acb->acb_next; 1461fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1462fa9e4066Sahrens } 1463fa9e4066Sahrens 1464fa9e4066Sahrens if (freeable) 1465fa9e4066Sahrens arc_hdr_free(hdr); 1466fa9e4066Sahrens } 1467fa9e4066Sahrens 1468fa9e4066Sahrens /* 1469fa9e4066Sahrens * "Read" the block block at the specified DVA (in bp) via the 1470fa9e4066Sahrens * cache. If the block is found in the cache, invoke the provided 1471fa9e4066Sahrens * callback immediately and return. Note that the `zio' parameter 1472fa9e4066Sahrens * in the callback will be NULL in this case, since no IO was 1473fa9e4066Sahrens * required. If the block is not in the cache pass the read request 1474fa9e4066Sahrens * on to the spa with a substitute callback function, so that the 1475fa9e4066Sahrens * requested block will be added to the cache. 1476fa9e4066Sahrens * 1477fa9e4066Sahrens * If a read request arrives for a block that has a read in-progress, 1478fa9e4066Sahrens * either wait for the in-progress read to complete (and return the 1479fa9e4066Sahrens * results); or, if this is a read with a "done" func, add a record 1480fa9e4066Sahrens * to the read to invoke the "done" func when the read completes, 1481fa9e4066Sahrens * and return; or just return. 1482fa9e4066Sahrens * 1483fa9e4066Sahrens * arc_read_done() will invoke all the requested "done" functions 1484fa9e4066Sahrens * for readers of this block. 1485fa9e4066Sahrens */ 1486fa9e4066Sahrens int 1487fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1488fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1489fa9e4066Sahrens uint32_t arc_flags) 1490fa9e4066Sahrens { 1491fa9e4066Sahrens arc_buf_hdr_t *hdr; 1492fa9e4066Sahrens arc_buf_t *buf; 1493fa9e4066Sahrens kmutex_t *hash_lock; 1494fa9e4066Sahrens zio_t *rzio; 1495fa9e4066Sahrens 1496fa9e4066Sahrens top: 1497fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1498fa9e4066Sahrens if (hdr && hdr->b_buf) { 1499fa9e4066Sahrens 1500fa9e4066Sahrens ASSERT((hdr->b_state == arc.mru_top) || 1501fa9e4066Sahrens (hdr->b_state == arc.mfu_top) || 1502fa9e4066Sahrens ((hdr->b_state == arc.anon) && 1503fa9e4066Sahrens (HDR_IO_IN_PROGRESS(hdr)))); 1504fa9e4066Sahrens 1505fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 1506fa9e4066Sahrens 1507fa9e4066Sahrens if ((arc_flags & ARC_NOWAIT) && done) { 1508fa9e4066Sahrens arc_callback_t *acb = NULL; 1509fa9e4066Sahrens 1510fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 1511fa9e4066Sahrens KM_SLEEP); 1512fa9e4066Sahrens acb->acb_done = done; 1513fa9e4066Sahrens acb->acb_private = private; 1514fa9e4066Sahrens acb->acb_byteswap = swap; 1515fa9e4066Sahrens if (pio != NULL) 1516fa9e4066Sahrens acb->acb_zio_dummy = zio_null(pio, 1517fa9e4066Sahrens spa, NULL, NULL, flags); 1518fa9e4066Sahrens 1519fa9e4066Sahrens ASSERT(acb->acb_done != NULL); 1520fa9e4066Sahrens acb->acb_next = hdr->b_acb; 1521fa9e4066Sahrens hdr->b_acb = acb; 1522fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1523fa9e4066Sahrens mutex_exit(hash_lock); 1524fa9e4066Sahrens return (0); 1525fa9e4066Sahrens } else if (arc_flags & ARC_WAIT) { 1526fa9e4066Sahrens cv_wait(&hdr->b_cv, hash_lock); 1527fa9e4066Sahrens mutex_exit(hash_lock); 1528fa9e4066Sahrens goto top; 1529fa9e4066Sahrens } 1530fa9e4066Sahrens 1531fa9e4066Sahrens mutex_exit(hash_lock); 1532fa9e4066Sahrens return (0); 1533fa9e4066Sahrens } 1534fa9e4066Sahrens 1535fa9e4066Sahrens /* 1536fa9e4066Sahrens * If there is already a reference on this block, create 1537fa9e4066Sahrens * a new copy of the data so that we will be guaranteed 1538fa9e4066Sahrens * that arc_release() will always succeed. 1539fa9e4066Sahrens */ 1540fa9e4066Sahrens 1541fa9e4066Sahrens if (done) 1542fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1543fa9e4066Sahrens if (done && refcount_count(&hdr->b_refcnt) > 1) { 1544fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1545fa9e4066Sahrens buf->b_data = zio_buf_alloc(hdr->b_size); 1546fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1); 1547fa9e4066Sahrens atomic_add_64(&arc.size, hdr->b_size); 1548fa9e4066Sahrens bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size); 1549fa9e4066Sahrens buf->b_hdr = hdr; 1550fa9e4066Sahrens buf->b_next = hdr->b_buf; 1551fa9e4066Sahrens hdr->b_buf = buf; 1552fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, hdr->b_size); 1553fa9e4066Sahrens } else { 1554fa9e4066Sahrens buf = hdr->b_buf; 1555fa9e4066Sahrens } 1556fa9e4066Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1557fa9e4066Sahrens arc_access(hdr, hash_lock); 1558fa9e4066Sahrens mutex_exit(hash_lock); 1559fa9e4066Sahrens atomic_add_64(&arc.hits, 1); 1560fa9e4066Sahrens if (done) 1561fa9e4066Sahrens done(NULL, buf, private); 1562fa9e4066Sahrens } else { 1563fa9e4066Sahrens uint64_t size = BP_GET_LSIZE(bp); 1564fa9e4066Sahrens arc_callback_t *acb; 1565fa9e4066Sahrens 1566fa9e4066Sahrens if (hdr == NULL) { 1567fa9e4066Sahrens /* this block is not in the cache */ 1568fa9e4066Sahrens arc_buf_hdr_t *exists; 1569fa9e4066Sahrens 1570fa9e4066Sahrens buf = arc_buf_alloc(spa, size, private); 1571fa9e4066Sahrens hdr = buf->b_hdr; 1572fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(bp); 1573fa9e4066Sahrens hdr->b_birth = bp->blk_birth; 1574fa9e4066Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1575fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1576fa9e4066Sahrens if (exists) { 1577fa9e4066Sahrens /* somebody beat us to the hash insert */ 1578fa9e4066Sahrens mutex_exit(hash_lock); 1579fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1580fa9e4066Sahrens hdr->b_birth = 0; 1581fa9e4066Sahrens hdr->b_cksum0 = 0; 1582fa9e4066Sahrens arc_buf_free(buf, private); 1583fa9e4066Sahrens goto top; /* restart the IO request */ 1584fa9e4066Sahrens } 1585fa9e4066Sahrens 1586fa9e4066Sahrens } else { 1587fa9e4066Sahrens /* this block is in the ghost cache */ 1588fa9e4066Sahrens ASSERT((hdr->b_state == arc.mru_bot) || 1589fa9e4066Sahrens (hdr->b_state == arc.mfu_bot)); 1590fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1591fa9e4066Sahrens 1592fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1593fa9e4066Sahrens buf->b_data = zio_buf_alloc(hdr->b_size); 1594fa9e4066Sahrens atomic_add_64(&arc.size, hdr->b_size); 1595fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1596fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 1597fa9e4066Sahrens buf->b_hdr = hdr; 1598fa9e4066Sahrens buf->b_next = NULL; 1599fa9e4066Sahrens hdr->b_buf = buf; 1600fa9e4066Sahrens } 1601fa9e4066Sahrens 1602fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1603fa9e4066Sahrens acb->acb_done = done; 1604fa9e4066Sahrens acb->acb_private = private; 1605fa9e4066Sahrens acb->acb_byteswap = swap; 1606fa9e4066Sahrens 1607fa9e4066Sahrens ASSERT(hdr->b_acb == NULL); 1608fa9e4066Sahrens hdr->b_acb = acb; 1609fa9e4066Sahrens 1610fa9e4066Sahrens /* 1611fa9e4066Sahrens * If this DVA is part of a prefetch, mark the buf 1612fa9e4066Sahrens * header with the prefetch flag 1613fa9e4066Sahrens */ 1614fa9e4066Sahrens if (arc_flags & ARC_PREFETCH) 1615fa9e4066Sahrens hdr->b_flags |= ARC_PREFETCH; 1616fa9e4066Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 1617fa9e4066Sahrens 1618fa9e4066Sahrens /* 1619fa9e4066Sahrens * If the buffer has been evicted, migrate it to a present state 1620fa9e4066Sahrens * before issuing the I/O. Once we drop the hash-table lock, 1621fa9e4066Sahrens * the header will be marked as I/O in progress and have an 1622fa9e4066Sahrens * attached buffer. At this point, anybody who finds this 1623fa9e4066Sahrens * buffer ought to notice that it's legit but has a pending I/O. 1624fa9e4066Sahrens */ 1625fa9e4066Sahrens 1626fa9e4066Sahrens if ((hdr->b_state == arc.mru_bot) || 1627fa9e4066Sahrens (hdr->b_state == arc.mfu_bot)) 1628fa9e4066Sahrens arc_access(hdr, hash_lock); 1629fa9e4066Sahrens 1630fa9e4066Sahrens mutex_exit(hash_lock); 1631fa9e4066Sahrens 1632fa9e4066Sahrens ASSERT3U(hdr->b_size, ==, size); 1633fa9e4066Sahrens DTRACE_PROBE2(arc__miss, blkptr_t *, bp, 1634fa9e4066Sahrens uint64_t, size); 1635fa9e4066Sahrens atomic_add_64(&arc.misses, 1); 1636fa9e4066Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 1637fa9e4066Sahrens arc_read_done, buf, priority, flags); 1638fa9e4066Sahrens 1639fa9e4066Sahrens if (arc_flags & ARC_WAIT) 1640fa9e4066Sahrens return (zio_wait(rzio)); 1641fa9e4066Sahrens 1642fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1643fa9e4066Sahrens zio_nowait(rzio); 1644fa9e4066Sahrens } 1645fa9e4066Sahrens return (0); 1646fa9e4066Sahrens } 1647fa9e4066Sahrens 1648fa9e4066Sahrens /* 1649fa9e4066Sahrens * arc_read() variant to support pool traversal. If the block is already 1650fa9e4066Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 1651fa9e4066Sahrens * The idea is that we don't want pool traversal filling up memory, but 1652fa9e4066Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 1653fa9e4066Sahrens */ 1654fa9e4066Sahrens int 1655fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 1656fa9e4066Sahrens { 1657fa9e4066Sahrens arc_buf_hdr_t *hdr; 1658fa9e4066Sahrens kmutex_t *hash_mtx; 1659fa9e4066Sahrens int rc = 0; 1660fa9e4066Sahrens 1661fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 1662fa9e4066Sahrens 1663fa9e4066Sahrens if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr)) 1664fa9e4066Sahrens bcopy(hdr->b_buf->b_data, data, hdr->b_size); 1665fa9e4066Sahrens else 1666fa9e4066Sahrens rc = ENOENT; 1667fa9e4066Sahrens 1668fa9e4066Sahrens if (hash_mtx) 1669fa9e4066Sahrens mutex_exit(hash_mtx); 1670fa9e4066Sahrens 1671fa9e4066Sahrens return (rc); 1672fa9e4066Sahrens } 1673fa9e4066Sahrens 1674fa9e4066Sahrens /* 1675fa9e4066Sahrens * Release this buffer from the cache. This must be done 1676fa9e4066Sahrens * after a read and prior to modifying the buffer contents. 1677fa9e4066Sahrens * If the buffer has more than one reference, we must make 1678fa9e4066Sahrens * make a new hdr for the buffer. 1679fa9e4066Sahrens */ 1680fa9e4066Sahrens void 1681fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag) 1682fa9e4066Sahrens { 1683fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1684fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 1685fa9e4066Sahrens 1686fa9e4066Sahrens /* this buffer is not on any list */ 1687fa9e4066Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 1688fa9e4066Sahrens 1689fa9e4066Sahrens if (hdr->b_state == arc.anon) { 1690fa9e4066Sahrens /* this buffer is already released */ 1691fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 1692fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 1693fa9e4066Sahrens return; 1694fa9e4066Sahrens } 1695fa9e4066Sahrens 1696fa9e4066Sahrens mutex_enter(hash_lock); 1697fa9e4066Sahrens 1698fa9e4066Sahrens if (refcount_count(&hdr->b_refcnt) > 1) { 1699fa9e4066Sahrens arc_buf_hdr_t *nhdr; 1700fa9e4066Sahrens arc_buf_t **bufp; 1701fa9e4066Sahrens uint64_t blksz = hdr->b_size; 1702fa9e4066Sahrens spa_t *spa = hdr->b_spa; 1703fa9e4066Sahrens 1704fa9e4066Sahrens /* 1705fa9e4066Sahrens * Pull the data off of this buf and attach it to 1706fa9e4066Sahrens * a new anonymous buf. 1707fa9e4066Sahrens */ 1708fa9e4066Sahrens bufp = &hdr->b_buf; 1709fa9e4066Sahrens while (*bufp != buf) { 1710fa9e4066Sahrens ASSERT(*bufp); 1711fa9e4066Sahrens bufp = &(*bufp)->b_next; 1712fa9e4066Sahrens } 1713fa9e4066Sahrens *bufp = (*bufp)->b_next; 1714fa9e4066Sahrens (void) refcount_remove(&hdr->b_refcnt, tag); 1715fa9e4066Sahrens ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 1716fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, -hdr->b_size); 1717fa9e4066Sahrens mutex_exit(hash_lock); 1718fa9e4066Sahrens 1719fa9e4066Sahrens nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 1720fa9e4066Sahrens nhdr->b_size = blksz; 1721fa9e4066Sahrens nhdr->b_spa = spa; 1722fa9e4066Sahrens nhdr->b_buf = buf; 1723fa9e4066Sahrens nhdr->b_state = arc.anon; 1724fa9e4066Sahrens nhdr->b_arc_access = 0; 1725fa9e4066Sahrens nhdr->b_flags = 0; 1726fa9e4066Sahrens buf->b_hdr = nhdr; 1727fa9e4066Sahrens buf->b_next = NULL; 1728fa9e4066Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 1729fa9e4066Sahrens atomic_add_64(&arc.anon->size, blksz); 1730fa9e4066Sahrens 1731fa9e4066Sahrens hdr = nhdr; 1732fa9e4066Sahrens } else { 1733fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1734fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1735fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1736fa9e4066Sahrens hdr->b_arc_access = 0; 1737fa9e4066Sahrens mutex_exit(hash_lock); 1738fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1739fa9e4066Sahrens hdr->b_birth = 0; 1740fa9e4066Sahrens hdr->b_cksum0 = 0; 1741fa9e4066Sahrens } 1742fa9e4066Sahrens } 1743fa9e4066Sahrens 1744fa9e4066Sahrens int 1745fa9e4066Sahrens arc_released(arc_buf_t *buf) 1746fa9e4066Sahrens { 1747fa9e4066Sahrens return (buf->b_hdr->b_state == arc.anon); 1748fa9e4066Sahrens } 1749fa9e4066Sahrens 1750fa9e4066Sahrens static void 1751fa9e4066Sahrens arc_write_done(zio_t *zio) 1752fa9e4066Sahrens { 1753fa9e4066Sahrens arc_buf_t *buf; 1754fa9e4066Sahrens arc_buf_hdr_t *hdr; 1755fa9e4066Sahrens arc_callback_t *acb; 1756fa9e4066Sahrens 1757fa9e4066Sahrens buf = zio->io_private; 1758fa9e4066Sahrens hdr = buf->b_hdr; 1759fa9e4066Sahrens acb = hdr->b_acb; 1760fa9e4066Sahrens hdr->b_acb = NULL; 1761fa9e4066Sahrens 1762fa9e4066Sahrens /* this buffer is on no lists and is not in the hash table */ 1763fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1764fa9e4066Sahrens 1765fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 1766fa9e4066Sahrens hdr->b_birth = zio->io_bp->blk_birth; 1767fa9e4066Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 1768fa9e4066Sahrens /* clear the "in-write" flag */ 1769fa9e4066Sahrens hdr->b_hash_next = NULL; 1770fa9e4066Sahrens /* This write may be all-zero */ 1771fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 1772fa9e4066Sahrens arc_buf_hdr_t *exists; 1773fa9e4066Sahrens kmutex_t *hash_lock; 1774fa9e4066Sahrens 1775fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1776fa9e4066Sahrens if (exists) { 1777fa9e4066Sahrens /* 1778fa9e4066Sahrens * This can only happen if we overwrite for 1779fa9e4066Sahrens * sync-to-convergence, because we remove 1780fa9e4066Sahrens * buffers from the hash table when we arc_free(). 1781fa9e4066Sahrens */ 1782fa9e4066Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 1783fa9e4066Sahrens BP_IDENTITY(zio->io_bp))); 1784fa9e4066Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 1785fa9e4066Sahrens zio->io_bp->blk_birth); 1786fa9e4066Sahrens 1787fa9e4066Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 1788fa9e4066Sahrens arc_change_state(arc.anon, exists, hash_lock); 1789fa9e4066Sahrens mutex_exit(hash_lock); 1790fa9e4066Sahrens arc_hdr_free(exists); 1791fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1792fa9e4066Sahrens ASSERT3P(exists, ==, NULL); 1793fa9e4066Sahrens } 1794fa9e4066Sahrens arc_access(hdr, hash_lock); 1795fa9e4066Sahrens mutex_exit(hash_lock); 1796fa9e4066Sahrens } 1797fa9e4066Sahrens if (acb && acb->acb_done) { 1798fa9e4066Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 1799fa9e4066Sahrens acb->acb_done(zio, buf, acb->acb_private); 1800fa9e4066Sahrens } 1801fa9e4066Sahrens 1802fa9e4066Sahrens if (acb) 1803fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1804fa9e4066Sahrens } 1805fa9e4066Sahrens 1806fa9e4066Sahrens int 1807fa9e4066Sahrens arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, 1808fa9e4066Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 1809fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1810fa9e4066Sahrens uint32_t arc_flags) 1811fa9e4066Sahrens { 1812fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1813fa9e4066Sahrens arc_callback_t *acb; 1814fa9e4066Sahrens zio_t *rzio; 1815fa9e4066Sahrens 1816fa9e4066Sahrens /* this is a private buffer - no locking required */ 1817fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1818fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 1819fa9e4066Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 1820fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1821fa9e4066Sahrens acb->acb_done = done; 1822fa9e4066Sahrens acb->acb_private = private; 1823fa9e4066Sahrens acb->acb_byteswap = (arc_byteswap_func_t *)-1; 1824fa9e4066Sahrens hdr->b_acb = acb; 1825fa9e4066Sahrens rzio = zio_write(pio, spa, checksum, compress, txg, bp, 1826fa9e4066Sahrens buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags); 1827fa9e4066Sahrens 1828fa9e4066Sahrens if (arc_flags & ARC_WAIT) 1829fa9e4066Sahrens return (zio_wait(rzio)); 1830fa9e4066Sahrens 1831fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1832fa9e4066Sahrens zio_nowait(rzio); 1833fa9e4066Sahrens 1834fa9e4066Sahrens return (0); 1835fa9e4066Sahrens } 1836fa9e4066Sahrens 1837fa9e4066Sahrens int 1838fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 1839fa9e4066Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 1840fa9e4066Sahrens { 1841fa9e4066Sahrens arc_buf_hdr_t *ab; 1842fa9e4066Sahrens kmutex_t *hash_lock; 1843fa9e4066Sahrens zio_t *zio; 1844fa9e4066Sahrens 1845fa9e4066Sahrens /* 1846fa9e4066Sahrens * If this buffer is in the cache, release it, so it 1847fa9e4066Sahrens * can be re-used. 1848fa9e4066Sahrens */ 1849fa9e4066Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1850fa9e4066Sahrens if (ab != NULL) { 1851fa9e4066Sahrens /* 1852fa9e4066Sahrens * The checksum of blocks to free is not always 1853fa9e4066Sahrens * preserved (eg. on the deadlist). However, if it is 1854fa9e4066Sahrens * nonzero, it should match what we have in the cache. 1855fa9e4066Sahrens */ 1856fa9e4066Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 1857fa9e4066Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 1858fa9e4066Sahrens arc_change_state(arc.anon, ab, hash_lock); 1859fa9e4066Sahrens if (refcount_is_zero(&ab->b_refcnt)) { 1860fa9e4066Sahrens mutex_exit(hash_lock); 1861fa9e4066Sahrens arc_hdr_free(ab); 1862fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 1863fa9e4066Sahrens } else { 1864fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1); 1865fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(ab)) 1866fa9e4066Sahrens ab->b_flags |= ARC_FREED_IN_READ; 1867fa9e4066Sahrens ab->b_arc_access = 0; 1868fa9e4066Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 1869fa9e4066Sahrens ab->b_birth = 0; 1870fa9e4066Sahrens ab->b_cksum0 = 0; 1871fa9e4066Sahrens mutex_exit(hash_lock); 1872fa9e4066Sahrens } 1873fa9e4066Sahrens } 1874fa9e4066Sahrens 1875fa9e4066Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 1876fa9e4066Sahrens 1877fa9e4066Sahrens if (arc_flags & ARC_WAIT) 1878fa9e4066Sahrens return (zio_wait(zio)); 1879fa9e4066Sahrens 1880fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1881fa9e4066Sahrens zio_nowait(zio); 1882fa9e4066Sahrens 1883fa9e4066Sahrens return (0); 1884fa9e4066Sahrens } 1885fa9e4066Sahrens 1886fa9e4066Sahrens void 1887fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve) 1888fa9e4066Sahrens { 1889fa9e4066Sahrens atomic_add_64(&arc_tempreserve, -tempreserve); 1890fa9e4066Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 1891fa9e4066Sahrens } 1892fa9e4066Sahrens 1893fa9e4066Sahrens int 1894fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve) 1895fa9e4066Sahrens { 1896fa9e4066Sahrens #ifdef ZFS_DEBUG 1897fa9e4066Sahrens /* 1898fa9e4066Sahrens * Once in a while, fail for no reason. Everything should cope. 1899fa9e4066Sahrens */ 1900fa9e4066Sahrens if (spa_get_random(10000) == 0) { 1901fa9e4066Sahrens dprintf("forcing random failure\n"); 1902fa9e4066Sahrens return (ERESTART); 1903fa9e4066Sahrens } 1904fa9e4066Sahrens #endif 1905112fe045Smaybee if (tempreserve > arc.c/4 && !arc.no_grow) 1906112fe045Smaybee arc.c = MIN(arc.c_max, tempreserve * 4); 1907112fe045Smaybee if (tempreserve > arc.c) 1908112fe045Smaybee return (ENOMEM); 1909112fe045Smaybee 1910fa9e4066Sahrens /* 1911112fe045Smaybee * Throttle writes when the amount of dirty data in the cache 1912112fe045Smaybee * gets too large. We try to keep the cache less than half full 1913112fe045Smaybee * of dirty blocks so that our sync times don't grow too large. 1914112fe045Smaybee * Note: if two requests come in concurrently, we might let them 1915112fe045Smaybee * both succeed, when one of them should fail. Not a huge deal. 1916112fe045Smaybee * 1917112fe045Smaybee * XXX The limit should be adjusted dynamically to keep the time 1918112fe045Smaybee * to sync a dataset fixed (around 1-5 seconds?). 1919fa9e4066Sahrens */ 1920fa9e4066Sahrens 1921112fe045Smaybee if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 && 1922112fe045Smaybee arc_tempreserve + arc.anon->size > arc.c / 4) { 1923fa9e4066Sahrens dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 1924fa9e4066Sahrens "tempreserve=%lluK arc.c=%lluK\n", 1925fa9e4066Sahrens arc_tempreserve>>10, arc.anon->lsize>>10, 1926fa9e4066Sahrens tempreserve>>10, arc.c>>10); 1927fa9e4066Sahrens return (ERESTART); 1928fa9e4066Sahrens } 1929fa9e4066Sahrens atomic_add_64(&arc_tempreserve, tempreserve); 1930fa9e4066Sahrens return (0); 1931fa9e4066Sahrens } 1932fa9e4066Sahrens 1933fa9e4066Sahrens void 1934fa9e4066Sahrens arc_init(void) 1935fa9e4066Sahrens { 1936fa9e4066Sahrens mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 1937fa9e4066Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 1938fa9e4066Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 1939fa9e4066Sahrens 1940fa9e4066Sahrens /* Start out with 1/8 of all memory */ 1941fa9e4066Sahrens arc.c = physmem * PAGESIZE / 8; 1942fa9e4066Sahrens 1943fa9e4066Sahrens #ifdef _KERNEL 1944fa9e4066Sahrens /* 1945fa9e4066Sahrens * On architectures where the physical memory can be larger 1946fa9e4066Sahrens * than the addressable space (intel in 32-bit mode), we may 1947fa9e4066Sahrens * need to limit the cache to 1/8 of VM size. 1948fa9e4066Sahrens */ 1949fa9e4066Sahrens arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 1950fa9e4066Sahrens #endif 1951fa9e4066Sahrens 1952112fe045Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 1953fa9e4066Sahrens arc.c_min = MAX(arc.c / 4, 64<<20); 1954112fe045Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 1955fa9e4066Sahrens if (arc.c * 8 >= 1<<30) 1956fa9e4066Sahrens arc.c_max = (arc.c * 8) - (1<<30); 1957fa9e4066Sahrens else 1958fa9e4066Sahrens arc.c_max = arc.c_min; 1959fa9e4066Sahrens arc.c_max = MAX(arc.c * 6, arc.c_max); 1960fa9e4066Sahrens arc.c = arc.c_max; 1961fa9e4066Sahrens arc.p = (arc.c >> 1); 1962fa9e4066Sahrens 1963fa9e4066Sahrens /* if kmem_flags are set, lets try to use less memory */ 1964fa9e4066Sahrens if (kmem_debugging()) 1965fa9e4066Sahrens arc.c = arc.c / 2; 1966fa9e4066Sahrens if (arc.c < arc.c_min) 1967fa9e4066Sahrens arc.c = arc.c_min; 1968fa9e4066Sahrens 1969fa9e4066Sahrens arc.anon = &ARC_anon; 1970fa9e4066Sahrens arc.mru_top = &ARC_mru_top; 1971fa9e4066Sahrens arc.mru_bot = &ARC_mru_bot; 1972fa9e4066Sahrens arc.mfu_top = &ARC_mfu_top; 1973fa9e4066Sahrens arc.mfu_bot = &ARC_mfu_bot; 1974fa9e4066Sahrens 1975fa9e4066Sahrens list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t), 1976fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1977fa9e4066Sahrens list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t), 1978fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1979fa9e4066Sahrens list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t), 1980fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1981fa9e4066Sahrens list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t), 1982fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1983fa9e4066Sahrens 1984fa9e4066Sahrens buf_init(); 1985fa9e4066Sahrens 1986fa9e4066Sahrens arc_thread_exit = 0; 1987fa9e4066Sahrens 1988fa9e4066Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 1989fa9e4066Sahrens TS_RUN, minclsyspri); 1990fa9e4066Sahrens } 1991fa9e4066Sahrens 1992fa9e4066Sahrens void 1993fa9e4066Sahrens arc_fini(void) 1994fa9e4066Sahrens { 1995fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1996fa9e4066Sahrens arc_thread_exit = 1; 1997fa9e4066Sahrens while (arc_thread_exit != 0) 1998fa9e4066Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 1999fa9e4066Sahrens mutex_exit(&arc_reclaim_thr_lock); 2000fa9e4066Sahrens 2001fa9e4066Sahrens arc_flush(); 2002fa9e4066Sahrens 2003fa9e4066Sahrens arc_dead = TRUE; 2004fa9e4066Sahrens 2005fa9e4066Sahrens mutex_destroy(&arc_reclaim_lock); 2006fa9e4066Sahrens mutex_destroy(&arc_reclaim_thr_lock); 2007fa9e4066Sahrens cv_destroy(&arc_reclaim_thr_cv); 2008fa9e4066Sahrens 2009fa9e4066Sahrens list_destroy(&arc.mru_top->list); 2010fa9e4066Sahrens list_destroy(&arc.mru_bot->list); 2011fa9e4066Sahrens list_destroy(&arc.mfu_top->list); 2012fa9e4066Sahrens list_destroy(&arc.mfu_bot->list); 2013fa9e4066Sahrens 2014fa9e4066Sahrens buf_fini(); 2015fa9e4066Sahrens } 2016