1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5fa9e4066Sahrens * Common Development and Distribution License, Version 1.0 only 6fa9e4066Sahrens * (the "License"). You may not use this file except in compliance 7fa9e4066Sahrens * with the License. 8fa9e4066Sahrens * 9fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 11fa9e4066Sahrens * See the License for the specific language governing permissions 12fa9e4066Sahrens * and limitations under the License. 13fa9e4066Sahrens * 14fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 17fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19fa9e4066Sahrens * 20fa9e4066Sahrens * CDDL HEADER END 21fa9e4066Sahrens */ 22fa9e4066Sahrens /* 23fa9e4066Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28fa9e4066Sahrens 29fa9e4066Sahrens /* 30fa9e4066Sahrens * DVA-based Adjustable Relpacement Cache 31fa9e4066Sahrens * 32fa9e4066Sahrens * While much of the theory of operation and algorithms used here 33fa9e4066Sahrens * are based on the self-tuning, low overhead replacement cache 34fa9e4066Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 35fa9e4066Sahrens * significant differences: 36fa9e4066Sahrens * 37fa9e4066Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 38fa9e4066Sahrens * Pages in its cache cannot be "locked" into memory. This makes 39fa9e4066Sahrens * the eviction algorithm simple: evict the last page in the list. 40fa9e4066Sahrens * This also make the performance characteristics easy to reason 41fa9e4066Sahrens * about. Our cache is not so simple. At any given moment, some 42fa9e4066Sahrens * subset of the blocks in the cache are un-evictable because we 43fa9e4066Sahrens * have handed out a reference to them. Blocks are only evictable 44fa9e4066Sahrens * when there are no external references active. This makes 45fa9e4066Sahrens * eviction far more problematic: we choose to evict the evictable 46fa9e4066Sahrens * blocks that are the "lowest" in the list. 47fa9e4066Sahrens * 48fa9e4066Sahrens * There are times when it is not possible to evict the requested 49fa9e4066Sahrens * space. In these circumstances we are unable to adjust the cache 50fa9e4066Sahrens * size. To prevent the cache growing unbounded at these times we 51fa9e4066Sahrens * implement a "cache throttle" that slowes the flow of new data 52fa9e4066Sahrens * into the cache until we can make space avaiable. 53fa9e4066Sahrens * 54fa9e4066Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 55fa9e4066Sahrens * Pages are evicted when the cache is full and there is a cache 56fa9e4066Sahrens * miss. Our model has a variable sized cache. It grows with 57fa9e4066Sahrens * high use, but also tries to react to memory preasure from the 58fa9e4066Sahrens * operating system: decreasing its size when system memory is 59fa9e4066Sahrens * tight. 60fa9e4066Sahrens * 61fa9e4066Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 62fa9e4066Sahrens * elements of the cache are therefor exactly the same size. So 63fa9e4066Sahrens * when adjusting the cache size following a cache miss, its simply 64fa9e4066Sahrens * a matter of choosing a single page to evict. In our model, we 65fa9e4066Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 66fa9e4066Sahrens * 128K bytes). We therefor choose a set of blocks to evict to make 67fa9e4066Sahrens * space for a cache miss that approximates as closely as possible 68fa9e4066Sahrens * the space used by the new block. 69fa9e4066Sahrens * 70fa9e4066Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71fa9e4066Sahrens * by N. Megiddo & D. Modha, FAST 2003 72fa9e4066Sahrens */ 73fa9e4066Sahrens 74fa9e4066Sahrens /* 75fa9e4066Sahrens * The locking model: 76fa9e4066Sahrens * 77fa9e4066Sahrens * A new reference to a cache buffer can be obtained in two 78fa9e4066Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 79fa9e4066Sahrens * or 2) via one of the ARC lists. The arc_read() inerface 80fa9e4066Sahrens * uses method 1, while the internal arc algorithms for 81fa9e4066Sahrens * adjusting the cache use method 2. We therefor provide two 82fa9e4066Sahrens * types of locks: 1) the hash table lock array, and 2) the 83fa9e4066Sahrens * arc list locks. 84fa9e4066Sahrens * 85fa9e4066Sahrens * Buffers do not have their own mutexs, rather they rely on the 86fa9e4066Sahrens * hash table mutexs for the bulk of their protection (i.e. most 87fa9e4066Sahrens * fields in the arc_buf_hdr_t are protected by these mutexs). 88fa9e4066Sahrens * 89fa9e4066Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 90fa9e4066Sahrens * locates the requested buffer in the hash table. It returns 91fa9e4066Sahrens * NULL for the mutex if the buffer was not in the table. 92fa9e4066Sahrens * 93fa9e4066Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 94fa9e4066Sahrens * already held before it is invoked. 95fa9e4066Sahrens * 96fa9e4066Sahrens * Each arc state also has a mutex which is used to protect the 97fa9e4066Sahrens * buffer list associated with the state. When attempting to 98fa9e4066Sahrens * obtain a hash table lock while holding an arc list lock you 99fa9e4066Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 100fa9e4066Sahrens * the "top" state mutex must be held before the "bot" state mutex. 101fa9e4066Sahrens * 102fa9e4066Sahrens * Note that the majority of the performance stats are manipulated 103fa9e4066Sahrens * with atomic operations. 104fa9e4066Sahrens */ 105fa9e4066Sahrens 106fa9e4066Sahrens #include <sys/spa.h> 107fa9e4066Sahrens #include <sys/zio.h> 108fa9e4066Sahrens #include <sys/zfs_context.h> 109fa9e4066Sahrens #include <sys/arc.h> 110fa9e4066Sahrens #include <sys/refcount.h> 111fa9e4066Sahrens #ifdef _KERNEL 112fa9e4066Sahrens #include <sys/vmsystm.h> 113fa9e4066Sahrens #include <vm/anon.h> 114fa9e4066Sahrens #include <sys/fs/swapnode.h> 115fa9e4066Sahrens #endif 116fa9e4066Sahrens #include <sys/callb.h> 117fa9e4066Sahrens 118fa9e4066Sahrens static kmutex_t arc_reclaim_thr_lock; 119fa9e4066Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 120fa9e4066Sahrens static uint8_t arc_thread_exit; 121fa9e4066Sahrens 122fa9e4066Sahrens typedef enum arc_reclaim_strategy { 123fa9e4066Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 124fa9e4066Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 125fa9e4066Sahrens } arc_reclaim_strategy_t; 126fa9e4066Sahrens 127fa9e4066Sahrens /* number of seconds before growing cache again */ 128fa9e4066Sahrens static int arc_grow_retry = 60; 129fa9e4066Sahrens 130fa9e4066Sahrens static kmutex_t arc_reclaim_lock; 131fa9e4066Sahrens static int arc_dead; 132fa9e4066Sahrens 133fa9e4066Sahrens /* 134fa9e4066Sahrens * Note that buffers can be on one of 5 states: 135fa9e4066Sahrens * ARC_anon - anonymous (discussed below) 136fa9e4066Sahrens * ARC_mru_top - recently used, currently cached 137fa9e4066Sahrens * ARC_mru_bot - recentely used, no longer in cache 138fa9e4066Sahrens * ARC_mfu_top - frequently used, currently cached 139fa9e4066Sahrens * ARC_mfu_bot - frequently used, no longer in cache 140fa9e4066Sahrens * When there are no active references to the buffer, they 141fa9e4066Sahrens * are linked onto one of the lists in arc. These are the 142fa9e4066Sahrens * only buffers that can be evicted or deleted. 143fa9e4066Sahrens * 144fa9e4066Sahrens * Anonymous buffers are buffers that are not associated with 145fa9e4066Sahrens * a DVA. These are buffers that hold dirty block copies 146fa9e4066Sahrens * before they are written to stable storage. By definition, 147fa9e4066Sahrens * they are "ref'd" and are considered part of arc_mru_top 148fa9e4066Sahrens * that cannot be freed. Generally, they will aquire a DVA 149fa9e4066Sahrens * as they are written and migrate onto the arc_mru_top list. 150fa9e4066Sahrens */ 151fa9e4066Sahrens 152fa9e4066Sahrens typedef struct arc_state { 153fa9e4066Sahrens list_t list; /* linked list of evictable buffer in state */ 154fa9e4066Sahrens uint64_t lsize; /* total size of buffers in the linked list */ 155fa9e4066Sahrens uint64_t size; /* total size of all buffers in this state */ 156fa9e4066Sahrens uint64_t hits; 157fa9e4066Sahrens kmutex_t mtx; 158fa9e4066Sahrens } arc_state_t; 159fa9e4066Sahrens 160fa9e4066Sahrens /* The 5 states: */ 161fa9e4066Sahrens static arc_state_t ARC_anon; 162fa9e4066Sahrens static arc_state_t ARC_mru_top; 163fa9e4066Sahrens static arc_state_t ARC_mru_bot; 164fa9e4066Sahrens static arc_state_t ARC_mfu_top; 165fa9e4066Sahrens static arc_state_t ARC_mfu_bot; 166fa9e4066Sahrens 167fa9e4066Sahrens static struct arc { 168fa9e4066Sahrens arc_state_t *anon; 169fa9e4066Sahrens arc_state_t *mru_top; 170fa9e4066Sahrens arc_state_t *mru_bot; 171fa9e4066Sahrens arc_state_t *mfu_top; 172fa9e4066Sahrens arc_state_t *mfu_bot; 173fa9e4066Sahrens uint64_t size; /* Actual total arc size */ 174fa9e4066Sahrens uint64_t p; /* Target size (in bytes) of mru_top */ 175fa9e4066Sahrens uint64_t c; /* Target size of cache (in bytes) */ 176fa9e4066Sahrens uint64_t c_min; /* Minimum target cache size */ 177fa9e4066Sahrens uint64_t c_max; /* Maximum target cache size */ 178fa9e4066Sahrens uint64_t incr; /* Size by which to increment arc.c */ 179fa9e4066Sahrens int64_t size_check; 180fa9e4066Sahrens 181fa9e4066Sahrens /* performance stats */ 182fa9e4066Sahrens uint64_t hits; 183fa9e4066Sahrens uint64_t misses; 184fa9e4066Sahrens uint64_t deleted; 185fa9e4066Sahrens uint64_t skipped; 186fa9e4066Sahrens uint64_t hash_elements; 187fa9e4066Sahrens uint64_t hash_elements_max; 188fa9e4066Sahrens uint64_t hash_collisions; 189fa9e4066Sahrens uint64_t hash_chains; 190fa9e4066Sahrens uint32_t hash_chain_max; 191fa9e4066Sahrens 192fa9e4066Sahrens int no_grow; /* Don't try to grow cache size */ 193fa9e4066Sahrens } arc; 194fa9e4066Sahrens 195fa9e4066Sahrens /* Default amount to grow arc.incr */ 196fa9e4066Sahrens static int64_t arc_incr_size = 1024; 197fa9e4066Sahrens 198fa9e4066Sahrens /* > 0 ==> time to increment arc.c */ 199fa9e4066Sahrens static int64_t arc_size_check_default = -1000; 200fa9e4066Sahrens 201fa9e4066Sahrens static uint64_t arc_tempreserve; 202fa9e4066Sahrens 203fa9e4066Sahrens typedef struct arc_callback arc_callback_t; 204fa9e4066Sahrens 205fa9e4066Sahrens struct arc_callback { 206fa9e4066Sahrens arc_done_func_t *acb_done; 207fa9e4066Sahrens void *acb_private; 208fa9e4066Sahrens arc_byteswap_func_t *acb_byteswap; 209fa9e4066Sahrens arc_buf_t *acb_buf; 210fa9e4066Sahrens zio_t *acb_zio_dummy; 211fa9e4066Sahrens arc_callback_t *acb_next; 212fa9e4066Sahrens }; 213fa9e4066Sahrens 214fa9e4066Sahrens struct arc_buf_hdr { 215fa9e4066Sahrens /* immutable */ 216fa9e4066Sahrens uint64_t b_size; 217fa9e4066Sahrens spa_t *b_spa; 218fa9e4066Sahrens 219fa9e4066Sahrens /* protected by hash lock */ 220fa9e4066Sahrens dva_t b_dva; 221fa9e4066Sahrens uint64_t b_birth; 222fa9e4066Sahrens uint64_t b_cksum0; 223fa9e4066Sahrens 224fa9e4066Sahrens arc_buf_hdr_t *b_hash_next; 225fa9e4066Sahrens arc_buf_t *b_buf; 226fa9e4066Sahrens uint32_t b_flags; 227fa9e4066Sahrens 228fa9e4066Sahrens kcondvar_t b_cv; 229fa9e4066Sahrens arc_callback_t *b_acb; 230fa9e4066Sahrens 231fa9e4066Sahrens /* protected by arc state mutex */ 232fa9e4066Sahrens arc_state_t *b_state; 233fa9e4066Sahrens list_node_t b_arc_node; 234fa9e4066Sahrens 235fa9e4066Sahrens /* updated atomically */ 236fa9e4066Sahrens clock_t b_arc_access; 237fa9e4066Sahrens 238fa9e4066Sahrens /* self protecting */ 239fa9e4066Sahrens refcount_t b_refcnt; 240fa9e4066Sahrens }; 241fa9e4066Sahrens 242fa9e4066Sahrens /* 243fa9e4066Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 244fa9e4066Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 245fa9e4066Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 246fa9e4066Sahrens * should never be passed and should only be set by ARC code. When adding new 247fa9e4066Sahrens * public flags, make sure not to smash the private ones. 248fa9e4066Sahrens */ 249fa9e4066Sahrens 250fa9e4066Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 251fa9e4066Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 252fa9e4066Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 253fa9e4066Sahrens 254fa9e4066Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 255fa9e4066Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 256fa9e4066Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 257fa9e4066Sahrens 258fa9e4066Sahrens /* 259fa9e4066Sahrens * Hash table routines 260fa9e4066Sahrens */ 261fa9e4066Sahrens 262fa9e4066Sahrens #define HT_LOCK_PAD 64 263fa9e4066Sahrens 264fa9e4066Sahrens struct ht_lock { 265fa9e4066Sahrens kmutex_t ht_lock; 266fa9e4066Sahrens #ifdef _KERNEL 267fa9e4066Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 268fa9e4066Sahrens #endif 269fa9e4066Sahrens }; 270fa9e4066Sahrens 271fa9e4066Sahrens #define BUF_LOCKS 256 272fa9e4066Sahrens typedef struct buf_hash_table { 273fa9e4066Sahrens uint64_t ht_mask; 274fa9e4066Sahrens arc_buf_hdr_t **ht_table; 275fa9e4066Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 276fa9e4066Sahrens } buf_hash_table_t; 277fa9e4066Sahrens 278fa9e4066Sahrens static buf_hash_table_t buf_hash_table; 279fa9e4066Sahrens 280fa9e4066Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 281fa9e4066Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 282fa9e4066Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 283fa9e4066Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 284fa9e4066Sahrens #define HDR_LOCK(buf) \ 285fa9e4066Sahrens (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 286fa9e4066Sahrens 287fa9e4066Sahrens uint64_t zfs_crc64_table[256]; 288fa9e4066Sahrens 289fa9e4066Sahrens static uint64_t 290fa9e4066Sahrens buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 291fa9e4066Sahrens { 292fa9e4066Sahrens uintptr_t spav = (uintptr_t)spa; 293fa9e4066Sahrens uint8_t *vdva = (uint8_t *)dva; 294fa9e4066Sahrens uint64_t crc = -1ULL; 295fa9e4066Sahrens int i; 296fa9e4066Sahrens 297fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 298fa9e4066Sahrens 299fa9e4066Sahrens for (i = 0; i < sizeof (dva_t); i++) 300fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 301fa9e4066Sahrens 302fa9e4066Sahrens crc ^= (spav>>8) ^ birth; 303fa9e4066Sahrens 304fa9e4066Sahrens return (crc); 305fa9e4066Sahrens } 306fa9e4066Sahrens 307fa9e4066Sahrens #define BUF_EMPTY(buf) \ 308fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 309fa9e4066Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 310fa9e4066Sahrens (buf)->b_birth == 0) 311fa9e4066Sahrens 312fa9e4066Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 313fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 314fa9e4066Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 315fa9e4066Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 316fa9e4066Sahrens 317fa9e4066Sahrens static arc_buf_hdr_t * 318fa9e4066Sahrens buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 319fa9e4066Sahrens { 320fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 321fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 322fa9e4066Sahrens arc_buf_hdr_t *buf; 323fa9e4066Sahrens 324fa9e4066Sahrens mutex_enter(hash_lock); 325fa9e4066Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 326fa9e4066Sahrens buf = buf->b_hash_next) { 327fa9e4066Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 328fa9e4066Sahrens *lockp = hash_lock; 329fa9e4066Sahrens return (buf); 330fa9e4066Sahrens } 331fa9e4066Sahrens } 332fa9e4066Sahrens mutex_exit(hash_lock); 333fa9e4066Sahrens *lockp = NULL; 334fa9e4066Sahrens return (NULL); 335fa9e4066Sahrens } 336fa9e4066Sahrens 337fa9e4066Sahrens /* 338fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 339fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 340fa9e4066Sahrens * will be returned and the new element will not be inserted. 341fa9e4066Sahrens * Otherwise returns NULL. 342fa9e4066Sahrens */ 343fa9e4066Sahrens static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */ 344fa9e4066Sahrens static kthread_t *fbufs_lastthread; 345fa9e4066Sahrens static arc_buf_hdr_t * 346fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 347fa9e4066Sahrens { 348fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 349fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 350fa9e4066Sahrens arc_buf_hdr_t *fbuf; 351fa9e4066Sahrens uint32_t max, i; 352fa9e4066Sahrens 353fa9e4066Sahrens fbufs_lastthread = curthread; 354fa9e4066Sahrens *lockp = hash_lock; 355fa9e4066Sahrens mutex_enter(hash_lock); 356fa9e4066Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 357fa9e4066Sahrens fbuf = fbuf->b_hash_next, i++) { 358fa9e4066Sahrens if (i < sizeof (fbufs) / sizeof (fbufs[0])) 359fa9e4066Sahrens fbufs[i] = fbuf; 360fa9e4066Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 361fa9e4066Sahrens return (fbuf); 362fa9e4066Sahrens } 363fa9e4066Sahrens 364fa9e4066Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 365fa9e4066Sahrens buf_hash_table.ht_table[idx] = buf; 366fa9e4066Sahrens 367fa9e4066Sahrens /* collect some hash table performance data */ 368fa9e4066Sahrens if (i > 0) { 369fa9e4066Sahrens atomic_add_64(&arc.hash_collisions, 1); 370fa9e4066Sahrens if (i == 1) 371fa9e4066Sahrens atomic_add_64(&arc.hash_chains, 1); 372fa9e4066Sahrens } 373fa9e4066Sahrens while (i > (max = arc.hash_chain_max) && 374fa9e4066Sahrens max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 375fa9e4066Sahrens continue; 376fa9e4066Sahrens } 377fa9e4066Sahrens atomic_add_64(&arc.hash_elements, 1); 378fa9e4066Sahrens if (arc.hash_elements > arc.hash_elements_max) 379fa9e4066Sahrens atomic_add_64(&arc.hash_elements_max, 1); 380fa9e4066Sahrens 381fa9e4066Sahrens return (NULL); 382fa9e4066Sahrens } 383fa9e4066Sahrens 384fa9e4066Sahrens static void 385fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 386fa9e4066Sahrens { 387fa9e4066Sahrens arc_buf_hdr_t *fbuf, **bufp; 388fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 389fa9e4066Sahrens 390fa9e4066Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 391fa9e4066Sahrens 392fa9e4066Sahrens bufp = &buf_hash_table.ht_table[idx]; 393fa9e4066Sahrens while ((fbuf = *bufp) != buf) { 394fa9e4066Sahrens ASSERT(fbuf != NULL); 395fa9e4066Sahrens bufp = &fbuf->b_hash_next; 396fa9e4066Sahrens } 397fa9e4066Sahrens *bufp = buf->b_hash_next; 398fa9e4066Sahrens buf->b_hash_next = NULL; 399fa9e4066Sahrens 400fa9e4066Sahrens /* collect some hash table performance data */ 401fa9e4066Sahrens atomic_add_64(&arc.hash_elements, -1); 402fa9e4066Sahrens if (buf_hash_table.ht_table[idx] && 403fa9e4066Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 404fa9e4066Sahrens atomic_add_64(&arc.hash_chains, -1); 405fa9e4066Sahrens } 406fa9e4066Sahrens 407fa9e4066Sahrens /* 408fa9e4066Sahrens * Global data structures and functions for the buf kmem cache. 409fa9e4066Sahrens */ 410fa9e4066Sahrens static kmem_cache_t *hdr_cache; 411fa9e4066Sahrens static kmem_cache_t *buf_cache; 412fa9e4066Sahrens 413fa9e4066Sahrens static void 414fa9e4066Sahrens buf_fini(void) 415fa9e4066Sahrens { 416fa9e4066Sahrens int i; 417fa9e4066Sahrens 418fa9e4066Sahrens kmem_free(buf_hash_table.ht_table, 419fa9e4066Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 420fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) 421fa9e4066Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 422fa9e4066Sahrens kmem_cache_destroy(hdr_cache); 423fa9e4066Sahrens kmem_cache_destroy(buf_cache); 424fa9e4066Sahrens } 425fa9e4066Sahrens 426fa9e4066Sahrens /* 427fa9e4066Sahrens * Constructor callback - called when the cache is empty 428fa9e4066Sahrens * and a new buf is requested. 429fa9e4066Sahrens */ 430fa9e4066Sahrens /* ARGSUSED */ 431fa9e4066Sahrens static int 432fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 433fa9e4066Sahrens { 434fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 435fa9e4066Sahrens 436fa9e4066Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 437fa9e4066Sahrens refcount_create(&buf->b_refcnt); 438fa9e4066Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 439fa9e4066Sahrens return (0); 440fa9e4066Sahrens } 441fa9e4066Sahrens 442fa9e4066Sahrens /* 443fa9e4066Sahrens * Destructor callback - called when a cached buf is 444fa9e4066Sahrens * no longer required. 445fa9e4066Sahrens */ 446fa9e4066Sahrens /* ARGSUSED */ 447fa9e4066Sahrens static void 448fa9e4066Sahrens hdr_dest(void *vbuf, void *unused) 449fa9e4066Sahrens { 450fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 451fa9e4066Sahrens 452fa9e4066Sahrens refcount_destroy(&buf->b_refcnt); 453fa9e4066Sahrens cv_destroy(&buf->b_cv); 454fa9e4066Sahrens } 455fa9e4066Sahrens 456fa9e4066Sahrens void arc_kmem_reclaim(void); 457fa9e4066Sahrens 458fa9e4066Sahrens /* 459fa9e4066Sahrens * Reclaim callback -- invoked when memory is low. 460fa9e4066Sahrens */ 461fa9e4066Sahrens /* ARGSUSED */ 462fa9e4066Sahrens static void 463fa9e4066Sahrens hdr_recl(void *unused) 464fa9e4066Sahrens { 465fa9e4066Sahrens dprintf("hdr_recl called\n"); 466fa9e4066Sahrens arc_kmem_reclaim(); 467fa9e4066Sahrens } 468fa9e4066Sahrens 469fa9e4066Sahrens static void 470fa9e4066Sahrens buf_init(void) 471fa9e4066Sahrens { 472fa9e4066Sahrens uint64_t *ct; 473fa9e4066Sahrens uint64_t hsize = 1ULL << 10; 474fa9e4066Sahrens int i, j; 475fa9e4066Sahrens 476fa9e4066Sahrens /* 477fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 478fa9e4066Sahrens * with an average 4k block size. The table will take up 479fa9e4066Sahrens * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte 480fa9e4066Sahrens * pointers). 481fa9e4066Sahrens */ 482fa9e4066Sahrens while (hsize * 4096 < physmem * PAGESIZE) 483fa9e4066Sahrens hsize <<= 1; 484fa9e4066Sahrens 485fa9e4066Sahrens buf_hash_table.ht_mask = hsize - 1; 486fa9e4066Sahrens buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP); 487fa9e4066Sahrens 488fa9e4066Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 489fa9e4066Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 490fa9e4066Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 491fa9e4066Sahrens 0, NULL, NULL, NULL, NULL, NULL, 0); 492fa9e4066Sahrens 493fa9e4066Sahrens for (i = 0; i < 256; i++) 494fa9e4066Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 495fa9e4066Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 496fa9e4066Sahrens 497fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) { 498fa9e4066Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 499fa9e4066Sahrens NULL, MUTEX_DEFAULT, NULL); 500fa9e4066Sahrens } 501fa9e4066Sahrens } 502fa9e4066Sahrens 503fa9e4066Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 504fa9e4066Sahrens 505fa9e4066Sahrens #define ARC_TAG (void *)0x05201962 506fa9e4066Sahrens 507fa9e4066Sahrens static void 508fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 509fa9e4066Sahrens { 510fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 511fa9e4066Sahrens 512fa9e4066Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 513fa9e4066Sahrens (ab->b_state != arc.anon)) { 514fa9e4066Sahrens 515fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 516fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 517fa9e4066Sahrens ASSERT(!refcount_is_zero(&ab->b_refcnt)); 518fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 519fa9e4066Sahrens list_remove(&ab->b_state->list, ab); 520fa9e4066Sahrens ASSERT3U(ab->b_state->lsize, >=, ab->b_size); 521fa9e4066Sahrens ab->b_state->lsize -= ab->b_size; 522fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 523fa9e4066Sahrens } 524fa9e4066Sahrens } 525fa9e4066Sahrens 526fa9e4066Sahrens static int 527fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 528fa9e4066Sahrens { 529fa9e4066Sahrens int cnt; 530fa9e4066Sahrens 531fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 532fa9e4066Sahrens 533fa9e4066Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 534fa9e4066Sahrens (ab->b_state != arc.anon)) { 535fa9e4066Sahrens 536fa9e4066Sahrens ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 537fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 538fa9e4066Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 539fa9e4066Sahrens list_insert_head(&ab->b_state->list, ab); 540fa9e4066Sahrens ASSERT(ab->b_buf != NULL); 541fa9e4066Sahrens ab->b_state->lsize += ab->b_size; 542fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 543fa9e4066Sahrens } 544fa9e4066Sahrens return (cnt); 545fa9e4066Sahrens } 546fa9e4066Sahrens 547fa9e4066Sahrens /* 548fa9e4066Sahrens * Move the supplied buffer to the indicated state. The mutex 549fa9e4066Sahrens * for the buffer must be held by the caller. 550fa9e4066Sahrens */ 551fa9e4066Sahrens static void 552fa9e4066Sahrens arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, 553fa9e4066Sahrens kmutex_t *hash_lock) 554fa9e4066Sahrens { 555fa9e4066Sahrens arc_buf_t *buf; 556fa9e4066Sahrens 557fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 558fa9e4066Sahrens 559fa9e4066Sahrens /* 560fa9e4066Sahrens * If this buffer is evictable, transfer it from the 561fa9e4066Sahrens * old state list to the new state list. 562fa9e4066Sahrens */ 563fa9e4066Sahrens if (refcount_is_zero(&ab->b_refcnt)) { 564fa9e4066Sahrens if (ab->b_state != arc.anon) { 565fa9e4066Sahrens int drop_mutex = FALSE; 566fa9e4066Sahrens 567fa9e4066Sahrens if (!MUTEX_HELD(&ab->b_state->mtx)) { 568fa9e4066Sahrens mutex_enter(&ab->b_state->mtx); 569fa9e4066Sahrens drop_mutex = TRUE; 570fa9e4066Sahrens } 571fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 572fa9e4066Sahrens list_remove(&ab->b_state->list, ab); 573fa9e4066Sahrens ASSERT3U(ab->b_state->lsize, >=, ab->b_size); 574fa9e4066Sahrens ab->b_state->lsize -= ab->b_size; 575fa9e4066Sahrens if (drop_mutex) 576fa9e4066Sahrens mutex_exit(&ab->b_state->mtx); 577fa9e4066Sahrens } 578fa9e4066Sahrens if (new_state != arc.anon) { 579fa9e4066Sahrens int drop_mutex = FALSE; 580fa9e4066Sahrens 581fa9e4066Sahrens if (!MUTEX_HELD(&new_state->mtx)) { 582fa9e4066Sahrens mutex_enter(&new_state->mtx); 583fa9e4066Sahrens drop_mutex = TRUE; 584fa9e4066Sahrens } 585fa9e4066Sahrens list_insert_head(&new_state->list, ab); 586fa9e4066Sahrens ASSERT(ab->b_buf != NULL); 587fa9e4066Sahrens new_state->lsize += ab->b_size; 588fa9e4066Sahrens if (drop_mutex) 589fa9e4066Sahrens mutex_exit(&new_state->mtx); 590fa9e4066Sahrens } 591fa9e4066Sahrens } 592fa9e4066Sahrens 593fa9e4066Sahrens ASSERT(!BUF_EMPTY(ab)); 594fa9e4066Sahrens if (new_state == arc.anon && ab->b_state != arc.anon) { 595fa9e4066Sahrens buf_hash_remove(ab); 596fa9e4066Sahrens } 597fa9e4066Sahrens 598fa9e4066Sahrens /* 599fa9e4066Sahrens * If this buffer isn't being transferred to the MRU-top 600fa9e4066Sahrens * state, it's safe to clear its prefetch flag 601fa9e4066Sahrens */ 602fa9e4066Sahrens if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) { 603fa9e4066Sahrens ab->b_flags &= ~ARC_PREFETCH; 604fa9e4066Sahrens } 605fa9e4066Sahrens 606fa9e4066Sahrens buf = ab->b_buf; 607fa9e4066Sahrens if (buf == NULL) { 608fa9e4066Sahrens ASSERT3U(ab->b_state->size, >=, ab->b_size); 609fa9e4066Sahrens atomic_add_64(&ab->b_state->size, -ab->b_size); 610fa9e4066Sahrens /* we should only be here if we are deleting state */ 611fa9e4066Sahrens ASSERT(new_state == arc.anon && 612fa9e4066Sahrens (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot)); 613fa9e4066Sahrens } else while (buf) { 614fa9e4066Sahrens ASSERT3U(ab->b_state->size, >=, ab->b_size); 615fa9e4066Sahrens atomic_add_64(&ab->b_state->size, -ab->b_size); 616fa9e4066Sahrens atomic_add_64(&new_state->size, ab->b_size); 617fa9e4066Sahrens buf = buf->b_next; 618fa9e4066Sahrens } 619fa9e4066Sahrens ab->b_state = new_state; 620fa9e4066Sahrens } 621fa9e4066Sahrens 622fa9e4066Sahrens arc_buf_t * 623fa9e4066Sahrens arc_buf_alloc(spa_t *spa, int size, void *tag) 624fa9e4066Sahrens { 625fa9e4066Sahrens arc_buf_hdr_t *hdr; 626fa9e4066Sahrens arc_buf_t *buf; 627fa9e4066Sahrens 628fa9e4066Sahrens ASSERT3U(size, >, 0); 629fa9e4066Sahrens hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 630fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 631fa9e4066Sahrens hdr->b_size = size; 632fa9e4066Sahrens hdr->b_spa = spa; 633fa9e4066Sahrens hdr->b_state = arc.anon; 634fa9e4066Sahrens hdr->b_arc_access = 0; 635fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 636fa9e4066Sahrens buf->b_hdr = hdr; 637fa9e4066Sahrens buf->b_next = NULL; 638fa9e4066Sahrens buf->b_data = zio_buf_alloc(size); 639fa9e4066Sahrens hdr->b_buf = buf; 640fa9e4066Sahrens hdr->b_flags = 0; 641fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 642fa9e4066Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 643fa9e4066Sahrens 644fa9e4066Sahrens atomic_add_64(&arc.size, size); 645fa9e4066Sahrens atomic_add_64(&arc.anon->size, size); 646fa9e4066Sahrens 647fa9e4066Sahrens return (buf); 648fa9e4066Sahrens } 649fa9e4066Sahrens 650fa9e4066Sahrens static void 651fa9e4066Sahrens arc_hdr_free(arc_buf_hdr_t *hdr) 652fa9e4066Sahrens { 653fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 654fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 655fa9e4066Sahrens 656fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 657fa9e4066Sahrens /* 658fa9e4066Sahrens * We can be called with an arc state lock held, 659fa9e4066Sahrens * so we can't hold a hash lock here. 660fa9e4066Sahrens * ASSERT(not in hash table) 661fa9e4066Sahrens */ 662fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 663fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 664fa9e4066Sahrens hdr->b_birth = 0; 665fa9e4066Sahrens hdr->b_cksum0 = 0; 666fa9e4066Sahrens } 667fa9e4066Sahrens if (hdr->b_buf) { 668fa9e4066Sahrens arc_buf_t *buf = hdr->b_buf; 669fa9e4066Sahrens 670fa9e4066Sahrens ASSERT3U(hdr->b_size, >, 0); 671fa9e4066Sahrens zio_buf_free(buf->b_data, hdr->b_size); 672fa9e4066Sahrens atomic_add_64(&arc.size, -hdr->b_size); 673fa9e4066Sahrens ASSERT3U(arc.anon->size, >=, hdr->b_size); 674fa9e4066Sahrens atomic_add_64(&arc.anon->size, -hdr->b_size); 675fa9e4066Sahrens ASSERT3P(buf->b_next, ==, NULL); 676fa9e4066Sahrens kmem_cache_free(buf_cache, buf); 677fa9e4066Sahrens hdr->b_buf = NULL; 678fa9e4066Sahrens } 679fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 680fa9e4066Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 681fa9e4066Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 682fa9e4066Sahrens kmem_cache_free(hdr_cache, hdr); 683fa9e4066Sahrens } 684fa9e4066Sahrens 685fa9e4066Sahrens void 686fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 687fa9e4066Sahrens { 688fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 689fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 690fa9e4066Sahrens int freeable; 691fa9e4066Sahrens 692fa9e4066Sahrens mutex_enter(hash_lock); 693fa9e4066Sahrens if (remove_reference(hdr, hash_lock, tag) > 0) { 694fa9e4066Sahrens arc_buf_t **bufp = &hdr->b_buf; 695fa9e4066Sahrens arc_state_t *state = hdr->b_state; 696fa9e4066Sahrens uint64_t size = hdr->b_size; 697fa9e4066Sahrens 698fa9e4066Sahrens ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr)); 699fa9e4066Sahrens while (*bufp != buf) { 700fa9e4066Sahrens ASSERT(*bufp); 701fa9e4066Sahrens bufp = &(*bufp)->b_next; 702fa9e4066Sahrens } 703fa9e4066Sahrens *bufp = buf->b_next; 704fa9e4066Sahrens mutex_exit(hash_lock); 705fa9e4066Sahrens zio_buf_free(buf->b_data, size); 706fa9e4066Sahrens atomic_add_64(&arc.size, -size); 707fa9e4066Sahrens kmem_cache_free(buf_cache, buf); 708fa9e4066Sahrens ASSERT3U(state->size, >=, size); 709fa9e4066Sahrens atomic_add_64(&state->size, -size); 710fa9e4066Sahrens return; 711fa9e4066Sahrens } 712fa9e4066Sahrens 713fa9e4066Sahrens /* don't free buffers that are in the middle of an async write */ 714fa9e4066Sahrens freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL); 715fa9e4066Sahrens mutex_exit(hash_lock); 716fa9e4066Sahrens 717fa9e4066Sahrens if (freeable) 718fa9e4066Sahrens arc_hdr_free(hdr); 719fa9e4066Sahrens } 720fa9e4066Sahrens 721fa9e4066Sahrens int 722fa9e4066Sahrens arc_buf_size(arc_buf_t *buf) 723fa9e4066Sahrens { 724fa9e4066Sahrens return (buf->b_hdr->b_size); 725fa9e4066Sahrens } 726fa9e4066Sahrens 727fa9e4066Sahrens /* 728fa9e4066Sahrens * Evict buffers from list until we've removed the specified number of 729fa9e4066Sahrens * bytes. Move the removed buffers to the appropriate evict state. 730fa9e4066Sahrens */ 731fa9e4066Sahrens static uint64_t 732fa9e4066Sahrens arc_evict_state(arc_state_t *state, int64_t bytes) 733fa9e4066Sahrens { 734fa9e4066Sahrens arc_state_t *evicted_state; 735fa9e4066Sahrens uint64_t bytes_evicted = 0; 736fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 737fa9e4066Sahrens kmutex_t *hash_lock; 738fa9e4066Sahrens 739fa9e4066Sahrens ASSERT(state == arc.mru_top || state == arc.mfu_top); 740fa9e4066Sahrens 741fa9e4066Sahrens if (state == arc.mru_top) 742fa9e4066Sahrens evicted_state = arc.mru_bot; 743fa9e4066Sahrens else 744fa9e4066Sahrens evicted_state = arc.mfu_bot; 745fa9e4066Sahrens 746fa9e4066Sahrens mutex_enter(&state->mtx); 747fa9e4066Sahrens mutex_enter(&evicted_state->mtx); 748fa9e4066Sahrens 749fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 750fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 751fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 752fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 753fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 754fa9e4066Sahrens arc_change_state(evicted_state, ab, hash_lock); 755fa9e4066Sahrens zio_buf_free(ab->b_buf->b_data, ab->b_size); 756fa9e4066Sahrens atomic_add_64(&arc.size, -ab->b_size); 757fa9e4066Sahrens ASSERT3P(ab->b_buf->b_next, ==, NULL); 758fa9e4066Sahrens kmem_cache_free(buf_cache, ab->b_buf); 759fa9e4066Sahrens ab->b_buf = NULL; 760fa9e4066Sahrens DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 761fa9e4066Sahrens bytes_evicted += ab->b_size; 762fa9e4066Sahrens mutex_exit(hash_lock); 763fa9e4066Sahrens if (bytes_evicted >= bytes) 764fa9e4066Sahrens break; 765fa9e4066Sahrens } else { 766fa9e4066Sahrens atomic_add_64(&arc.skipped, 1); 767fa9e4066Sahrens } 768fa9e4066Sahrens } 769fa9e4066Sahrens mutex_exit(&evicted_state->mtx); 770fa9e4066Sahrens mutex_exit(&state->mtx); 771fa9e4066Sahrens 772fa9e4066Sahrens if (bytes_evicted < bytes) 773fa9e4066Sahrens dprintf("only evicted %lld bytes from %x", 774fa9e4066Sahrens (longlong_t)bytes_evicted, state); 775fa9e4066Sahrens 776fa9e4066Sahrens return (bytes_evicted); 777fa9e4066Sahrens } 778fa9e4066Sahrens 779fa9e4066Sahrens /* 780fa9e4066Sahrens * Remove buffers from list until we've removed the specified number of 781fa9e4066Sahrens * bytes. Destroy the buffers that are removed. 782fa9e4066Sahrens */ 783fa9e4066Sahrens static void 784fa9e4066Sahrens arc_delete_state(arc_state_t *state, int64_t bytes) 785fa9e4066Sahrens { 786fa9e4066Sahrens uint_t bufs_skipped = 0; 787fa9e4066Sahrens uint64_t bytes_deleted = 0; 788fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 789fa9e4066Sahrens kmutex_t *hash_lock; 790fa9e4066Sahrens 791fa9e4066Sahrens top: 792fa9e4066Sahrens mutex_enter(&state->mtx); 793fa9e4066Sahrens for (ab = list_tail(&state->list); ab; ab = ab_prev) { 794fa9e4066Sahrens ab_prev = list_prev(&state->list, ab); 795fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 796fa9e4066Sahrens if (mutex_tryenter(hash_lock)) { 797fa9e4066Sahrens arc_change_state(arc.anon, ab, hash_lock); 798fa9e4066Sahrens mutex_exit(hash_lock); 799fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 800fa9e4066Sahrens DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 801fa9e4066Sahrens bytes_deleted += ab->b_size; 802fa9e4066Sahrens arc_hdr_free(ab); 803fa9e4066Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 804fa9e4066Sahrens break; 805fa9e4066Sahrens } else { 806fa9e4066Sahrens if (bytes < 0) { 807fa9e4066Sahrens mutex_exit(&state->mtx); 808fa9e4066Sahrens mutex_enter(hash_lock); 809fa9e4066Sahrens mutex_exit(hash_lock); 810fa9e4066Sahrens goto top; 811fa9e4066Sahrens } 812fa9e4066Sahrens bufs_skipped += 1; 813fa9e4066Sahrens } 814fa9e4066Sahrens } 815fa9e4066Sahrens mutex_exit(&state->mtx); 816fa9e4066Sahrens 817fa9e4066Sahrens if (bufs_skipped) { 818fa9e4066Sahrens atomic_add_64(&arc.skipped, bufs_skipped); 819fa9e4066Sahrens ASSERT(bytes >= 0); 820fa9e4066Sahrens } 821fa9e4066Sahrens 822fa9e4066Sahrens if (bytes_deleted < bytes) 823fa9e4066Sahrens dprintf("only deleted %lld bytes from %p", 824fa9e4066Sahrens (longlong_t)bytes_deleted, state); 825fa9e4066Sahrens } 826fa9e4066Sahrens 827fa9e4066Sahrens static void 828fa9e4066Sahrens arc_adjust(void) 829fa9e4066Sahrens { 830fa9e4066Sahrens int64_t top_sz, mru_over, arc_over; 831fa9e4066Sahrens 832fa9e4066Sahrens top_sz = arc.anon->size + arc.mru_top->size; 833fa9e4066Sahrens 834fa9e4066Sahrens if (top_sz > arc.p && arc.mru_top->lsize > 0) { 835fa9e4066Sahrens int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p); 836fa9e4066Sahrens (void) arc_evict_state(arc.mru_top, toevict); 837fa9e4066Sahrens top_sz = arc.anon->size + arc.mru_top->size; 838fa9e4066Sahrens } 839fa9e4066Sahrens 840fa9e4066Sahrens mru_over = top_sz + arc.mru_bot->size - arc.c; 841fa9e4066Sahrens 842fa9e4066Sahrens if (mru_over > 0) { 843fa9e4066Sahrens if (arc.mru_bot->lsize > 0) { 844fa9e4066Sahrens int64_t todelete = MIN(arc.mru_bot->lsize, mru_over); 845fa9e4066Sahrens arc_delete_state(arc.mru_bot, todelete); 846fa9e4066Sahrens } 847fa9e4066Sahrens } 848fa9e4066Sahrens 849fa9e4066Sahrens if ((arc_over = arc.size - arc.c) > 0) { 850fa9e4066Sahrens int64_t table_over; 851fa9e4066Sahrens 852fa9e4066Sahrens if (arc.mfu_top->lsize > 0) { 853fa9e4066Sahrens int64_t toevict = MIN(arc.mfu_top->lsize, arc_over); 854fa9e4066Sahrens (void) arc_evict_state(arc.mfu_top, toevict); 855fa9e4066Sahrens } 856fa9e4066Sahrens 857fa9e4066Sahrens table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize 858fa9e4066Sahrens - arc.c*2; 859fa9e4066Sahrens 860fa9e4066Sahrens if (table_over > 0 && arc.mfu_bot->lsize > 0) { 861fa9e4066Sahrens int64_t todelete = MIN(arc.mfu_bot->lsize, table_over); 862fa9e4066Sahrens arc_delete_state(arc.mfu_bot, todelete); 863fa9e4066Sahrens } 864fa9e4066Sahrens } 865fa9e4066Sahrens } 866fa9e4066Sahrens 867fa9e4066Sahrens /* 868fa9e4066Sahrens * Flush all *evictable* data from the cache. 869fa9e4066Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 870fa9e4066Sahrens */ 871fa9e4066Sahrens void 872fa9e4066Sahrens arc_flush(void) 873fa9e4066Sahrens { 874fa9e4066Sahrens arc_delete_state(arc.mru_top, -1); 875fa9e4066Sahrens arc_delete_state(arc.mfu_top, -1); 876fa9e4066Sahrens 877fa9e4066Sahrens arc_delete_state(arc.mru_bot, -1); 878fa9e4066Sahrens arc_delete_state(arc.mfu_bot, -1); 879fa9e4066Sahrens } 880fa9e4066Sahrens 881fa9e4066Sahrens void 882fa9e4066Sahrens arc_kmem_reclaim(void) 883fa9e4066Sahrens { 884fa9e4066Sahrens /* Remove 6.25% */ 885fa9e4066Sahrens /* 886fa9e4066Sahrens * We need arc_reclaim_lock because we don't want multiple 887fa9e4066Sahrens * threads trying to reclaim concurrently. 888fa9e4066Sahrens */ 889fa9e4066Sahrens 890fa9e4066Sahrens /* 891fa9e4066Sahrens * umem calls the reclaim func when we destroy the buf cache, 892fa9e4066Sahrens * which is after we do arc_fini(). So we set a flag to prevent 893fa9e4066Sahrens * accessing the destroyed mutexes and lists. 894fa9e4066Sahrens */ 895fa9e4066Sahrens if (arc_dead) 896fa9e4066Sahrens return; 897fa9e4066Sahrens 898fa9e4066Sahrens mutex_enter(&arc_reclaim_lock); 899fa9e4066Sahrens 900fa9e4066Sahrens atomic_add_64(&arc.c, -(arc.c >> 4)); 901fa9e4066Sahrens if (arc.c < arc.c_min) 902fa9e4066Sahrens arc.c = arc.c_min; 903fa9e4066Sahrens atomic_add_64(&arc.p, -(arc.p >> 4)); 904fa9e4066Sahrens 905fa9e4066Sahrens arc_adjust(); 906fa9e4066Sahrens 907fa9e4066Sahrens /* Cool it for a while */ 908fa9e4066Sahrens arc.incr = 0; 909fa9e4066Sahrens arc.size_check = arc_size_check_default << 3; 910fa9e4066Sahrens 911fa9e4066Sahrens mutex_exit(&arc_reclaim_lock); 912fa9e4066Sahrens } 913fa9e4066Sahrens 914fa9e4066Sahrens static int 915fa9e4066Sahrens arc_reclaim_needed(void) 916fa9e4066Sahrens { 917fa9e4066Sahrens uint64_t extra; 918fa9e4066Sahrens 919fa9e4066Sahrens #ifdef _KERNEL 920fa9e4066Sahrens /* 921fa9e4066Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 922fa9e4066Sahrens */ 923fa9e4066Sahrens extra = desfree; 924fa9e4066Sahrens 925fa9e4066Sahrens /* 926fa9e4066Sahrens * check that we're out of range of the pageout scanner. It starts to 927fa9e4066Sahrens * schedule paging if freemem is less than lotsfree and needfree. 928fa9e4066Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 929fa9e4066Sahrens * number of needed free pages. We add extra pages here to make sure 930fa9e4066Sahrens * the scanner doesn't start up while we're freeing memory. 931fa9e4066Sahrens */ 932fa9e4066Sahrens if (freemem < lotsfree + needfree + extra) 933fa9e4066Sahrens return (1); 934fa9e4066Sahrens 935fa9e4066Sahrens /* 936fa9e4066Sahrens * check to make sure that swapfs has enough space so that anon 937fa9e4066Sahrens * reservations can still succeeed. anon_resvmem() checks that the 938fa9e4066Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 939fa9e4066Sahrens * swap pages. We also add a bit of extra here just to prevent 940fa9e4066Sahrens * circumstances from getting really dire. 941fa9e4066Sahrens */ 942fa9e4066Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 943fa9e4066Sahrens return (1); 944fa9e4066Sahrens 945fa9e4066Sahrens /* 946fa9e4066Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 947fa9e4066Sahrens * kernel heap space before we ever run out of available physical 948fa9e4066Sahrens * memory. Most checks of the size of the heap_area compare against 949fa9e4066Sahrens * tune.t_minarmem, which is the minimum available real memory that we 950fa9e4066Sahrens * can have in the system. However, this is generally fixed at 25 pages 951fa9e4066Sahrens * which is so low that it's useless. In this comparison, we seek to 952fa9e4066Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 953fa9e4066Sahrens * heap is allocated. (Or, in the caclulation, if less than 1/4th is 954fa9e4066Sahrens * free) 955fa9e4066Sahrens */ 956fa9e4066Sahrens #if defined(__i386) 957fa9e4066Sahrens if (btop(vmem_size(heap_arena, VMEM_FREE)) < 958fa9e4066Sahrens (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 959fa9e4066Sahrens return (1); 960fa9e4066Sahrens #endif 961fa9e4066Sahrens 962fa9e4066Sahrens #else 963fa9e4066Sahrens if (spa_get_random(100) == 0) 964fa9e4066Sahrens return (1); 965fa9e4066Sahrens #endif 966fa9e4066Sahrens return (0); 967fa9e4066Sahrens } 968fa9e4066Sahrens 969fa9e4066Sahrens static void 970fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 971fa9e4066Sahrens { 972fa9e4066Sahrens size_t i; 973fa9e4066Sahrens kmem_cache_t *prev_cache = NULL; 974fa9e4066Sahrens extern kmem_cache_t *zio_buf_cache[]; 975fa9e4066Sahrens 976fa9e4066Sahrens /* 977fa9e4066Sahrens * an agressive reclamation will shrink the cache size as well as reap 978fa9e4066Sahrens * free kmem buffers. The arc_kmem_reclaim function is called when the 979fa9e4066Sahrens * header-cache is reaped, so we only reap the header cache if we're 980fa9e4066Sahrens * performing an agressive reclaim. If we're not, just clean the kmem 981fa9e4066Sahrens * buffer caches. 982fa9e4066Sahrens */ 983fa9e4066Sahrens if (strat == ARC_RECLAIM_AGGR) 984fa9e4066Sahrens kmem_cache_reap_now(hdr_cache); 985fa9e4066Sahrens 986fa9e4066Sahrens kmem_cache_reap_now(buf_cache); 987fa9e4066Sahrens 988fa9e4066Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 989fa9e4066Sahrens if (zio_buf_cache[i] != prev_cache) { 990fa9e4066Sahrens prev_cache = zio_buf_cache[i]; 991fa9e4066Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 992fa9e4066Sahrens } 993fa9e4066Sahrens } 994fa9e4066Sahrens } 995fa9e4066Sahrens 996fa9e4066Sahrens static void 997fa9e4066Sahrens arc_reclaim_thread(void) 998fa9e4066Sahrens { 999fa9e4066Sahrens clock_t growtime = 0; 1000fa9e4066Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1001fa9e4066Sahrens callb_cpr_t cpr; 1002fa9e4066Sahrens 1003fa9e4066Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1004fa9e4066Sahrens 1005fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1006fa9e4066Sahrens while (arc_thread_exit == 0) { 1007fa9e4066Sahrens if (arc_reclaim_needed()) { 1008fa9e4066Sahrens 1009fa9e4066Sahrens if (arc.no_grow) { 1010fa9e4066Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 1011fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1012fa9e4066Sahrens } else { 1013fa9e4066Sahrens last_reclaim = ARC_RECLAIM_CONS; 1014fa9e4066Sahrens } 1015fa9e4066Sahrens } else { 1016fa9e4066Sahrens arc.no_grow = TRUE; 1017fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 1018fa9e4066Sahrens membar_producer(); 1019fa9e4066Sahrens } 1020fa9e4066Sahrens 1021fa9e4066Sahrens /* reset the growth delay for every reclaim */ 1022fa9e4066Sahrens growtime = lbolt + (arc_grow_retry * hz); 1023fa9e4066Sahrens 1024fa9e4066Sahrens arc_kmem_reap_now(last_reclaim); 1025fa9e4066Sahrens 1026fa9e4066Sahrens } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1027fa9e4066Sahrens arc.no_grow = FALSE; 1028fa9e4066Sahrens } 1029fa9e4066Sahrens 1030fa9e4066Sahrens /* block until needed, or one second, whichever is shorter */ 1031fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 1032fa9e4066Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 1033fa9e4066Sahrens &arc_reclaim_thr_lock, (lbolt + hz)); 1034fa9e4066Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1035fa9e4066Sahrens } 1036fa9e4066Sahrens 1037fa9e4066Sahrens arc_thread_exit = 0; 1038fa9e4066Sahrens cv_broadcast(&arc_reclaim_thr_cv); 1039fa9e4066Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1040fa9e4066Sahrens thread_exit(); 1041fa9e4066Sahrens } 1042fa9e4066Sahrens 1043fa9e4066Sahrens static void 1044fa9e4066Sahrens arc_try_grow(int64_t bytes) 1045fa9e4066Sahrens { 1046fa9e4066Sahrens /* 1047fa9e4066Sahrens * If we're within (2 * maxblocksize) bytes of the target 1048fa9e4066Sahrens * cache size, increment the target cache size 1049fa9e4066Sahrens */ 1050fa9e4066Sahrens atomic_add_64((uint64_t *)&arc.size_check, 1); 1051fa9e4066Sahrens 1052fa9e4066Sahrens if (arc_reclaim_needed()) { 1053fa9e4066Sahrens cv_signal(&arc_reclaim_thr_cv); 1054fa9e4066Sahrens return; 1055fa9e4066Sahrens } 1056fa9e4066Sahrens 1057fa9e4066Sahrens if (arc.no_grow) 1058fa9e4066Sahrens return; 1059fa9e4066Sahrens 1060fa9e4066Sahrens /* 1061fa9e4066Sahrens * return true if we successfully grow, or if there's enough space that 1062fa9e4066Sahrens * we don't have to grow. Above, we return false if we can't grow, or 1063fa9e4066Sahrens * if we shouldn't because a reclaim is in progress. 1064fa9e4066Sahrens */ 1065fa9e4066Sahrens if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) { 1066fa9e4066Sahrens if (arc.size_check > 0) { 1067fa9e4066Sahrens arc.size_check = arc_size_check_default; 1068fa9e4066Sahrens atomic_add_64(&arc.incr, arc_incr_size); 1069fa9e4066Sahrens } 1070fa9e4066Sahrens atomic_add_64(&arc.c, MIN(bytes, arc.incr)); 1071fa9e4066Sahrens if (arc.c > arc.c_max) 1072fa9e4066Sahrens arc.c = arc.c_max; 1073fa9e4066Sahrens else 1074fa9e4066Sahrens atomic_add_64(&arc.p, MIN(bytes, arc.incr)); 1075fa9e4066Sahrens } else if (arc.size > arc.c) { 1076fa9e4066Sahrens if (arc.size_check > 0) { 1077fa9e4066Sahrens arc.size_check = arc_size_check_default; 1078fa9e4066Sahrens atomic_add_64(&arc.incr, arc_incr_size); 1079fa9e4066Sahrens } 1080fa9e4066Sahrens atomic_add_64(&arc.c, MIN(bytes, arc.incr)); 1081fa9e4066Sahrens if (arc.c > arc.c_max) 1082fa9e4066Sahrens arc.c = arc.c_max; 1083fa9e4066Sahrens else 1084fa9e4066Sahrens atomic_add_64(&arc.p, MIN(bytes, arc.incr)); 1085fa9e4066Sahrens } 1086fa9e4066Sahrens } 1087fa9e4066Sahrens 1088fa9e4066Sahrens /* 1089fa9e4066Sahrens * check if the cache has reached its limits and eviction is required prior to 1090fa9e4066Sahrens * insert. In this situation, we want to evict if no_grow is set Otherwise, the 1091fa9e4066Sahrens * cache is either big enough that we can insert, or a arc_try_grow will result 1092fa9e4066Sahrens * in more space being made available. 1093fa9e4066Sahrens */ 1094fa9e4066Sahrens 1095fa9e4066Sahrens static int 1096fa9e4066Sahrens arc_evict_needed() 1097fa9e4066Sahrens { 1098fa9e4066Sahrens 1099fa9e4066Sahrens if (arc_reclaim_needed()) 1100fa9e4066Sahrens return (1); 1101fa9e4066Sahrens 1102fa9e4066Sahrens if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c)) 1103fa9e4066Sahrens return (1); 1104fa9e4066Sahrens 1105fa9e4066Sahrens return (0); 1106fa9e4066Sahrens } 1107fa9e4066Sahrens 1108fa9e4066Sahrens /* 1109fa9e4066Sahrens * The state, supplied as the first argument, is going to have something 1110fa9e4066Sahrens * inserted on its behalf. So, determine which cache must be victimized to 1111fa9e4066Sahrens * satisfy an insertion for this state. We have the following cases: 1112fa9e4066Sahrens * 1113fa9e4066Sahrens * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) -> 1114fa9e4066Sahrens * In this situation if we're out of space, but the resident size of the MFU is 1115fa9e4066Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 1116fa9e4066Sahrens * 1117fa9e4066Sahrens * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) -> 1118fa9e4066Sahrens * Here, we've used up all of the available space for the MRU, so we need to 1119fa9e4066Sahrens * evict from our own cache instead. Evict from the set of resident MRU 1120fa9e4066Sahrens * entries. 1121fa9e4066Sahrens * 1122fa9e4066Sahrens * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) -> 1123fa9e4066Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 1124fa9e4066Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 1125fa9e4066Sahrens * the MFU side, so the MRU side needs to be victimized. 1126fa9e4066Sahrens * 1127fa9e4066Sahrens * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) -> 1128fa9e4066Sahrens * MFU's resident set is consuming more space than it has been allotted. In 1129fa9e4066Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 1130fa9e4066Sahrens */ 1131fa9e4066Sahrens static void 1132fa9e4066Sahrens arc_evict_for_state(arc_state_t *state, uint64_t bytes) 1133fa9e4066Sahrens { 1134fa9e4066Sahrens uint64_t mru_used; 1135fa9e4066Sahrens uint64_t mfu_space; 1136fa9e4066Sahrens uint64_t evicted; 1137fa9e4066Sahrens 1138fa9e4066Sahrens ASSERT(state == arc.mru_top || state == arc.mfu_top); 1139fa9e4066Sahrens 1140fa9e4066Sahrens if (state == arc.mru_top) { 1141fa9e4066Sahrens mru_used = arc.anon->size + arc.mru_top->size; 1142fa9e4066Sahrens if (arc.p > mru_used) { 1143fa9e4066Sahrens /* case 1 */ 1144fa9e4066Sahrens evicted = arc_evict_state(arc.mfu_top, bytes); 1145fa9e4066Sahrens if (evicted < bytes) { 1146fa9e4066Sahrens arc_adjust(); 1147fa9e4066Sahrens } 1148fa9e4066Sahrens } else { 1149fa9e4066Sahrens /* case 2 */ 1150fa9e4066Sahrens evicted = arc_evict_state(arc.mru_top, bytes); 1151fa9e4066Sahrens if (evicted < bytes) { 1152fa9e4066Sahrens arc_adjust(); 1153fa9e4066Sahrens } 1154fa9e4066Sahrens } 1155fa9e4066Sahrens } else { 1156fa9e4066Sahrens /* MFU_top case */ 1157fa9e4066Sahrens mfu_space = arc.c - arc.p; 1158fa9e4066Sahrens if (mfu_space > arc.mfu_top->size) { 1159fa9e4066Sahrens /* case 3 */ 1160fa9e4066Sahrens evicted = arc_evict_state(arc.mru_top, bytes); 1161fa9e4066Sahrens if (evicted < bytes) { 1162fa9e4066Sahrens arc_adjust(); 1163fa9e4066Sahrens } 1164fa9e4066Sahrens } else { 1165fa9e4066Sahrens /* case 4 */ 1166fa9e4066Sahrens evicted = arc_evict_state(arc.mfu_top, bytes); 1167fa9e4066Sahrens if (evicted < bytes) { 1168fa9e4066Sahrens arc_adjust(); 1169fa9e4066Sahrens } 1170fa9e4066Sahrens } 1171fa9e4066Sahrens } 1172fa9e4066Sahrens } 1173fa9e4066Sahrens 1174fa9e4066Sahrens /* 1175fa9e4066Sahrens * This routine is called whenever a buffer is accessed. 1176fa9e4066Sahrens */ 1177fa9e4066Sahrens static void 1178fa9e4066Sahrens arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1179fa9e4066Sahrens { 1180fa9e4066Sahrens int blksz, mult; 1181fa9e4066Sahrens 1182fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1183fa9e4066Sahrens 1184fa9e4066Sahrens blksz = buf->b_size; 1185fa9e4066Sahrens 1186fa9e4066Sahrens if (buf->b_state == arc.anon) { 1187fa9e4066Sahrens /* 1188fa9e4066Sahrens * This buffer is not in the cache, and does not 1189fa9e4066Sahrens * appear in our "ghost" list. Add the new buffer 1190fa9e4066Sahrens * to the MRU state. 1191fa9e4066Sahrens */ 1192fa9e4066Sahrens 1193fa9e4066Sahrens arc_try_grow(blksz); 1194fa9e4066Sahrens if (arc_evict_needed()) { 1195fa9e4066Sahrens arc_evict_for_state(arc.mru_top, blksz); 1196fa9e4066Sahrens } 1197fa9e4066Sahrens 1198fa9e4066Sahrens ASSERT(buf->b_arc_access == 0); 1199fa9e4066Sahrens buf->b_arc_access = lbolt; 1200fa9e4066Sahrens DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *, 1201fa9e4066Sahrens buf); 1202fa9e4066Sahrens arc_change_state(arc.mru_top, buf, hash_lock); 1203fa9e4066Sahrens 1204fa9e4066Sahrens /* 1205fa9e4066Sahrens * If we are using less than 2/3 of our total target 1206fa9e4066Sahrens * cache size, bump up the target size for the MRU 1207fa9e4066Sahrens * list. 1208fa9e4066Sahrens */ 1209fa9e4066Sahrens if (arc.size < arc.c*2/3) { 1210fa9e4066Sahrens arc.p = arc.anon->size + arc.mru_top->size + arc.c/6; 1211fa9e4066Sahrens } 1212fa9e4066Sahrens 1213fa9e4066Sahrens } else if (buf->b_state == arc.mru_top) { 1214fa9e4066Sahrens /* 1215fa9e4066Sahrens * If this buffer is in the MRU-top state and has the prefetch 1216fa9e4066Sahrens * flag, the first read was actually part of a prefetch. In 1217fa9e4066Sahrens * this situation, we simply want to clear the flag and return. 1218fa9e4066Sahrens * A subsequent access should bump this into the MFU state. 1219fa9e4066Sahrens */ 1220fa9e4066Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 1221fa9e4066Sahrens buf->b_flags &= ~ARC_PREFETCH; 1222fa9e4066Sahrens atomic_add_64(&arc.mru_top->hits, 1); 1223fa9e4066Sahrens return; 1224fa9e4066Sahrens } 1225fa9e4066Sahrens 1226fa9e4066Sahrens /* 1227fa9e4066Sahrens * This buffer has been "accessed" only once so far, 1228fa9e4066Sahrens * but it is still in the cache. Move it to the MFU 1229fa9e4066Sahrens * state. 1230fa9e4066Sahrens */ 1231fa9e4066Sahrens if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1232fa9e4066Sahrens /* 1233fa9e4066Sahrens * More than 125ms have passed since we 1234fa9e4066Sahrens * instantiated this buffer. Move it to the 1235fa9e4066Sahrens * most frequently used state. 1236fa9e4066Sahrens */ 1237fa9e4066Sahrens buf->b_arc_access = lbolt; 1238fa9e4066Sahrens DTRACE_PROBE1(new_state__mfu_top, 1239fa9e4066Sahrens arc_buf_hdr_t *, buf); 1240fa9e4066Sahrens arc_change_state(arc.mfu_top, buf, hash_lock); 1241fa9e4066Sahrens } 1242fa9e4066Sahrens atomic_add_64(&arc.mru_top->hits, 1); 1243fa9e4066Sahrens } else if (buf->b_state == arc.mru_bot) { 1244fa9e4066Sahrens arc_state_t *new_state; 1245fa9e4066Sahrens /* 1246fa9e4066Sahrens * This buffer has been "accessed" recently, but 1247fa9e4066Sahrens * was evicted from the cache. Move it to the 1248fa9e4066Sahrens * MFU state. 1249fa9e4066Sahrens */ 1250fa9e4066Sahrens 1251fa9e4066Sahrens if (buf->b_flags & ARC_PREFETCH) { 1252fa9e4066Sahrens new_state = arc.mru_top; 1253fa9e4066Sahrens DTRACE_PROBE1(new_state__mru_top, 1254fa9e4066Sahrens arc_buf_hdr_t *, buf); 1255fa9e4066Sahrens } else { 1256fa9e4066Sahrens new_state = arc.mfu_top; 1257fa9e4066Sahrens DTRACE_PROBE1(new_state__mfu_top, 1258fa9e4066Sahrens arc_buf_hdr_t *, buf); 1259fa9e4066Sahrens } 1260fa9e4066Sahrens 1261fa9e4066Sahrens arc_try_grow(blksz); 1262fa9e4066Sahrens if (arc_evict_needed()) { 1263fa9e4066Sahrens arc_evict_for_state(new_state, blksz); 1264fa9e4066Sahrens } 1265fa9e4066Sahrens 1266fa9e4066Sahrens /* Bump up the target size of the MRU list */ 1267fa9e4066Sahrens mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ? 1268fa9e4066Sahrens 1 : (arc.mfu_bot->size/arc.mru_bot->size)); 1269fa9e4066Sahrens arc.p = MIN(arc.c, arc.p + blksz * mult); 1270fa9e4066Sahrens 1271fa9e4066Sahrens buf->b_arc_access = lbolt; 1272fa9e4066Sahrens arc_change_state(new_state, buf, hash_lock); 1273fa9e4066Sahrens 1274fa9e4066Sahrens atomic_add_64(&arc.mru_bot->hits, 1); 1275fa9e4066Sahrens } else if (buf->b_state == arc.mfu_top) { 1276fa9e4066Sahrens /* 1277fa9e4066Sahrens * This buffer has been accessed more than once and is 1278fa9e4066Sahrens * still in the cache. Keep it in the MFU state. 1279fa9e4066Sahrens * 1280fa9e4066Sahrens * NOTE: the add_reference() that occurred when we did 1281fa9e4066Sahrens * the arc_read() should have kicked this off the list, 1282fa9e4066Sahrens * so even if it was a prefetch, it will be put back at 1283fa9e4066Sahrens * the head of the list when we remove_reference(). 1284fa9e4066Sahrens */ 1285fa9e4066Sahrens atomic_add_64(&arc.mfu_top->hits, 1); 1286fa9e4066Sahrens } else if (buf->b_state == arc.mfu_bot) { 1287fa9e4066Sahrens /* 1288fa9e4066Sahrens * This buffer has been accessed more than once but has 1289fa9e4066Sahrens * been evicted from the cache. Move it back to the 1290fa9e4066Sahrens * MFU state. 1291fa9e4066Sahrens */ 1292fa9e4066Sahrens 1293fa9e4066Sahrens arc_try_grow(blksz); 1294fa9e4066Sahrens if (arc_evict_needed()) { 1295fa9e4066Sahrens arc_evict_for_state(arc.mfu_top, blksz); 1296fa9e4066Sahrens } 1297fa9e4066Sahrens 1298fa9e4066Sahrens /* Bump up the target size for the MFU list */ 1299fa9e4066Sahrens mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ? 1300fa9e4066Sahrens 1 : (arc.mru_bot->size/arc.mfu_bot->size)); 1301fa9e4066Sahrens arc.p = MAX(0, (int64_t)arc.p - blksz * mult); 1302fa9e4066Sahrens 1303fa9e4066Sahrens buf->b_arc_access = lbolt; 1304fa9e4066Sahrens DTRACE_PROBE1(new_state__mfu_top, 1305fa9e4066Sahrens arc_buf_hdr_t *, buf); 1306fa9e4066Sahrens arc_change_state(arc.mfu_top, buf, hash_lock); 1307fa9e4066Sahrens 1308fa9e4066Sahrens atomic_add_64(&arc.mfu_bot->hits, 1); 1309fa9e4066Sahrens } else { 1310fa9e4066Sahrens ASSERT(!"invalid arc state"); 1311fa9e4066Sahrens } 1312fa9e4066Sahrens 1313fa9e4066Sahrens } 1314fa9e4066Sahrens 1315fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1316fa9e4066Sahrens /* ARGSUSED */ 1317fa9e4066Sahrens void 1318fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1319fa9e4066Sahrens { 1320fa9e4066Sahrens bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1321fa9e4066Sahrens arc_buf_free(buf, arg); 1322fa9e4066Sahrens } 1323fa9e4066Sahrens 1324fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 1325fa9e4066Sahrens void 1326fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1327fa9e4066Sahrens { 1328fa9e4066Sahrens arc_buf_t **bufp = arg; 1329fa9e4066Sahrens if (zio && zio->io_error) { 1330fa9e4066Sahrens arc_buf_free(buf, arg); 1331fa9e4066Sahrens *bufp = NULL; 1332fa9e4066Sahrens } else { 1333fa9e4066Sahrens *bufp = buf; 1334fa9e4066Sahrens } 1335fa9e4066Sahrens } 1336fa9e4066Sahrens 1337fa9e4066Sahrens static void 1338fa9e4066Sahrens arc_read_done(zio_t *zio) 1339fa9e4066Sahrens { 1340fa9e4066Sahrens arc_buf_hdr_t *hdr; 1341fa9e4066Sahrens arc_buf_t *buf; 1342fa9e4066Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 1343fa9e4066Sahrens kmutex_t *hash_lock; 1344fa9e4066Sahrens arc_callback_t *callback_list, *acb; 1345fa9e4066Sahrens int freeable = FALSE; 1346fa9e4066Sahrens 1347fa9e4066Sahrens buf = zio->io_private; 1348fa9e4066Sahrens hdr = buf->b_hdr; 1349fa9e4066Sahrens 1350fa9e4066Sahrens if (!HDR_FREED_IN_READ(hdr)) { 1351fa9e4066Sahrens arc_buf_hdr_t *found; 1352fa9e4066Sahrens 1353fa9e4066Sahrens found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1354fa9e4066Sahrens &hash_lock); 1355fa9e4066Sahrens 1356fa9e4066Sahrens /* 1357fa9e4066Sahrens * Buffer was inserted into hash-table and removed from lists 1358fa9e4066Sahrens * prior to starting I/O. We should find this header, since 1359fa9e4066Sahrens * it's in the hash table, and it should be legit since it's 1360fa9e4066Sahrens * not possible to evict it during the I/O. 1361fa9e4066Sahrens */ 1362fa9e4066Sahrens 1363fa9e4066Sahrens ASSERT(found); 1364fa9e4066Sahrens ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))); 1365fa9e4066Sahrens } 1366fa9e4066Sahrens 1367fa9e4066Sahrens /* byteswap if necessary */ 1368fa9e4066Sahrens callback_list = hdr->b_acb; 1369fa9e4066Sahrens ASSERT(callback_list != NULL); 1370fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1371fa9e4066Sahrens callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1372fa9e4066Sahrens 1373fa9e4066Sahrens /* create copies of the data buffer for the callers */ 1374fa9e4066Sahrens abuf = buf; 1375fa9e4066Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 1376fa9e4066Sahrens if (acb->acb_done) { 1377fa9e4066Sahrens if (abuf == NULL) { 1378fa9e4066Sahrens abuf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1379fa9e4066Sahrens abuf->b_data = zio_buf_alloc(hdr->b_size); 1380fa9e4066Sahrens atomic_add_64(&arc.size, hdr->b_size); 1381fa9e4066Sahrens bcopy(buf->b_data, abuf->b_data, hdr->b_size); 1382fa9e4066Sahrens abuf->b_hdr = hdr; 1383fa9e4066Sahrens abuf->b_next = hdr->b_buf; 1384fa9e4066Sahrens hdr->b_buf = abuf; 1385fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, hdr->b_size); 1386fa9e4066Sahrens } 1387fa9e4066Sahrens acb->acb_buf = abuf; 1388fa9e4066Sahrens abuf = NULL; 1389fa9e4066Sahrens } else { 1390fa9e4066Sahrens /* 1391fa9e4066Sahrens * The caller did not provide a callback function. 1392fa9e4066Sahrens * In this case, we should just remove the reference. 1393fa9e4066Sahrens */ 1394fa9e4066Sahrens if (HDR_FREED_IN_READ(hdr)) { 1395fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1396fa9e4066Sahrens (void) refcount_remove(&hdr->b_refcnt, 1397fa9e4066Sahrens acb->acb_private); 1398fa9e4066Sahrens } else { 1399fa9e4066Sahrens (void) remove_reference(hdr, hash_lock, 1400fa9e4066Sahrens acb->acb_private); 1401fa9e4066Sahrens } 1402fa9e4066Sahrens } 1403fa9e4066Sahrens } 1404fa9e4066Sahrens hdr->b_acb = NULL; 1405fa9e4066Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1406fa9e4066Sahrens 1407fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1408fa9e4066Sahrens 1409fa9e4066Sahrens if (zio->io_error != 0) { 1410fa9e4066Sahrens hdr->b_flags |= ARC_IO_ERROR; 1411fa9e4066Sahrens if (hdr->b_state != arc.anon) 1412fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1413fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1414fa9e4066Sahrens } 1415fa9e4066Sahrens 1416fa9e4066Sahrens if (!HDR_FREED_IN_READ(hdr)) { 1417fa9e4066Sahrens /* 1418fa9e4066Sahrens * Only call arc_access on anonymous buffers. This is because 1419fa9e4066Sahrens * if we've issued an I/O for an evicted buffer, we've already 1420fa9e4066Sahrens * called arc_access (to prevent any simultaneous readers from 1421fa9e4066Sahrens * getting confused). 1422fa9e4066Sahrens */ 1423fa9e4066Sahrens if (zio->io_error == 0 && hdr->b_state == arc.anon) 1424fa9e4066Sahrens arc_access(hdr, hash_lock); 1425fa9e4066Sahrens mutex_exit(hash_lock); 1426fa9e4066Sahrens } else { 1427fa9e4066Sahrens /* 1428fa9e4066Sahrens * This block was freed while we waited for the read to 1429fa9e4066Sahrens * complete. It has been removed from the hash table and 1430fa9e4066Sahrens * moved to the anonymous state (so that it won't show up 1431fa9e4066Sahrens * in the cache). 1432fa9e4066Sahrens */ 1433fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1434fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 1435fa9e4066Sahrens } 1436fa9e4066Sahrens 1437fa9e4066Sahrens cv_broadcast(&hdr->b_cv); 1438fa9e4066Sahrens 1439fa9e4066Sahrens /* execute each callback and free its structure */ 1440fa9e4066Sahrens while ((acb = callback_list) != NULL) { 1441fa9e4066Sahrens if (acb->acb_done) 1442fa9e4066Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1443fa9e4066Sahrens 1444fa9e4066Sahrens if (acb->acb_zio_dummy != NULL) { 1445fa9e4066Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 1446fa9e4066Sahrens zio_nowait(acb->acb_zio_dummy); 1447fa9e4066Sahrens } 1448fa9e4066Sahrens 1449fa9e4066Sahrens callback_list = acb->acb_next; 1450fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1451fa9e4066Sahrens } 1452fa9e4066Sahrens 1453fa9e4066Sahrens if (freeable) 1454fa9e4066Sahrens arc_hdr_free(hdr); 1455fa9e4066Sahrens } 1456fa9e4066Sahrens 1457fa9e4066Sahrens /* 1458fa9e4066Sahrens * "Read" the block block at the specified DVA (in bp) via the 1459fa9e4066Sahrens * cache. If the block is found in the cache, invoke the provided 1460fa9e4066Sahrens * callback immediately and return. Note that the `zio' parameter 1461fa9e4066Sahrens * in the callback will be NULL in this case, since no IO was 1462fa9e4066Sahrens * required. If the block is not in the cache pass the read request 1463fa9e4066Sahrens * on to the spa with a substitute callback function, so that the 1464fa9e4066Sahrens * requested block will be added to the cache. 1465fa9e4066Sahrens * 1466fa9e4066Sahrens * If a read request arrives for a block that has a read in-progress, 1467fa9e4066Sahrens * either wait for the in-progress read to complete (and return the 1468fa9e4066Sahrens * results); or, if this is a read with a "done" func, add a record 1469fa9e4066Sahrens * to the read to invoke the "done" func when the read completes, 1470fa9e4066Sahrens * and return; or just return. 1471fa9e4066Sahrens * 1472fa9e4066Sahrens * arc_read_done() will invoke all the requested "done" functions 1473fa9e4066Sahrens * for readers of this block. 1474fa9e4066Sahrens */ 1475fa9e4066Sahrens int 1476fa9e4066Sahrens arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1477fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1478fa9e4066Sahrens uint32_t arc_flags) 1479fa9e4066Sahrens { 1480fa9e4066Sahrens arc_buf_hdr_t *hdr; 1481fa9e4066Sahrens arc_buf_t *buf; 1482fa9e4066Sahrens kmutex_t *hash_lock; 1483fa9e4066Sahrens zio_t *rzio; 1484fa9e4066Sahrens 1485fa9e4066Sahrens top: 1486fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1487fa9e4066Sahrens if (hdr && hdr->b_buf) { 1488fa9e4066Sahrens 1489fa9e4066Sahrens ASSERT((hdr->b_state == arc.mru_top) || 1490fa9e4066Sahrens (hdr->b_state == arc.mfu_top) || 1491fa9e4066Sahrens ((hdr->b_state == arc.anon) && 1492fa9e4066Sahrens (HDR_IO_IN_PROGRESS(hdr)))); 1493fa9e4066Sahrens 1494fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 1495fa9e4066Sahrens 1496fa9e4066Sahrens if ((arc_flags & ARC_NOWAIT) && done) { 1497fa9e4066Sahrens arc_callback_t *acb = NULL; 1498fa9e4066Sahrens 1499fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 1500fa9e4066Sahrens KM_SLEEP); 1501fa9e4066Sahrens acb->acb_done = done; 1502fa9e4066Sahrens acb->acb_private = private; 1503fa9e4066Sahrens acb->acb_byteswap = swap; 1504fa9e4066Sahrens if (pio != NULL) 1505fa9e4066Sahrens acb->acb_zio_dummy = zio_null(pio, 1506fa9e4066Sahrens spa, NULL, NULL, flags); 1507fa9e4066Sahrens 1508fa9e4066Sahrens ASSERT(acb->acb_done != NULL); 1509fa9e4066Sahrens acb->acb_next = hdr->b_acb; 1510fa9e4066Sahrens hdr->b_acb = acb; 1511fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1512fa9e4066Sahrens mutex_exit(hash_lock); 1513fa9e4066Sahrens return (0); 1514fa9e4066Sahrens } else if (arc_flags & ARC_WAIT) { 1515fa9e4066Sahrens cv_wait(&hdr->b_cv, hash_lock); 1516fa9e4066Sahrens mutex_exit(hash_lock); 1517fa9e4066Sahrens goto top; 1518fa9e4066Sahrens } 1519fa9e4066Sahrens 1520fa9e4066Sahrens mutex_exit(hash_lock); 1521fa9e4066Sahrens return (0); 1522fa9e4066Sahrens } 1523fa9e4066Sahrens 1524fa9e4066Sahrens /* 1525fa9e4066Sahrens * If there is already a reference on this block, create 1526fa9e4066Sahrens * a new copy of the data so that we will be guaranteed 1527fa9e4066Sahrens * that arc_release() will always succeed. 1528fa9e4066Sahrens */ 1529fa9e4066Sahrens 1530fa9e4066Sahrens if (done) 1531fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1532fa9e4066Sahrens if (done && refcount_count(&hdr->b_refcnt) > 1) { 1533fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1534fa9e4066Sahrens buf->b_data = zio_buf_alloc(hdr->b_size); 1535fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1); 1536fa9e4066Sahrens atomic_add_64(&arc.size, hdr->b_size); 1537fa9e4066Sahrens bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size); 1538fa9e4066Sahrens buf->b_hdr = hdr; 1539fa9e4066Sahrens buf->b_next = hdr->b_buf; 1540fa9e4066Sahrens hdr->b_buf = buf; 1541fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, hdr->b_size); 1542fa9e4066Sahrens } else { 1543fa9e4066Sahrens buf = hdr->b_buf; 1544fa9e4066Sahrens } 1545fa9e4066Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1546fa9e4066Sahrens arc_access(hdr, hash_lock); 1547fa9e4066Sahrens mutex_exit(hash_lock); 1548fa9e4066Sahrens atomic_add_64(&arc.hits, 1); 1549fa9e4066Sahrens if (done) 1550fa9e4066Sahrens done(NULL, buf, private); 1551fa9e4066Sahrens } else { 1552fa9e4066Sahrens uint64_t size = BP_GET_LSIZE(bp); 1553fa9e4066Sahrens arc_callback_t *acb; 1554fa9e4066Sahrens 1555fa9e4066Sahrens if (hdr == NULL) { 1556fa9e4066Sahrens /* this block is not in the cache */ 1557fa9e4066Sahrens arc_buf_hdr_t *exists; 1558fa9e4066Sahrens 1559fa9e4066Sahrens buf = arc_buf_alloc(spa, size, private); 1560fa9e4066Sahrens hdr = buf->b_hdr; 1561fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(bp); 1562fa9e4066Sahrens hdr->b_birth = bp->blk_birth; 1563fa9e4066Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1564fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1565fa9e4066Sahrens if (exists) { 1566fa9e4066Sahrens /* somebody beat us to the hash insert */ 1567fa9e4066Sahrens mutex_exit(hash_lock); 1568fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1569fa9e4066Sahrens hdr->b_birth = 0; 1570fa9e4066Sahrens hdr->b_cksum0 = 0; 1571fa9e4066Sahrens arc_buf_free(buf, private); 1572fa9e4066Sahrens goto top; /* restart the IO request */ 1573fa9e4066Sahrens } 1574fa9e4066Sahrens 1575fa9e4066Sahrens } else { 1576fa9e4066Sahrens /* this block is in the ghost cache */ 1577fa9e4066Sahrens ASSERT((hdr->b_state == arc.mru_bot) || 1578fa9e4066Sahrens (hdr->b_state == arc.mfu_bot)); 1579fa9e4066Sahrens add_reference(hdr, hash_lock, private); 1580fa9e4066Sahrens 1581fa9e4066Sahrens buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1582fa9e4066Sahrens buf->b_data = zio_buf_alloc(hdr->b_size); 1583fa9e4066Sahrens atomic_add_64(&arc.size, hdr->b_size); 1584fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1585fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 1586fa9e4066Sahrens buf->b_hdr = hdr; 1587fa9e4066Sahrens buf->b_next = NULL; 1588fa9e4066Sahrens hdr->b_buf = buf; 1589fa9e4066Sahrens } 1590fa9e4066Sahrens 1591fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1592fa9e4066Sahrens acb->acb_done = done; 1593fa9e4066Sahrens acb->acb_private = private; 1594fa9e4066Sahrens acb->acb_byteswap = swap; 1595fa9e4066Sahrens 1596fa9e4066Sahrens ASSERT(hdr->b_acb == NULL); 1597fa9e4066Sahrens hdr->b_acb = acb; 1598fa9e4066Sahrens 1599fa9e4066Sahrens /* 1600fa9e4066Sahrens * If this DVA is part of a prefetch, mark the buf 1601fa9e4066Sahrens * header with the prefetch flag 1602fa9e4066Sahrens */ 1603fa9e4066Sahrens if (arc_flags & ARC_PREFETCH) 1604fa9e4066Sahrens hdr->b_flags |= ARC_PREFETCH; 1605fa9e4066Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 1606fa9e4066Sahrens 1607fa9e4066Sahrens /* 1608fa9e4066Sahrens * If the buffer has been evicted, migrate it to a present state 1609fa9e4066Sahrens * before issuing the I/O. Once we drop the hash-table lock, 1610fa9e4066Sahrens * the header will be marked as I/O in progress and have an 1611fa9e4066Sahrens * attached buffer. At this point, anybody who finds this 1612fa9e4066Sahrens * buffer ought to notice that it's legit but has a pending I/O. 1613fa9e4066Sahrens */ 1614fa9e4066Sahrens 1615fa9e4066Sahrens if ((hdr->b_state == arc.mru_bot) || 1616fa9e4066Sahrens (hdr->b_state == arc.mfu_bot)) 1617fa9e4066Sahrens arc_access(hdr, hash_lock); 1618fa9e4066Sahrens 1619fa9e4066Sahrens mutex_exit(hash_lock); 1620fa9e4066Sahrens 1621fa9e4066Sahrens ASSERT3U(hdr->b_size, ==, size); 1622fa9e4066Sahrens DTRACE_PROBE2(arc__miss, blkptr_t *, bp, 1623fa9e4066Sahrens uint64_t, size); 1624fa9e4066Sahrens atomic_add_64(&arc.misses, 1); 1625fa9e4066Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 1626fa9e4066Sahrens arc_read_done, buf, priority, flags); 1627fa9e4066Sahrens 1628fa9e4066Sahrens if (arc_flags & ARC_WAIT) 1629fa9e4066Sahrens return (zio_wait(rzio)); 1630fa9e4066Sahrens 1631fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1632fa9e4066Sahrens zio_nowait(rzio); 1633fa9e4066Sahrens } 1634fa9e4066Sahrens return (0); 1635fa9e4066Sahrens } 1636fa9e4066Sahrens 1637fa9e4066Sahrens /* 1638fa9e4066Sahrens * arc_read() variant to support pool traversal. If the block is already 1639fa9e4066Sahrens * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 1640fa9e4066Sahrens * The idea is that we don't want pool traversal filling up memory, but 1641fa9e4066Sahrens * if the ARC already has the data anyway, we shouldn't pay for the I/O. 1642fa9e4066Sahrens */ 1643fa9e4066Sahrens int 1644fa9e4066Sahrens arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 1645fa9e4066Sahrens { 1646fa9e4066Sahrens arc_buf_hdr_t *hdr; 1647fa9e4066Sahrens kmutex_t *hash_mtx; 1648fa9e4066Sahrens int rc = 0; 1649fa9e4066Sahrens 1650fa9e4066Sahrens hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 1651fa9e4066Sahrens 1652fa9e4066Sahrens if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr)) 1653fa9e4066Sahrens bcopy(hdr->b_buf->b_data, data, hdr->b_size); 1654fa9e4066Sahrens else 1655fa9e4066Sahrens rc = ENOENT; 1656fa9e4066Sahrens 1657fa9e4066Sahrens if (hash_mtx) 1658fa9e4066Sahrens mutex_exit(hash_mtx); 1659fa9e4066Sahrens 1660fa9e4066Sahrens return (rc); 1661fa9e4066Sahrens } 1662fa9e4066Sahrens 1663fa9e4066Sahrens /* 1664fa9e4066Sahrens * Release this buffer from the cache. This must be done 1665fa9e4066Sahrens * after a read and prior to modifying the buffer contents. 1666fa9e4066Sahrens * If the buffer has more than one reference, we must make 1667fa9e4066Sahrens * make a new hdr for the buffer. 1668fa9e4066Sahrens */ 1669fa9e4066Sahrens void 1670fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag) 1671fa9e4066Sahrens { 1672fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1673fa9e4066Sahrens kmutex_t *hash_lock = HDR_LOCK(hdr); 1674fa9e4066Sahrens 1675fa9e4066Sahrens /* this buffer is not on any list */ 1676fa9e4066Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 1677fa9e4066Sahrens 1678fa9e4066Sahrens if (hdr->b_state == arc.anon) { 1679fa9e4066Sahrens /* this buffer is already released */ 1680fa9e4066Sahrens ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 1681fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 1682fa9e4066Sahrens return; 1683fa9e4066Sahrens } 1684fa9e4066Sahrens 1685fa9e4066Sahrens mutex_enter(hash_lock); 1686fa9e4066Sahrens 1687fa9e4066Sahrens if (refcount_count(&hdr->b_refcnt) > 1) { 1688fa9e4066Sahrens arc_buf_hdr_t *nhdr; 1689fa9e4066Sahrens arc_buf_t **bufp; 1690fa9e4066Sahrens uint64_t blksz = hdr->b_size; 1691fa9e4066Sahrens spa_t *spa = hdr->b_spa; 1692fa9e4066Sahrens 1693fa9e4066Sahrens /* 1694fa9e4066Sahrens * Pull the data off of this buf and attach it to 1695fa9e4066Sahrens * a new anonymous buf. 1696fa9e4066Sahrens */ 1697fa9e4066Sahrens bufp = &hdr->b_buf; 1698fa9e4066Sahrens while (*bufp != buf) { 1699fa9e4066Sahrens ASSERT(*bufp); 1700fa9e4066Sahrens bufp = &(*bufp)->b_next; 1701fa9e4066Sahrens } 1702fa9e4066Sahrens *bufp = (*bufp)->b_next; 1703fa9e4066Sahrens (void) refcount_remove(&hdr->b_refcnt, tag); 1704fa9e4066Sahrens ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 1705fa9e4066Sahrens atomic_add_64(&hdr->b_state->size, -hdr->b_size); 1706fa9e4066Sahrens mutex_exit(hash_lock); 1707fa9e4066Sahrens 1708fa9e4066Sahrens nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 1709fa9e4066Sahrens nhdr->b_size = blksz; 1710fa9e4066Sahrens nhdr->b_spa = spa; 1711fa9e4066Sahrens nhdr->b_buf = buf; 1712fa9e4066Sahrens nhdr->b_state = arc.anon; 1713fa9e4066Sahrens nhdr->b_arc_access = 0; 1714fa9e4066Sahrens nhdr->b_flags = 0; 1715fa9e4066Sahrens buf->b_hdr = nhdr; 1716fa9e4066Sahrens buf->b_next = NULL; 1717fa9e4066Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 1718fa9e4066Sahrens atomic_add_64(&arc.anon->size, blksz); 1719fa9e4066Sahrens 1720fa9e4066Sahrens hdr = nhdr; 1721fa9e4066Sahrens } else { 1722fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1723fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1724fa9e4066Sahrens arc_change_state(arc.anon, hdr, hash_lock); 1725fa9e4066Sahrens hdr->b_arc_access = 0; 1726fa9e4066Sahrens mutex_exit(hash_lock); 1727fa9e4066Sahrens bzero(&hdr->b_dva, sizeof (dva_t)); 1728fa9e4066Sahrens hdr->b_birth = 0; 1729fa9e4066Sahrens hdr->b_cksum0 = 0; 1730fa9e4066Sahrens } 1731fa9e4066Sahrens } 1732fa9e4066Sahrens 1733fa9e4066Sahrens int 1734fa9e4066Sahrens arc_released(arc_buf_t *buf) 1735fa9e4066Sahrens { 1736fa9e4066Sahrens return (buf->b_hdr->b_state == arc.anon); 1737fa9e4066Sahrens } 1738fa9e4066Sahrens 1739fa9e4066Sahrens static void 1740fa9e4066Sahrens arc_write_done(zio_t *zio) 1741fa9e4066Sahrens { 1742fa9e4066Sahrens arc_buf_t *buf; 1743fa9e4066Sahrens arc_buf_hdr_t *hdr; 1744fa9e4066Sahrens arc_callback_t *acb; 1745fa9e4066Sahrens 1746fa9e4066Sahrens buf = zio->io_private; 1747fa9e4066Sahrens hdr = buf->b_hdr; 1748fa9e4066Sahrens acb = hdr->b_acb; 1749fa9e4066Sahrens hdr->b_acb = NULL; 1750fa9e4066Sahrens 1751fa9e4066Sahrens /* this buffer is on no lists and is not in the hash table */ 1752fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1753fa9e4066Sahrens 1754fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(zio->io_bp); 1755fa9e4066Sahrens hdr->b_birth = zio->io_bp->blk_birth; 1756fa9e4066Sahrens hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 1757fa9e4066Sahrens /* clear the "in-write" flag */ 1758fa9e4066Sahrens hdr->b_hash_next = NULL; 1759fa9e4066Sahrens /* This write may be all-zero */ 1760fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 1761fa9e4066Sahrens arc_buf_hdr_t *exists; 1762fa9e4066Sahrens kmutex_t *hash_lock; 1763fa9e4066Sahrens 1764fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1765fa9e4066Sahrens if (exists) { 1766fa9e4066Sahrens /* 1767fa9e4066Sahrens * This can only happen if we overwrite for 1768fa9e4066Sahrens * sync-to-convergence, because we remove 1769fa9e4066Sahrens * buffers from the hash table when we arc_free(). 1770fa9e4066Sahrens */ 1771fa9e4066Sahrens ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 1772fa9e4066Sahrens BP_IDENTITY(zio->io_bp))); 1773fa9e4066Sahrens ASSERT3U(zio->io_bp_orig.blk_birth, ==, 1774fa9e4066Sahrens zio->io_bp->blk_birth); 1775fa9e4066Sahrens 1776fa9e4066Sahrens ASSERT(refcount_is_zero(&exists->b_refcnt)); 1777fa9e4066Sahrens arc_change_state(arc.anon, exists, hash_lock); 1778fa9e4066Sahrens mutex_exit(hash_lock); 1779fa9e4066Sahrens arc_hdr_free(exists); 1780fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 1781fa9e4066Sahrens ASSERT3P(exists, ==, NULL); 1782fa9e4066Sahrens } 1783fa9e4066Sahrens arc_access(hdr, hash_lock); 1784fa9e4066Sahrens mutex_exit(hash_lock); 1785fa9e4066Sahrens } 1786fa9e4066Sahrens if (acb && acb->acb_done) { 1787fa9e4066Sahrens ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 1788fa9e4066Sahrens acb->acb_done(zio, buf, acb->acb_private); 1789fa9e4066Sahrens } 1790fa9e4066Sahrens 1791fa9e4066Sahrens if (acb) 1792fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 1793fa9e4066Sahrens } 1794fa9e4066Sahrens 1795fa9e4066Sahrens int 1796fa9e4066Sahrens arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, 1797fa9e4066Sahrens uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 1798fa9e4066Sahrens arc_done_func_t *done, void *private, int priority, int flags, 1799fa9e4066Sahrens uint32_t arc_flags) 1800fa9e4066Sahrens { 1801fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1802fa9e4066Sahrens arc_callback_t *acb; 1803fa9e4066Sahrens zio_t *rzio; 1804fa9e4066Sahrens 1805fa9e4066Sahrens /* this is a private buffer - no locking required */ 1806fa9e4066Sahrens ASSERT3P(hdr->b_state, ==, arc.anon); 1807fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 1808fa9e4066Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 1809fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1810fa9e4066Sahrens acb->acb_done = done; 1811fa9e4066Sahrens acb->acb_private = private; 1812fa9e4066Sahrens acb->acb_byteswap = (arc_byteswap_func_t *)-1; 1813fa9e4066Sahrens hdr->b_acb = acb; 1814fa9e4066Sahrens rzio = zio_write(pio, spa, checksum, compress, txg, bp, 1815fa9e4066Sahrens buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags); 1816fa9e4066Sahrens 1817fa9e4066Sahrens if (arc_flags & ARC_WAIT) 1818fa9e4066Sahrens return (zio_wait(rzio)); 1819fa9e4066Sahrens 1820fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1821fa9e4066Sahrens zio_nowait(rzio); 1822fa9e4066Sahrens 1823fa9e4066Sahrens return (0); 1824fa9e4066Sahrens } 1825fa9e4066Sahrens 1826fa9e4066Sahrens int 1827fa9e4066Sahrens arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 1828fa9e4066Sahrens zio_done_func_t *done, void *private, uint32_t arc_flags) 1829fa9e4066Sahrens { 1830fa9e4066Sahrens arc_buf_hdr_t *ab; 1831fa9e4066Sahrens kmutex_t *hash_lock; 1832fa9e4066Sahrens zio_t *zio; 1833fa9e4066Sahrens 1834fa9e4066Sahrens /* 1835fa9e4066Sahrens * If this buffer is in the cache, release it, so it 1836fa9e4066Sahrens * can be re-used. 1837fa9e4066Sahrens */ 1838fa9e4066Sahrens ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1839fa9e4066Sahrens if (ab != NULL) { 1840fa9e4066Sahrens /* 1841fa9e4066Sahrens * The checksum of blocks to free is not always 1842fa9e4066Sahrens * preserved (eg. on the deadlist). However, if it is 1843fa9e4066Sahrens * nonzero, it should match what we have in the cache. 1844fa9e4066Sahrens */ 1845fa9e4066Sahrens ASSERT(bp->blk_cksum.zc_word[0] == 0 || 1846fa9e4066Sahrens ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 1847fa9e4066Sahrens arc_change_state(arc.anon, ab, hash_lock); 1848fa9e4066Sahrens if (refcount_is_zero(&ab->b_refcnt)) { 1849fa9e4066Sahrens mutex_exit(hash_lock); 1850fa9e4066Sahrens arc_hdr_free(ab); 1851fa9e4066Sahrens atomic_add_64(&arc.deleted, 1); 1852fa9e4066Sahrens } else { 1853fa9e4066Sahrens ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1); 1854fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(ab)) 1855fa9e4066Sahrens ab->b_flags |= ARC_FREED_IN_READ; 1856fa9e4066Sahrens ab->b_arc_access = 0; 1857fa9e4066Sahrens bzero(&ab->b_dva, sizeof (dva_t)); 1858fa9e4066Sahrens ab->b_birth = 0; 1859fa9e4066Sahrens ab->b_cksum0 = 0; 1860fa9e4066Sahrens mutex_exit(hash_lock); 1861fa9e4066Sahrens } 1862fa9e4066Sahrens } 1863fa9e4066Sahrens 1864fa9e4066Sahrens zio = zio_free(pio, spa, txg, bp, done, private); 1865fa9e4066Sahrens 1866fa9e4066Sahrens if (arc_flags & ARC_WAIT) 1867fa9e4066Sahrens return (zio_wait(zio)); 1868fa9e4066Sahrens 1869fa9e4066Sahrens ASSERT(arc_flags & ARC_NOWAIT); 1870fa9e4066Sahrens zio_nowait(zio); 1871fa9e4066Sahrens 1872fa9e4066Sahrens return (0); 1873fa9e4066Sahrens } 1874fa9e4066Sahrens 1875fa9e4066Sahrens void 1876fa9e4066Sahrens arc_tempreserve_clear(uint64_t tempreserve) 1877fa9e4066Sahrens { 1878fa9e4066Sahrens atomic_add_64(&arc_tempreserve, -tempreserve); 1879fa9e4066Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 1880fa9e4066Sahrens } 1881fa9e4066Sahrens 1882fa9e4066Sahrens int 1883fa9e4066Sahrens arc_tempreserve_space(uint64_t tempreserve) 1884fa9e4066Sahrens { 1885fa9e4066Sahrens #ifdef ZFS_DEBUG 1886fa9e4066Sahrens /* 1887fa9e4066Sahrens * Once in a while, fail for no reason. Everything should cope. 1888fa9e4066Sahrens */ 1889fa9e4066Sahrens if (spa_get_random(10000) == 0) { 1890fa9e4066Sahrens dprintf("forcing random failure\n"); 1891fa9e4066Sahrens return (ERESTART); 1892fa9e4066Sahrens } 1893fa9e4066Sahrens #endif 1894*112fe045Smaybee if (tempreserve > arc.c/4 && !arc.no_grow) 1895*112fe045Smaybee arc.c = MIN(arc.c_max, tempreserve * 4); 1896*112fe045Smaybee if (tempreserve > arc.c) 1897*112fe045Smaybee return (ENOMEM); 1898*112fe045Smaybee 1899fa9e4066Sahrens /* 1900*112fe045Smaybee * Throttle writes when the amount of dirty data in the cache 1901*112fe045Smaybee * gets too large. We try to keep the cache less than half full 1902*112fe045Smaybee * of dirty blocks so that our sync times don't grow too large. 1903*112fe045Smaybee * Note: if two requests come in concurrently, we might let them 1904*112fe045Smaybee * both succeed, when one of them should fail. Not a huge deal. 1905*112fe045Smaybee * 1906*112fe045Smaybee * XXX The limit should be adjusted dynamically to keep the time 1907*112fe045Smaybee * to sync a dataset fixed (around 1-5 seconds?). 1908fa9e4066Sahrens */ 1909fa9e4066Sahrens 1910*112fe045Smaybee if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 && 1911*112fe045Smaybee arc_tempreserve + arc.anon->size > arc.c / 4) { 1912fa9e4066Sahrens dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 1913fa9e4066Sahrens "tempreserve=%lluK arc.c=%lluK\n", 1914fa9e4066Sahrens arc_tempreserve>>10, arc.anon->lsize>>10, 1915fa9e4066Sahrens tempreserve>>10, arc.c>>10); 1916fa9e4066Sahrens return (ERESTART); 1917fa9e4066Sahrens } 1918fa9e4066Sahrens atomic_add_64(&arc_tempreserve, tempreserve); 1919fa9e4066Sahrens return (0); 1920fa9e4066Sahrens } 1921fa9e4066Sahrens 1922fa9e4066Sahrens void 1923fa9e4066Sahrens arc_init(void) 1924fa9e4066Sahrens { 1925fa9e4066Sahrens mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 1926fa9e4066Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 1927fa9e4066Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 1928fa9e4066Sahrens 1929fa9e4066Sahrens /* Start out with 1/8 of all memory */ 1930fa9e4066Sahrens arc.c = physmem * PAGESIZE / 8; 1931fa9e4066Sahrens 1932fa9e4066Sahrens #ifdef _KERNEL 1933fa9e4066Sahrens /* 1934fa9e4066Sahrens * On architectures where the physical memory can be larger 1935fa9e4066Sahrens * than the addressable space (intel in 32-bit mode), we may 1936fa9e4066Sahrens * need to limit the cache to 1/8 of VM size. 1937fa9e4066Sahrens */ 1938fa9e4066Sahrens arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 1939fa9e4066Sahrens #endif 1940fa9e4066Sahrens 1941*112fe045Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 1942fa9e4066Sahrens arc.c_min = MAX(arc.c / 4, 64<<20); 1943*112fe045Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 1944fa9e4066Sahrens if (arc.c * 8 >= 1<<30) 1945fa9e4066Sahrens arc.c_max = (arc.c * 8) - (1<<30); 1946fa9e4066Sahrens else 1947fa9e4066Sahrens arc.c_max = arc.c_min; 1948fa9e4066Sahrens arc.c_max = MAX(arc.c * 6, arc.c_max); 1949fa9e4066Sahrens arc.c = arc.c_max; 1950fa9e4066Sahrens arc.p = (arc.c >> 1); 1951fa9e4066Sahrens 1952fa9e4066Sahrens /* if kmem_flags are set, lets try to use less memory */ 1953fa9e4066Sahrens if (kmem_debugging()) 1954fa9e4066Sahrens arc.c = arc.c / 2; 1955fa9e4066Sahrens if (arc.c < arc.c_min) 1956fa9e4066Sahrens arc.c = arc.c_min; 1957fa9e4066Sahrens 1958fa9e4066Sahrens arc.anon = &ARC_anon; 1959fa9e4066Sahrens arc.mru_top = &ARC_mru_top; 1960fa9e4066Sahrens arc.mru_bot = &ARC_mru_bot; 1961fa9e4066Sahrens arc.mfu_top = &ARC_mfu_top; 1962fa9e4066Sahrens arc.mfu_bot = &ARC_mfu_bot; 1963fa9e4066Sahrens 1964fa9e4066Sahrens list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t), 1965fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1966fa9e4066Sahrens list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t), 1967fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1968fa9e4066Sahrens list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t), 1969fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1970fa9e4066Sahrens list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t), 1971fa9e4066Sahrens offsetof(arc_buf_hdr_t, b_arc_node)); 1972fa9e4066Sahrens 1973fa9e4066Sahrens buf_init(); 1974fa9e4066Sahrens 1975fa9e4066Sahrens arc_thread_exit = 0; 1976fa9e4066Sahrens 1977fa9e4066Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 1978fa9e4066Sahrens TS_RUN, minclsyspri); 1979fa9e4066Sahrens } 1980fa9e4066Sahrens 1981fa9e4066Sahrens void 1982fa9e4066Sahrens arc_fini(void) 1983fa9e4066Sahrens { 1984fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 1985fa9e4066Sahrens arc_thread_exit = 1; 1986fa9e4066Sahrens while (arc_thread_exit != 0) 1987fa9e4066Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 1988fa9e4066Sahrens mutex_exit(&arc_reclaim_thr_lock); 1989fa9e4066Sahrens 1990fa9e4066Sahrens arc_flush(); 1991fa9e4066Sahrens 1992fa9e4066Sahrens arc_dead = TRUE; 1993fa9e4066Sahrens 1994fa9e4066Sahrens mutex_destroy(&arc_reclaim_lock); 1995fa9e4066Sahrens mutex_destroy(&arc_reclaim_thr_lock); 1996fa9e4066Sahrens cv_destroy(&arc_reclaim_thr_cv); 1997fa9e4066Sahrens 1998fa9e4066Sahrens list_destroy(&arc.mru_top->list); 1999fa9e4066Sahrens list_destroy(&arc.mru_bot->list); 2000fa9e4066Sahrens list_destroy(&arc.mfu_top->list); 2001fa9e4066Sahrens list_destroy(&arc.mfu_bot->list); 2002fa9e4066Sahrens 2003fa9e4066Sahrens buf_fini(); 2004fa9e4066Sahrens } 2005