1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5033f9833Sek * Common Development and Distribution License (the "License"). 6033f9833Sek * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23e9103aaeSGarrett D'Amore * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24be6fd75aSMatthew Ahrens * Copyright (c) 2013 by Delphix. All rights reserved. 25aad02571SSaso Kiselkov * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26fa9e4066Sahrens */ 27fa9e4066Sahrens 28fa9e4066Sahrens /* 2944cb6abcSbmc * DVA-based Adjustable Replacement Cache 30fa9e4066Sahrens * 31ea8dc4b6Seschrock * While much of the theory of operation used here is 32ea8dc4b6Seschrock * based on the self-tuning, low overhead replacement cache 33fa9e4066Sahrens * presented by Megiddo and Modha at FAST 2003, there are some 34fa9e4066Sahrens * significant differences: 35fa9e4066Sahrens * 36fa9e4066Sahrens * 1. The Megiddo and Modha model assumes any page is evictable. 37fa9e4066Sahrens * Pages in its cache cannot be "locked" into memory. This makes 38fa9e4066Sahrens * the eviction algorithm simple: evict the last page in the list. 39fa9e4066Sahrens * This also make the performance characteristics easy to reason 40fa9e4066Sahrens * about. Our cache is not so simple. At any given moment, some 41fa9e4066Sahrens * subset of the blocks in the cache are un-evictable because we 42fa9e4066Sahrens * have handed out a reference to them. Blocks are only evictable 43fa9e4066Sahrens * when there are no external references active. This makes 44fa9e4066Sahrens * eviction far more problematic: we choose to evict the evictable 45fa9e4066Sahrens * blocks that are the "lowest" in the list. 46fa9e4066Sahrens * 47fa9e4066Sahrens * There are times when it is not possible to evict the requested 48fa9e4066Sahrens * space. In these circumstances we are unable to adjust the cache 49fa9e4066Sahrens * size. To prevent the cache growing unbounded at these times we 50fa94a07fSbrendan * implement a "cache throttle" that slows the flow of new data 51fa94a07fSbrendan * into the cache until we can make space available. 52fa9e4066Sahrens * 53fa9e4066Sahrens * 2. The Megiddo and Modha model assumes a fixed cache size. 54fa9e4066Sahrens * Pages are evicted when the cache is full and there is a cache 55fa9e4066Sahrens * miss. Our model has a variable sized cache. It grows with 56fa94a07fSbrendan * high use, but also tries to react to memory pressure from the 57fa9e4066Sahrens * operating system: decreasing its size when system memory is 58fa9e4066Sahrens * tight. 59fa9e4066Sahrens * 60fa9e4066Sahrens * 3. The Megiddo and Modha model assumes a fixed page size. All 61f7170741SWill Andrews * elements of the cache are therefore exactly the same size. So 62fa9e4066Sahrens * when adjusting the cache size following a cache miss, its simply 63fa9e4066Sahrens * a matter of choosing a single page to evict. In our model, we 64fa9e4066Sahrens * have variable sized cache blocks (rangeing from 512 bytes to 65f7170741SWill Andrews * 128K bytes). We therefore choose a set of blocks to evict to make 66fa9e4066Sahrens * space for a cache miss that approximates as closely as possible 67fa9e4066Sahrens * the space used by the new block. 68fa9e4066Sahrens * 69fa9e4066Sahrens * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70fa9e4066Sahrens * by N. Megiddo & D. Modha, FAST 2003 71fa9e4066Sahrens */ 72fa9e4066Sahrens 73fa9e4066Sahrens /* 74fa9e4066Sahrens * The locking model: 75fa9e4066Sahrens * 76fa9e4066Sahrens * A new reference to a cache buffer can be obtained in two 77fa9e4066Sahrens * ways: 1) via a hash table lookup using the DVA as a key, 78fa94a07fSbrendan * or 2) via one of the ARC lists. The arc_read() interface 79fa9e4066Sahrens * uses method 1, while the internal arc algorithms for 80f7170741SWill Andrews * adjusting the cache use method 2. We therefore provide two 81fa9e4066Sahrens * types of locks: 1) the hash table lock array, and 2) the 82fa9e4066Sahrens * arc list locks. 83fa9e4066Sahrens * 84fc98fea5SBart Coddens * Buffers do not have their own mutexes, rather they rely on the 85fc98fea5SBart Coddens * hash table mutexes for the bulk of their protection (i.e. most 86fc98fea5SBart Coddens * fields in the arc_buf_hdr_t are protected by these mutexes). 87fa9e4066Sahrens * 88fa9e4066Sahrens * buf_hash_find() returns the appropriate mutex (held) when it 89fa9e4066Sahrens * locates the requested buffer in the hash table. It returns 90fa9e4066Sahrens * NULL for the mutex if the buffer was not in the table. 91fa9e4066Sahrens * 92fa9e4066Sahrens * buf_hash_remove() expects the appropriate hash mutex to be 93fa9e4066Sahrens * already held before it is invoked. 94fa9e4066Sahrens * 95fa9e4066Sahrens * Each arc state also has a mutex which is used to protect the 96fa9e4066Sahrens * buffer list associated with the state. When attempting to 97fa9e4066Sahrens * obtain a hash table lock while holding an arc list lock you 98fa9e4066Sahrens * must use: mutex_tryenter() to avoid deadlock. Also note that 9944eda4d7Smaybee * the active state mutex must be held before the ghost state mutex. 100fa9e4066Sahrens * 101ea8dc4b6Seschrock * Arc buffers may have an associated eviction callback function. 102ea8dc4b6Seschrock * This function will be invoked prior to removing the buffer (e.g. 103ea8dc4b6Seschrock * in arc_do_user_evicts()). Note however that the data associated 104ea8dc4b6Seschrock * with the buffer may be evicted prior to the callback. The callback 105ea8dc4b6Seschrock * must be made with *no locks held* (to prevent deadlock). Additionally, 106ea8dc4b6Seschrock * the users of callbacks must ensure that their private data is 107ea8dc4b6Seschrock * protected from simultaneous callbacks from arc_buf_evict() 108ea8dc4b6Seschrock * and arc_do_user_evicts(). 109ea8dc4b6Seschrock * 110fa9e4066Sahrens * Note that the majority of the performance stats are manipulated 111fa9e4066Sahrens * with atomic operations. 112fa94a07fSbrendan * 113fa94a07fSbrendan * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 114fa94a07fSbrendan * 115fa94a07fSbrendan * - L2ARC buflist creation 116fa94a07fSbrendan * - L2ARC buflist eviction 117fa94a07fSbrendan * - L2ARC write completion, which walks L2ARC buflists 118fa94a07fSbrendan * - ARC header destruction, as it removes from L2ARC buflists 119fa94a07fSbrendan * - ARC header release, as it removes from L2ARC buflists 120fa9e4066Sahrens */ 121fa9e4066Sahrens 122fa9e4066Sahrens #include <sys/spa.h> 123fa9e4066Sahrens #include <sys/zio.h> 124aad02571SSaso Kiselkov #include <sys/zio_compress.h> 125fa9e4066Sahrens #include <sys/zfs_context.h> 126fa9e4066Sahrens #include <sys/arc.h> 127fa9e4066Sahrens #include <sys/refcount.h> 128c5904d13Seschrock #include <sys/vdev.h> 129573ca77eSGeorge Wilson #include <sys/vdev_impl.h> 130*69962b56SMatthew Ahrens #include <sys/dsl_pool.h> 131fa9e4066Sahrens #ifdef _KERNEL 132fa9e4066Sahrens #include <sys/vmsystm.h> 133fa9e4066Sahrens #include <vm/anon.h> 134fa9e4066Sahrens #include <sys/fs/swapnode.h> 135033f9833Sek #include <sys/dnlc.h> 136fa9e4066Sahrens #endif 137fa9e4066Sahrens #include <sys/callb.h> 13844cb6abcSbmc #include <sys/kstat.h> 139b24ab676SJeff Bonwick #include <zfs_fletcher.h> 140fa9e4066Sahrens 141cd1c8b85SMatthew Ahrens #ifndef _KERNEL 142cd1c8b85SMatthew Ahrens /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 143cd1c8b85SMatthew Ahrens boolean_t arc_watch = B_FALSE; 144cd1c8b85SMatthew Ahrens int arc_procfd; 145cd1c8b85SMatthew Ahrens #endif 146cd1c8b85SMatthew Ahrens 147fa9e4066Sahrens static kmutex_t arc_reclaim_thr_lock; 148fa9e4066Sahrens static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 149fa9e4066Sahrens static uint8_t arc_thread_exit; 150fa9e4066Sahrens 151033f9833Sek #define ARC_REDUCE_DNLC_PERCENT 3 152033f9833Sek uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 153033f9833Sek 154fa9e4066Sahrens typedef enum arc_reclaim_strategy { 155fa9e4066Sahrens ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 156fa9e4066Sahrens ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 157fa9e4066Sahrens } arc_reclaim_strategy_t; 158fa9e4066Sahrens 159*69962b56SMatthew Ahrens /* 160*69962b56SMatthew Ahrens * The number of iterations through arc_evict_*() before we 161*69962b56SMatthew Ahrens * drop & reacquire the lock. 162*69962b56SMatthew Ahrens */ 163*69962b56SMatthew Ahrens int arc_evict_iterations = 100; 164*69962b56SMatthew Ahrens 165fa9e4066Sahrens /* number of seconds before growing cache again */ 166fa9e4066Sahrens static int arc_grow_retry = 60; 167fa9e4066Sahrens 1685a98e54bSBrendan Gregg - Sun Microsystems /* shift of arc_c for calculating both min and max arc_p */ 1695a98e54bSBrendan Gregg - Sun Microsystems static int arc_p_min_shift = 4; 1705a98e54bSBrendan Gregg - Sun Microsystems 1715a98e54bSBrendan Gregg - Sun Microsystems /* log2(fraction of arc to reclaim) */ 1725a98e54bSBrendan Gregg - Sun Microsystems static int arc_shrink_shift = 5; 1735a98e54bSBrendan Gregg - Sun Microsystems 17413506d1eSmaybee /* 175b19a79ecSperrin * minimum lifespan of a prefetch block in clock ticks 176b19a79ecSperrin * (initialized in arc_init()) 17713506d1eSmaybee */ 178b19a79ecSperrin static int arc_min_prefetch_lifespan; 17913506d1eSmaybee 180*69962b56SMatthew Ahrens /* 181*69962b56SMatthew Ahrens * If this percent of memory is free, don't throttle. 182*69962b56SMatthew Ahrens */ 183*69962b56SMatthew Ahrens int arc_lotsfree_percent = 10; 184*69962b56SMatthew Ahrens 185fa9e4066Sahrens static int arc_dead; 186fa9e4066Sahrens 1873a737e0dSbrendan /* 1883a737e0dSbrendan * The arc has filled available memory and has now warmed up. 1893a737e0dSbrendan */ 1903a737e0dSbrendan static boolean_t arc_warm; 1913a737e0dSbrendan 192a2eea2e1Sahrens /* 193a2eea2e1Sahrens * These tunables are for performance analysis. 194a2eea2e1Sahrens */ 195a2eea2e1Sahrens uint64_t zfs_arc_max; 196a2eea2e1Sahrens uint64_t zfs_arc_min; 1971116048bSek uint64_t zfs_arc_meta_limit = 0; 1985a98e54bSBrendan Gregg - Sun Microsystems int zfs_arc_grow_retry = 0; 1995a98e54bSBrendan Gregg - Sun Microsystems int zfs_arc_shrink_shift = 0; 2005a98e54bSBrendan Gregg - Sun Microsystems int zfs_arc_p_min_shift = 0; 2019253d63dSGeorge Wilson int zfs_disable_dup_eviction = 0; 202a2eea2e1Sahrens 203fa9e4066Sahrens /* 204fa94a07fSbrendan * Note that buffers can be in one of 6 states: 205fa9e4066Sahrens * ARC_anon - anonymous (discussed below) 206ea8dc4b6Seschrock * ARC_mru - recently used, currently cached 207ea8dc4b6Seschrock * ARC_mru_ghost - recentely used, no longer in cache 208ea8dc4b6Seschrock * ARC_mfu - frequently used, currently cached 209ea8dc4b6Seschrock * ARC_mfu_ghost - frequently used, no longer in cache 210fa94a07fSbrendan * ARC_l2c_only - exists in L2ARC but not other states 2110e8c6158Smaybee * When there are no active references to the buffer, they are 2120e8c6158Smaybee * are linked onto a list in one of these arc states. These are 2130e8c6158Smaybee * the only buffers that can be evicted or deleted. Within each 2140e8c6158Smaybee * state there are multiple lists, one for meta-data and one for 2150e8c6158Smaybee * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 2160e8c6158Smaybee * etc.) is tracked separately so that it can be managed more 217fa94a07fSbrendan * explicitly: favored over data, limited explicitly. 218fa9e4066Sahrens * 219fa9e4066Sahrens * Anonymous buffers are buffers that are not associated with 220fa9e4066Sahrens * a DVA. These are buffers that hold dirty block copies 221fa9e4066Sahrens * before they are written to stable storage. By definition, 222ea8dc4b6Seschrock * they are "ref'd" and are considered part of arc_mru 223fa9e4066Sahrens * that cannot be freed. Generally, they will aquire a DVA 224ea8dc4b6Seschrock * as they are written and migrate onto the arc_mru list. 225fa94a07fSbrendan * 226fa94a07fSbrendan * The ARC_l2c_only state is for buffers that are in the second 227fa94a07fSbrendan * level ARC but no longer in any of the ARC_m* lists. The second 228fa94a07fSbrendan * level ARC itself may also contain buffers that are in any of 229fa94a07fSbrendan * the ARC_m* states - meaning that a buffer can exist in two 230fa94a07fSbrendan * places. The reason for the ARC_l2c_only state is to keep the 231fa94a07fSbrendan * buffer header in the hash table, so that reads that hit the 232fa94a07fSbrendan * second level ARC benefit from these fast lookups. 233fa9e4066Sahrens */ 234fa9e4066Sahrens 235fa9e4066Sahrens typedef struct arc_state { 2360e8c6158Smaybee list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 2370e8c6158Smaybee uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 2380e8c6158Smaybee uint64_t arcs_size; /* total amount of data in this state */ 23944cb6abcSbmc kmutex_t arcs_mtx; 240fa9e4066Sahrens } arc_state_t; 241fa9e4066Sahrens 242fa94a07fSbrendan /* The 6 states: */ 243fa9e4066Sahrens static arc_state_t ARC_anon; 244ea8dc4b6Seschrock static arc_state_t ARC_mru; 245ea8dc4b6Seschrock static arc_state_t ARC_mru_ghost; 246ea8dc4b6Seschrock static arc_state_t ARC_mfu; 247ea8dc4b6Seschrock static arc_state_t ARC_mfu_ghost; 248fa94a07fSbrendan static arc_state_t ARC_l2c_only; 249fa9e4066Sahrens 25044cb6abcSbmc typedef struct arc_stats { 25144cb6abcSbmc kstat_named_t arcstat_hits; 25244cb6abcSbmc kstat_named_t arcstat_misses; 25344cb6abcSbmc kstat_named_t arcstat_demand_data_hits; 25444cb6abcSbmc kstat_named_t arcstat_demand_data_misses; 25544cb6abcSbmc kstat_named_t arcstat_demand_metadata_hits; 25644cb6abcSbmc kstat_named_t arcstat_demand_metadata_misses; 25744cb6abcSbmc kstat_named_t arcstat_prefetch_data_hits; 25844cb6abcSbmc kstat_named_t arcstat_prefetch_data_misses; 25944cb6abcSbmc kstat_named_t arcstat_prefetch_metadata_hits; 26044cb6abcSbmc kstat_named_t arcstat_prefetch_metadata_misses; 26144cb6abcSbmc kstat_named_t arcstat_mru_hits; 26244cb6abcSbmc kstat_named_t arcstat_mru_ghost_hits; 26344cb6abcSbmc kstat_named_t arcstat_mfu_hits; 26444cb6abcSbmc kstat_named_t arcstat_mfu_ghost_hits; 26544cb6abcSbmc kstat_named_t arcstat_deleted; 26644cb6abcSbmc kstat_named_t arcstat_recycle_miss; 2673e30c24aSWill Andrews /* 2683e30c24aSWill Andrews * Number of buffers that could not be evicted because the hash lock 2693e30c24aSWill Andrews * was held by another thread. The lock may not necessarily be held 2703e30c24aSWill Andrews * by something using the same buffer, since hash locks are shared 2713e30c24aSWill Andrews * by multiple buffers. 2723e30c24aSWill Andrews */ 27344cb6abcSbmc kstat_named_t arcstat_mutex_miss; 2743e30c24aSWill Andrews /* 2753e30c24aSWill Andrews * Number of buffers skipped because they have I/O in progress, are 2763e30c24aSWill Andrews * indrect prefetch buffers that have not lived long enough, or are 2773e30c24aSWill Andrews * not from the spa we're trying to evict from. 2783e30c24aSWill Andrews */ 27944cb6abcSbmc kstat_named_t arcstat_evict_skip; 2805ea40c06SBrendan Gregg - Sun Microsystems kstat_named_t arcstat_evict_l2_cached; 2815ea40c06SBrendan Gregg - Sun Microsystems kstat_named_t arcstat_evict_l2_eligible; 2825ea40c06SBrendan Gregg - Sun Microsystems kstat_named_t arcstat_evict_l2_ineligible; 28344cb6abcSbmc kstat_named_t arcstat_hash_elements; 28444cb6abcSbmc kstat_named_t arcstat_hash_elements_max; 28544cb6abcSbmc kstat_named_t arcstat_hash_collisions; 28644cb6abcSbmc kstat_named_t arcstat_hash_chains; 28744cb6abcSbmc kstat_named_t arcstat_hash_chain_max; 28844cb6abcSbmc kstat_named_t arcstat_p; 28944cb6abcSbmc kstat_named_t arcstat_c; 29044cb6abcSbmc kstat_named_t arcstat_c_min; 29144cb6abcSbmc kstat_named_t arcstat_c_max; 29244cb6abcSbmc kstat_named_t arcstat_size; 293fa94a07fSbrendan kstat_named_t arcstat_hdr_size; 2945a98e54bSBrendan Gregg - Sun Microsystems kstat_named_t arcstat_data_size; 2955a98e54bSBrendan Gregg - Sun Microsystems kstat_named_t arcstat_other_size; 296fa94a07fSbrendan kstat_named_t arcstat_l2_hits; 297fa94a07fSbrendan kstat_named_t arcstat_l2_misses; 298fa94a07fSbrendan kstat_named_t arcstat_l2_feeds; 299fa94a07fSbrendan kstat_named_t arcstat_l2_rw_clash; 3005a98e54bSBrendan Gregg - Sun Microsystems kstat_named_t arcstat_l2_read_bytes; 3015a98e54bSBrendan Gregg - Sun Microsystems kstat_named_t arcstat_l2_write_bytes; 302fa94a07fSbrendan kstat_named_t arcstat_l2_writes_sent; 303fa94a07fSbrendan kstat_named_t arcstat_l2_writes_done; 304fa94a07fSbrendan kstat_named_t arcstat_l2_writes_error; 305fa94a07fSbrendan kstat_named_t arcstat_l2_writes_hdr_miss; 306fa94a07fSbrendan kstat_named_t arcstat_l2_evict_lock_retry; 307fa94a07fSbrendan kstat_named_t arcstat_l2_evict_reading; 308fa94a07fSbrendan kstat_named_t arcstat_l2_free_on_write; 309fa94a07fSbrendan kstat_named_t arcstat_l2_abort_lowmem; 310fa94a07fSbrendan kstat_named_t arcstat_l2_cksum_bad; 311fa94a07fSbrendan kstat_named_t arcstat_l2_io_error; 312fa94a07fSbrendan kstat_named_t arcstat_l2_size; 313aad02571SSaso Kiselkov kstat_named_t arcstat_l2_asize; 314fa94a07fSbrendan kstat_named_t arcstat_l2_hdr_size; 315aad02571SSaso Kiselkov kstat_named_t arcstat_l2_compress_successes; 316aad02571SSaso Kiselkov kstat_named_t arcstat_l2_compress_zeros; 317aad02571SSaso Kiselkov kstat_named_t arcstat_l2_compress_failures; 3181ab7f2deSmaybee kstat_named_t arcstat_memory_throttle_count; 3199253d63dSGeorge Wilson kstat_named_t arcstat_duplicate_buffers; 3209253d63dSGeorge Wilson kstat_named_t arcstat_duplicate_buffers_size; 3219253d63dSGeorge Wilson kstat_named_t arcstat_duplicate_reads; 32220128a08SGeorge Wilson kstat_named_t arcstat_meta_used; 32320128a08SGeorge Wilson kstat_named_t arcstat_meta_limit; 32420128a08SGeorge Wilson kstat_named_t arcstat_meta_max; 32544cb6abcSbmc } arc_stats_t; 32644cb6abcSbmc 32744cb6abcSbmc static arc_stats_t arc_stats = { 32844cb6abcSbmc { "hits", KSTAT_DATA_UINT64 }, 32944cb6abcSbmc { "misses", KSTAT_DATA_UINT64 }, 33044cb6abcSbmc { "demand_data_hits", KSTAT_DATA_UINT64 }, 33144cb6abcSbmc { "demand_data_misses", KSTAT_DATA_UINT64 }, 33244cb6abcSbmc { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 33344cb6abcSbmc { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 33444cb6abcSbmc { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 33544cb6abcSbmc { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 33644cb6abcSbmc { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 33744cb6abcSbmc { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 33844cb6abcSbmc { "mru_hits", KSTAT_DATA_UINT64 }, 33944cb6abcSbmc { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 34044cb6abcSbmc { "mfu_hits", KSTAT_DATA_UINT64 }, 34144cb6abcSbmc { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 34244cb6abcSbmc { "deleted", KSTAT_DATA_UINT64 }, 34344cb6abcSbmc { "recycle_miss", KSTAT_DATA_UINT64 }, 34444cb6abcSbmc { "mutex_miss", KSTAT_DATA_UINT64 }, 34544cb6abcSbmc { "evict_skip", KSTAT_DATA_UINT64 }, 3465ea40c06SBrendan Gregg - Sun Microsystems { "evict_l2_cached", KSTAT_DATA_UINT64 }, 3475ea40c06SBrendan Gregg - Sun Microsystems { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 3485ea40c06SBrendan Gregg - Sun Microsystems { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 34944cb6abcSbmc { "hash_elements", KSTAT_DATA_UINT64 }, 35044cb6abcSbmc { "hash_elements_max", KSTAT_DATA_UINT64 }, 35144cb6abcSbmc { "hash_collisions", KSTAT_DATA_UINT64 }, 35244cb6abcSbmc { "hash_chains", KSTAT_DATA_UINT64 }, 35344cb6abcSbmc { "hash_chain_max", KSTAT_DATA_UINT64 }, 35444cb6abcSbmc { "p", KSTAT_DATA_UINT64 }, 35544cb6abcSbmc { "c", KSTAT_DATA_UINT64 }, 35644cb6abcSbmc { "c_min", KSTAT_DATA_UINT64 }, 35744cb6abcSbmc { "c_max", KSTAT_DATA_UINT64 }, 358fa94a07fSbrendan { "size", KSTAT_DATA_UINT64 }, 359fa94a07fSbrendan { "hdr_size", KSTAT_DATA_UINT64 }, 3605a98e54bSBrendan Gregg - Sun Microsystems { "data_size", KSTAT_DATA_UINT64 }, 3615a98e54bSBrendan Gregg - Sun Microsystems { "other_size", KSTAT_DATA_UINT64 }, 362fa94a07fSbrendan { "l2_hits", KSTAT_DATA_UINT64 }, 363fa94a07fSbrendan { "l2_misses", KSTAT_DATA_UINT64 }, 364fa94a07fSbrendan { "l2_feeds", KSTAT_DATA_UINT64 }, 365fa94a07fSbrendan { "l2_rw_clash", KSTAT_DATA_UINT64 }, 3665a98e54bSBrendan Gregg - Sun Microsystems { "l2_read_bytes", KSTAT_DATA_UINT64 }, 3675a98e54bSBrendan Gregg - Sun Microsystems { "l2_write_bytes", KSTAT_DATA_UINT64 }, 368fa94a07fSbrendan { "l2_writes_sent", KSTAT_DATA_UINT64 }, 369fa94a07fSbrendan { "l2_writes_done", KSTAT_DATA_UINT64 }, 370fa94a07fSbrendan { "l2_writes_error", KSTAT_DATA_UINT64 }, 371fa94a07fSbrendan { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 372fa94a07fSbrendan { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 373fa94a07fSbrendan { "l2_evict_reading", KSTAT_DATA_UINT64 }, 374fa94a07fSbrendan { "l2_free_on_write", KSTAT_DATA_UINT64 }, 375fa94a07fSbrendan { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 376fa94a07fSbrendan { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 377fa94a07fSbrendan { "l2_io_error", KSTAT_DATA_UINT64 }, 378fa94a07fSbrendan { "l2_size", KSTAT_DATA_UINT64 }, 379aad02571SSaso Kiselkov { "l2_asize", KSTAT_DATA_UINT64 }, 3801ab7f2deSmaybee { "l2_hdr_size", KSTAT_DATA_UINT64 }, 381aad02571SSaso Kiselkov { "l2_compress_successes", KSTAT_DATA_UINT64 }, 382aad02571SSaso Kiselkov { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 383aad02571SSaso Kiselkov { "l2_compress_failures", KSTAT_DATA_UINT64 }, 3849253d63dSGeorge Wilson { "memory_throttle_count", KSTAT_DATA_UINT64 }, 3859253d63dSGeorge Wilson { "duplicate_buffers", KSTAT_DATA_UINT64 }, 3869253d63dSGeorge Wilson { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 38720128a08SGeorge Wilson { "duplicate_reads", KSTAT_DATA_UINT64 }, 38820128a08SGeorge Wilson { "arc_meta_used", KSTAT_DATA_UINT64 }, 38920128a08SGeorge Wilson { "arc_meta_limit", KSTAT_DATA_UINT64 }, 39020128a08SGeorge Wilson { "arc_meta_max", KSTAT_DATA_UINT64 } 39144cb6abcSbmc }; 39244cb6abcSbmc 39344cb6abcSbmc #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 39444cb6abcSbmc 39544cb6abcSbmc #define ARCSTAT_INCR(stat, val) \ 396f7170741SWill Andrews atomic_add_64(&arc_stats.stat.value.ui64, (val)) 39744cb6abcSbmc 398b24ab676SJeff Bonwick #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 39944cb6abcSbmc #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 40044cb6abcSbmc 40144cb6abcSbmc #define ARCSTAT_MAX(stat, val) { \ 40244cb6abcSbmc uint64_t m; \ 40344cb6abcSbmc while ((val) > (m = arc_stats.stat.value.ui64) && \ 40444cb6abcSbmc (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 40544cb6abcSbmc continue; \ 40644cb6abcSbmc } 40744cb6abcSbmc 40844cb6abcSbmc #define ARCSTAT_MAXSTAT(stat) \ 40944cb6abcSbmc ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 41044cb6abcSbmc 41144cb6abcSbmc /* 41244cb6abcSbmc * We define a macro to allow ARC hits/misses to be easily broken down by 41344cb6abcSbmc * two separate conditions, giving a total of four different subtypes for 41444cb6abcSbmc * each of hits and misses (so eight statistics total). 41544cb6abcSbmc */ 41644cb6abcSbmc #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 41744cb6abcSbmc if (cond1) { \ 41844cb6abcSbmc if (cond2) { \ 41944cb6abcSbmc ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 42044cb6abcSbmc } else { \ 42144cb6abcSbmc ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 42244cb6abcSbmc } \ 42344cb6abcSbmc } else { \ 42444cb6abcSbmc if (cond2) { \ 42544cb6abcSbmc ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 42644cb6abcSbmc } else { \ 42744cb6abcSbmc ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 42844cb6abcSbmc } \ 42944cb6abcSbmc } 43044cb6abcSbmc 43144cb6abcSbmc kstat_t *arc_ksp; 432b24ab676SJeff Bonwick static arc_state_t *arc_anon; 43344cb6abcSbmc static arc_state_t *arc_mru; 43444cb6abcSbmc static arc_state_t *arc_mru_ghost; 43544cb6abcSbmc static arc_state_t *arc_mfu; 43644cb6abcSbmc static arc_state_t *arc_mfu_ghost; 437fa94a07fSbrendan static arc_state_t *arc_l2c_only; 43844cb6abcSbmc 43944cb6abcSbmc /* 44044cb6abcSbmc * There are several ARC variables that are critical to export as kstats -- 44144cb6abcSbmc * but we don't want to have to grovel around in the kstat whenever we wish to 44244cb6abcSbmc * manipulate them. For these variables, we therefore define them to be in 44344cb6abcSbmc * terms of the statistic variable. This assures that we are not introducing 44444cb6abcSbmc * the possibility of inconsistency by having shadow copies of the variables, 44544cb6abcSbmc * while still allowing the code to be readable. 44644cb6abcSbmc */ 44744cb6abcSbmc #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 44844cb6abcSbmc #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 44944cb6abcSbmc #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 45044cb6abcSbmc #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 45144cb6abcSbmc #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 45220128a08SGeorge Wilson #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 45320128a08SGeorge Wilson #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 45420128a08SGeorge Wilson #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 45544cb6abcSbmc 456aad02571SSaso Kiselkov #define L2ARC_IS_VALID_COMPRESS(_c_) \ 457aad02571SSaso Kiselkov ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 458aad02571SSaso Kiselkov 45944cb6abcSbmc static int arc_no_grow; /* Don't try to grow cache size */ 46044cb6abcSbmc static uint64_t arc_tempreserve; 4612fdbea25SAleksandr Guzovskiy static uint64_t arc_loaned_bytes; 462fa9e4066Sahrens 463fa94a07fSbrendan typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 464fa94a07fSbrendan 465fa9e4066Sahrens typedef struct arc_callback arc_callback_t; 466fa9e4066Sahrens 467fa9e4066Sahrens struct arc_callback { 468fa9e4066Sahrens void *acb_private; 469c717a561Smaybee arc_done_func_t *acb_done; 470fa9e4066Sahrens arc_buf_t *acb_buf; 471fa9e4066Sahrens zio_t *acb_zio_dummy; 472fa9e4066Sahrens arc_callback_t *acb_next; 473fa9e4066Sahrens }; 474fa9e4066Sahrens 475c717a561Smaybee typedef struct arc_write_callback arc_write_callback_t; 476c717a561Smaybee 477c717a561Smaybee struct arc_write_callback { 478c717a561Smaybee void *awcb_private; 479c717a561Smaybee arc_done_func_t *awcb_ready; 480*69962b56SMatthew Ahrens arc_done_func_t *awcb_physdone; 481c717a561Smaybee arc_done_func_t *awcb_done; 482c717a561Smaybee arc_buf_t *awcb_buf; 483c717a561Smaybee }; 484c717a561Smaybee 485fa9e4066Sahrens struct arc_buf_hdr { 486fa9e4066Sahrens /* protected by hash lock */ 487fa9e4066Sahrens dva_t b_dva; 488fa9e4066Sahrens uint64_t b_birth; 489fa9e4066Sahrens uint64_t b_cksum0; 490fa9e4066Sahrens 4916b4acc8bSahrens kmutex_t b_freeze_lock; 4926b4acc8bSahrens zio_cksum_t *b_freeze_cksum; 4933f9d6ad7SLin Ling void *b_thawed; 4946b4acc8bSahrens 495fa9e4066Sahrens arc_buf_hdr_t *b_hash_next; 496fa9e4066Sahrens arc_buf_t *b_buf; 497fa9e4066Sahrens uint32_t b_flags; 498ea8dc4b6Seschrock uint32_t b_datacnt; 499fa9e4066Sahrens 500fa9e4066Sahrens arc_callback_t *b_acb; 501ad23a2dbSjohansen kcondvar_t b_cv; 502ad23a2dbSjohansen 503ad23a2dbSjohansen /* immutable */ 504ad23a2dbSjohansen arc_buf_contents_t b_type; 505ad23a2dbSjohansen uint64_t b_size; 506ac05c741SMark Maybee uint64_t b_spa; 507fa9e4066Sahrens 508fa9e4066Sahrens /* protected by arc state mutex */ 509fa9e4066Sahrens arc_state_t *b_state; 510fa9e4066Sahrens list_node_t b_arc_node; 511fa9e4066Sahrens 512fa9e4066Sahrens /* updated atomically */ 513fa9e4066Sahrens clock_t b_arc_access; 514fa9e4066Sahrens 515fa9e4066Sahrens /* self protecting */ 516fa9e4066Sahrens refcount_t b_refcnt; 517fa94a07fSbrendan 518fa94a07fSbrendan l2arc_buf_hdr_t *b_l2hdr; 519fa94a07fSbrendan list_node_t b_l2node; 520fa9e4066Sahrens }; 521fa9e4066Sahrens 522ea8dc4b6Seschrock static arc_buf_t *arc_eviction_list; 523ea8dc4b6Seschrock static kmutex_t arc_eviction_mtx; 52440d7d650Smaybee static arc_buf_hdr_t arc_eviction_hdr; 52544eda4d7Smaybee static void arc_get_data_buf(arc_buf_t *buf); 52644eda4d7Smaybee static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 5270e8c6158Smaybee static int arc_evict_needed(arc_buf_contents_t type); 528ac05c741SMark Maybee static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); 529cd1c8b85SMatthew Ahrens static void arc_buf_watch(arc_buf_t *buf); 530ea8dc4b6Seschrock 5315ea40c06SBrendan Gregg - Sun Microsystems static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); 5325ea40c06SBrendan Gregg - Sun Microsystems 533ea8dc4b6Seschrock #define GHOST_STATE(state) \ 534fa94a07fSbrendan ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 535fa94a07fSbrendan (state) == arc_l2c_only) 536ea8dc4b6Seschrock 537fa9e4066Sahrens /* 538fa9e4066Sahrens * Private ARC flags. These flags are private ARC only flags that will show up 539fa9e4066Sahrens * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 540fa9e4066Sahrens * be passed in as arc_flags in things like arc_read. However, these flags 541fa9e4066Sahrens * should never be passed and should only be set by ARC code. When adding new 542fa9e4066Sahrens * public flags, make sure not to smash the private ones. 543fa9e4066Sahrens */ 544fa9e4066Sahrens 545ea8dc4b6Seschrock #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 546fa9e4066Sahrens #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 547fa9e4066Sahrens #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 548fa9e4066Sahrens #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 549ea8dc4b6Seschrock #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 55013506d1eSmaybee #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 551fa94a07fSbrendan #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 5523baa08fcSek #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 5533baa08fcSek #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 5543baa08fcSek #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 555fa9e4066Sahrens 556ea8dc4b6Seschrock #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 557fa9e4066Sahrens #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 558fa9e4066Sahrens #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 5595a98e54bSBrendan Gregg - Sun Microsystems #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) 560fa9e4066Sahrens #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 561ea8dc4b6Seschrock #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 562fa94a07fSbrendan #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 5633baa08fcSek #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 5643a737e0dSbrendan #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 5653a737e0dSbrendan (hdr)->b_l2hdr != NULL) 566fa94a07fSbrendan #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 567fa94a07fSbrendan #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 568fa94a07fSbrendan #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 569fa9e4066Sahrens 570e6c728e1Sbrendan /* 571e6c728e1Sbrendan * Other sizes 572e6c728e1Sbrendan */ 573e6c728e1Sbrendan 574e6c728e1Sbrendan #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 575e6c728e1Sbrendan #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 576e6c728e1Sbrendan 577fa9e4066Sahrens /* 578fa9e4066Sahrens * Hash table routines 579fa9e4066Sahrens */ 580fa9e4066Sahrens 581fa9e4066Sahrens #define HT_LOCK_PAD 64 582fa9e4066Sahrens 583fa9e4066Sahrens struct ht_lock { 584fa9e4066Sahrens kmutex_t ht_lock; 585fa9e4066Sahrens #ifdef _KERNEL 586fa9e4066Sahrens unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 587fa9e4066Sahrens #endif 588fa9e4066Sahrens }; 589fa9e4066Sahrens 590fa9e4066Sahrens #define BUF_LOCKS 256 591fa9e4066Sahrens typedef struct buf_hash_table { 592fa9e4066Sahrens uint64_t ht_mask; 593fa9e4066Sahrens arc_buf_hdr_t **ht_table; 594fa9e4066Sahrens struct ht_lock ht_locks[BUF_LOCKS]; 595fa9e4066Sahrens } buf_hash_table_t; 596fa9e4066Sahrens 597fa9e4066Sahrens static buf_hash_table_t buf_hash_table; 598fa9e4066Sahrens 599fa9e4066Sahrens #define BUF_HASH_INDEX(spa, dva, birth) \ 600fa9e4066Sahrens (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 601fa9e4066Sahrens #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 602fa9e4066Sahrens #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 6033f9d6ad7SLin Ling #define HDR_LOCK(hdr) \ 6043f9d6ad7SLin Ling (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 605fa9e4066Sahrens 606fa9e4066Sahrens uint64_t zfs_crc64_table[256]; 607fa9e4066Sahrens 608fa94a07fSbrendan /* 609fa94a07fSbrendan * Level 2 ARC 610fa94a07fSbrendan */ 611fa94a07fSbrendan 612fa94a07fSbrendan #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 613aad02571SSaso Kiselkov #define L2ARC_HEADROOM 2 /* num of writes */ 614aad02571SSaso Kiselkov /* 615aad02571SSaso Kiselkov * If we discover during ARC scan any buffers to be compressed, we boost 616aad02571SSaso Kiselkov * our headroom for the next scanning cycle by this percentage multiple. 617aad02571SSaso Kiselkov */ 618aad02571SSaso Kiselkov #define L2ARC_HEADROOM_BOOST 200 6195a98e54bSBrendan Gregg - Sun Microsystems #define L2ARC_FEED_SECS 1 /* caching interval secs */ 6205a98e54bSBrendan Gregg - Sun Microsystems #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 621fa94a07fSbrendan 622fa94a07fSbrendan #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 623fa94a07fSbrendan #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 624fa94a07fSbrendan 625f7170741SWill Andrews /* L2ARC Performance Tunables */ 626fa94a07fSbrendan uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 6273a737e0dSbrendan uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 628fa94a07fSbrendan uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 629aad02571SSaso Kiselkov uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 630fa94a07fSbrendan uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 6315a98e54bSBrendan Gregg - Sun Microsystems uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 632fa94a07fSbrendan boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 6335a98e54bSBrendan Gregg - Sun Microsystems boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 6345a98e54bSBrendan Gregg - Sun Microsystems boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 635fa94a07fSbrendan 636fa94a07fSbrendan /* 637fa94a07fSbrendan * L2ARC Internals 638fa94a07fSbrendan */ 639fa94a07fSbrendan typedef struct l2arc_dev { 640fa94a07fSbrendan vdev_t *l2ad_vdev; /* vdev */ 641fa94a07fSbrendan spa_t *l2ad_spa; /* spa */ 642fa94a07fSbrendan uint64_t l2ad_hand; /* next write location */ 643fa94a07fSbrendan uint64_t l2ad_start; /* first addr on device */ 644fa94a07fSbrendan uint64_t l2ad_end; /* last addr on device */ 645fa94a07fSbrendan uint64_t l2ad_evict; /* last addr eviction reached */ 646fa94a07fSbrendan boolean_t l2ad_first; /* first sweep through */ 6475a98e54bSBrendan Gregg - Sun Microsystems boolean_t l2ad_writing; /* currently writing */ 648fa94a07fSbrendan list_t *l2ad_buflist; /* buffer list */ 649fa94a07fSbrendan list_node_t l2ad_node; /* device list node */ 650fa94a07fSbrendan } l2arc_dev_t; 651fa94a07fSbrendan 652fa94a07fSbrendan static list_t L2ARC_dev_list; /* device list */ 653fa94a07fSbrendan static list_t *l2arc_dev_list; /* device list pointer */ 654fa94a07fSbrendan static kmutex_t l2arc_dev_mtx; /* device list mutex */ 655fa94a07fSbrendan static l2arc_dev_t *l2arc_dev_last; /* last device used */ 656fa94a07fSbrendan static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 657fa94a07fSbrendan static list_t L2ARC_free_on_write; /* free after write buf list */ 658fa94a07fSbrendan static list_t *l2arc_free_on_write; /* free after write list ptr */ 659fa94a07fSbrendan static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 660fa94a07fSbrendan static uint64_t l2arc_ndev; /* number of devices */ 661fa94a07fSbrendan 662fa94a07fSbrendan typedef struct l2arc_read_callback { 663aad02571SSaso Kiselkov arc_buf_t *l2rcb_buf; /* read buffer */ 664aad02571SSaso Kiselkov spa_t *l2rcb_spa; /* spa */ 665aad02571SSaso Kiselkov blkptr_t l2rcb_bp; /* original blkptr */ 666aad02571SSaso Kiselkov zbookmark_t l2rcb_zb; /* original bookmark */ 667aad02571SSaso Kiselkov int l2rcb_flags; /* original flags */ 668aad02571SSaso Kiselkov enum zio_compress l2rcb_compress; /* applied compress */ 669fa94a07fSbrendan } l2arc_read_callback_t; 670fa94a07fSbrendan 671fa94a07fSbrendan typedef struct l2arc_write_callback { 672fa94a07fSbrendan l2arc_dev_t *l2wcb_dev; /* device info */ 673fa94a07fSbrendan arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 674fa94a07fSbrendan } l2arc_write_callback_t; 675fa94a07fSbrendan 676fa94a07fSbrendan struct l2arc_buf_hdr { 677fa94a07fSbrendan /* protected by arc_buf_hdr mutex */ 678aad02571SSaso Kiselkov l2arc_dev_t *b_dev; /* L2ARC device */ 679aad02571SSaso Kiselkov uint64_t b_daddr; /* disk address, offset byte */ 680aad02571SSaso Kiselkov /* compression applied to buffer data */ 681aad02571SSaso Kiselkov enum zio_compress b_compress; 682aad02571SSaso Kiselkov /* real alloc'd buffer size depending on b_compress applied */ 683aad02571SSaso Kiselkov int b_asize; 684aad02571SSaso Kiselkov /* temporary buffer holder for in-flight compressed data */ 685aad02571SSaso Kiselkov void *b_tmp_cdata; 686fa94a07fSbrendan }; 687fa94a07fSbrendan 688fa94a07fSbrendan typedef struct l2arc_data_free { 689fa94a07fSbrendan /* protected by l2arc_free_on_write_mtx */ 690fa94a07fSbrendan void *l2df_data; 691fa94a07fSbrendan size_t l2df_size; 692fa94a07fSbrendan void (*l2df_func)(void *, size_t); 693fa94a07fSbrendan list_node_t l2df_list_node; 694fa94a07fSbrendan } l2arc_data_free_t; 695fa94a07fSbrendan 696fa94a07fSbrendan static kmutex_t l2arc_feed_thr_lock; 697fa94a07fSbrendan static kcondvar_t l2arc_feed_thr_cv; 698fa94a07fSbrendan static uint8_t l2arc_thread_exit; 699fa94a07fSbrendan 700fa94a07fSbrendan static void l2arc_read_done(zio_t *zio); 701fa94a07fSbrendan static void l2arc_hdr_stat_add(void); 702fa94a07fSbrendan static void l2arc_hdr_stat_remove(void); 703fa94a07fSbrendan 704aad02571SSaso Kiselkov static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); 705aad02571SSaso Kiselkov static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, 706aad02571SSaso Kiselkov enum zio_compress c); 707aad02571SSaso Kiselkov static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); 708aad02571SSaso Kiselkov 709fa9e4066Sahrens static uint64_t 710ac05c741SMark Maybee buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 711fa9e4066Sahrens { 712fa9e4066Sahrens uint8_t *vdva = (uint8_t *)dva; 713fa9e4066Sahrens uint64_t crc = -1ULL; 714fa9e4066Sahrens int i; 715fa9e4066Sahrens 716fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 717fa9e4066Sahrens 718fa9e4066Sahrens for (i = 0; i < sizeof (dva_t); i++) 719fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 720fa9e4066Sahrens 721ac05c741SMark Maybee crc ^= (spa>>8) ^ birth; 722fa9e4066Sahrens 723fa9e4066Sahrens return (crc); 724fa9e4066Sahrens } 725fa9e4066Sahrens 726fa9e4066Sahrens #define BUF_EMPTY(buf) \ 727fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == 0 && \ 728fa9e4066Sahrens (buf)->b_dva.dva_word[1] == 0 && \ 729fa9e4066Sahrens (buf)->b_birth == 0) 730fa9e4066Sahrens 731fa9e4066Sahrens #define BUF_EQUAL(spa, dva, birth, buf) \ 732fa9e4066Sahrens ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 733fa9e4066Sahrens ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 734fa9e4066Sahrens ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 735fa9e4066Sahrens 7363f9d6ad7SLin Ling static void 7373f9d6ad7SLin Ling buf_discard_identity(arc_buf_hdr_t *hdr) 7383f9d6ad7SLin Ling { 7393f9d6ad7SLin Ling hdr->b_dva.dva_word[0] = 0; 7403f9d6ad7SLin Ling hdr->b_dva.dva_word[1] = 0; 7413f9d6ad7SLin Ling hdr->b_birth = 0; 7423f9d6ad7SLin Ling hdr->b_cksum0 = 0; 7433f9d6ad7SLin Ling } 7443f9d6ad7SLin Ling 745fa9e4066Sahrens static arc_buf_hdr_t * 746ac05c741SMark Maybee buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 747fa9e4066Sahrens { 748fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 749fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 750fa9e4066Sahrens arc_buf_hdr_t *buf; 751fa9e4066Sahrens 752fa9e4066Sahrens mutex_enter(hash_lock); 753fa9e4066Sahrens for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 754fa9e4066Sahrens buf = buf->b_hash_next) { 755fa9e4066Sahrens if (BUF_EQUAL(spa, dva, birth, buf)) { 756fa9e4066Sahrens *lockp = hash_lock; 757fa9e4066Sahrens return (buf); 758fa9e4066Sahrens } 759fa9e4066Sahrens } 760fa9e4066Sahrens mutex_exit(hash_lock); 761fa9e4066Sahrens *lockp = NULL; 762fa9e4066Sahrens return (NULL); 763fa9e4066Sahrens } 764fa9e4066Sahrens 765fa9e4066Sahrens /* 766fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 767fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 768fa9e4066Sahrens * will be returned and the new element will not be inserted. 769fa9e4066Sahrens * Otherwise returns NULL. 770fa9e4066Sahrens */ 771fa9e4066Sahrens static arc_buf_hdr_t * 772fa9e4066Sahrens buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 773fa9e4066Sahrens { 774fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 775fa9e4066Sahrens kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 776fa9e4066Sahrens arc_buf_hdr_t *fbuf; 77744cb6abcSbmc uint32_t i; 778fa9e4066Sahrens 779ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(buf)); 780fa9e4066Sahrens *lockp = hash_lock; 781fa9e4066Sahrens mutex_enter(hash_lock); 782fa9e4066Sahrens for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 783fa9e4066Sahrens fbuf = fbuf->b_hash_next, i++) { 784fa9e4066Sahrens if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 785fa9e4066Sahrens return (fbuf); 786fa9e4066Sahrens } 787fa9e4066Sahrens 788fa9e4066Sahrens buf->b_hash_next = buf_hash_table.ht_table[idx]; 789fa9e4066Sahrens buf_hash_table.ht_table[idx] = buf; 790ea8dc4b6Seschrock buf->b_flags |= ARC_IN_HASH_TABLE; 791fa9e4066Sahrens 792fa9e4066Sahrens /* collect some hash table performance data */ 793fa9e4066Sahrens if (i > 0) { 79444cb6abcSbmc ARCSTAT_BUMP(arcstat_hash_collisions); 795fa9e4066Sahrens if (i == 1) 79644cb6abcSbmc ARCSTAT_BUMP(arcstat_hash_chains); 79744cb6abcSbmc 79844cb6abcSbmc ARCSTAT_MAX(arcstat_hash_chain_max, i); 799fa9e4066Sahrens } 80044cb6abcSbmc 80144cb6abcSbmc ARCSTAT_BUMP(arcstat_hash_elements); 80244cb6abcSbmc ARCSTAT_MAXSTAT(arcstat_hash_elements); 803fa9e4066Sahrens 804fa9e4066Sahrens return (NULL); 805fa9e4066Sahrens } 806fa9e4066Sahrens 807fa9e4066Sahrens static void 808fa9e4066Sahrens buf_hash_remove(arc_buf_hdr_t *buf) 809fa9e4066Sahrens { 810fa9e4066Sahrens arc_buf_hdr_t *fbuf, **bufp; 811fa9e4066Sahrens uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 812fa9e4066Sahrens 813fa9e4066Sahrens ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 814ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(buf)); 815fa9e4066Sahrens 816fa9e4066Sahrens bufp = &buf_hash_table.ht_table[idx]; 817fa9e4066Sahrens while ((fbuf = *bufp) != buf) { 818fa9e4066Sahrens ASSERT(fbuf != NULL); 819fa9e4066Sahrens bufp = &fbuf->b_hash_next; 820fa9e4066Sahrens } 821fa9e4066Sahrens *bufp = buf->b_hash_next; 822fa9e4066Sahrens buf->b_hash_next = NULL; 823ea8dc4b6Seschrock buf->b_flags &= ~ARC_IN_HASH_TABLE; 824fa9e4066Sahrens 825fa9e4066Sahrens /* collect some hash table performance data */ 82644cb6abcSbmc ARCSTAT_BUMPDOWN(arcstat_hash_elements); 82744cb6abcSbmc 828fa9e4066Sahrens if (buf_hash_table.ht_table[idx] && 829fa9e4066Sahrens buf_hash_table.ht_table[idx]->b_hash_next == NULL) 83044cb6abcSbmc ARCSTAT_BUMPDOWN(arcstat_hash_chains); 831fa9e4066Sahrens } 832fa9e4066Sahrens 833fa9e4066Sahrens /* 834fa9e4066Sahrens * Global data structures and functions for the buf kmem cache. 835fa9e4066Sahrens */ 836fa9e4066Sahrens static kmem_cache_t *hdr_cache; 837fa9e4066Sahrens static kmem_cache_t *buf_cache; 838fa9e4066Sahrens 839fa9e4066Sahrens static void 840fa9e4066Sahrens buf_fini(void) 841fa9e4066Sahrens { 842fa9e4066Sahrens int i; 843fa9e4066Sahrens 844fa9e4066Sahrens kmem_free(buf_hash_table.ht_table, 845fa9e4066Sahrens (buf_hash_table.ht_mask + 1) * sizeof (void *)); 846fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) 847fa9e4066Sahrens mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 848fa9e4066Sahrens kmem_cache_destroy(hdr_cache); 849fa9e4066Sahrens kmem_cache_destroy(buf_cache); 850fa9e4066Sahrens } 851fa9e4066Sahrens 852fa9e4066Sahrens /* 853fa9e4066Sahrens * Constructor callback - called when the cache is empty 854fa9e4066Sahrens * and a new buf is requested. 855fa9e4066Sahrens */ 856fa9e4066Sahrens /* ARGSUSED */ 857fa9e4066Sahrens static int 858fa9e4066Sahrens hdr_cons(void *vbuf, void *unused, int kmflag) 859fa9e4066Sahrens { 860fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 861fa9e4066Sahrens 862fa9e4066Sahrens bzero(buf, sizeof (arc_buf_hdr_t)); 863fa9e4066Sahrens refcount_create(&buf->b_refcnt); 864fa9e4066Sahrens cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 865c25056deSgw mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 8665a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 867fa94a07fSbrendan 868fa9e4066Sahrens return (0); 869fa9e4066Sahrens } 870fa9e4066Sahrens 8716f83844dSMark Maybee /* ARGSUSED */ 8726f83844dSMark Maybee static int 8736f83844dSMark Maybee buf_cons(void *vbuf, void *unused, int kmflag) 8746f83844dSMark Maybee { 8756f83844dSMark Maybee arc_buf_t *buf = vbuf; 8766f83844dSMark Maybee 8776f83844dSMark Maybee bzero(buf, sizeof (arc_buf_t)); 8783f9d6ad7SLin Ling mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 8795a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 8805a98e54bSBrendan Gregg - Sun Microsystems 8816f83844dSMark Maybee return (0); 8826f83844dSMark Maybee } 8836f83844dSMark Maybee 884fa9e4066Sahrens /* 885fa9e4066Sahrens * Destructor callback - called when a cached buf is 886fa9e4066Sahrens * no longer required. 887fa9e4066Sahrens */ 888fa9e4066Sahrens /* ARGSUSED */ 889fa9e4066Sahrens static void 890fa9e4066Sahrens hdr_dest(void *vbuf, void *unused) 891fa9e4066Sahrens { 892fa9e4066Sahrens arc_buf_hdr_t *buf = vbuf; 893fa9e4066Sahrens 894b24ab676SJeff Bonwick ASSERT(BUF_EMPTY(buf)); 895fa9e4066Sahrens refcount_destroy(&buf->b_refcnt); 896fa9e4066Sahrens cv_destroy(&buf->b_cv); 897c25056deSgw mutex_destroy(&buf->b_freeze_lock); 8985a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); 899fa9e4066Sahrens } 900fa9e4066Sahrens 9016f83844dSMark Maybee /* ARGSUSED */ 9026f83844dSMark Maybee static void 9036f83844dSMark Maybee buf_dest(void *vbuf, void *unused) 9046f83844dSMark Maybee { 9056f83844dSMark Maybee arc_buf_t *buf = vbuf; 9066f83844dSMark Maybee 9073f9d6ad7SLin Ling mutex_destroy(&buf->b_evict_lock); 9085a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 9096f83844dSMark Maybee } 9106f83844dSMark Maybee 911fa9e4066Sahrens /* 912fa9e4066Sahrens * Reclaim callback -- invoked when memory is low. 913fa9e4066Sahrens */ 914fa9e4066Sahrens /* ARGSUSED */ 915fa9e4066Sahrens static void 916fa9e4066Sahrens hdr_recl(void *unused) 917fa9e4066Sahrens { 918fa9e4066Sahrens dprintf("hdr_recl called\n"); 91949e3519aSmaybee /* 92049e3519aSmaybee * umem calls the reclaim func when we destroy the buf cache, 92149e3519aSmaybee * which is after we do arc_fini(). 92249e3519aSmaybee */ 92349e3519aSmaybee if (!arc_dead) 92449e3519aSmaybee cv_signal(&arc_reclaim_thr_cv); 925fa9e4066Sahrens } 926fa9e4066Sahrens 927fa9e4066Sahrens static void 928fa9e4066Sahrens buf_init(void) 929fa9e4066Sahrens { 930fa9e4066Sahrens uint64_t *ct; 931ea8dc4b6Seschrock uint64_t hsize = 1ULL << 12; 932fa9e4066Sahrens int i, j; 933fa9e4066Sahrens 934fa9e4066Sahrens /* 935fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 936ea8dc4b6Seschrock * with an average 64K block size. The table will take up 937ea8dc4b6Seschrock * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 938fa9e4066Sahrens */ 939ea8dc4b6Seschrock while (hsize * 65536 < physmem * PAGESIZE) 940fa9e4066Sahrens hsize <<= 1; 941ea8dc4b6Seschrock retry: 942fa9e4066Sahrens buf_hash_table.ht_mask = hsize - 1; 943ea8dc4b6Seschrock buf_hash_table.ht_table = 944ea8dc4b6Seschrock kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 945ea8dc4b6Seschrock if (buf_hash_table.ht_table == NULL) { 946ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 8)); 947ea8dc4b6Seschrock hsize >>= 1; 948ea8dc4b6Seschrock goto retry; 949ea8dc4b6Seschrock } 950fa9e4066Sahrens 951fa9e4066Sahrens hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 952fa9e4066Sahrens 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 953fa9e4066Sahrens buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 9546f83844dSMark Maybee 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 955fa9e4066Sahrens 956fa9e4066Sahrens for (i = 0; i < 256; i++) 957fa9e4066Sahrens for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 958fa9e4066Sahrens *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 959fa9e4066Sahrens 960fa9e4066Sahrens for (i = 0; i < BUF_LOCKS; i++) { 961fa9e4066Sahrens mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 962fa9e4066Sahrens NULL, MUTEX_DEFAULT, NULL); 963fa9e4066Sahrens } 964fa9e4066Sahrens } 965fa9e4066Sahrens 966fa9e4066Sahrens #define ARC_MINTIME (hz>>4) /* 62 ms */ 967fa9e4066Sahrens 9686b4acc8bSahrens static void 9696b4acc8bSahrens arc_cksum_verify(arc_buf_t *buf) 9706b4acc8bSahrens { 9716b4acc8bSahrens zio_cksum_t zc; 9726b4acc8bSahrens 973cc60fd72Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 9746b4acc8bSahrens return; 9756b4acc8bSahrens 9766b4acc8bSahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 9773ccfa83cSahrens if (buf->b_hdr->b_freeze_cksum == NULL || 9783ccfa83cSahrens (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 9796b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 9806b4acc8bSahrens return; 9816b4acc8bSahrens } 9826b4acc8bSahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 9836b4acc8bSahrens if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 9846b4acc8bSahrens panic("buffer modified while frozen!"); 9856b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 9866b4acc8bSahrens } 9876b4acc8bSahrens 988fa94a07fSbrendan static int 989fa94a07fSbrendan arc_cksum_equal(arc_buf_t *buf) 990fa94a07fSbrendan { 991fa94a07fSbrendan zio_cksum_t zc; 992fa94a07fSbrendan int equal; 993fa94a07fSbrendan 994fa94a07fSbrendan mutex_enter(&buf->b_hdr->b_freeze_lock); 995fa94a07fSbrendan fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 996fa94a07fSbrendan equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 997fa94a07fSbrendan mutex_exit(&buf->b_hdr->b_freeze_lock); 998fa94a07fSbrendan 999fa94a07fSbrendan return (equal); 1000fa94a07fSbrendan } 1001fa94a07fSbrendan 10026b4acc8bSahrens static void 1003fa94a07fSbrendan arc_cksum_compute(arc_buf_t *buf, boolean_t force) 10046b4acc8bSahrens { 1005fa94a07fSbrendan if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 10066b4acc8bSahrens return; 10076b4acc8bSahrens 10086b4acc8bSahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 10096b4acc8bSahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 10106b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 10116b4acc8bSahrens return; 10126b4acc8bSahrens } 10136b4acc8bSahrens buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 10146b4acc8bSahrens fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 10156b4acc8bSahrens buf->b_hdr->b_freeze_cksum); 10166b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 1017cd1c8b85SMatthew Ahrens arc_buf_watch(buf); 1018cd1c8b85SMatthew Ahrens } 1019cd1c8b85SMatthew Ahrens 1020cd1c8b85SMatthew Ahrens #ifndef _KERNEL 1021cd1c8b85SMatthew Ahrens typedef struct procctl { 1022cd1c8b85SMatthew Ahrens long cmd; 1023cd1c8b85SMatthew Ahrens prwatch_t prwatch; 1024cd1c8b85SMatthew Ahrens } procctl_t; 1025cd1c8b85SMatthew Ahrens #endif 1026cd1c8b85SMatthew Ahrens 1027cd1c8b85SMatthew Ahrens /* ARGSUSED */ 1028cd1c8b85SMatthew Ahrens static void 1029cd1c8b85SMatthew Ahrens arc_buf_unwatch(arc_buf_t *buf) 1030cd1c8b85SMatthew Ahrens { 1031cd1c8b85SMatthew Ahrens #ifndef _KERNEL 1032cd1c8b85SMatthew Ahrens if (arc_watch) { 1033cd1c8b85SMatthew Ahrens int result; 1034cd1c8b85SMatthew Ahrens procctl_t ctl; 1035cd1c8b85SMatthew Ahrens ctl.cmd = PCWATCH; 1036cd1c8b85SMatthew Ahrens ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1037cd1c8b85SMatthew Ahrens ctl.prwatch.pr_size = 0; 1038cd1c8b85SMatthew Ahrens ctl.prwatch.pr_wflags = 0; 1039cd1c8b85SMatthew Ahrens result = write(arc_procfd, &ctl, sizeof (ctl)); 1040cd1c8b85SMatthew Ahrens ASSERT3U(result, ==, sizeof (ctl)); 1041cd1c8b85SMatthew Ahrens } 1042cd1c8b85SMatthew Ahrens #endif 1043cd1c8b85SMatthew Ahrens } 1044cd1c8b85SMatthew Ahrens 1045cd1c8b85SMatthew Ahrens /* ARGSUSED */ 1046cd1c8b85SMatthew Ahrens static void 1047cd1c8b85SMatthew Ahrens arc_buf_watch(arc_buf_t *buf) 1048cd1c8b85SMatthew Ahrens { 1049cd1c8b85SMatthew Ahrens #ifndef _KERNEL 1050cd1c8b85SMatthew Ahrens if (arc_watch) { 1051cd1c8b85SMatthew Ahrens int result; 1052cd1c8b85SMatthew Ahrens procctl_t ctl; 1053cd1c8b85SMatthew Ahrens ctl.cmd = PCWATCH; 1054cd1c8b85SMatthew Ahrens ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1055cd1c8b85SMatthew Ahrens ctl.prwatch.pr_size = buf->b_hdr->b_size; 1056cd1c8b85SMatthew Ahrens ctl.prwatch.pr_wflags = WA_WRITE; 1057cd1c8b85SMatthew Ahrens result = write(arc_procfd, &ctl, sizeof (ctl)); 1058cd1c8b85SMatthew Ahrens ASSERT3U(result, ==, sizeof (ctl)); 1059cd1c8b85SMatthew Ahrens } 1060cd1c8b85SMatthew Ahrens #endif 10616b4acc8bSahrens } 10626b4acc8bSahrens 10636b4acc8bSahrens void 10646b4acc8bSahrens arc_buf_thaw(arc_buf_t *buf) 10656b4acc8bSahrens { 1066fa94a07fSbrendan if (zfs_flags & ZFS_DEBUG_MODIFY) { 1067fa94a07fSbrendan if (buf->b_hdr->b_state != arc_anon) 1068fa94a07fSbrendan panic("modifying non-anon buffer!"); 1069fa94a07fSbrendan if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 1070fa94a07fSbrendan panic("modifying buffer while i/o in progress!"); 1071fa94a07fSbrendan arc_cksum_verify(buf); 1072fa94a07fSbrendan } 10736b4acc8bSahrens 10746b4acc8bSahrens mutex_enter(&buf->b_hdr->b_freeze_lock); 10756b4acc8bSahrens if (buf->b_hdr->b_freeze_cksum != NULL) { 10766b4acc8bSahrens kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 10776b4acc8bSahrens buf->b_hdr->b_freeze_cksum = NULL; 10786b4acc8bSahrens } 10793f9d6ad7SLin Ling 10803f9d6ad7SLin Ling if (zfs_flags & ZFS_DEBUG_MODIFY) { 10813f9d6ad7SLin Ling if (buf->b_hdr->b_thawed) 10823f9d6ad7SLin Ling kmem_free(buf->b_hdr->b_thawed, 1); 10833f9d6ad7SLin Ling buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); 10843f9d6ad7SLin Ling } 10853f9d6ad7SLin Ling 10866b4acc8bSahrens mutex_exit(&buf->b_hdr->b_freeze_lock); 1087cd1c8b85SMatthew Ahrens 1088cd1c8b85SMatthew Ahrens arc_buf_unwatch(buf); 10896b4acc8bSahrens } 10906b4acc8bSahrens 10916b4acc8bSahrens void 10926b4acc8bSahrens arc_buf_freeze(arc_buf_t *buf) 10936b4acc8bSahrens { 10943f9d6ad7SLin Ling kmutex_t *hash_lock; 10953f9d6ad7SLin Ling 1096cc60fd72Sahrens if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1097cc60fd72Sahrens return; 1098cc60fd72Sahrens 10993f9d6ad7SLin Ling hash_lock = HDR_LOCK(buf->b_hdr); 11003f9d6ad7SLin Ling mutex_enter(hash_lock); 11013f9d6ad7SLin Ling 11026b4acc8bSahrens ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 110344cb6abcSbmc buf->b_hdr->b_state == arc_anon); 1104fa94a07fSbrendan arc_cksum_compute(buf, B_FALSE); 11053f9d6ad7SLin Ling mutex_exit(hash_lock); 1106cd1c8b85SMatthew Ahrens 11076b4acc8bSahrens } 11086b4acc8bSahrens 1109fa9e4066Sahrens static void 1110fa9e4066Sahrens add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1111fa9e4066Sahrens { 1112fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1113fa9e4066Sahrens 1114fa9e4066Sahrens if ((refcount_add(&ab->b_refcnt, tag) == 1) && 111544cb6abcSbmc (ab->b_state != arc_anon)) { 1116c0a81264Sek uint64_t delta = ab->b_size * ab->b_datacnt; 11170e8c6158Smaybee list_t *list = &ab->b_state->arcs_list[ab->b_type]; 11180e8c6158Smaybee uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 1119fa9e4066Sahrens 112044cb6abcSbmc ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 112144cb6abcSbmc mutex_enter(&ab->b_state->arcs_mtx); 1122fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 11230e8c6158Smaybee list_remove(list, ab); 1124ea8dc4b6Seschrock if (GHOST_STATE(ab->b_state)) { 1125fb09f5aaSMadhav Suresh ASSERT0(ab->b_datacnt); 1126ea8dc4b6Seschrock ASSERT3P(ab->b_buf, ==, NULL); 1127ea8dc4b6Seschrock delta = ab->b_size; 1128ea8dc4b6Seschrock } 1129ea8dc4b6Seschrock ASSERT(delta > 0); 11300e8c6158Smaybee ASSERT3U(*size, >=, delta); 11310e8c6158Smaybee atomic_add_64(size, -delta); 113244cb6abcSbmc mutex_exit(&ab->b_state->arcs_mtx); 1133088f3894Sahrens /* remove the prefetch flag if we get a reference */ 113413506d1eSmaybee if (ab->b_flags & ARC_PREFETCH) 113513506d1eSmaybee ab->b_flags &= ~ARC_PREFETCH; 1136fa9e4066Sahrens } 1137fa9e4066Sahrens } 1138fa9e4066Sahrens 1139fa9e4066Sahrens static int 1140fa9e4066Sahrens remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 1141fa9e4066Sahrens { 1142fa9e4066Sahrens int cnt; 114344cb6abcSbmc arc_state_t *state = ab->b_state; 1144fa9e4066Sahrens 114544cb6abcSbmc ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 114644cb6abcSbmc ASSERT(!GHOST_STATE(state)); 1147fa9e4066Sahrens 1148fa9e4066Sahrens if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 114944cb6abcSbmc (state != arc_anon)) { 11500e8c6158Smaybee uint64_t *size = &state->arcs_lsize[ab->b_type]; 11510e8c6158Smaybee 115244cb6abcSbmc ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 115344cb6abcSbmc mutex_enter(&state->arcs_mtx); 1154fa9e4066Sahrens ASSERT(!list_link_active(&ab->b_arc_node)); 11550e8c6158Smaybee list_insert_head(&state->arcs_list[ab->b_type], ab); 1156ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 11570e8c6158Smaybee atomic_add_64(size, ab->b_size * ab->b_datacnt); 115844cb6abcSbmc mutex_exit(&state->arcs_mtx); 1159fa9e4066Sahrens } 1160fa9e4066Sahrens return (cnt); 1161fa9e4066Sahrens } 1162fa9e4066Sahrens 1163fa9e4066Sahrens /* 1164fa9e4066Sahrens * Move the supplied buffer to the indicated state. The mutex 1165fa9e4066Sahrens * for the buffer must be held by the caller. 1166fa9e4066Sahrens */ 1167fa9e4066Sahrens static void 1168ea8dc4b6Seschrock arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 1169fa9e4066Sahrens { 1170ea8dc4b6Seschrock arc_state_t *old_state = ab->b_state; 1171c0a81264Sek int64_t refcnt = refcount_count(&ab->b_refcnt); 1172c0a81264Sek uint64_t from_delta, to_delta; 1173fa9e4066Sahrens 1174fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 1175*69962b56SMatthew Ahrens ASSERT3P(new_state, !=, old_state); 1176ea8dc4b6Seschrock ASSERT(refcnt == 0 || ab->b_datacnt > 0); 1177ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 1178b24ab676SJeff Bonwick ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); 1179ea8dc4b6Seschrock 1180ea8dc4b6Seschrock from_delta = to_delta = ab->b_datacnt * ab->b_size; 1181fa9e4066Sahrens 1182fa9e4066Sahrens /* 1183fa9e4066Sahrens * If this buffer is evictable, transfer it from the 1184fa9e4066Sahrens * old state list to the new state list. 1185fa9e4066Sahrens */ 1186ea8dc4b6Seschrock if (refcnt == 0) { 118744cb6abcSbmc if (old_state != arc_anon) { 118844cb6abcSbmc int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 11890e8c6158Smaybee uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1190ea8dc4b6Seschrock 1191ea8dc4b6Seschrock if (use_mutex) 119244cb6abcSbmc mutex_enter(&old_state->arcs_mtx); 1193fa9e4066Sahrens 1194fa9e4066Sahrens ASSERT(list_link_active(&ab->b_arc_node)); 11950e8c6158Smaybee list_remove(&old_state->arcs_list[ab->b_type], ab); 1196ea8dc4b6Seschrock 119713506d1eSmaybee /* 119813506d1eSmaybee * If prefetching out of the ghost cache, 11993f9d6ad7SLin Ling * we will have a non-zero datacnt. 120013506d1eSmaybee */ 120113506d1eSmaybee if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 120213506d1eSmaybee /* ghost elements have a ghost size */ 1203ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 1204ea8dc4b6Seschrock from_delta = ab->b_size; 1205ea8dc4b6Seschrock } 12060e8c6158Smaybee ASSERT3U(*size, >=, from_delta); 12070e8c6158Smaybee atomic_add_64(size, -from_delta); 1208ea8dc4b6Seschrock 1209ea8dc4b6Seschrock if (use_mutex) 121044cb6abcSbmc mutex_exit(&old_state->arcs_mtx); 1211fa9e4066Sahrens } 121244cb6abcSbmc if (new_state != arc_anon) { 121344cb6abcSbmc int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 12140e8c6158Smaybee uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1215fa9e4066Sahrens 1216ea8dc4b6Seschrock if (use_mutex) 121744cb6abcSbmc mutex_enter(&new_state->arcs_mtx); 1218ea8dc4b6Seschrock 12190e8c6158Smaybee list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1220ea8dc4b6Seschrock 1221ea8dc4b6Seschrock /* ghost elements have a ghost size */ 1222ea8dc4b6Seschrock if (GHOST_STATE(new_state)) { 1223ea8dc4b6Seschrock ASSERT(ab->b_datacnt == 0); 1224ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 1225ea8dc4b6Seschrock to_delta = ab->b_size; 1226ea8dc4b6Seschrock } 12270e8c6158Smaybee atomic_add_64(size, to_delta); 1228ea8dc4b6Seschrock 1229ea8dc4b6Seschrock if (use_mutex) 123044cb6abcSbmc mutex_exit(&new_state->arcs_mtx); 1231fa9e4066Sahrens } 1232fa9e4066Sahrens } 1233fa9e4066Sahrens 1234fa9e4066Sahrens ASSERT(!BUF_EMPTY(ab)); 12353f9d6ad7SLin Ling if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) 1236fa9e4066Sahrens buf_hash_remove(ab); 1237fa9e4066Sahrens 1238ea8dc4b6Seschrock /* adjust state sizes */ 1239ea8dc4b6Seschrock if (to_delta) 124044cb6abcSbmc atomic_add_64(&new_state->arcs_size, to_delta); 1241ea8dc4b6Seschrock if (from_delta) { 124244cb6abcSbmc ASSERT3U(old_state->arcs_size, >=, from_delta); 124344cb6abcSbmc atomic_add_64(&old_state->arcs_size, -from_delta); 1244fa9e4066Sahrens } 1245fa9e4066Sahrens ab->b_state = new_state; 1246fa94a07fSbrendan 1247fa94a07fSbrendan /* adjust l2arc hdr stats */ 1248fa94a07fSbrendan if (new_state == arc_l2c_only) 1249fa94a07fSbrendan l2arc_hdr_stat_add(); 1250fa94a07fSbrendan else if (old_state == arc_l2c_only) 1251fa94a07fSbrendan l2arc_hdr_stat_remove(); 1252fa9e4066Sahrens } 1253fa9e4066Sahrens 12540e8c6158Smaybee void 12555a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(uint64_t space, arc_space_type_t type) 12560e8c6158Smaybee { 12575a98e54bSBrendan Gregg - Sun Microsystems ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 12585a98e54bSBrendan Gregg - Sun Microsystems 12595a98e54bSBrendan Gregg - Sun Microsystems switch (type) { 12605a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_DATA: 12615a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_data_size, space); 12625a98e54bSBrendan Gregg - Sun Microsystems break; 12635a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_OTHER: 12645a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_other_size, space); 12655a98e54bSBrendan Gregg - Sun Microsystems break; 12665a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_HDRS: 12675a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_hdr_size, space); 12685a98e54bSBrendan Gregg - Sun Microsystems break; 12695a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_L2HDRS: 12705a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_l2_hdr_size, space); 12715a98e54bSBrendan Gregg - Sun Microsystems break; 12725a98e54bSBrendan Gregg - Sun Microsystems } 12735a98e54bSBrendan Gregg - Sun Microsystems 127420128a08SGeorge Wilson ARCSTAT_INCR(arcstat_meta_used, space); 12750e8c6158Smaybee atomic_add_64(&arc_size, space); 12760e8c6158Smaybee } 12770e8c6158Smaybee 12780e8c6158Smaybee void 12795a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(uint64_t space, arc_space_type_t type) 12800e8c6158Smaybee { 12815a98e54bSBrendan Gregg - Sun Microsystems ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 12825a98e54bSBrendan Gregg - Sun Microsystems 12835a98e54bSBrendan Gregg - Sun Microsystems switch (type) { 12845a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_DATA: 12855a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_data_size, -space); 12865a98e54bSBrendan Gregg - Sun Microsystems break; 12875a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_OTHER: 12885a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_other_size, -space); 12895a98e54bSBrendan Gregg - Sun Microsystems break; 12905a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_HDRS: 12915a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_hdr_size, -space); 12925a98e54bSBrendan Gregg - Sun Microsystems break; 12935a98e54bSBrendan Gregg - Sun Microsystems case ARC_SPACE_L2HDRS: 12945a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 12955a98e54bSBrendan Gregg - Sun Microsystems break; 12965a98e54bSBrendan Gregg - Sun Microsystems } 12975a98e54bSBrendan Gregg - Sun Microsystems 12980e8c6158Smaybee ASSERT(arc_meta_used >= space); 12990e8c6158Smaybee if (arc_meta_max < arc_meta_used) 13000e8c6158Smaybee arc_meta_max = arc_meta_used; 130120128a08SGeorge Wilson ARCSTAT_INCR(arcstat_meta_used, -space); 13020e8c6158Smaybee ASSERT(arc_size >= space); 13030e8c6158Smaybee atomic_add_64(&arc_size, -space); 13040e8c6158Smaybee } 13050e8c6158Smaybee 13060e8c6158Smaybee void * 13070e8c6158Smaybee arc_data_buf_alloc(uint64_t size) 13080e8c6158Smaybee { 13090e8c6158Smaybee if (arc_evict_needed(ARC_BUFC_DATA)) 13100e8c6158Smaybee cv_signal(&arc_reclaim_thr_cv); 13110e8c6158Smaybee atomic_add_64(&arc_size, size); 13120e8c6158Smaybee return (zio_data_buf_alloc(size)); 13130e8c6158Smaybee } 13140e8c6158Smaybee 13150e8c6158Smaybee void 13160e8c6158Smaybee arc_data_buf_free(void *buf, uint64_t size) 13170e8c6158Smaybee { 13180e8c6158Smaybee zio_data_buf_free(buf, size); 13190e8c6158Smaybee ASSERT(arc_size >= size); 13200e8c6158Smaybee atomic_add_64(&arc_size, -size); 13210e8c6158Smaybee } 13220e8c6158Smaybee 1323fa9e4066Sahrens arc_buf_t * 1324ad23a2dbSjohansen arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1325fa9e4066Sahrens { 1326fa9e4066Sahrens arc_buf_hdr_t *hdr; 1327fa9e4066Sahrens arc_buf_t *buf; 1328fa9e4066Sahrens 1329fa9e4066Sahrens ASSERT3U(size, >, 0); 13301ab7f2deSmaybee hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1331fa9e4066Sahrens ASSERT(BUF_EMPTY(hdr)); 1332fa9e4066Sahrens hdr->b_size = size; 1333ad23a2dbSjohansen hdr->b_type = type; 1334e9103aaeSGarrett D'Amore hdr->b_spa = spa_load_guid(spa); 133544cb6abcSbmc hdr->b_state = arc_anon; 1336fa9e4066Sahrens hdr->b_arc_access = 0; 13371ab7f2deSmaybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1338fa9e4066Sahrens buf->b_hdr = hdr; 133944eda4d7Smaybee buf->b_data = NULL; 1340ea8dc4b6Seschrock buf->b_efunc = NULL; 1341ea8dc4b6Seschrock buf->b_private = NULL; 1342fa9e4066Sahrens buf->b_next = NULL; 1343fa9e4066Sahrens hdr->b_buf = buf; 134444eda4d7Smaybee arc_get_data_buf(buf); 1345ea8dc4b6Seschrock hdr->b_datacnt = 1; 1346fa9e4066Sahrens hdr->b_flags = 0; 1347fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1348fa9e4066Sahrens (void) refcount_add(&hdr->b_refcnt, tag); 1349fa9e4066Sahrens 1350fa9e4066Sahrens return (buf); 1351fa9e4066Sahrens } 1352fa9e4066Sahrens 13532fdbea25SAleksandr Guzovskiy static char *arc_onloan_tag = "onloan"; 13542fdbea25SAleksandr Guzovskiy 13552fdbea25SAleksandr Guzovskiy /* 13562fdbea25SAleksandr Guzovskiy * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 13572fdbea25SAleksandr Guzovskiy * flight data by arc_tempreserve_space() until they are "returned". Loaned 13582fdbea25SAleksandr Guzovskiy * buffers must be returned to the arc before they can be used by the DMU or 13592fdbea25SAleksandr Guzovskiy * freed. 13602fdbea25SAleksandr Guzovskiy */ 13612fdbea25SAleksandr Guzovskiy arc_buf_t * 13622fdbea25SAleksandr Guzovskiy arc_loan_buf(spa_t *spa, int size) 13632fdbea25SAleksandr Guzovskiy { 13642fdbea25SAleksandr Guzovskiy arc_buf_t *buf; 13652fdbea25SAleksandr Guzovskiy 13662fdbea25SAleksandr Guzovskiy buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 13672fdbea25SAleksandr Guzovskiy 13682fdbea25SAleksandr Guzovskiy atomic_add_64(&arc_loaned_bytes, size); 13692fdbea25SAleksandr Guzovskiy return (buf); 13702fdbea25SAleksandr Guzovskiy } 13712fdbea25SAleksandr Guzovskiy 13722fdbea25SAleksandr Guzovskiy /* 13732fdbea25SAleksandr Guzovskiy * Return a loaned arc buffer to the arc. 13742fdbea25SAleksandr Guzovskiy */ 13752fdbea25SAleksandr Guzovskiy void 13762fdbea25SAleksandr Guzovskiy arc_return_buf(arc_buf_t *buf, void *tag) 13772fdbea25SAleksandr Guzovskiy { 13782fdbea25SAleksandr Guzovskiy arc_buf_hdr_t *hdr = buf->b_hdr; 13792fdbea25SAleksandr Guzovskiy 13802fdbea25SAleksandr Guzovskiy ASSERT(buf->b_data != NULL); 1381c242f9a0Schunli zhang - Sun Microsystems - Irvine United States (void) refcount_add(&hdr->b_refcnt, tag); 1382c242f9a0Schunli zhang - Sun Microsystems - Irvine United States (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); 13832fdbea25SAleksandr Guzovskiy 13842fdbea25SAleksandr Guzovskiy atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 13852fdbea25SAleksandr Guzovskiy } 13862fdbea25SAleksandr Guzovskiy 1387c242f9a0Schunli zhang - Sun Microsystems - Irvine United States /* Detach an arc_buf from a dbuf (tag) */ 1388c242f9a0Schunli zhang - Sun Microsystems - Irvine United States void 1389c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1390c242f9a0Schunli zhang - Sun Microsystems - Irvine United States { 1391c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_hdr_t *hdr; 1392c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 1393c242f9a0Schunli zhang - Sun Microsystems - Irvine United States ASSERT(buf->b_data != NULL); 1394c242f9a0Schunli zhang - Sun Microsystems - Irvine United States hdr = buf->b_hdr; 1395c242f9a0Schunli zhang - Sun Microsystems - Irvine United States (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); 1396c242f9a0Schunli zhang - Sun Microsystems - Irvine United States (void) refcount_remove(&hdr->b_refcnt, tag); 1397c242f9a0Schunli zhang - Sun Microsystems - Irvine United States buf->b_efunc = NULL; 1398c242f9a0Schunli zhang - Sun Microsystems - Irvine United States buf->b_private = NULL; 1399c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 1400c242f9a0Schunli zhang - Sun Microsystems - Irvine United States atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1401c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } 1402c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 140344eda4d7Smaybee static arc_buf_t * 140444eda4d7Smaybee arc_buf_clone(arc_buf_t *from) 1405ea8dc4b6Seschrock { 140644eda4d7Smaybee arc_buf_t *buf; 140744eda4d7Smaybee arc_buf_hdr_t *hdr = from->b_hdr; 140844eda4d7Smaybee uint64_t size = hdr->b_size; 1409ea8dc4b6Seschrock 1410b24ab676SJeff Bonwick ASSERT(hdr->b_state != arc_anon); 1411b24ab676SJeff Bonwick 14121ab7f2deSmaybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 141344eda4d7Smaybee buf->b_hdr = hdr; 141444eda4d7Smaybee buf->b_data = NULL; 141544eda4d7Smaybee buf->b_efunc = NULL; 141644eda4d7Smaybee buf->b_private = NULL; 141744eda4d7Smaybee buf->b_next = hdr->b_buf; 141844eda4d7Smaybee hdr->b_buf = buf; 141944eda4d7Smaybee arc_get_data_buf(buf); 142044eda4d7Smaybee bcopy(from->b_data, buf->b_data, size); 14219253d63dSGeorge Wilson 14229253d63dSGeorge Wilson /* 14239253d63dSGeorge Wilson * This buffer already exists in the arc so create a duplicate 14249253d63dSGeorge Wilson * copy for the caller. If the buffer is associated with user data 14259253d63dSGeorge Wilson * then track the size and number of duplicates. These stats will be 14269253d63dSGeorge Wilson * updated as duplicate buffers are created and destroyed. 14279253d63dSGeorge Wilson */ 14289253d63dSGeorge Wilson if (hdr->b_type == ARC_BUFC_DATA) { 14299253d63dSGeorge Wilson ARCSTAT_BUMP(arcstat_duplicate_buffers); 14309253d63dSGeorge Wilson ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 14319253d63dSGeorge Wilson } 143244eda4d7Smaybee hdr->b_datacnt += 1; 143344eda4d7Smaybee return (buf); 1434ea8dc4b6Seschrock } 1435ea8dc4b6Seschrock 1436ea8dc4b6Seschrock void 1437ea8dc4b6Seschrock arc_buf_add_ref(arc_buf_t *buf, void* tag) 1438ea8dc4b6Seschrock { 143940d7d650Smaybee arc_buf_hdr_t *hdr; 1440ea8dc4b6Seschrock kmutex_t *hash_lock; 1441ea8dc4b6Seschrock 14429b23f181Smaybee /* 14436f83844dSMark Maybee * Check to see if this buffer is evicted. Callers 14446f83844dSMark Maybee * must verify b_data != NULL to know if the add_ref 14456f83844dSMark Maybee * was successful. 14469b23f181Smaybee */ 14473f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 14486f83844dSMark Maybee if (buf->b_data == NULL) { 14493f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 14509b23f181Smaybee return; 145140d7d650Smaybee } 14523f9d6ad7SLin Ling hash_lock = HDR_LOCK(buf->b_hdr); 14539b23f181Smaybee mutex_enter(hash_lock); 14543f9d6ad7SLin Ling hdr = buf->b_hdr; 14553f9d6ad7SLin Ling ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 14563f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 1457ea8dc4b6Seschrock 145844cb6abcSbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1459ea8dc4b6Seschrock add_reference(hdr, hash_lock, tag); 14605a98e54bSBrendan Gregg - Sun Microsystems DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 146144eda4d7Smaybee arc_access(hdr, hash_lock); 146244eda4d7Smaybee mutex_exit(hash_lock); 146344cb6abcSbmc ARCSTAT_BUMP(arcstat_hits); 146444cb6abcSbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 146544cb6abcSbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 146644cb6abcSbmc data, metadata, hits); 1467ea8dc4b6Seschrock } 1468ea8dc4b6Seschrock 1469fa94a07fSbrendan /* 1470fa94a07fSbrendan * Free the arc data buffer. If it is an l2arc write in progress, 1471fa94a07fSbrendan * the buffer is placed on l2arc_free_on_write to be freed later. 1472fa94a07fSbrendan */ 1473fa94a07fSbrendan static void 1474cd1c8b85SMatthew Ahrens arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 1475fa94a07fSbrendan { 1476cd1c8b85SMatthew Ahrens arc_buf_hdr_t *hdr = buf->b_hdr; 1477cd1c8b85SMatthew Ahrens 1478fa94a07fSbrendan if (HDR_L2_WRITING(hdr)) { 1479fa94a07fSbrendan l2arc_data_free_t *df; 1480fa94a07fSbrendan df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1481cd1c8b85SMatthew Ahrens df->l2df_data = buf->b_data; 1482cd1c8b85SMatthew Ahrens df->l2df_size = hdr->b_size; 1483fa94a07fSbrendan df->l2df_func = free_func; 1484fa94a07fSbrendan mutex_enter(&l2arc_free_on_write_mtx); 1485fa94a07fSbrendan list_insert_head(l2arc_free_on_write, df); 1486fa94a07fSbrendan mutex_exit(&l2arc_free_on_write_mtx); 1487fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_free_on_write); 1488fa94a07fSbrendan } else { 1489cd1c8b85SMatthew Ahrens free_func(buf->b_data, hdr->b_size); 1490fa94a07fSbrendan } 1491fa94a07fSbrendan } 1492fa94a07fSbrendan 1493ea8dc4b6Seschrock static void 149444eda4d7Smaybee arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1495ea8dc4b6Seschrock { 1496ea8dc4b6Seschrock arc_buf_t **bufp; 1497ea8dc4b6Seschrock 1498ea8dc4b6Seschrock /* free up data associated with the buf */ 1499ea8dc4b6Seschrock if (buf->b_data) { 1500ea8dc4b6Seschrock arc_state_t *state = buf->b_hdr->b_state; 1501ea8dc4b6Seschrock uint64_t size = buf->b_hdr->b_size; 1502ad23a2dbSjohansen arc_buf_contents_t type = buf->b_hdr->b_type; 1503ea8dc4b6Seschrock 15046b4acc8bSahrens arc_cksum_verify(buf); 1505cd1c8b85SMatthew Ahrens arc_buf_unwatch(buf); 1506b24ab676SJeff Bonwick 150744eda4d7Smaybee if (!recycle) { 1508ad23a2dbSjohansen if (type == ARC_BUFC_METADATA) { 1509cd1c8b85SMatthew Ahrens arc_buf_data_free(buf, zio_buf_free); 15105a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(size, ARC_SPACE_DATA); 1511ad23a2dbSjohansen } else { 1512ad23a2dbSjohansen ASSERT(type == ARC_BUFC_DATA); 1513cd1c8b85SMatthew Ahrens arc_buf_data_free(buf, zio_data_buf_free); 15145a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_data_size, -size); 15150e8c6158Smaybee atomic_add_64(&arc_size, -size); 1516ad23a2dbSjohansen } 151744eda4d7Smaybee } 1518ea8dc4b6Seschrock if (list_link_active(&buf->b_hdr->b_arc_node)) { 15190e8c6158Smaybee uint64_t *cnt = &state->arcs_lsize[type]; 15200e8c6158Smaybee 1521ea8dc4b6Seschrock ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 152244cb6abcSbmc ASSERT(state != arc_anon); 15230e8c6158Smaybee 15240e8c6158Smaybee ASSERT3U(*cnt, >=, size); 15250e8c6158Smaybee atomic_add_64(cnt, -size); 1526ea8dc4b6Seschrock } 152744cb6abcSbmc ASSERT3U(state->arcs_size, >=, size); 152844cb6abcSbmc atomic_add_64(&state->arcs_size, -size); 1529ea8dc4b6Seschrock buf->b_data = NULL; 15309253d63dSGeorge Wilson 15319253d63dSGeorge Wilson /* 15329253d63dSGeorge Wilson * If we're destroying a duplicate buffer make sure 15339253d63dSGeorge Wilson * that the appropriate statistics are updated. 15349253d63dSGeorge Wilson */ 15359253d63dSGeorge Wilson if (buf->b_hdr->b_datacnt > 1 && 15369253d63dSGeorge Wilson buf->b_hdr->b_type == ARC_BUFC_DATA) { 15379253d63dSGeorge Wilson ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 15389253d63dSGeorge Wilson ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 15399253d63dSGeorge Wilson } 1540ea8dc4b6Seschrock ASSERT(buf->b_hdr->b_datacnt > 0); 1541ea8dc4b6Seschrock buf->b_hdr->b_datacnt -= 1; 1542ea8dc4b6Seschrock } 1543ea8dc4b6Seschrock 1544ea8dc4b6Seschrock /* only remove the buf if requested */ 1545ea8dc4b6Seschrock if (!all) 1546ea8dc4b6Seschrock return; 1547ea8dc4b6Seschrock 1548ea8dc4b6Seschrock /* remove the buf from the hdr list */ 1549ea8dc4b6Seschrock for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1550ea8dc4b6Seschrock continue; 1551ea8dc4b6Seschrock *bufp = buf->b_next; 15523f9d6ad7SLin Ling buf->b_next = NULL; 1553ea8dc4b6Seschrock 1554ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 1555ea8dc4b6Seschrock 1556ea8dc4b6Seschrock /* clean up the buf */ 1557ea8dc4b6Seschrock buf->b_hdr = NULL; 1558ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 1559ea8dc4b6Seschrock } 1560ea8dc4b6Seschrock 1561fa9e4066Sahrens static void 1562ea8dc4b6Seschrock arc_hdr_destroy(arc_buf_hdr_t *hdr) 1563fa9e4066Sahrens { 1564fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt)); 156544cb6abcSbmc ASSERT3P(hdr->b_state, ==, arc_anon); 1566ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1567b24ab676SJeff Bonwick l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; 1568fa9e4066Sahrens 1569b24ab676SJeff Bonwick if (l2hdr != NULL) { 1570b24ab676SJeff Bonwick boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); 1571b24ab676SJeff Bonwick /* 1572b24ab676SJeff Bonwick * To prevent arc_free() and l2arc_evict() from 1573b24ab676SJeff Bonwick * attempting to free the same buffer at the same time, 1574b24ab676SJeff Bonwick * a FREE_IN_PROGRESS flag is given to arc_free() to 1575b24ab676SJeff Bonwick * give it priority. l2arc_evict() can't destroy this 1576b24ab676SJeff Bonwick * header while we are waiting on l2arc_buflist_mtx. 1577b24ab676SJeff Bonwick * 1578b24ab676SJeff Bonwick * The hdr may be removed from l2ad_buflist before we 1579b24ab676SJeff Bonwick * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. 1580b24ab676SJeff Bonwick */ 1581b24ab676SJeff Bonwick if (!buflist_held) { 1582fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 1583b24ab676SJeff Bonwick l2hdr = hdr->b_l2hdr; 1584fa94a07fSbrendan } 1585b24ab676SJeff Bonwick 1586b24ab676SJeff Bonwick if (l2hdr != NULL) { 1587b24ab676SJeff Bonwick list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 1588b24ab676SJeff Bonwick ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1589aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 1590b24ab676SJeff Bonwick kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 1591b24ab676SJeff Bonwick if (hdr->b_state == arc_l2c_only) 1592b24ab676SJeff Bonwick l2arc_hdr_stat_remove(); 1593b24ab676SJeff Bonwick hdr->b_l2hdr = NULL; 1594b24ab676SJeff Bonwick } 1595b24ab676SJeff Bonwick 1596b24ab676SJeff Bonwick if (!buflist_held) 1597b24ab676SJeff Bonwick mutex_exit(&l2arc_buflist_mtx); 1598fa94a07fSbrendan } 1599fa94a07fSbrendan 1600fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 1601ea8dc4b6Seschrock ASSERT(!HDR_IN_HASH_TABLE(hdr)); 16023f9d6ad7SLin Ling buf_discard_identity(hdr); 1603fa9e4066Sahrens } 1604ea8dc4b6Seschrock while (hdr->b_buf) { 1605fa9e4066Sahrens arc_buf_t *buf = hdr->b_buf; 1606fa9e4066Sahrens 1607ea8dc4b6Seschrock if (buf->b_efunc) { 1608ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 16093f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 1610ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 161144eda4d7Smaybee arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1612ea8dc4b6Seschrock hdr->b_buf = buf->b_next; 161340d7d650Smaybee buf->b_hdr = &arc_eviction_hdr; 1614ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 1615ea8dc4b6Seschrock arc_eviction_list = buf; 16163f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 1617ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1618ea8dc4b6Seschrock } else { 161944eda4d7Smaybee arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1620ea8dc4b6Seschrock } 1621fa9e4066Sahrens } 16226b4acc8bSahrens if (hdr->b_freeze_cksum != NULL) { 16236b4acc8bSahrens kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 16246b4acc8bSahrens hdr->b_freeze_cksum = NULL; 16256b4acc8bSahrens } 16263f9d6ad7SLin Ling if (hdr->b_thawed) { 16273f9d6ad7SLin Ling kmem_free(hdr->b_thawed, 1); 16283f9d6ad7SLin Ling hdr->b_thawed = NULL; 16293f9d6ad7SLin Ling } 1630ea8dc4b6Seschrock 1631fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 1632fa9e4066Sahrens ASSERT3P(hdr->b_hash_next, ==, NULL); 1633fa9e4066Sahrens ASSERT3P(hdr->b_acb, ==, NULL); 1634fa9e4066Sahrens kmem_cache_free(hdr_cache, hdr); 1635fa9e4066Sahrens } 1636fa9e4066Sahrens 1637fa9e4066Sahrens void 1638fa9e4066Sahrens arc_buf_free(arc_buf_t *buf, void *tag) 1639fa9e4066Sahrens { 1640fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 164144cb6abcSbmc int hashed = hdr->b_state != arc_anon; 1642fa9e4066Sahrens 1643ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 1644ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 1645ea8dc4b6Seschrock 1646ea8dc4b6Seschrock if (hashed) { 1647ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 1648ea8dc4b6Seschrock 1649ea8dc4b6Seschrock mutex_enter(hash_lock); 16503f9d6ad7SLin Ling hdr = buf->b_hdr; 16513f9d6ad7SLin Ling ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 16523f9d6ad7SLin Ling 1653ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 1654b24ab676SJeff Bonwick if (hdr->b_datacnt > 1) { 165544eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 1656b24ab676SJeff Bonwick } else { 1657b24ab676SJeff Bonwick ASSERT(buf == hdr->b_buf); 1658b24ab676SJeff Bonwick ASSERT(buf->b_efunc == NULL); 1659ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1660b24ab676SJeff Bonwick } 1661fa9e4066Sahrens mutex_exit(hash_lock); 1662ea8dc4b6Seschrock } else if (HDR_IO_IN_PROGRESS(hdr)) { 1663ea8dc4b6Seschrock int destroy_hdr; 1664ea8dc4b6Seschrock /* 1665ea8dc4b6Seschrock * We are in the middle of an async write. Don't destroy 1666ea8dc4b6Seschrock * this buffer unless the write completes before we finish 1667ea8dc4b6Seschrock * decrementing the reference count. 1668ea8dc4b6Seschrock */ 1669ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 1670ea8dc4b6Seschrock (void) remove_reference(hdr, NULL, tag); 1671ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1672ea8dc4b6Seschrock destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1673ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 1674ea8dc4b6Seschrock if (destroy_hdr) 1675ea8dc4b6Seschrock arc_hdr_destroy(hdr); 1676ea8dc4b6Seschrock } else { 16773f9d6ad7SLin Ling if (remove_reference(hdr, NULL, tag) > 0) 167844eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 16793f9d6ad7SLin Ling else 1680ea8dc4b6Seschrock arc_hdr_destroy(hdr); 1681fa9e4066Sahrens } 1682ea8dc4b6Seschrock } 1683fa9e4066Sahrens 16843b2aab18SMatthew Ahrens boolean_t 1685ea8dc4b6Seschrock arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1686ea8dc4b6Seschrock { 1687ea8dc4b6Seschrock arc_buf_hdr_t *hdr = buf->b_hdr; 1688ea8dc4b6Seschrock kmutex_t *hash_lock = HDR_LOCK(hdr); 16893b2aab18SMatthew Ahrens boolean_t no_callback = (buf->b_efunc == NULL); 1690fa9e4066Sahrens 169144cb6abcSbmc if (hdr->b_state == arc_anon) { 1692b24ab676SJeff Bonwick ASSERT(hdr->b_datacnt == 1); 1693ea8dc4b6Seschrock arc_buf_free(buf, tag); 1694ea8dc4b6Seschrock return (no_callback); 1695ea8dc4b6Seschrock } 1696ea8dc4b6Seschrock 1697ea8dc4b6Seschrock mutex_enter(hash_lock); 16983f9d6ad7SLin Ling hdr = buf->b_hdr; 16993f9d6ad7SLin Ling ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 170044cb6abcSbmc ASSERT(hdr->b_state != arc_anon); 1701ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 1702ea8dc4b6Seschrock 1703ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 1704ea8dc4b6Seschrock if (hdr->b_datacnt > 1) { 1705ea8dc4b6Seschrock if (no_callback) 170644eda4d7Smaybee arc_buf_destroy(buf, FALSE, TRUE); 1707ea8dc4b6Seschrock } else if (no_callback) { 1708ea8dc4b6Seschrock ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1709b24ab676SJeff Bonwick ASSERT(buf->b_efunc == NULL); 1710ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 1711ea8dc4b6Seschrock } 1712ea8dc4b6Seschrock ASSERT(no_callback || hdr->b_datacnt > 1 || 1713ea8dc4b6Seschrock refcount_is_zero(&hdr->b_refcnt)); 1714ea8dc4b6Seschrock mutex_exit(hash_lock); 1715ea8dc4b6Seschrock return (no_callback); 1716fa9e4066Sahrens } 1717fa9e4066Sahrens 1718fa9e4066Sahrens int 1719fa9e4066Sahrens arc_buf_size(arc_buf_t *buf) 1720fa9e4066Sahrens { 1721fa9e4066Sahrens return (buf->b_hdr->b_size); 1722fa9e4066Sahrens } 1723fa9e4066Sahrens 17249253d63dSGeorge Wilson /* 17259253d63dSGeorge Wilson * Called from the DMU to determine if the current buffer should be 17269253d63dSGeorge Wilson * evicted. In order to ensure proper locking, the eviction must be initiated 17279253d63dSGeorge Wilson * from the DMU. Return true if the buffer is associated with user data and 17289253d63dSGeorge Wilson * duplicate buffers still exist. 17299253d63dSGeorge Wilson */ 17309253d63dSGeorge Wilson boolean_t 17319253d63dSGeorge Wilson arc_buf_eviction_needed(arc_buf_t *buf) 17329253d63dSGeorge Wilson { 17339253d63dSGeorge Wilson arc_buf_hdr_t *hdr; 17349253d63dSGeorge Wilson boolean_t evict_needed = B_FALSE; 17359253d63dSGeorge Wilson 17369253d63dSGeorge Wilson if (zfs_disable_dup_eviction) 17379253d63dSGeorge Wilson return (B_FALSE); 17389253d63dSGeorge Wilson 17399253d63dSGeorge Wilson mutex_enter(&buf->b_evict_lock); 17409253d63dSGeorge Wilson hdr = buf->b_hdr; 17419253d63dSGeorge Wilson if (hdr == NULL) { 17429253d63dSGeorge Wilson /* 17439253d63dSGeorge Wilson * We are in arc_do_user_evicts(); let that function 17449253d63dSGeorge Wilson * perform the eviction. 17459253d63dSGeorge Wilson */ 17469253d63dSGeorge Wilson ASSERT(buf->b_data == NULL); 17479253d63dSGeorge Wilson mutex_exit(&buf->b_evict_lock); 17489253d63dSGeorge Wilson return (B_FALSE); 17499253d63dSGeorge Wilson } else if (buf->b_data == NULL) { 17509253d63dSGeorge Wilson /* 17519253d63dSGeorge Wilson * We have already been added to the arc eviction list; 17529253d63dSGeorge Wilson * recommend eviction. 17539253d63dSGeorge Wilson */ 17549253d63dSGeorge Wilson ASSERT3P(hdr, ==, &arc_eviction_hdr); 17559253d63dSGeorge Wilson mutex_exit(&buf->b_evict_lock); 17569253d63dSGeorge Wilson return (B_TRUE); 17579253d63dSGeorge Wilson } 17589253d63dSGeorge Wilson 17599253d63dSGeorge Wilson if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) 17609253d63dSGeorge Wilson evict_needed = B_TRUE; 17619253d63dSGeorge Wilson 17629253d63dSGeorge Wilson mutex_exit(&buf->b_evict_lock); 17639253d63dSGeorge Wilson return (evict_needed); 17649253d63dSGeorge Wilson } 17659253d63dSGeorge Wilson 1766fa9e4066Sahrens /* 1767fa9e4066Sahrens * Evict buffers from list until we've removed the specified number of 1768fa9e4066Sahrens * bytes. Move the removed buffers to the appropriate evict state. 176944eda4d7Smaybee * If the recycle flag is set, then attempt to "recycle" a buffer: 177044eda4d7Smaybee * - look for a buffer to evict that is `bytes' long. 177144eda4d7Smaybee * - return the data block from this buffer rather than freeing it. 177244eda4d7Smaybee * This flag is used by callers that are trying to make space for a 177344eda4d7Smaybee * new buffer in a full arc cache. 1774874395d5Smaybee * 1775874395d5Smaybee * This function makes a "best effort". It skips over any buffers 1776874395d5Smaybee * it can't get a hash_lock on, and so may not catch all candidates. 1777874395d5Smaybee * It may also return without evicting as much space as requested. 1778fa9e4066Sahrens */ 177944eda4d7Smaybee static void * 1780ac05c741SMark Maybee arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 1781ad23a2dbSjohansen arc_buf_contents_t type) 1782fa9e4066Sahrens { 1783fa9e4066Sahrens arc_state_t *evicted_state; 178444eda4d7Smaybee uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 17853fa51506Smaybee arc_buf_hdr_t *ab, *ab_prev = NULL; 17860e8c6158Smaybee list_t *list = &state->arcs_list[type]; 1787fa9e4066Sahrens kmutex_t *hash_lock; 178844eda4d7Smaybee boolean_t have_lock; 17893fa51506Smaybee void *stolen = NULL; 1790*69962b56SMatthew Ahrens arc_buf_hdr_t marker = { 0 }; 1791*69962b56SMatthew Ahrens int count = 0; 1792fa9e4066Sahrens 179344cb6abcSbmc ASSERT(state == arc_mru || state == arc_mfu); 1794fa9e4066Sahrens 179544cb6abcSbmc evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1796fa9e4066Sahrens 179744cb6abcSbmc mutex_enter(&state->arcs_mtx); 179844cb6abcSbmc mutex_enter(&evicted_state->arcs_mtx); 1799fa9e4066Sahrens 18000e8c6158Smaybee for (ab = list_tail(list); ab; ab = ab_prev) { 18010e8c6158Smaybee ab_prev = list_prev(list, ab); 180213506d1eSmaybee /* prefetch buffers have a minimum lifespan */ 180344eda4d7Smaybee if (HDR_IO_IN_PROGRESS(ab) || 1804874395d5Smaybee (spa && ab->b_spa != spa) || 180544eda4d7Smaybee (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1806d3d50737SRafael Vanoni ddi_get_lbolt() - ab->b_arc_access < 1807d3d50737SRafael Vanoni arc_min_prefetch_lifespan)) { 180813506d1eSmaybee skipped++; 180913506d1eSmaybee continue; 181013506d1eSmaybee } 18113fa51506Smaybee /* "lookahead" for better eviction candidate */ 18123fa51506Smaybee if (recycle && ab->b_size != bytes && 18133fa51506Smaybee ab_prev && ab_prev->b_size == bytes) 181444eda4d7Smaybee continue; 1815*69962b56SMatthew Ahrens 1816*69962b56SMatthew Ahrens /* ignore markers */ 1817*69962b56SMatthew Ahrens if (ab->b_spa == 0) 1818*69962b56SMatthew Ahrens continue; 1819*69962b56SMatthew Ahrens 1820*69962b56SMatthew Ahrens /* 1821*69962b56SMatthew Ahrens * It may take a long time to evict all the bufs requested. 1822*69962b56SMatthew Ahrens * To avoid blocking all arc activity, periodically drop 1823*69962b56SMatthew Ahrens * the arcs_mtx and give other threads a chance to run 1824*69962b56SMatthew Ahrens * before reacquiring the lock. 1825*69962b56SMatthew Ahrens * 1826*69962b56SMatthew Ahrens * If we are looking for a buffer to recycle, we are in 1827*69962b56SMatthew Ahrens * the hot code path, so don't sleep. 1828*69962b56SMatthew Ahrens */ 1829*69962b56SMatthew Ahrens if (!recycle && count++ > arc_evict_iterations) { 1830*69962b56SMatthew Ahrens list_insert_after(list, ab, &marker); 1831*69962b56SMatthew Ahrens mutex_exit(&evicted_state->arcs_mtx); 1832*69962b56SMatthew Ahrens mutex_exit(&state->arcs_mtx); 1833*69962b56SMatthew Ahrens kpreempt(KPREEMPT_SYNC); 1834*69962b56SMatthew Ahrens mutex_enter(&state->arcs_mtx); 1835*69962b56SMatthew Ahrens mutex_enter(&evicted_state->arcs_mtx); 1836*69962b56SMatthew Ahrens ab_prev = list_prev(list, &marker); 1837*69962b56SMatthew Ahrens list_remove(list, &marker); 1838*69962b56SMatthew Ahrens count = 0; 1839*69962b56SMatthew Ahrens continue; 1840*69962b56SMatthew Ahrens } 1841*69962b56SMatthew Ahrens 1842fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 184344eda4d7Smaybee have_lock = MUTEX_HELD(hash_lock); 184444eda4d7Smaybee if (have_lock || mutex_tryenter(hash_lock)) { 1845fb09f5aaSMadhav Suresh ASSERT0(refcount_count(&ab->b_refcnt)); 1846ea8dc4b6Seschrock ASSERT(ab->b_datacnt > 0); 1847ea8dc4b6Seschrock while (ab->b_buf) { 1848ea8dc4b6Seschrock arc_buf_t *buf = ab->b_buf; 18493f9d6ad7SLin Ling if (!mutex_tryenter(&buf->b_evict_lock)) { 18506f83844dSMark Maybee missed += 1; 18516f83844dSMark Maybee break; 18526f83844dSMark Maybee } 185344eda4d7Smaybee if (buf->b_data) { 1854ea8dc4b6Seschrock bytes_evicted += ab->b_size; 1855ad23a2dbSjohansen if (recycle && ab->b_type == type && 1856fa94a07fSbrendan ab->b_size == bytes && 1857fa94a07fSbrendan !HDR_L2_WRITING(ab)) { 18583fa51506Smaybee stolen = buf->b_data; 18593fa51506Smaybee recycle = FALSE; 18603fa51506Smaybee } 186144eda4d7Smaybee } 1862ea8dc4b6Seschrock if (buf->b_efunc) { 1863ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 18643fa51506Smaybee arc_buf_destroy(buf, 18653fa51506Smaybee buf->b_data == stolen, FALSE); 1866ea8dc4b6Seschrock ab->b_buf = buf->b_next; 186740d7d650Smaybee buf->b_hdr = &arc_eviction_hdr; 1868ea8dc4b6Seschrock buf->b_next = arc_eviction_list; 1869ea8dc4b6Seschrock arc_eviction_list = buf; 1870ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 18713f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 1872ea8dc4b6Seschrock } else { 18733f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 18743fa51506Smaybee arc_buf_destroy(buf, 18753fa51506Smaybee buf->b_data == stolen, TRUE); 1876ea8dc4b6Seschrock } 1877ea8dc4b6Seschrock } 18785ea40c06SBrendan Gregg - Sun Microsystems 18795ea40c06SBrendan Gregg - Sun Microsystems if (ab->b_l2hdr) { 18805ea40c06SBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_evict_l2_cached, 18815ea40c06SBrendan Gregg - Sun Microsystems ab->b_size); 18825ea40c06SBrendan Gregg - Sun Microsystems } else { 18835ea40c06SBrendan Gregg - Sun Microsystems if (l2arc_write_eligible(ab->b_spa, ab)) { 18845ea40c06SBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_evict_l2_eligible, 18855ea40c06SBrendan Gregg - Sun Microsystems ab->b_size); 18865ea40c06SBrendan Gregg - Sun Microsystems } else { 18875ea40c06SBrendan Gregg - Sun Microsystems ARCSTAT_INCR( 18885ea40c06SBrendan Gregg - Sun Microsystems arcstat_evict_l2_ineligible, 18895ea40c06SBrendan Gregg - Sun Microsystems ab->b_size); 18905ea40c06SBrendan Gregg - Sun Microsystems } 18915ea40c06SBrendan Gregg - Sun Microsystems } 18925ea40c06SBrendan Gregg - Sun Microsystems 18936f83844dSMark Maybee if (ab->b_datacnt == 0) { 18946f83844dSMark Maybee arc_change_state(evicted_state, ab, hash_lock); 18956f83844dSMark Maybee ASSERT(HDR_IN_HASH_TABLE(ab)); 18966f83844dSMark Maybee ab->b_flags |= ARC_IN_HASH_TABLE; 18976f83844dSMark Maybee ab->b_flags &= ~ARC_BUF_AVAILABLE; 18986f83844dSMark Maybee DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 18996f83844dSMark Maybee } 190044eda4d7Smaybee if (!have_lock) 190144eda4d7Smaybee mutex_exit(hash_lock); 1902ea8dc4b6Seschrock if (bytes >= 0 && bytes_evicted >= bytes) 1903fa9e4066Sahrens break; 1904fa9e4066Sahrens } else { 190544eda4d7Smaybee missed += 1; 1906fa9e4066Sahrens } 1907fa9e4066Sahrens } 190844cb6abcSbmc 190944cb6abcSbmc mutex_exit(&evicted_state->arcs_mtx); 191044cb6abcSbmc mutex_exit(&state->arcs_mtx); 1911fa9e4066Sahrens 1912fa9e4066Sahrens if (bytes_evicted < bytes) 1913fa9e4066Sahrens dprintf("only evicted %lld bytes from %x", 1914fa9e4066Sahrens (longlong_t)bytes_evicted, state); 1915fa9e4066Sahrens 191644eda4d7Smaybee if (skipped) 191744cb6abcSbmc ARCSTAT_INCR(arcstat_evict_skip, skipped); 191844cb6abcSbmc 191944eda4d7Smaybee if (missed) 192044cb6abcSbmc ARCSTAT_INCR(arcstat_mutex_miss, missed); 1921f4d2e9e6Smaybee 1922f4d2e9e6Smaybee /* 1923*69962b56SMatthew Ahrens * Note: we have just evicted some data into the ghost state, 1924*69962b56SMatthew Ahrens * potentially putting the ghost size over the desired size. Rather 1925*69962b56SMatthew Ahrens * that evicting from the ghost list in this hot code path, leave 1926*69962b56SMatthew Ahrens * this chore to the arc_reclaim_thread(). 1927f4d2e9e6Smaybee */ 192844cb6abcSbmc 19293fa51506Smaybee return (stolen); 1930fa9e4066Sahrens } 1931fa9e4066Sahrens 1932fa9e4066Sahrens /* 1933fa9e4066Sahrens * Remove buffers from list until we've removed the specified number of 1934fa9e4066Sahrens * bytes. Destroy the buffers that are removed. 1935fa9e4066Sahrens */ 1936fa9e4066Sahrens static void 1937ac05c741SMark Maybee arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 1938fa9e4066Sahrens { 1939fa9e4066Sahrens arc_buf_hdr_t *ab, *ab_prev; 1940b802aa8cSSanjeev Bagewadi arc_buf_hdr_t marker = { 0 }; 19410e8c6158Smaybee list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1942fa9e4066Sahrens kmutex_t *hash_lock; 1943ea8dc4b6Seschrock uint64_t bytes_deleted = 0; 1944c0a81264Sek uint64_t bufs_skipped = 0; 1945*69962b56SMatthew Ahrens int count = 0; 1946fa9e4066Sahrens 1947ea8dc4b6Seschrock ASSERT(GHOST_STATE(state)); 1948fa9e4066Sahrens top: 194944cb6abcSbmc mutex_enter(&state->arcs_mtx); 19500e8c6158Smaybee for (ab = list_tail(list); ab; ab = ab_prev) { 19510e8c6158Smaybee ab_prev = list_prev(list, ab); 1952*69962b56SMatthew Ahrens if (ab->b_type > ARC_BUFC_NUMTYPES) 1953*69962b56SMatthew Ahrens panic("invalid ab=%p", (void *)ab); 1954874395d5Smaybee if (spa && ab->b_spa != spa) 1955874395d5Smaybee continue; 1956b802aa8cSSanjeev Bagewadi 1957b802aa8cSSanjeev Bagewadi /* ignore markers */ 1958b802aa8cSSanjeev Bagewadi if (ab->b_spa == 0) 1959b802aa8cSSanjeev Bagewadi continue; 1960b802aa8cSSanjeev Bagewadi 1961fa9e4066Sahrens hash_lock = HDR_LOCK(ab); 19627e453561SWilliam Gorrell /* caller may be trying to modify this buffer, skip it */ 19637e453561SWilliam Gorrell if (MUTEX_HELD(hash_lock)) 19647e453561SWilliam Gorrell continue; 1965*69962b56SMatthew Ahrens 1966*69962b56SMatthew Ahrens /* 1967*69962b56SMatthew Ahrens * It may take a long time to evict all the bufs requested. 1968*69962b56SMatthew Ahrens * To avoid blocking all arc activity, periodically drop 1969*69962b56SMatthew Ahrens * the arcs_mtx and give other threads a chance to run 1970*69962b56SMatthew Ahrens * before reacquiring the lock. 1971*69962b56SMatthew Ahrens */ 1972*69962b56SMatthew Ahrens if (count++ > arc_evict_iterations) { 1973*69962b56SMatthew Ahrens list_insert_after(list, ab, &marker); 1974*69962b56SMatthew Ahrens mutex_exit(&state->arcs_mtx); 1975*69962b56SMatthew Ahrens kpreempt(KPREEMPT_SYNC); 1976*69962b56SMatthew Ahrens mutex_enter(&state->arcs_mtx); 1977*69962b56SMatthew Ahrens ab_prev = list_prev(list, &marker); 1978*69962b56SMatthew Ahrens list_remove(list, &marker); 1979*69962b56SMatthew Ahrens count = 0; 1980*69962b56SMatthew Ahrens continue; 1981*69962b56SMatthew Ahrens } 19827e453561SWilliam Gorrell if (mutex_tryenter(hash_lock)) { 198313506d1eSmaybee ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1984ea8dc4b6Seschrock ASSERT(ab->b_buf == NULL); 198544cb6abcSbmc ARCSTAT_BUMP(arcstat_deleted); 1986fa9e4066Sahrens bytes_deleted += ab->b_size; 1987fa94a07fSbrendan 1988fa94a07fSbrendan if (ab->b_l2hdr != NULL) { 1989fa94a07fSbrendan /* 1990fa94a07fSbrendan * This buffer is cached on the 2nd Level ARC; 1991fa94a07fSbrendan * don't destroy the header. 1992fa94a07fSbrendan */ 1993fa94a07fSbrendan arc_change_state(arc_l2c_only, ab, hash_lock); 19947e453561SWilliam Gorrell mutex_exit(hash_lock); 1995fa94a07fSbrendan } else { 1996fa94a07fSbrendan arc_change_state(arc_anon, ab, hash_lock); 19977e453561SWilliam Gorrell mutex_exit(hash_lock); 1998fa94a07fSbrendan arc_hdr_destroy(ab); 1999fa94a07fSbrendan } 2000fa94a07fSbrendan 2001ea8dc4b6Seschrock DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 2002fa9e4066Sahrens if (bytes >= 0 && bytes_deleted >= bytes) 2003fa9e4066Sahrens break; 2004b802aa8cSSanjeev Bagewadi } else if (bytes < 0) { 2005b802aa8cSSanjeev Bagewadi /* 2006b802aa8cSSanjeev Bagewadi * Insert a list marker and then wait for the 2007b802aa8cSSanjeev Bagewadi * hash lock to become available. Once its 2008b802aa8cSSanjeev Bagewadi * available, restart from where we left off. 2009b802aa8cSSanjeev Bagewadi */ 2010b802aa8cSSanjeev Bagewadi list_insert_after(list, ab, &marker); 2011b802aa8cSSanjeev Bagewadi mutex_exit(&state->arcs_mtx); 2012b802aa8cSSanjeev Bagewadi mutex_enter(hash_lock); 2013b802aa8cSSanjeev Bagewadi mutex_exit(hash_lock); 2014b802aa8cSSanjeev Bagewadi mutex_enter(&state->arcs_mtx); 2015b802aa8cSSanjeev Bagewadi ab_prev = list_prev(list, &marker); 2016b802aa8cSSanjeev Bagewadi list_remove(list, &marker); 2017*69962b56SMatthew Ahrens } else { 2018fa9e4066Sahrens bufs_skipped += 1; 2019*69962b56SMatthew Ahrens } 2020*69962b56SMatthew Ahrens 2021fa9e4066Sahrens } 202244cb6abcSbmc mutex_exit(&state->arcs_mtx); 2023fa9e4066Sahrens 20240e8c6158Smaybee if (list == &state->arcs_list[ARC_BUFC_DATA] && 20250e8c6158Smaybee (bytes < 0 || bytes_deleted < bytes)) { 20260e8c6158Smaybee list = &state->arcs_list[ARC_BUFC_METADATA]; 20270e8c6158Smaybee goto top; 20280e8c6158Smaybee } 20290e8c6158Smaybee 2030fa9e4066Sahrens if (bufs_skipped) { 203144cb6abcSbmc ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2032fa9e4066Sahrens ASSERT(bytes >= 0); 2033fa9e4066Sahrens } 2034fa9e4066Sahrens 2035fa9e4066Sahrens if (bytes_deleted < bytes) 2036fa9e4066Sahrens dprintf("only deleted %lld bytes from %p", 2037fa9e4066Sahrens (longlong_t)bytes_deleted, state); 2038fa9e4066Sahrens } 2039fa9e4066Sahrens 2040fa9e4066Sahrens static void 2041fa9e4066Sahrens arc_adjust(void) 2042fa9e4066Sahrens { 20435a98e54bSBrendan Gregg - Sun Microsystems int64_t adjustment, delta; 2044fa9e4066Sahrens 20455a98e54bSBrendan Gregg - Sun Microsystems /* 20465a98e54bSBrendan Gregg - Sun Microsystems * Adjust MRU size 20475a98e54bSBrendan Gregg - Sun Microsystems */ 20485a98e54bSBrendan Gregg - Sun Microsystems 20493e4e8481STom Erickson adjustment = MIN((int64_t)(arc_size - arc_c), 20503e4e8481STom Erickson (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 20513e4e8481STom Erickson arc_p)); 2052fa9e4066Sahrens 20535a98e54bSBrendan Gregg - Sun Microsystems if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 20545a98e54bSBrendan Gregg - Sun Microsystems delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 20555a98e54bSBrendan Gregg - Sun Microsystems (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); 20565a98e54bSBrendan Gregg - Sun Microsystems adjustment -= delta; 20570e8c6158Smaybee } 20580e8c6158Smaybee 20595a98e54bSBrendan Gregg - Sun Microsystems if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 20605a98e54bSBrendan Gregg - Sun Microsystems delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 20615a98e54bSBrendan Gregg - Sun Microsystems (void) arc_evict(arc_mru, NULL, delta, FALSE, 2062874395d5Smaybee ARC_BUFC_METADATA); 2063fa9e4066Sahrens } 2064fa9e4066Sahrens 20655a98e54bSBrendan Gregg - Sun Microsystems /* 20665a98e54bSBrendan Gregg - Sun Microsystems * Adjust MFU size 20675a98e54bSBrendan Gregg - Sun Microsystems */ 2068fa9e4066Sahrens 20695a98e54bSBrendan Gregg - Sun Microsystems adjustment = arc_size - arc_c; 20705a98e54bSBrendan Gregg - Sun Microsystems 20715a98e54bSBrendan Gregg - Sun Microsystems if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 20725a98e54bSBrendan Gregg - Sun Microsystems delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 20735a98e54bSBrendan Gregg - Sun Microsystems (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); 20745a98e54bSBrendan Gregg - Sun Microsystems adjustment -= delta; 2075fa9e4066Sahrens } 2076fa9e4066Sahrens 20775a98e54bSBrendan Gregg - Sun Microsystems if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 20785a98e54bSBrendan Gregg - Sun Microsystems int64_t delta = MIN(adjustment, 20795a98e54bSBrendan Gregg - Sun Microsystems arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 20805a98e54bSBrendan Gregg - Sun Microsystems (void) arc_evict(arc_mfu, NULL, delta, FALSE, 20815a98e54bSBrendan Gregg - Sun Microsystems ARC_BUFC_METADATA); 20825a98e54bSBrendan Gregg - Sun Microsystems } 2083fa9e4066Sahrens 20845a98e54bSBrendan Gregg - Sun Microsystems /* 20855a98e54bSBrendan Gregg - Sun Microsystems * Adjust ghost lists 20865a98e54bSBrendan Gregg - Sun Microsystems */ 2087fa9e4066Sahrens 20885a98e54bSBrendan Gregg - Sun Microsystems adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2089fa9e4066Sahrens 20905a98e54bSBrendan Gregg - Sun Microsystems if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 20915a98e54bSBrendan Gregg - Sun Microsystems delta = MIN(arc_mru_ghost->arcs_size, adjustment); 20925a98e54bSBrendan Gregg - Sun Microsystems arc_evict_ghost(arc_mru_ghost, NULL, delta); 20935a98e54bSBrendan Gregg - Sun Microsystems } 20940e8c6158Smaybee 20955a98e54bSBrendan Gregg - Sun Microsystems adjustment = 20965a98e54bSBrendan Gregg - Sun Microsystems arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 20975a98e54bSBrendan Gregg - Sun Microsystems 20985a98e54bSBrendan Gregg - Sun Microsystems if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 20995a98e54bSBrendan Gregg - Sun Microsystems delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 21005a98e54bSBrendan Gregg - Sun Microsystems arc_evict_ghost(arc_mfu_ghost, NULL, delta); 2101fa9e4066Sahrens } 2102fa9e4066Sahrens } 2103fa9e4066Sahrens 2104ea8dc4b6Seschrock static void 2105ea8dc4b6Seschrock arc_do_user_evicts(void) 2106ea8dc4b6Seschrock { 2107ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 2108ea8dc4b6Seschrock while (arc_eviction_list != NULL) { 2109ea8dc4b6Seschrock arc_buf_t *buf = arc_eviction_list; 2110ea8dc4b6Seschrock arc_eviction_list = buf->b_next; 21113f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 2112ea8dc4b6Seschrock buf->b_hdr = NULL; 21133f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 2114ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2115ea8dc4b6Seschrock 2116dd6ef538Smaybee if (buf->b_efunc != NULL) 2117dd6ef538Smaybee VERIFY(buf->b_efunc(buf) == 0); 2118ea8dc4b6Seschrock 2119ea8dc4b6Seschrock buf->b_efunc = NULL; 2120ea8dc4b6Seschrock buf->b_private = NULL; 2121ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 2122ea8dc4b6Seschrock mutex_enter(&arc_eviction_mtx); 2123ea8dc4b6Seschrock } 2124ea8dc4b6Seschrock mutex_exit(&arc_eviction_mtx); 2125ea8dc4b6Seschrock } 2126ea8dc4b6Seschrock 2127fa9e4066Sahrens /* 2128874395d5Smaybee * Flush all *evictable* data from the cache for the given spa. 2129fa9e4066Sahrens * NOTE: this will not touch "active" (i.e. referenced) data. 2130fa9e4066Sahrens */ 2131fa9e4066Sahrens void 2132874395d5Smaybee arc_flush(spa_t *spa) 2133fa9e4066Sahrens { 2134ac05c741SMark Maybee uint64_t guid = 0; 2135ac05c741SMark Maybee 2136ac05c741SMark Maybee if (spa) 2137e9103aaeSGarrett D'Amore guid = spa_load_guid(spa); 2138ac05c741SMark Maybee 2139874395d5Smaybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 2140ac05c741SMark Maybee (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2141874395d5Smaybee if (spa) 2142874395d5Smaybee break; 2143874395d5Smaybee } 2144874395d5Smaybee while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 2145ac05c741SMark Maybee (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2146874395d5Smaybee if (spa) 2147874395d5Smaybee break; 2148874395d5Smaybee } 2149874395d5Smaybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 2150ac05c741SMark Maybee (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2151874395d5Smaybee if (spa) 2152874395d5Smaybee break; 2153874395d5Smaybee } 2154874395d5Smaybee while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 2155ac05c741SMark Maybee (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2156874395d5Smaybee if (spa) 2157874395d5Smaybee break; 2158874395d5Smaybee } 2159874395d5Smaybee 2160ac05c741SMark Maybee arc_evict_ghost(arc_mru_ghost, guid, -1); 2161ac05c741SMark Maybee arc_evict_ghost(arc_mfu_ghost, guid, -1); 2162ea8dc4b6Seschrock 2163ea8dc4b6Seschrock mutex_enter(&arc_reclaim_thr_lock); 2164ea8dc4b6Seschrock arc_do_user_evicts(); 2165ea8dc4b6Seschrock mutex_exit(&arc_reclaim_thr_lock); 2166874395d5Smaybee ASSERT(spa || arc_eviction_list == NULL); 2167fa9e4066Sahrens } 2168fa9e4066Sahrens 2169fa9e4066Sahrens void 217049e3519aSmaybee arc_shrink(void) 2171fa9e4066Sahrens { 217244cb6abcSbmc if (arc_c > arc_c_min) { 217349e3519aSmaybee uint64_t to_free; 2174fa9e4066Sahrens 21753cff2f43Sstans #ifdef _KERNEL 217644cb6abcSbmc to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 21773cff2f43Sstans #else 217844cb6abcSbmc to_free = arc_c >> arc_shrink_shift; 21793cff2f43Sstans #endif 218044cb6abcSbmc if (arc_c > arc_c_min + to_free) 218144cb6abcSbmc atomic_add_64(&arc_c, -to_free); 218249e3519aSmaybee else 218344cb6abcSbmc arc_c = arc_c_min; 218444cb6abcSbmc 218544cb6abcSbmc atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 218644cb6abcSbmc if (arc_c > arc_size) 218744cb6abcSbmc arc_c = MAX(arc_size, arc_c_min); 218844cb6abcSbmc if (arc_p > arc_c) 218944cb6abcSbmc arc_p = (arc_c >> 1); 219044cb6abcSbmc ASSERT(arc_c >= arc_c_min); 219144cb6abcSbmc ASSERT((int64_t)arc_p >= 0); 219249e3519aSmaybee } 2193fa9e4066Sahrens 219444cb6abcSbmc if (arc_size > arc_c) 219549e3519aSmaybee arc_adjust(); 2196fa9e4066Sahrens } 2197fa9e4066Sahrens 219894dd93aeSGeorge Wilson /* 219994dd93aeSGeorge Wilson * Determine if the system is under memory pressure and is asking 220094dd93aeSGeorge Wilson * to reclaim memory. A return value of 1 indicates that the system 220194dd93aeSGeorge Wilson * is under memory pressure and that the arc should adjust accordingly. 220294dd93aeSGeorge Wilson */ 2203fa9e4066Sahrens static int 2204fa9e4066Sahrens arc_reclaim_needed(void) 2205fa9e4066Sahrens { 2206fa9e4066Sahrens uint64_t extra; 2207fa9e4066Sahrens 2208fa9e4066Sahrens #ifdef _KERNEL 22093cff2f43Sstans 22103cff2f43Sstans if (needfree) 22113cff2f43Sstans return (1); 22123cff2f43Sstans 2213fa9e4066Sahrens /* 2214fa9e4066Sahrens * take 'desfree' extra pages, so we reclaim sooner, rather than later 2215fa9e4066Sahrens */ 2216fa9e4066Sahrens extra = desfree; 2217fa9e4066Sahrens 2218fa9e4066Sahrens /* 2219fa9e4066Sahrens * check that we're out of range of the pageout scanner. It starts to 2220fa9e4066Sahrens * schedule paging if freemem is less than lotsfree and needfree. 2221fa9e4066Sahrens * lotsfree is the high-water mark for pageout, and needfree is the 2222fa9e4066Sahrens * number of needed free pages. We add extra pages here to make sure 2223fa9e4066Sahrens * the scanner doesn't start up while we're freeing memory. 2224fa9e4066Sahrens */ 2225fa9e4066Sahrens if (freemem < lotsfree + needfree + extra) 2226fa9e4066Sahrens return (1); 2227fa9e4066Sahrens 2228fa9e4066Sahrens /* 2229fa9e4066Sahrens * check to make sure that swapfs has enough space so that anon 2230fa94a07fSbrendan * reservations can still succeed. anon_resvmem() checks that the 2231fa9e4066Sahrens * availrmem is greater than swapfs_minfree, and the number of reserved 2232fa9e4066Sahrens * swap pages. We also add a bit of extra here just to prevent 2233fa9e4066Sahrens * circumstances from getting really dire. 2234fa9e4066Sahrens */ 2235fa9e4066Sahrens if (availrmem < swapfs_minfree + swapfs_reserve + extra) 2236fa9e4066Sahrens return (1); 2237fa9e4066Sahrens 22385dc8af33Smaybee #if defined(__i386) 2239fa9e4066Sahrens /* 2240fa9e4066Sahrens * If we're on an i386 platform, it's possible that we'll exhaust the 2241fa9e4066Sahrens * kernel heap space before we ever run out of available physical 2242fa9e4066Sahrens * memory. Most checks of the size of the heap_area compare against 2243fa9e4066Sahrens * tune.t_minarmem, which is the minimum available real memory that we 2244fa9e4066Sahrens * can have in the system. However, this is generally fixed at 25 pages 2245fa9e4066Sahrens * which is so low that it's useless. In this comparison, we seek to 2246fa9e4066Sahrens * calculate the total heap-size, and reclaim if more than 3/4ths of the 2247fa94a07fSbrendan * heap is allocated. (Or, in the calculation, if less than 1/4th is 2248fa9e4066Sahrens * free) 2249fa9e4066Sahrens */ 225094dd93aeSGeorge Wilson if (vmem_size(heap_arena, VMEM_FREE) < 225194dd93aeSGeorge Wilson (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) 2252fa9e4066Sahrens return (1); 2253fa9e4066Sahrens #endif 2254fa9e4066Sahrens 225594dd93aeSGeorge Wilson /* 225694dd93aeSGeorge Wilson * If zio data pages are being allocated out of a separate heap segment, 225794dd93aeSGeorge Wilson * then enforce that the size of available vmem for this arena remains 225894dd93aeSGeorge Wilson * above about 1/16th free. 225994dd93aeSGeorge Wilson * 226094dd93aeSGeorge Wilson * Note: The 1/16th arena free requirement was put in place 226194dd93aeSGeorge Wilson * to aggressively evict memory from the arc in order to avoid 226294dd93aeSGeorge Wilson * memory fragmentation issues. 226394dd93aeSGeorge Wilson */ 226494dd93aeSGeorge Wilson if (zio_arena != NULL && 226594dd93aeSGeorge Wilson vmem_size(zio_arena, VMEM_FREE) < 226694dd93aeSGeorge Wilson (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 226794dd93aeSGeorge Wilson return (1); 2268fa9e4066Sahrens #else 2269fa9e4066Sahrens if (spa_get_random(100) == 0) 2270fa9e4066Sahrens return (1); 2271fa9e4066Sahrens #endif 2272fa9e4066Sahrens return (0); 2273fa9e4066Sahrens } 2274fa9e4066Sahrens 2275fa9e4066Sahrens static void 2276fa9e4066Sahrens arc_kmem_reap_now(arc_reclaim_strategy_t strat) 2277fa9e4066Sahrens { 2278fa9e4066Sahrens size_t i; 2279fa9e4066Sahrens kmem_cache_t *prev_cache = NULL; 2280ad23a2dbSjohansen kmem_cache_t *prev_data_cache = NULL; 2281fa9e4066Sahrens extern kmem_cache_t *zio_buf_cache[]; 2282ad23a2dbSjohansen extern kmem_cache_t *zio_data_buf_cache[]; 2283fa9e4066Sahrens 2284033f9833Sek #ifdef _KERNEL 22850e8c6158Smaybee if (arc_meta_used >= arc_meta_limit) { 22860e8c6158Smaybee /* 22870e8c6158Smaybee * We are exceeding our meta-data cache limit. 22880e8c6158Smaybee * Purge some DNLC entries to release holds on meta-data. 22890e8c6158Smaybee */ 22900e8c6158Smaybee dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 22910e8c6158Smaybee } 22925dc8af33Smaybee #if defined(__i386) 22935dc8af33Smaybee /* 22945dc8af33Smaybee * Reclaim unused memory from all kmem caches. 22955dc8af33Smaybee */ 22965dc8af33Smaybee kmem_reap(); 22975dc8af33Smaybee #endif 2298033f9833Sek #endif 2299033f9833Sek 2300fa9e4066Sahrens /* 2301fa94a07fSbrendan * An aggressive reclamation will shrink the cache size as well as 2302ea8dc4b6Seschrock * reap free buffers from the arc kmem caches. 2303fa9e4066Sahrens */ 2304fa9e4066Sahrens if (strat == ARC_RECLAIM_AGGR) 230549e3519aSmaybee arc_shrink(); 2306fa9e4066Sahrens 2307fa9e4066Sahrens for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 2308fa9e4066Sahrens if (zio_buf_cache[i] != prev_cache) { 2309fa9e4066Sahrens prev_cache = zio_buf_cache[i]; 2310fa9e4066Sahrens kmem_cache_reap_now(zio_buf_cache[i]); 2311fa9e4066Sahrens } 2312ad23a2dbSjohansen if (zio_data_buf_cache[i] != prev_data_cache) { 2313ad23a2dbSjohansen prev_data_cache = zio_data_buf_cache[i]; 2314ad23a2dbSjohansen kmem_cache_reap_now(zio_data_buf_cache[i]); 2315ad23a2dbSjohansen } 2316fa9e4066Sahrens } 2317ea8dc4b6Seschrock kmem_cache_reap_now(buf_cache); 2318ea8dc4b6Seschrock kmem_cache_reap_now(hdr_cache); 231994dd93aeSGeorge Wilson 232094dd93aeSGeorge Wilson /* 232194dd93aeSGeorge Wilson * Ask the vmem areana to reclaim unused memory from its 232294dd93aeSGeorge Wilson * quantum caches. 232394dd93aeSGeorge Wilson */ 232494dd93aeSGeorge Wilson if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 232594dd93aeSGeorge Wilson vmem_qcache_reap(zio_arena); 2326fa9e4066Sahrens } 2327fa9e4066Sahrens 2328fa9e4066Sahrens static void 2329fa9e4066Sahrens arc_reclaim_thread(void) 2330fa9e4066Sahrens { 2331fa9e4066Sahrens clock_t growtime = 0; 2332fa9e4066Sahrens arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 2333fa9e4066Sahrens callb_cpr_t cpr; 2334fa9e4066Sahrens 2335fa9e4066Sahrens CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 2336fa9e4066Sahrens 2337fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 2338fa9e4066Sahrens while (arc_thread_exit == 0) { 2339fa9e4066Sahrens if (arc_reclaim_needed()) { 2340fa9e4066Sahrens 234144cb6abcSbmc if (arc_no_grow) { 2342fa9e4066Sahrens if (last_reclaim == ARC_RECLAIM_CONS) { 2343fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 2344fa9e4066Sahrens } else { 2345fa9e4066Sahrens last_reclaim = ARC_RECLAIM_CONS; 2346fa9e4066Sahrens } 2347fa9e4066Sahrens } else { 234844cb6abcSbmc arc_no_grow = TRUE; 2349fa9e4066Sahrens last_reclaim = ARC_RECLAIM_AGGR; 2350fa9e4066Sahrens membar_producer(); 2351fa9e4066Sahrens } 2352fa9e4066Sahrens 2353fa9e4066Sahrens /* reset the growth delay for every reclaim */ 2354d3d50737SRafael Vanoni growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 2355fa9e4066Sahrens 2356fa9e4066Sahrens arc_kmem_reap_now(last_reclaim); 23573a737e0dSbrendan arc_warm = B_TRUE; 2358fa9e4066Sahrens 2359d3d50737SRafael Vanoni } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 236044cb6abcSbmc arc_no_grow = FALSE; 2361fa9e4066Sahrens } 2362fa9e4066Sahrens 23633e4e8481STom Erickson arc_adjust(); 2364641fbdaeSmaybee 2365ea8dc4b6Seschrock if (arc_eviction_list != NULL) 2366ea8dc4b6Seschrock arc_do_user_evicts(); 2367ea8dc4b6Seschrock 2368fa9e4066Sahrens /* block until needed, or one second, whichever is shorter */ 2369fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cpr); 2370fa9e4066Sahrens (void) cv_timedwait(&arc_reclaim_thr_cv, 2371d3d50737SRafael Vanoni &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); 2372fa9e4066Sahrens CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 2373fa9e4066Sahrens } 2374fa9e4066Sahrens 2375fa9e4066Sahrens arc_thread_exit = 0; 2376fa9e4066Sahrens cv_broadcast(&arc_reclaim_thr_cv); 2377fa9e4066Sahrens CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 2378fa9e4066Sahrens thread_exit(); 2379fa9e4066Sahrens } 2380fa9e4066Sahrens 2381ea8dc4b6Seschrock /* 2382ea8dc4b6Seschrock * Adapt arc info given the number of bytes we are trying to add and 2383ea8dc4b6Seschrock * the state that we are comming from. This function is only called 2384ea8dc4b6Seschrock * when we are adding new content to the cache. 2385ea8dc4b6Seschrock */ 2386fa9e4066Sahrens static void 2387ea8dc4b6Seschrock arc_adapt(int bytes, arc_state_t *state) 2388fa9e4066Sahrens { 2389ea8dc4b6Seschrock int mult; 23905a98e54bSBrendan Gregg - Sun Microsystems uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 2391ea8dc4b6Seschrock 2392fa94a07fSbrendan if (state == arc_l2c_only) 2393fa94a07fSbrendan return; 2394fa94a07fSbrendan 2395ea8dc4b6Seschrock ASSERT(bytes > 0); 2396fa9e4066Sahrens /* 2397ea8dc4b6Seschrock * Adapt the target size of the MRU list: 2398ea8dc4b6Seschrock * - if we just hit in the MRU ghost list, then increase 2399ea8dc4b6Seschrock * the target size of the MRU list. 2400ea8dc4b6Seschrock * - if we just hit in the MFU ghost list, then increase 2401ea8dc4b6Seschrock * the target size of the MFU list by decreasing the 2402ea8dc4b6Seschrock * target size of the MRU list. 2403fa9e4066Sahrens */ 240444cb6abcSbmc if (state == arc_mru_ghost) { 240544cb6abcSbmc mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 240644cb6abcSbmc 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 24073e4e8481STom Erickson mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 2408ea8dc4b6Seschrock 24095a98e54bSBrendan Gregg - Sun Microsystems arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 241044cb6abcSbmc } else if (state == arc_mfu_ghost) { 24115a98e54bSBrendan Gregg - Sun Microsystems uint64_t delta; 24125a98e54bSBrendan Gregg - Sun Microsystems 241344cb6abcSbmc mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 241444cb6abcSbmc 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 24153e4e8481STom Erickson mult = MIN(mult, 10); 2416ea8dc4b6Seschrock 24175a98e54bSBrendan Gregg - Sun Microsystems delta = MIN(bytes * mult, arc_p); 24185a98e54bSBrendan Gregg - Sun Microsystems arc_p = MAX(arc_p_min, arc_p - delta); 2419ea8dc4b6Seschrock } 242044cb6abcSbmc ASSERT((int64_t)arc_p >= 0); 2421fa9e4066Sahrens 2422fa9e4066Sahrens if (arc_reclaim_needed()) { 2423fa9e4066Sahrens cv_signal(&arc_reclaim_thr_cv); 2424fa9e4066Sahrens return; 2425fa9e4066Sahrens } 2426fa9e4066Sahrens 242744cb6abcSbmc if (arc_no_grow) 2428fa9e4066Sahrens return; 2429fa9e4066Sahrens 243044cb6abcSbmc if (arc_c >= arc_c_max) 2431ea8dc4b6Seschrock return; 2432ea8dc4b6Seschrock 2433fa9e4066Sahrens /* 2434ea8dc4b6Seschrock * If we're within (2 * maxblocksize) bytes of the target 2435ea8dc4b6Seschrock * cache size, increment the target cache size 2436fa9e4066Sahrens */ 243744cb6abcSbmc if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 243844cb6abcSbmc atomic_add_64(&arc_c, (int64_t)bytes); 243944cb6abcSbmc if (arc_c > arc_c_max) 244044cb6abcSbmc arc_c = arc_c_max; 244144cb6abcSbmc else if (state == arc_anon) 244244cb6abcSbmc atomic_add_64(&arc_p, (int64_t)bytes); 244344cb6abcSbmc if (arc_p > arc_c) 244444cb6abcSbmc arc_p = arc_c; 2445fa9e4066Sahrens } 244644cb6abcSbmc ASSERT((int64_t)arc_p >= 0); 2447fa9e4066Sahrens } 2448fa9e4066Sahrens 2449fa9e4066Sahrens /* 2450ea8dc4b6Seschrock * Check if the cache has reached its limits and eviction is required 2451ea8dc4b6Seschrock * prior to insert. 2452fa9e4066Sahrens */ 2453fa9e4066Sahrens static int 24540e8c6158Smaybee arc_evict_needed(arc_buf_contents_t type) 2455fa9e4066Sahrens { 24560e8c6158Smaybee if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 24570e8c6158Smaybee return (1); 24580e8c6158Smaybee 2459fa9e4066Sahrens if (arc_reclaim_needed()) 2460fa9e4066Sahrens return (1); 2461fa9e4066Sahrens 246244cb6abcSbmc return (arc_size > arc_c); 2463fa9e4066Sahrens } 2464fa9e4066Sahrens 2465fa9e4066Sahrens /* 246644eda4d7Smaybee * The buffer, supplied as the first argument, needs a data block. 246744eda4d7Smaybee * So, if we are at cache max, determine which cache should be victimized. 246844eda4d7Smaybee * We have the following cases: 2469fa9e4066Sahrens * 247044cb6abcSbmc * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2471fa9e4066Sahrens * In this situation if we're out of space, but the resident size of the MFU is 2472fa9e4066Sahrens * under the limit, victimize the MFU cache to satisfy this insertion request. 2473fa9e4066Sahrens * 247444cb6abcSbmc * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2475fa9e4066Sahrens * Here, we've used up all of the available space for the MRU, so we need to 2476fa9e4066Sahrens * evict from our own cache instead. Evict from the set of resident MRU 2477fa9e4066Sahrens * entries. 2478fa9e4066Sahrens * 247944cb6abcSbmc * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2480fa9e4066Sahrens * c minus p represents the MFU space in the cache, since p is the size of the 2481fa9e4066Sahrens * cache that is dedicated to the MRU. In this situation there's still space on 2482fa9e4066Sahrens * the MFU side, so the MRU side needs to be victimized. 2483fa9e4066Sahrens * 248444cb6abcSbmc * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2485fa9e4066Sahrens * MFU's resident set is consuming more space than it has been allotted. In 2486fa9e4066Sahrens * this situation, we must victimize our own cache, the MFU, for this insertion. 2487fa9e4066Sahrens */ 2488fa9e4066Sahrens static void 248944eda4d7Smaybee arc_get_data_buf(arc_buf_t *buf) 2490fa9e4066Sahrens { 2491ad23a2dbSjohansen arc_state_t *state = buf->b_hdr->b_state; 2492ad23a2dbSjohansen uint64_t size = buf->b_hdr->b_size; 2493ad23a2dbSjohansen arc_buf_contents_t type = buf->b_hdr->b_type; 2494fa9e4066Sahrens 249544eda4d7Smaybee arc_adapt(size, state); 2496fa9e4066Sahrens 249744eda4d7Smaybee /* 249844eda4d7Smaybee * We have not yet reached cache maximum size, 249944eda4d7Smaybee * just allocate a new buffer. 250044eda4d7Smaybee */ 25010e8c6158Smaybee if (!arc_evict_needed(type)) { 2502ad23a2dbSjohansen if (type == ARC_BUFC_METADATA) { 2503ad23a2dbSjohansen buf->b_data = zio_buf_alloc(size); 25045a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(size, ARC_SPACE_DATA); 2505ad23a2dbSjohansen } else { 2506ad23a2dbSjohansen ASSERT(type == ARC_BUFC_DATA); 2507ad23a2dbSjohansen buf->b_data = zio_data_buf_alloc(size); 25085a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_data_size, size); 25090e8c6158Smaybee atomic_add_64(&arc_size, size); 2510ad23a2dbSjohansen } 251144eda4d7Smaybee goto out; 251244eda4d7Smaybee } 251344eda4d7Smaybee 251444eda4d7Smaybee /* 251544eda4d7Smaybee * If we are prefetching from the mfu ghost list, this buffer 251644eda4d7Smaybee * will end up on the mru list; so steal space from there. 251744eda4d7Smaybee */ 251844cb6abcSbmc if (state == arc_mfu_ghost) 251944cb6abcSbmc state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 252044cb6abcSbmc else if (state == arc_mru_ghost) 252144cb6abcSbmc state = arc_mru; 252244cb6abcSbmc 252344cb6abcSbmc if (state == arc_mru || state == arc_anon) { 252444cb6abcSbmc uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 25255a98e54bSBrendan Gregg - Sun Microsystems state = (arc_mfu->arcs_lsize[type] >= size && 25260e8c6158Smaybee arc_p > mru_used) ? arc_mfu : arc_mru; 2527fa9e4066Sahrens } else { 252844eda4d7Smaybee /* MFU cases */ 252944cb6abcSbmc uint64_t mfu_space = arc_c - arc_p; 25305a98e54bSBrendan Gregg - Sun Microsystems state = (arc_mru->arcs_lsize[type] >= size && 25310e8c6158Smaybee mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 253244eda4d7Smaybee } 2533874395d5Smaybee if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2534ad23a2dbSjohansen if (type == ARC_BUFC_METADATA) { 2535ad23a2dbSjohansen buf->b_data = zio_buf_alloc(size); 25365a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(size, ARC_SPACE_DATA); 2537ad23a2dbSjohansen } else { 2538ad23a2dbSjohansen ASSERT(type == ARC_BUFC_DATA); 2539ad23a2dbSjohansen buf->b_data = zio_data_buf_alloc(size); 25405a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_INCR(arcstat_data_size, size); 25410e8c6158Smaybee atomic_add_64(&arc_size, size); 2542ad23a2dbSjohansen } 254344cb6abcSbmc ARCSTAT_BUMP(arcstat_recycle_miss); 254444eda4d7Smaybee } 254544eda4d7Smaybee ASSERT(buf->b_data != NULL); 254644eda4d7Smaybee out: 254744eda4d7Smaybee /* 254844eda4d7Smaybee * Update the state size. Note that ghost states have a 254944eda4d7Smaybee * "ghost size" and so don't need to be updated. 255044eda4d7Smaybee */ 255144eda4d7Smaybee if (!GHOST_STATE(buf->b_hdr->b_state)) { 255244eda4d7Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 255344eda4d7Smaybee 255444cb6abcSbmc atomic_add_64(&hdr->b_state->arcs_size, size); 255544eda4d7Smaybee if (list_link_active(&hdr->b_arc_node)) { 255644eda4d7Smaybee ASSERT(refcount_is_zero(&hdr->b_refcnt)); 25570e8c6158Smaybee atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2558fa9e4066Sahrens } 2559641fbdaeSmaybee /* 2560641fbdaeSmaybee * If we are growing the cache, and we are adding anonymous 256144cb6abcSbmc * data, and we have outgrown arc_p, update arc_p 2562641fbdaeSmaybee */ 256344cb6abcSbmc if (arc_size < arc_c && hdr->b_state == arc_anon && 256444cb6abcSbmc arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 256544cb6abcSbmc arc_p = MIN(arc_c, arc_p + size); 2566fa9e4066Sahrens } 2567fa9e4066Sahrens } 2568fa9e4066Sahrens 2569fa9e4066Sahrens /* 2570fa9e4066Sahrens * This routine is called whenever a buffer is accessed. 2571ea8dc4b6Seschrock * NOTE: the hash lock is dropped in this function. 2572fa9e4066Sahrens */ 2573fa9e4066Sahrens static void 257444eda4d7Smaybee arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2575fa9e4066Sahrens { 2576d3d50737SRafael Vanoni clock_t now; 2577d3d50737SRafael Vanoni 2578fa9e4066Sahrens ASSERT(MUTEX_HELD(hash_lock)); 2579fa9e4066Sahrens 258044cb6abcSbmc if (buf->b_state == arc_anon) { 2581fa9e4066Sahrens /* 2582fa9e4066Sahrens * This buffer is not in the cache, and does not 2583fa9e4066Sahrens * appear in our "ghost" list. Add the new buffer 2584fa9e4066Sahrens * to the MRU state. 2585fa9e4066Sahrens */ 2586fa9e4066Sahrens 2587fa9e4066Sahrens ASSERT(buf->b_arc_access == 0); 2588d3d50737SRafael Vanoni buf->b_arc_access = ddi_get_lbolt(); 2589ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 259044cb6abcSbmc arc_change_state(arc_mru, buf, hash_lock); 2591fa9e4066Sahrens 259244cb6abcSbmc } else if (buf->b_state == arc_mru) { 2593d3d50737SRafael Vanoni now = ddi_get_lbolt(); 2594d3d50737SRafael Vanoni 2595fa9e4066Sahrens /* 259613506d1eSmaybee * If this buffer is here because of a prefetch, then either: 259713506d1eSmaybee * - clear the flag if this is a "referencing" read 259813506d1eSmaybee * (any subsequent access will bump this into the MFU state). 259913506d1eSmaybee * or 260013506d1eSmaybee * - move the buffer to the head of the list if this is 260113506d1eSmaybee * another prefetch (to make it less likely to be evicted). 2602fa9e4066Sahrens */ 2603fa9e4066Sahrens if ((buf->b_flags & ARC_PREFETCH) != 0) { 260413506d1eSmaybee if (refcount_count(&buf->b_refcnt) == 0) { 260513506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 260613506d1eSmaybee } else { 260713506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 260844cb6abcSbmc ARCSTAT_BUMP(arcstat_mru_hits); 260913506d1eSmaybee } 2610d3d50737SRafael Vanoni buf->b_arc_access = now; 2611fa9e4066Sahrens return; 2612fa9e4066Sahrens } 2613fa9e4066Sahrens 2614fa9e4066Sahrens /* 2615fa9e4066Sahrens * This buffer has been "accessed" only once so far, 2616fa9e4066Sahrens * but it is still in the cache. Move it to the MFU 2617fa9e4066Sahrens * state. 2618fa9e4066Sahrens */ 2619d3d50737SRafael Vanoni if (now > buf->b_arc_access + ARC_MINTIME) { 2620fa9e4066Sahrens /* 2621fa9e4066Sahrens * More than 125ms have passed since we 2622fa9e4066Sahrens * instantiated this buffer. Move it to the 2623fa9e4066Sahrens * most frequently used state. 2624fa9e4066Sahrens */ 2625d3d50737SRafael Vanoni buf->b_arc_access = now; 2626ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 262744cb6abcSbmc arc_change_state(arc_mfu, buf, hash_lock); 2628fa9e4066Sahrens } 262944cb6abcSbmc ARCSTAT_BUMP(arcstat_mru_hits); 263044cb6abcSbmc } else if (buf->b_state == arc_mru_ghost) { 2631fa9e4066Sahrens arc_state_t *new_state; 2632fa9e4066Sahrens /* 2633fa9e4066Sahrens * This buffer has been "accessed" recently, but 2634fa9e4066Sahrens * was evicted from the cache. Move it to the 2635fa9e4066Sahrens * MFU state. 2636fa9e4066Sahrens */ 2637fa9e4066Sahrens 2638fa9e4066Sahrens if (buf->b_flags & ARC_PREFETCH) { 263944cb6abcSbmc new_state = arc_mru; 264013506d1eSmaybee if (refcount_count(&buf->b_refcnt) > 0) 264113506d1eSmaybee buf->b_flags &= ~ARC_PREFETCH; 2642ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2643fa9e4066Sahrens } else { 264444cb6abcSbmc new_state = arc_mfu; 2645ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2646fa9e4066Sahrens } 2647fa9e4066Sahrens 2648d3d50737SRafael Vanoni buf->b_arc_access = ddi_get_lbolt(); 2649fa9e4066Sahrens arc_change_state(new_state, buf, hash_lock); 2650fa9e4066Sahrens 265144cb6abcSbmc ARCSTAT_BUMP(arcstat_mru_ghost_hits); 265244cb6abcSbmc } else if (buf->b_state == arc_mfu) { 2653fa9e4066Sahrens /* 2654fa9e4066Sahrens * This buffer has been accessed more than once and is 2655fa9e4066Sahrens * still in the cache. Keep it in the MFU state. 2656fa9e4066Sahrens * 265713506d1eSmaybee * NOTE: an add_reference() that occurred when we did 265813506d1eSmaybee * the arc_read() will have kicked this off the list. 265913506d1eSmaybee * If it was a prefetch, we will explicitly move it to 266013506d1eSmaybee * the head of the list now. 2661fa9e4066Sahrens */ 266213506d1eSmaybee if ((buf->b_flags & ARC_PREFETCH) != 0) { 266313506d1eSmaybee ASSERT(refcount_count(&buf->b_refcnt) == 0); 266413506d1eSmaybee ASSERT(list_link_active(&buf->b_arc_node)); 266513506d1eSmaybee } 266644cb6abcSbmc ARCSTAT_BUMP(arcstat_mfu_hits); 2667d3d50737SRafael Vanoni buf->b_arc_access = ddi_get_lbolt(); 266844cb6abcSbmc } else if (buf->b_state == arc_mfu_ghost) { 266944cb6abcSbmc arc_state_t *new_state = arc_mfu; 2670fa9e4066Sahrens /* 2671fa9e4066Sahrens * This buffer has been accessed more than once but has 2672fa9e4066Sahrens * been evicted from the cache. Move it back to the 2673fa9e4066Sahrens * MFU state. 2674fa9e4066Sahrens */ 2675fa9e4066Sahrens 267613506d1eSmaybee if (buf->b_flags & ARC_PREFETCH) { 267713506d1eSmaybee /* 267813506d1eSmaybee * This is a prefetch access... 267913506d1eSmaybee * move this block back to the MRU state. 268013506d1eSmaybee */ 2681fb09f5aaSMadhav Suresh ASSERT0(refcount_count(&buf->b_refcnt)); 268244cb6abcSbmc new_state = arc_mru; 268313506d1eSmaybee } 268413506d1eSmaybee 2685d3d50737SRafael Vanoni buf->b_arc_access = ddi_get_lbolt(); 2686ea8dc4b6Seschrock DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 268713506d1eSmaybee arc_change_state(new_state, buf, hash_lock); 2688fa9e4066Sahrens 268944cb6abcSbmc ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2690fa94a07fSbrendan } else if (buf->b_state == arc_l2c_only) { 2691fa94a07fSbrendan /* 2692fa94a07fSbrendan * This buffer is on the 2nd Level ARC. 2693fa94a07fSbrendan */ 2694fa94a07fSbrendan 2695d3d50737SRafael Vanoni buf->b_arc_access = ddi_get_lbolt(); 2696fa94a07fSbrendan DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2697fa94a07fSbrendan arc_change_state(arc_mfu, buf, hash_lock); 2698fa9e4066Sahrens } else { 2699fa9e4066Sahrens ASSERT(!"invalid arc state"); 2700fa9e4066Sahrens } 2701fa9e4066Sahrens } 2702fa9e4066Sahrens 2703fa9e4066Sahrens /* a generic arc_done_func_t which you can use */ 2704fa9e4066Sahrens /* ARGSUSED */ 2705fa9e4066Sahrens void 2706fa9e4066Sahrens arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2707fa9e4066Sahrens { 27083f9d6ad7SLin Ling if (zio == NULL || zio->io_error == 0) 27093f9d6ad7SLin Ling bcopy(buf->b_data, arg, buf->b_hdr->b_size); 27103b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, arg)); 2711fa9e4066Sahrens } 2712fa9e4066Sahrens 27130e8c6158Smaybee /* a generic arc_done_func_t */ 2714fa9e4066Sahrens void 2715fa9e4066Sahrens arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2716fa9e4066Sahrens { 2717fa9e4066Sahrens arc_buf_t **bufp = arg; 2718fa9e4066Sahrens if (zio && zio->io_error) { 27193b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, arg)); 2720fa9e4066Sahrens *bufp = NULL; 2721fa9e4066Sahrens } else { 2722fa9e4066Sahrens *bufp = buf; 27233f9d6ad7SLin Ling ASSERT(buf->b_data); 2724fa9e4066Sahrens } 2725fa9e4066Sahrens } 2726fa9e4066Sahrens 2727fa9e4066Sahrens static void 2728fa9e4066Sahrens arc_read_done(zio_t *zio) 2729fa9e4066Sahrens { 2730bbf4a8dfSmaybee arc_buf_hdr_t *hdr, *found; 2731fa9e4066Sahrens arc_buf_t *buf; 2732fa9e4066Sahrens arc_buf_t *abuf; /* buffer we're assigning to callback */ 2733fa9e4066Sahrens kmutex_t *hash_lock; 2734fa9e4066Sahrens arc_callback_t *callback_list, *acb; 2735fa9e4066Sahrens int freeable = FALSE; 2736fa9e4066Sahrens 2737fa9e4066Sahrens buf = zio->io_private; 2738fa9e4066Sahrens hdr = buf->b_hdr; 2739fa9e4066Sahrens 2740bbf4a8dfSmaybee /* 2741bbf4a8dfSmaybee * The hdr was inserted into hash-table and removed from lists 2742bbf4a8dfSmaybee * prior to starting I/O. We should find this header, since 2743bbf4a8dfSmaybee * it's in the hash table, and it should be legit since it's 2744bbf4a8dfSmaybee * not possible to evict it during the I/O. The only possible 2745bbf4a8dfSmaybee * reason for it not to be found is if we were freed during the 2746bbf4a8dfSmaybee * read. 2747bbf4a8dfSmaybee */ 2748ac05c741SMark Maybee found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, 27496b4acc8bSahrens &hash_lock); 2750fa9e4066Sahrens 2751bbf4a8dfSmaybee ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2752fa94a07fSbrendan (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2753fa94a07fSbrendan (found == hdr && HDR_L2_READING(hdr))); 2754fa94a07fSbrendan 27553a737e0dSbrendan hdr->b_flags &= ~ARC_L2_EVICTED; 2756fa94a07fSbrendan if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 27573baa08fcSek hdr->b_flags &= ~ARC_L2CACHE; 2758fa9e4066Sahrens 2759fa9e4066Sahrens /* byteswap if necessary */ 2760fa9e4066Sahrens callback_list = hdr->b_acb; 2761fa9e4066Sahrens ASSERT(callback_list != NULL); 27628e0f0d3dSWilliam Gorrell if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 2763ad135b5dSChristopher Siden dmu_object_byteswap_t bswap = 2764ad135b5dSChristopher Siden DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 2765088f3894Sahrens arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2766088f3894Sahrens byteswap_uint64_array : 2767ad135b5dSChristopher Siden dmu_ot_byteswap[bswap].ob_func; 2768088f3894Sahrens func(buf->b_data, hdr->b_size); 2769088f3894Sahrens } 2770fa9e4066Sahrens 2771fa94a07fSbrendan arc_cksum_compute(buf, B_FALSE); 2772cd1c8b85SMatthew Ahrens arc_buf_watch(buf); 27736b4acc8bSahrens 2774b24ab676SJeff Bonwick if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { 2775b24ab676SJeff Bonwick /* 2776b24ab676SJeff Bonwick * Only call arc_access on anonymous buffers. This is because 2777b24ab676SJeff Bonwick * if we've issued an I/O for an evicted buffer, we've already 2778b24ab676SJeff Bonwick * called arc_access (to prevent any simultaneous readers from 2779b24ab676SJeff Bonwick * getting confused). 2780b24ab676SJeff Bonwick */ 2781b24ab676SJeff Bonwick arc_access(hdr, hash_lock); 2782b24ab676SJeff Bonwick } 2783b24ab676SJeff Bonwick 2784fa9e4066Sahrens /* create copies of the data buffer for the callers */ 2785fa9e4066Sahrens abuf = buf; 2786fa9e4066Sahrens for (acb = callback_list; acb; acb = acb->acb_next) { 2787fa9e4066Sahrens if (acb->acb_done) { 27889253d63dSGeorge Wilson if (abuf == NULL) { 27899253d63dSGeorge Wilson ARCSTAT_BUMP(arcstat_duplicate_reads); 279044eda4d7Smaybee abuf = arc_buf_clone(buf); 27919253d63dSGeorge Wilson } 2792fa9e4066Sahrens acb->acb_buf = abuf; 2793fa9e4066Sahrens abuf = NULL; 2794fa9e4066Sahrens } 2795fa9e4066Sahrens } 2796fa9e4066Sahrens hdr->b_acb = NULL; 2797fa9e4066Sahrens hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2798ea8dc4b6Seschrock ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2799b24ab676SJeff Bonwick if (abuf == buf) { 2800b24ab676SJeff Bonwick ASSERT(buf->b_efunc == NULL); 2801b24ab676SJeff Bonwick ASSERT(hdr->b_datacnt == 1); 2802ea8dc4b6Seschrock hdr->b_flags |= ARC_BUF_AVAILABLE; 2803b24ab676SJeff Bonwick } 2804fa9e4066Sahrens 2805fa9e4066Sahrens ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2806fa9e4066Sahrens 2807fa9e4066Sahrens if (zio->io_error != 0) { 2808fa9e4066Sahrens hdr->b_flags |= ARC_IO_ERROR; 280944cb6abcSbmc if (hdr->b_state != arc_anon) 281044cb6abcSbmc arc_change_state(arc_anon, hdr, hash_lock); 2811ea8dc4b6Seschrock if (HDR_IN_HASH_TABLE(hdr)) 2812ea8dc4b6Seschrock buf_hash_remove(hdr); 2813fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 2814fa9e4066Sahrens } 2815fa9e4066Sahrens 2816ea8dc4b6Seschrock /* 281713506d1eSmaybee * Broadcast before we drop the hash_lock to avoid the possibility 281813506d1eSmaybee * that the hdr (and hence the cv) might be freed before we get to 281913506d1eSmaybee * the cv_broadcast(). 2820ea8dc4b6Seschrock */ 2821ea8dc4b6Seschrock cv_broadcast(&hdr->b_cv); 2822ea8dc4b6Seschrock 2823bbf4a8dfSmaybee if (hash_lock) { 282444eda4d7Smaybee mutex_exit(hash_lock); 2825fa9e4066Sahrens } else { 2826fa9e4066Sahrens /* 2827fa9e4066Sahrens * This block was freed while we waited for the read to 2828fa9e4066Sahrens * complete. It has been removed from the hash table and 2829fa9e4066Sahrens * moved to the anonymous state (so that it won't show up 2830fa9e4066Sahrens * in the cache). 2831fa9e4066Sahrens */ 283244cb6abcSbmc ASSERT3P(hdr->b_state, ==, arc_anon); 2833fa9e4066Sahrens freeable = refcount_is_zero(&hdr->b_refcnt); 2834fa9e4066Sahrens } 2835fa9e4066Sahrens 2836fa9e4066Sahrens /* execute each callback and free its structure */ 2837fa9e4066Sahrens while ((acb = callback_list) != NULL) { 2838fa9e4066Sahrens if (acb->acb_done) 2839fa9e4066Sahrens acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2840fa9e4066Sahrens 2841fa9e4066Sahrens if (acb->acb_zio_dummy != NULL) { 2842fa9e4066Sahrens acb->acb_zio_dummy->io_error = zio->io_error; 2843fa9e4066Sahrens zio_nowait(acb->acb_zio_dummy); 2844fa9e4066Sahrens } 2845fa9e4066Sahrens 2846fa9e4066Sahrens callback_list = acb->acb_next; 2847fa9e4066Sahrens kmem_free(acb, sizeof (arc_callback_t)); 2848fa9e4066Sahrens } 2849fa9e4066Sahrens 2850fa9e4066Sahrens if (freeable) 2851ea8dc4b6Seschrock arc_hdr_destroy(hdr); 2852fa9e4066Sahrens } 2853fa9e4066Sahrens 2854fa9e4066Sahrens /* 2855fc98fea5SBart Coddens * "Read" the block at the specified DVA (in bp) via the 2856fa9e4066Sahrens * cache. If the block is found in the cache, invoke the provided 2857fa9e4066Sahrens * callback immediately and return. Note that the `zio' parameter 2858fa9e4066Sahrens * in the callback will be NULL in this case, since no IO was 2859fa9e4066Sahrens * required. If the block is not in the cache pass the read request 2860fa9e4066Sahrens * on to the spa with a substitute callback function, so that the 2861fa9e4066Sahrens * requested block will be added to the cache. 2862fa9e4066Sahrens * 2863fa9e4066Sahrens * If a read request arrives for a block that has a read in-progress, 2864fa9e4066Sahrens * either wait for the in-progress read to complete (and return the 2865fa9e4066Sahrens * results); or, if this is a read with a "done" func, add a record 2866fa9e4066Sahrens * to the read to invoke the "done" func when the read completes, 2867fa9e4066Sahrens * and return; or just return. 2868fa9e4066Sahrens * 2869fa9e4066Sahrens * arc_read_done() will invoke all the requested "done" functions 2870fa9e4066Sahrens * for readers of this block. 2871fa9e4066Sahrens */ 2872fa9e4066Sahrens int 28731b912ec7SGeorge Wilson arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 2874*69962b56SMatthew Ahrens void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, 28751b912ec7SGeorge Wilson const zbookmark_t *zb) 2876fa9e4066Sahrens { 2877fa9e4066Sahrens arc_buf_hdr_t *hdr; 2878d5285caeSGeorge Wilson arc_buf_t *buf = NULL; 2879fa9e4066Sahrens kmutex_t *hash_lock; 2880fa94a07fSbrendan zio_t *rzio; 2881e9103aaeSGarrett D'Amore uint64_t guid = spa_load_guid(spa); 2882fa9e4066Sahrens 2883fa9e4066Sahrens top: 2884b24ab676SJeff Bonwick hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 2885b24ab676SJeff Bonwick &hash_lock); 2886ea8dc4b6Seschrock if (hdr && hdr->b_datacnt > 0) { 2887fa9e4066Sahrens 288813506d1eSmaybee *arc_flags |= ARC_CACHED; 288913506d1eSmaybee 2890fa9e4066Sahrens if (HDR_IO_IN_PROGRESS(hdr)) { 289113506d1eSmaybee 289213506d1eSmaybee if (*arc_flags & ARC_WAIT) { 289313506d1eSmaybee cv_wait(&hdr->b_cv, hash_lock); 289413506d1eSmaybee mutex_exit(hash_lock); 289513506d1eSmaybee goto top; 289613506d1eSmaybee } 289713506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 289813506d1eSmaybee 289913506d1eSmaybee if (done) { 2900fa9e4066Sahrens arc_callback_t *acb = NULL; 2901fa9e4066Sahrens 2902fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), 2903fa9e4066Sahrens KM_SLEEP); 2904fa9e4066Sahrens acb->acb_done = done; 2905fa9e4066Sahrens acb->acb_private = private; 2906fa9e4066Sahrens if (pio != NULL) 2907fa9e4066Sahrens acb->acb_zio_dummy = zio_null(pio, 2908a3f829aeSBill Moore spa, NULL, NULL, NULL, zio_flags); 2909fa9e4066Sahrens 2910fa9e4066Sahrens ASSERT(acb->acb_done != NULL); 2911fa9e4066Sahrens acb->acb_next = hdr->b_acb; 2912fa9e4066Sahrens hdr->b_acb = acb; 2913fa9e4066Sahrens add_reference(hdr, hash_lock, private); 2914fa9e4066Sahrens mutex_exit(hash_lock); 2915fa9e4066Sahrens return (0); 2916fa9e4066Sahrens } 2917fa9e4066Sahrens mutex_exit(hash_lock); 2918fa9e4066Sahrens return (0); 2919fa9e4066Sahrens } 2920fa9e4066Sahrens 292144cb6abcSbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2922fa9e4066Sahrens 2923ea8dc4b6Seschrock if (done) { 292444eda4d7Smaybee add_reference(hdr, hash_lock, private); 2925ea8dc4b6Seschrock /* 2926ea8dc4b6Seschrock * If this block is already in use, create a new 2927ea8dc4b6Seschrock * copy of the data so that we will be guaranteed 2928ea8dc4b6Seschrock * that arc_release() will always succeed. 2929ea8dc4b6Seschrock */ 2930fa9e4066Sahrens buf = hdr->b_buf; 2931ea8dc4b6Seschrock ASSERT(buf); 2932ea8dc4b6Seschrock ASSERT(buf->b_data); 293344eda4d7Smaybee if (HDR_BUF_AVAILABLE(hdr)) { 2934ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 2935ea8dc4b6Seschrock hdr->b_flags &= ~ARC_BUF_AVAILABLE; 293644eda4d7Smaybee } else { 293744eda4d7Smaybee buf = arc_buf_clone(buf); 2938ea8dc4b6Seschrock } 2939b24ab676SJeff Bonwick 294013506d1eSmaybee } else if (*arc_flags & ARC_PREFETCH && 294113506d1eSmaybee refcount_count(&hdr->b_refcnt) == 0) { 294213506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 2943fa9e4066Sahrens } 2944fa9e4066Sahrens DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 294544eda4d7Smaybee arc_access(hdr, hash_lock); 29463baa08fcSek if (*arc_flags & ARC_L2CACHE) 29473baa08fcSek hdr->b_flags |= ARC_L2CACHE; 2948aad02571SSaso Kiselkov if (*arc_flags & ARC_L2COMPRESS) 2949aad02571SSaso Kiselkov hdr->b_flags |= ARC_L2COMPRESS; 295044eda4d7Smaybee mutex_exit(hash_lock); 295144cb6abcSbmc ARCSTAT_BUMP(arcstat_hits); 295244cb6abcSbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 295344cb6abcSbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 295444cb6abcSbmc data, metadata, hits); 295544cb6abcSbmc 2956fa9e4066Sahrens if (done) 2957fa9e4066Sahrens done(NULL, buf, private); 2958fa9e4066Sahrens } else { 2959fa9e4066Sahrens uint64_t size = BP_GET_LSIZE(bp); 2960fa9e4066Sahrens arc_callback_t *acb; 29613a737e0dSbrendan vdev_t *vd = NULL; 2962d5285caeSGeorge Wilson uint64_t addr = 0; 29635a98e54bSBrendan Gregg - Sun Microsystems boolean_t devw = B_FALSE; 2964fa9e4066Sahrens 2965fa9e4066Sahrens if (hdr == NULL) { 2966fa9e4066Sahrens /* this block is not in the cache */ 2967fa9e4066Sahrens arc_buf_hdr_t *exists; 2968ad23a2dbSjohansen arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2969ad23a2dbSjohansen buf = arc_buf_alloc(spa, size, private, type); 2970fa9e4066Sahrens hdr = buf->b_hdr; 2971fa9e4066Sahrens hdr->b_dva = *BP_IDENTITY(bp); 2972b24ab676SJeff Bonwick hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 2973fa9e4066Sahrens hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2974fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 2975fa9e4066Sahrens if (exists) { 2976fa9e4066Sahrens /* somebody beat us to the hash insert */ 2977fa9e4066Sahrens mutex_exit(hash_lock); 29783f9d6ad7SLin Ling buf_discard_identity(hdr); 2979ea8dc4b6Seschrock (void) arc_buf_remove_ref(buf, private); 2980fa9e4066Sahrens goto top; /* restart the IO request */ 2981fa9e4066Sahrens } 298213506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 298313506d1eSmaybee if (*arc_flags & ARC_PREFETCH) { 298413506d1eSmaybee (void) remove_reference(hdr, hash_lock, 298513506d1eSmaybee private); 298613506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 298713506d1eSmaybee } 29883baa08fcSek if (*arc_flags & ARC_L2CACHE) 29893baa08fcSek hdr->b_flags |= ARC_L2CACHE; 2990aad02571SSaso Kiselkov if (*arc_flags & ARC_L2COMPRESS) 2991aad02571SSaso Kiselkov hdr->b_flags |= ARC_L2COMPRESS; 299213506d1eSmaybee if (BP_GET_LEVEL(bp) > 0) 299313506d1eSmaybee hdr->b_flags |= ARC_INDIRECT; 2994fa9e4066Sahrens } else { 2995fa9e4066Sahrens /* this block is in the ghost cache */ 2996ea8dc4b6Seschrock ASSERT(GHOST_STATE(hdr->b_state)); 2997ea8dc4b6Seschrock ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2998fb09f5aaSMadhav Suresh ASSERT0(refcount_count(&hdr->b_refcnt)); 2999ea8dc4b6Seschrock ASSERT(hdr->b_buf == NULL); 300013506d1eSmaybee 300113506d1eSmaybee /* if this is a prefetch, we don't have a reference */ 300213506d1eSmaybee if (*arc_flags & ARC_PREFETCH) 300313506d1eSmaybee hdr->b_flags |= ARC_PREFETCH; 300413506d1eSmaybee else 300513506d1eSmaybee add_reference(hdr, hash_lock, private); 30063baa08fcSek if (*arc_flags & ARC_L2CACHE) 30073baa08fcSek hdr->b_flags |= ARC_L2CACHE; 3008aad02571SSaso Kiselkov if (*arc_flags & ARC_L2COMPRESS) 3009aad02571SSaso Kiselkov hdr->b_flags |= ARC_L2COMPRESS; 30101ab7f2deSmaybee buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3011fa9e4066Sahrens buf->b_hdr = hdr; 301244eda4d7Smaybee buf->b_data = NULL; 3013ea8dc4b6Seschrock buf->b_efunc = NULL; 3014ea8dc4b6Seschrock buf->b_private = NULL; 3015fa9e4066Sahrens buf->b_next = NULL; 3016fa9e4066Sahrens hdr->b_buf = buf; 3017ea8dc4b6Seschrock ASSERT(hdr->b_datacnt == 0); 3018ea8dc4b6Seschrock hdr->b_datacnt = 1; 30195614b00aSWilliam Gorrell arc_get_data_buf(buf); 30207e453561SWilliam Gorrell arc_access(hdr, hash_lock); 3021fa9e4066Sahrens } 3022fa9e4066Sahrens 30235614b00aSWilliam Gorrell ASSERT(!GHOST_STATE(hdr->b_state)); 30245614b00aSWilliam Gorrell 3025fa9e4066Sahrens acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3026fa9e4066Sahrens acb->acb_done = done; 3027fa9e4066Sahrens acb->acb_private = private; 3028fa9e4066Sahrens 3029fa9e4066Sahrens ASSERT(hdr->b_acb == NULL); 3030fa9e4066Sahrens hdr->b_acb = acb; 3031fa9e4066Sahrens hdr->b_flags |= ARC_IO_IN_PROGRESS; 3032fa9e4066Sahrens 3033e14bb325SJeff Bonwick if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && 3034e14bb325SJeff Bonwick (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { 30355a98e54bSBrendan Gregg - Sun Microsystems devw = hdr->b_l2hdr->b_dev->l2ad_writing; 30363a737e0dSbrendan addr = hdr->b_l2hdr->b_daddr; 3037e14bb325SJeff Bonwick /* 3038e14bb325SJeff Bonwick * Lock out device removal. 3039e14bb325SJeff Bonwick */ 3040e14bb325SJeff Bonwick if (vdev_is_dead(vd) || 3041e14bb325SJeff Bonwick !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3042e14bb325SJeff Bonwick vd = NULL; 30433a737e0dSbrendan } 30443a737e0dSbrendan 30453a737e0dSbrendan mutex_exit(hash_lock); 30463a737e0dSbrendan 30473e30c24aSWill Andrews /* 30483e30c24aSWill Andrews * At this point, we have a level 1 cache miss. Try again in 30493e30c24aSWill Andrews * L2ARC if possible. 30503e30c24aSWill Andrews */ 3051fa9e4066Sahrens ASSERT3U(hdr->b_size, ==, size); 30525c28183bSBrendan Gregg - Sun Microsystems DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 30535c28183bSBrendan Gregg - Sun Microsystems uint64_t, size, zbookmark_t *, zb); 305444cb6abcSbmc ARCSTAT_BUMP(arcstat_misses); 305544cb6abcSbmc ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 305644cb6abcSbmc demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 305744cb6abcSbmc data, metadata, misses); 3058ea8dc4b6Seschrock 30595a98e54bSBrendan Gregg - Sun Microsystems if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 3060fa94a07fSbrendan /* 3061fa94a07fSbrendan * Read from the L2ARC if the following are true: 30623a737e0dSbrendan * 1. The L2ARC vdev was previously cached. 30633a737e0dSbrendan * 2. This buffer still has L2ARC metadata. 30643a737e0dSbrendan * 3. This buffer isn't currently writing to the L2ARC. 30653a737e0dSbrendan * 4. The L2ARC entry wasn't evicted, which may 30663a737e0dSbrendan * also have invalidated the vdev. 30675a98e54bSBrendan Gregg - Sun Microsystems * 5. This isn't prefetch and l2arc_noprefetch is set. 3068fa94a07fSbrendan */ 3069e14bb325SJeff Bonwick if (hdr->b_l2hdr != NULL && 30705a98e54bSBrendan Gregg - Sun Microsystems !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 30715a98e54bSBrendan Gregg - Sun Microsystems !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 3072fa94a07fSbrendan l2arc_read_callback_t *cb; 3073fa94a07fSbrendan 3074c5904d13Seschrock DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 3075c5904d13Seschrock ARCSTAT_BUMP(arcstat_l2_hits); 3076c5904d13Seschrock 3077fa94a07fSbrendan cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 3078fa94a07fSbrendan KM_SLEEP); 3079fa94a07fSbrendan cb->l2rcb_buf = buf; 3080fa94a07fSbrendan cb->l2rcb_spa = spa; 3081fa94a07fSbrendan cb->l2rcb_bp = *bp; 3082fa94a07fSbrendan cb->l2rcb_zb = *zb; 30833baa08fcSek cb->l2rcb_flags = zio_flags; 3084aad02571SSaso Kiselkov cb->l2rcb_compress = hdr->b_l2hdr->b_compress; 3085fa94a07fSbrendan 3086d5285caeSGeorge Wilson ASSERT(addr >= VDEV_LABEL_START_SIZE && 3087d5285caeSGeorge Wilson addr + size < vd->vdev_psize - 3088d5285caeSGeorge Wilson VDEV_LABEL_END_SIZE); 3089d5285caeSGeorge Wilson 3090fa94a07fSbrendan /* 3091e14bb325SJeff Bonwick * l2arc read. The SCL_L2ARC lock will be 3092e14bb325SJeff Bonwick * released by l2arc_read_done(). 3093aad02571SSaso Kiselkov * Issue a null zio if the underlying buffer 3094aad02571SSaso Kiselkov * was squashed to zero size by compression. 3095fa94a07fSbrendan */ 3096aad02571SSaso Kiselkov if (hdr->b_l2hdr->b_compress == 3097aad02571SSaso Kiselkov ZIO_COMPRESS_EMPTY) { 3098aad02571SSaso Kiselkov rzio = zio_null(pio, spa, vd, 3099aad02571SSaso Kiselkov l2arc_read_done, cb, 3100aad02571SSaso Kiselkov zio_flags | ZIO_FLAG_DONT_CACHE | 3101aad02571SSaso Kiselkov ZIO_FLAG_CANFAIL | 3102aad02571SSaso Kiselkov ZIO_FLAG_DONT_PROPAGATE | 3103aad02571SSaso Kiselkov ZIO_FLAG_DONT_RETRY); 3104aad02571SSaso Kiselkov } else { 3105aad02571SSaso Kiselkov rzio = zio_read_phys(pio, vd, addr, 3106aad02571SSaso Kiselkov hdr->b_l2hdr->b_asize, 3107aad02571SSaso Kiselkov buf->b_data, ZIO_CHECKSUM_OFF, 3108aad02571SSaso Kiselkov l2arc_read_done, cb, priority, 3109aad02571SSaso Kiselkov zio_flags | ZIO_FLAG_DONT_CACHE | 3110aad02571SSaso Kiselkov ZIO_FLAG_CANFAIL | 3111aad02571SSaso Kiselkov ZIO_FLAG_DONT_PROPAGATE | 3112aad02571SSaso Kiselkov ZIO_FLAG_DONT_RETRY, B_FALSE); 3113aad02571SSaso Kiselkov } 3114fa94a07fSbrendan DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 3115fa94a07fSbrendan zio_t *, rzio); 3116aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_read_bytes, 3117aad02571SSaso Kiselkov hdr->b_l2hdr->b_asize); 3118fa94a07fSbrendan 31193a737e0dSbrendan if (*arc_flags & ARC_NOWAIT) { 31203a737e0dSbrendan zio_nowait(rzio); 31213a737e0dSbrendan return (0); 31223a737e0dSbrendan } 3123fa94a07fSbrendan 31243a737e0dSbrendan ASSERT(*arc_flags & ARC_WAIT); 31253a737e0dSbrendan if (zio_wait(rzio) == 0) 31263a737e0dSbrendan return (0); 31273a737e0dSbrendan 31283a737e0dSbrendan /* l2arc read error; goto zio_read() */ 3129fa94a07fSbrendan } else { 3130fa94a07fSbrendan DTRACE_PROBE1(l2arc__miss, 3131fa94a07fSbrendan arc_buf_hdr_t *, hdr); 3132fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_misses); 3133fa94a07fSbrendan if (HDR_L2_WRITING(hdr)) 3134fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_rw_clash); 3135e14bb325SJeff Bonwick spa_config_exit(spa, SCL_L2ARC, vd); 3136fa94a07fSbrendan } 31375a98e54bSBrendan Gregg - Sun Microsystems } else { 313876a25fafSBill Moore if (vd != NULL) 313976a25fafSBill Moore spa_config_exit(spa, SCL_L2ARC, vd); 31405a98e54bSBrendan Gregg - Sun Microsystems if (l2arc_ndev != 0) { 31415a98e54bSBrendan Gregg - Sun Microsystems DTRACE_PROBE1(l2arc__miss, 31425a98e54bSBrendan Gregg - Sun Microsystems arc_buf_hdr_t *, hdr); 31435a98e54bSBrendan Gregg - Sun Microsystems ARCSTAT_BUMP(arcstat_l2_misses); 31445a98e54bSBrendan Gregg - Sun Microsystems } 3145fa94a07fSbrendan } 3146c5904d13Seschrock 3147fa9e4066Sahrens rzio = zio_read(pio, spa, bp, buf->b_data, size, 31483baa08fcSek arc_read_done, buf, priority, zio_flags, zb); 3149fa9e4066Sahrens 315013506d1eSmaybee if (*arc_flags & ARC_WAIT) 3151fa9e4066Sahrens return (zio_wait(rzio)); 3152fa9e4066Sahrens 315313506d1eSmaybee ASSERT(*arc_flags & ARC_NOWAIT); 3154fa9e4066Sahrens zio_nowait(rzio); 3155fa9e4066Sahrens } 3156fa9e4066Sahrens return (0); 3157fa9e4066Sahrens } 3158fa9e4066Sahrens 3159ea8dc4b6Seschrock void 3160ea8dc4b6Seschrock arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 3161ea8dc4b6Seschrock { 3162ea8dc4b6Seschrock ASSERT(buf->b_hdr != NULL); 316344cb6abcSbmc ASSERT(buf->b_hdr->b_state != arc_anon); 3164ea8dc4b6Seschrock ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 3165b24ab676SJeff Bonwick ASSERT(buf->b_efunc == NULL); 3166b24ab676SJeff Bonwick ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 3167b24ab676SJeff Bonwick 3168ea8dc4b6Seschrock buf->b_efunc = func; 3169ea8dc4b6Seschrock buf->b_private = private; 3170ea8dc4b6Seschrock } 3171ea8dc4b6Seschrock 31726e6d5868SMatthew Ahrens /* 31736e6d5868SMatthew Ahrens * Notify the arc that a block was freed, and thus will never be used again. 31746e6d5868SMatthew Ahrens */ 31756e6d5868SMatthew Ahrens void 31766e6d5868SMatthew Ahrens arc_freed(spa_t *spa, const blkptr_t *bp) 31776e6d5868SMatthew Ahrens { 31786e6d5868SMatthew Ahrens arc_buf_hdr_t *hdr; 31796e6d5868SMatthew Ahrens kmutex_t *hash_lock; 31806e6d5868SMatthew Ahrens uint64_t guid = spa_load_guid(spa); 31816e6d5868SMatthew Ahrens 31826e6d5868SMatthew Ahrens hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), 31836e6d5868SMatthew Ahrens &hash_lock); 31846e6d5868SMatthew Ahrens if (hdr == NULL) 31856e6d5868SMatthew Ahrens return; 31866e6d5868SMatthew Ahrens if (HDR_BUF_AVAILABLE(hdr)) { 31876e6d5868SMatthew Ahrens arc_buf_t *buf = hdr->b_buf; 31886e6d5868SMatthew Ahrens add_reference(hdr, hash_lock, FTAG); 31896e6d5868SMatthew Ahrens hdr->b_flags &= ~ARC_BUF_AVAILABLE; 31906e6d5868SMatthew Ahrens mutex_exit(hash_lock); 31916e6d5868SMatthew Ahrens 31926e6d5868SMatthew Ahrens arc_release(buf, FTAG); 31936e6d5868SMatthew Ahrens (void) arc_buf_remove_ref(buf, FTAG); 31946e6d5868SMatthew Ahrens } else { 31956e6d5868SMatthew Ahrens mutex_exit(hash_lock); 31966e6d5868SMatthew Ahrens } 31976e6d5868SMatthew Ahrens 31986e6d5868SMatthew Ahrens } 31996e6d5868SMatthew Ahrens 3200ea8dc4b6Seschrock /* 3201ea8dc4b6Seschrock * This is used by the DMU to let the ARC know that a buffer is 3202ea8dc4b6Seschrock * being evicted, so the ARC should clean up. If this arc buf 3203ea8dc4b6Seschrock * is not yet in the evicted state, it will be put there. 3204ea8dc4b6Seschrock */ 3205ea8dc4b6Seschrock int 3206ea8dc4b6Seschrock arc_buf_evict(arc_buf_t *buf) 3207ea8dc4b6Seschrock { 320840d7d650Smaybee arc_buf_hdr_t *hdr; 3209ea8dc4b6Seschrock kmutex_t *hash_lock; 3210ea8dc4b6Seschrock arc_buf_t **bufp; 3211ea8dc4b6Seschrock 32123f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 321340d7d650Smaybee hdr = buf->b_hdr; 3214ea8dc4b6Seschrock if (hdr == NULL) { 3215ea8dc4b6Seschrock /* 3216ea8dc4b6Seschrock * We are in arc_do_user_evicts(). 3217ea8dc4b6Seschrock */ 3218ea8dc4b6Seschrock ASSERT(buf->b_data == NULL); 32193f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 3220ea8dc4b6Seschrock return (0); 32216f83844dSMark Maybee } else if (buf->b_data == NULL) { 32226f83844dSMark Maybee arc_buf_t copy = *buf; /* structure assignment */ 32239b23f181Smaybee /* 32246f83844dSMark Maybee * We are on the eviction list; process this buffer now 32256f83844dSMark Maybee * but let arc_do_user_evicts() do the reaping. 32269b23f181Smaybee */ 32276f83844dSMark Maybee buf->b_efunc = NULL; 32283f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 32296f83844dSMark Maybee VERIFY(copy.b_efunc(©) == 0); 32306f83844dSMark Maybee return (1); 32319b23f181Smaybee } 32326f83844dSMark Maybee hash_lock = HDR_LOCK(hdr); 32336f83844dSMark Maybee mutex_enter(hash_lock); 32343f9d6ad7SLin Ling hdr = buf->b_hdr; 32353f9d6ad7SLin Ling ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 32369b23f181Smaybee 32379b23f181Smaybee ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 323844cb6abcSbmc ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 3239ea8dc4b6Seschrock 3240ea8dc4b6Seschrock /* 3241ea8dc4b6Seschrock * Pull this buffer off of the hdr 3242ea8dc4b6Seschrock */ 3243ea8dc4b6Seschrock bufp = &hdr->b_buf; 3244ea8dc4b6Seschrock while (*bufp != buf) 3245ea8dc4b6Seschrock bufp = &(*bufp)->b_next; 3246ea8dc4b6Seschrock *bufp = buf->b_next; 3247ea8dc4b6Seschrock 3248ea8dc4b6Seschrock ASSERT(buf->b_data != NULL); 324944eda4d7Smaybee arc_buf_destroy(buf, FALSE, FALSE); 3250ea8dc4b6Seschrock 3251ea8dc4b6Seschrock if (hdr->b_datacnt == 0) { 3252ea8dc4b6Seschrock arc_state_t *old_state = hdr->b_state; 3253ea8dc4b6Seschrock arc_state_t *evicted_state; 3254ea8dc4b6Seschrock 32553f9d6ad7SLin Ling ASSERT(hdr->b_buf == NULL); 3256ea8dc4b6Seschrock ASSERT(refcount_is_zero(&hdr->b_refcnt)); 3257ea8dc4b6Seschrock 3258ea8dc4b6Seschrock evicted_state = 325944cb6abcSbmc (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3260ea8dc4b6Seschrock 326144cb6abcSbmc mutex_enter(&old_state->arcs_mtx); 326244cb6abcSbmc mutex_enter(&evicted_state->arcs_mtx); 3263ea8dc4b6Seschrock 3264ea8dc4b6Seschrock arc_change_state(evicted_state, hdr, hash_lock); 3265ea8dc4b6Seschrock ASSERT(HDR_IN_HASH_TABLE(hdr)); 3266fa94a07fSbrendan hdr->b_flags |= ARC_IN_HASH_TABLE; 3267fa94a07fSbrendan hdr->b_flags &= ~ARC_BUF_AVAILABLE; 3268ea8dc4b6Seschrock 326944cb6abcSbmc mutex_exit(&evicted_state->arcs_mtx); 327044cb6abcSbmc mutex_exit(&old_state->arcs_mtx); 3271ea8dc4b6Seschrock } 3272ea8dc4b6Seschrock mutex_exit(hash_lock); 32733f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 3274dd6ef538Smaybee 3275ea8dc4b6Seschrock VERIFY(buf->b_efunc(buf) == 0); 3276ea8dc4b6Seschrock buf->b_efunc = NULL; 3277ea8dc4b6Seschrock buf->b_private = NULL; 3278ea8dc4b6Seschrock buf->b_hdr = NULL; 32793f9d6ad7SLin Ling buf->b_next = NULL; 3280ea8dc4b6Seschrock kmem_cache_free(buf_cache, buf); 3281ea8dc4b6Seschrock return (1); 3282ea8dc4b6Seschrock } 3283ea8dc4b6Seschrock 3284fa9e4066Sahrens /* 32853e30c24aSWill Andrews * Release this buffer from the cache, making it an anonymous buffer. This 32863e30c24aSWill Andrews * must be done after a read and prior to modifying the buffer contents. 3287fa9e4066Sahrens * If the buffer has more than one reference, we must make 3288088f3894Sahrens * a new hdr for the buffer. 3289fa9e4066Sahrens */ 3290fa9e4066Sahrens void 3291fa9e4066Sahrens arc_release(arc_buf_t *buf, void *tag) 3292fa9e4066Sahrens { 32936f83844dSMark Maybee arc_buf_hdr_t *hdr; 32943f9d6ad7SLin Ling kmutex_t *hash_lock = NULL; 32956f83844dSMark Maybee l2arc_buf_hdr_t *l2hdr; 3296fa94a07fSbrendan uint64_t buf_size; 3297fa9e4066Sahrens 32983f9d6ad7SLin Ling /* 32993f9d6ad7SLin Ling * It would be nice to assert that if it's DMU metadata (level > 33003f9d6ad7SLin Ling * 0 || it's the dnode file), then it must be syncing context. 33013f9d6ad7SLin Ling * But we don't know that information at this level. 33023f9d6ad7SLin Ling */ 33033f9d6ad7SLin Ling 33043f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 33056f83844dSMark Maybee hdr = buf->b_hdr; 33066f83844dSMark Maybee 3307fa9e4066Sahrens /* this buffer is not on any list */ 3308fa9e4066Sahrens ASSERT(refcount_count(&hdr->b_refcnt) > 0); 3309fa9e4066Sahrens 331044cb6abcSbmc if (hdr->b_state == arc_anon) { 3311fa9e4066Sahrens /* this buffer is already released */ 3312ea8dc4b6Seschrock ASSERT(buf->b_efunc == NULL); 33130a95608cSBrendan Gregg - Sun Microsystems } else { 33140a95608cSBrendan Gregg - Sun Microsystems hash_lock = HDR_LOCK(hdr); 33150a95608cSBrendan Gregg - Sun Microsystems mutex_enter(hash_lock); 33163f9d6ad7SLin Ling hdr = buf->b_hdr; 33173f9d6ad7SLin Ling ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3318fa9e4066Sahrens } 3319fa9e4066Sahrens 33206f83844dSMark Maybee l2hdr = hdr->b_l2hdr; 33216f83844dSMark Maybee if (l2hdr) { 33226f83844dSMark Maybee mutex_enter(&l2arc_buflist_mtx); 33236f83844dSMark Maybee hdr->b_l2hdr = NULL; 33246f83844dSMark Maybee } 3325d5285caeSGeorge Wilson buf_size = hdr->b_size; 33266f83844dSMark Maybee 3327ea8dc4b6Seschrock /* 3328ea8dc4b6Seschrock * Do we have more than one buf? 3329ea8dc4b6Seschrock */ 33306f83844dSMark Maybee if (hdr->b_datacnt > 1) { 3331fa9e4066Sahrens arc_buf_hdr_t *nhdr; 3332fa9e4066Sahrens arc_buf_t **bufp; 3333fa9e4066Sahrens uint64_t blksz = hdr->b_size; 3334ac05c741SMark Maybee uint64_t spa = hdr->b_spa; 3335ad23a2dbSjohansen arc_buf_contents_t type = hdr->b_type; 3336fa94a07fSbrendan uint32_t flags = hdr->b_flags; 3337fa9e4066Sahrens 33386f83844dSMark Maybee ASSERT(hdr->b_buf != buf || buf->b_next != NULL); 3339fa9e4066Sahrens /* 33403f9d6ad7SLin Ling * Pull the data off of this hdr and attach it to 33413f9d6ad7SLin Ling * a new anonymous hdr. 3342fa9e4066Sahrens */ 3343ea8dc4b6Seschrock (void) remove_reference(hdr, hash_lock, tag); 3344fa9e4066Sahrens bufp = &hdr->b_buf; 3345ea8dc4b6Seschrock while (*bufp != buf) 3346fa9e4066Sahrens bufp = &(*bufp)->b_next; 33473f9d6ad7SLin Ling *bufp = buf->b_next; 3348af2c4821Smaybee buf->b_next = NULL; 3349ea8dc4b6Seschrock 335044cb6abcSbmc ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 335144cb6abcSbmc atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 3352ea8dc4b6Seschrock if (refcount_is_zero(&hdr->b_refcnt)) { 33530e8c6158Smaybee uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 33540e8c6158Smaybee ASSERT3U(*size, >=, hdr->b_size); 33550e8c6158Smaybee atomic_add_64(size, -hdr->b_size); 3356ea8dc4b6Seschrock } 33579253d63dSGeorge Wilson 33589253d63dSGeorge Wilson /* 33599253d63dSGeorge Wilson * We're releasing a duplicate user data buffer, update 33609253d63dSGeorge Wilson * our statistics accordingly. 33619253d63dSGeorge Wilson */ 33629253d63dSGeorge Wilson if (hdr->b_type == ARC_BUFC_DATA) { 33639253d63dSGeorge Wilson ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 33649253d63dSGeorge Wilson ARCSTAT_INCR(arcstat_duplicate_buffers_size, 33659253d63dSGeorge Wilson -hdr->b_size); 33669253d63dSGeorge Wilson } 3367ea8dc4b6Seschrock hdr->b_datacnt -= 1; 3368c717a561Smaybee arc_cksum_verify(buf); 3369cd1c8b85SMatthew Ahrens arc_buf_unwatch(buf); 3370ea8dc4b6Seschrock 3371fa9e4066Sahrens mutex_exit(hash_lock); 3372fa9e4066Sahrens 33731ab7f2deSmaybee nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 3374fa9e4066Sahrens nhdr->b_size = blksz; 3375fa9e4066Sahrens nhdr->b_spa = spa; 3376ad23a2dbSjohansen nhdr->b_type = type; 3377fa9e4066Sahrens nhdr->b_buf = buf; 337844cb6abcSbmc nhdr->b_state = arc_anon; 3379fa9e4066Sahrens nhdr->b_arc_access = 0; 3380fa94a07fSbrendan nhdr->b_flags = flags & ARC_L2_WRITING; 3381fa94a07fSbrendan nhdr->b_l2hdr = NULL; 3382ea8dc4b6Seschrock nhdr->b_datacnt = 1; 3383c717a561Smaybee nhdr->b_freeze_cksum = NULL; 3384fa9e4066Sahrens (void) refcount_add(&nhdr->b_refcnt, tag); 3385af2c4821Smaybee buf->b_hdr = nhdr; 33863f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 338744cb6abcSbmc atomic_add_64(&arc_anon->arcs_size, blksz); 3388fa9e4066Sahrens } else { 33893f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 3390ea8dc4b6Seschrock ASSERT(refcount_count(&hdr->b_refcnt) == 1); 3391fa9e4066Sahrens ASSERT(!list_link_active(&hdr->b_arc_node)); 3392fa9e4066Sahrens ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 33933f9d6ad7SLin Ling if (hdr->b_state != arc_anon) 33943f9d6ad7SLin Ling arc_change_state(arc_anon, hdr, hash_lock); 3395fa9e4066Sahrens hdr->b_arc_access = 0; 33963f9d6ad7SLin Ling if (hash_lock) 33973f9d6ad7SLin Ling mutex_exit(hash_lock); 3398fa94a07fSbrendan 33993f9d6ad7SLin Ling buf_discard_identity(hdr); 3400c717a561Smaybee arc_buf_thaw(buf); 3401fa9e4066Sahrens } 3402ea8dc4b6Seschrock buf->b_efunc = NULL; 3403ea8dc4b6Seschrock buf->b_private = NULL; 3404fa94a07fSbrendan 3405fa94a07fSbrendan if (l2hdr) { 3406aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 3407fa94a07fSbrendan list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 3408fa94a07fSbrendan kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 3409fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, -buf_size); 3410fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 34116f83844dSMark Maybee } 3412fa9e4066Sahrens } 3413fa9e4066Sahrens 3414fa9e4066Sahrens int 3415fa9e4066Sahrens arc_released(arc_buf_t *buf) 3416fa9e4066Sahrens { 34176f83844dSMark Maybee int released; 34186f83844dSMark Maybee 34193f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 34206f83844dSMark Maybee released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 34213f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 34226f83844dSMark Maybee return (released); 3423ea8dc4b6Seschrock } 3424ea8dc4b6Seschrock 3425ea8dc4b6Seschrock int 3426ea8dc4b6Seschrock arc_has_callback(arc_buf_t *buf) 3427ea8dc4b6Seschrock { 34286f83844dSMark Maybee int callback; 34296f83844dSMark Maybee 34303f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 34316f83844dSMark Maybee callback = (buf->b_efunc != NULL); 34323f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 34336f83844dSMark Maybee return (callback); 3434fa9e4066Sahrens } 3435fa9e4066Sahrens 3436ea8dc4b6Seschrock #ifdef ZFS_DEBUG 3437ea8dc4b6Seschrock int 3438ea8dc4b6Seschrock arc_referenced(arc_buf_t *buf) 3439ea8dc4b6Seschrock { 34406f83844dSMark Maybee int referenced; 34416f83844dSMark Maybee 34423f9d6ad7SLin Ling mutex_enter(&buf->b_evict_lock); 34436f83844dSMark Maybee referenced = (refcount_count(&buf->b_hdr->b_refcnt)); 34443f9d6ad7SLin Ling mutex_exit(&buf->b_evict_lock); 34456f83844dSMark Maybee return (referenced); 3446ea8dc4b6Seschrock } 3447ea8dc4b6Seschrock #endif 3448ea8dc4b6Seschrock 3449c717a561Smaybee static void 3450c717a561Smaybee arc_write_ready(zio_t *zio) 3451c717a561Smaybee { 3452c717a561Smaybee arc_write_callback_t *callback = zio->io_private; 3453c717a561Smaybee arc_buf_t *buf = callback->awcb_buf; 34540a4e9518Sgw arc_buf_hdr_t *hdr = buf->b_hdr; 3455c717a561Smaybee 3456e14bb325SJeff Bonwick ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 3457e14bb325SJeff Bonwick callback->awcb_ready(zio, buf, callback->awcb_private); 3458e14bb325SJeff Bonwick 34590a4e9518Sgw /* 34600a4e9518Sgw * If the IO is already in progress, then this is a re-write 3461e14bb325SJeff Bonwick * attempt, so we need to thaw and re-compute the cksum. 3462e14bb325SJeff Bonwick * It is the responsibility of the callback to handle the 3463e14bb325SJeff Bonwick * accounting for any re-write attempt. 34640a4e9518Sgw */ 34650a4e9518Sgw if (HDR_IO_IN_PROGRESS(hdr)) { 34660a4e9518Sgw mutex_enter(&hdr->b_freeze_lock); 34670a4e9518Sgw if (hdr->b_freeze_cksum != NULL) { 34680a4e9518Sgw kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 34690a4e9518Sgw hdr->b_freeze_cksum = NULL; 34700a4e9518Sgw } 34710a4e9518Sgw mutex_exit(&hdr->b_freeze_lock); 34720a4e9518Sgw } 3473fa94a07fSbrendan arc_cksum_compute(buf, B_FALSE); 34740a4e9518Sgw hdr->b_flags |= ARC_IO_IN_PROGRESS; 3475c717a561Smaybee } 3476c717a561Smaybee 3477*69962b56SMatthew Ahrens /* 3478*69962b56SMatthew Ahrens * The SPA calls this callback for each physical write that happens on behalf 3479*69962b56SMatthew Ahrens * of a logical write. See the comment in dbuf_write_physdone() for details. 3480*69962b56SMatthew Ahrens */ 3481*69962b56SMatthew Ahrens static void 3482*69962b56SMatthew Ahrens arc_write_physdone(zio_t *zio) 3483*69962b56SMatthew Ahrens { 3484*69962b56SMatthew Ahrens arc_write_callback_t *cb = zio->io_private; 3485*69962b56SMatthew Ahrens if (cb->awcb_physdone != NULL) 3486*69962b56SMatthew Ahrens cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 3487*69962b56SMatthew Ahrens } 3488*69962b56SMatthew Ahrens 3489fa9e4066Sahrens static void 3490fa9e4066Sahrens arc_write_done(zio_t *zio) 3491fa9e4066Sahrens { 3492c717a561Smaybee arc_write_callback_t *callback = zio->io_private; 3493c717a561Smaybee arc_buf_t *buf = callback->awcb_buf; 3494c717a561Smaybee arc_buf_hdr_t *hdr = buf->b_hdr; 3495fa9e4066Sahrens 3496b24ab676SJeff Bonwick ASSERT(hdr->b_acb == NULL); 3497b24ab676SJeff Bonwick 3498b24ab676SJeff Bonwick if (zio->io_error == 0) { 3499b24ab676SJeff Bonwick hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3500b24ab676SJeff Bonwick hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 3501b24ab676SJeff Bonwick hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3502b24ab676SJeff Bonwick } else { 3503b24ab676SJeff Bonwick ASSERT(BUF_EMPTY(hdr)); 3504b24ab676SJeff Bonwick } 3505fa9e4066Sahrens 3506ea8dc4b6Seschrock /* 3507ea8dc4b6Seschrock * If the block to be written was all-zero, we may have 3508ea8dc4b6Seschrock * compressed it away. In this case no write was performed 35093f9d6ad7SLin Ling * so there will be no dva/birth/checksum. The buffer must 35103f9d6ad7SLin Ling * therefore remain anonymous (and uncached). 3511ea8dc4b6Seschrock */ 3512fa9e4066Sahrens if (!BUF_EMPTY(hdr)) { 3513fa9e4066Sahrens arc_buf_hdr_t *exists; 3514fa9e4066Sahrens kmutex_t *hash_lock; 3515fa9e4066Sahrens 3516b24ab676SJeff Bonwick ASSERT(zio->io_error == 0); 3517b24ab676SJeff Bonwick 35186b4acc8bSahrens arc_cksum_verify(buf); 35196b4acc8bSahrens 3520fa9e4066Sahrens exists = buf_hash_insert(hdr, &hash_lock); 3521fa9e4066Sahrens if (exists) { 3522fa9e4066Sahrens /* 3523fa9e4066Sahrens * This can only happen if we overwrite for 3524fa9e4066Sahrens * sync-to-convergence, because we remove 3525fa9e4066Sahrens * buffers from the hash table when we arc_free(). 3526fa9e4066Sahrens */ 3527b24ab676SJeff Bonwick if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 3528b24ab676SJeff Bonwick if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 3529b24ab676SJeff Bonwick panic("bad overwrite, hdr=%p exists=%p", 3530b24ab676SJeff Bonwick (void *)hdr, (void *)exists); 3531b24ab676SJeff Bonwick ASSERT(refcount_is_zero(&exists->b_refcnt)); 3532b24ab676SJeff Bonwick arc_change_state(arc_anon, exists, hash_lock); 3533b24ab676SJeff Bonwick mutex_exit(hash_lock); 3534b24ab676SJeff Bonwick arc_hdr_destroy(exists); 3535b24ab676SJeff Bonwick exists = buf_hash_insert(hdr, &hash_lock); 3536b24ab676SJeff Bonwick ASSERT3P(exists, ==, NULL); 353780901aeaSGeorge Wilson } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 353880901aeaSGeorge Wilson /* nopwrite */ 353980901aeaSGeorge Wilson ASSERT(zio->io_prop.zp_nopwrite); 354080901aeaSGeorge Wilson if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 354180901aeaSGeorge Wilson panic("bad nopwrite, hdr=%p exists=%p", 354280901aeaSGeorge Wilson (void *)hdr, (void *)exists); 3543b24ab676SJeff Bonwick } else { 3544b24ab676SJeff Bonwick /* Dedup */ 3545b24ab676SJeff Bonwick ASSERT(hdr->b_datacnt == 1); 3546b24ab676SJeff Bonwick ASSERT(hdr->b_state == arc_anon); 3547b24ab676SJeff Bonwick ASSERT(BP_GET_DEDUP(zio->io_bp)); 3548b24ab676SJeff Bonwick ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 3549ae46e4c7SMatthew Ahrens } 3550fa9e4066Sahrens } 3551ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3552088f3894Sahrens /* if it's not anon, we are doing a scrub */ 3553b24ab676SJeff Bonwick if (!exists && hdr->b_state == arc_anon) 3554088f3894Sahrens arc_access(hdr, hash_lock); 355544eda4d7Smaybee mutex_exit(hash_lock); 3556ea8dc4b6Seschrock } else { 3557ea8dc4b6Seschrock hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3558fa9e4066Sahrens } 3559ea8dc4b6Seschrock 3560b24ab676SJeff Bonwick ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3561b24ab676SJeff Bonwick callback->awcb_done(zio, buf, callback->awcb_private); 3562fa9e4066Sahrens 3563c717a561Smaybee kmem_free(callback, sizeof (arc_write_callback_t)); 3564fa9e4066Sahrens } 3565fa9e4066Sahrens 3566c717a561Smaybee zio_t * 3567b24ab676SJeff Bonwick arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 3568aad02571SSaso Kiselkov blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 3569*69962b56SMatthew Ahrens const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 3570*69962b56SMatthew Ahrens arc_done_func_t *done, void *private, zio_priority_t priority, 3571*69962b56SMatthew Ahrens int zio_flags, const zbookmark_t *zb) 3572fa9e4066Sahrens { 3573fa9e4066Sahrens arc_buf_hdr_t *hdr = buf->b_hdr; 3574c717a561Smaybee arc_write_callback_t *callback; 3575e14bb325SJeff Bonwick zio_t *zio; 3576fa9e4066Sahrens 3577e14bb325SJeff Bonwick ASSERT(ready != NULL); 3578b24ab676SJeff Bonwick ASSERT(done != NULL); 3579fa9e4066Sahrens ASSERT(!HDR_IO_ERROR(hdr)); 3580c5c6ffa0Smaybee ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3581b24ab676SJeff Bonwick ASSERT(hdr->b_acb == NULL); 35823baa08fcSek if (l2arc) 35833baa08fcSek hdr->b_flags |= ARC_L2CACHE; 3584aad02571SSaso Kiselkov if (l2arc_compress) 3585aad02571SSaso Kiselkov hdr->b_flags |= ARC_L2COMPRESS; 3586c717a561Smaybee callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3587c717a561Smaybee callback->awcb_ready = ready; 3588*69962b56SMatthew Ahrens callback->awcb_physdone = physdone; 3589c717a561Smaybee callback->awcb_done = done; 3590c717a561Smaybee callback->awcb_private = private; 3591c717a561Smaybee callback->awcb_buf = buf; 3592088f3894Sahrens 3593b24ab676SJeff Bonwick zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 3594*69962b56SMatthew Ahrens arc_write_ready, arc_write_physdone, arc_write_done, callback, 3595*69962b56SMatthew Ahrens priority, zio_flags, zb); 3596fa9e4066Sahrens 3597c717a561Smaybee return (zio); 3598fa9e4066Sahrens } 3599fa9e4066Sahrens 36001ab7f2deSmaybee static int 3601*69962b56SMatthew Ahrens arc_memory_throttle(uint64_t reserve, uint64_t txg) 36021ab7f2deSmaybee { 36031ab7f2deSmaybee #ifdef _KERNEL 36041ab7f2deSmaybee uint64_t available_memory = ptob(freemem); 36051ab7f2deSmaybee static uint64_t page_load = 0; 36061ab7f2deSmaybee static uint64_t last_txg = 0; 36071ab7f2deSmaybee 36081ab7f2deSmaybee #if defined(__i386) 36091ab7f2deSmaybee available_memory = 36101ab7f2deSmaybee MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 36111ab7f2deSmaybee #endif 3612*69962b56SMatthew Ahrens 3613*69962b56SMatthew Ahrens if (freemem > physmem * arc_lotsfree_percent / 100) 36141ab7f2deSmaybee return (0); 36151ab7f2deSmaybee 36161ab7f2deSmaybee if (txg > last_txg) { 36171ab7f2deSmaybee last_txg = txg; 36181ab7f2deSmaybee page_load = 0; 36191ab7f2deSmaybee } 36201ab7f2deSmaybee /* 36211ab7f2deSmaybee * If we are in pageout, we know that memory is already tight, 36221ab7f2deSmaybee * the arc is already going to be evicting, so we just want to 36231ab7f2deSmaybee * continue to let page writes occur as quickly as possible. 36241ab7f2deSmaybee */ 36251ab7f2deSmaybee if (curproc == proc_pageout) { 36261ab7f2deSmaybee if (page_load > MAX(ptob(minfree), available_memory) / 4) 3627be6fd75aSMatthew Ahrens return (SET_ERROR(ERESTART)); 36281ab7f2deSmaybee /* Note: reserve is inflated, so we deflate */ 36291ab7f2deSmaybee page_load += reserve / 8; 36301ab7f2deSmaybee return (0); 36311ab7f2deSmaybee } else if (page_load > 0 && arc_reclaim_needed()) { 36321ab7f2deSmaybee /* memory is low, delay before restarting */ 36331ab7f2deSmaybee ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3634be6fd75aSMatthew Ahrens return (SET_ERROR(EAGAIN)); 36351ab7f2deSmaybee } 36361ab7f2deSmaybee page_load = 0; 36371ab7f2deSmaybee #endif 36381ab7f2deSmaybee return (0); 36391ab7f2deSmaybee } 36401ab7f2deSmaybee 3641fa9e4066Sahrens void 36421ab7f2deSmaybee arc_tempreserve_clear(uint64_t reserve) 3643fa9e4066Sahrens { 36441ab7f2deSmaybee atomic_add_64(&arc_tempreserve, -reserve); 3645fa9e4066Sahrens ASSERT((int64_t)arc_tempreserve >= 0); 3646fa9e4066Sahrens } 3647fa9e4066Sahrens 3648fa9e4066Sahrens int 36491ab7f2deSmaybee arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3650fa9e4066Sahrens { 36511ab7f2deSmaybee int error; 36522fdbea25SAleksandr Guzovskiy uint64_t anon_size; 36531ab7f2deSmaybee 36541ab7f2deSmaybee if (reserve > arc_c/4 && !arc_no_grow) 36551ab7f2deSmaybee arc_c = MIN(arc_c_max, reserve * 4); 36561ab7f2deSmaybee if (reserve > arc_c) 3657be6fd75aSMatthew Ahrens return (SET_ERROR(ENOMEM)); 3658112fe045Smaybee 36592fdbea25SAleksandr Guzovskiy /* 36602fdbea25SAleksandr Guzovskiy * Don't count loaned bufs as in flight dirty data to prevent long 36612fdbea25SAleksandr Guzovskiy * network delays from blocking transactions that are ready to be 36622fdbea25SAleksandr Guzovskiy * assigned to a txg. 36632fdbea25SAleksandr Guzovskiy */ 36642fdbea25SAleksandr Guzovskiy anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 36652fdbea25SAleksandr Guzovskiy 36661ab7f2deSmaybee /* 36671ab7f2deSmaybee * Writes will, almost always, require additional memory allocations 3668f7170741SWill Andrews * in order to compress/encrypt/etc the data. We therefore need to 36691ab7f2deSmaybee * make sure that there is sufficient available memory for this. 36701ab7f2deSmaybee */ 3671*69962b56SMatthew Ahrens error = arc_memory_throttle(reserve, txg); 3672*69962b56SMatthew Ahrens if (error != 0) 36731ab7f2deSmaybee return (error); 36741ab7f2deSmaybee 3675fa9e4066Sahrens /* 3676112fe045Smaybee * Throttle writes when the amount of dirty data in the cache 3677112fe045Smaybee * gets too large. We try to keep the cache less than half full 3678112fe045Smaybee * of dirty blocks so that our sync times don't grow too large. 3679112fe045Smaybee * Note: if two requests come in concurrently, we might let them 3680112fe045Smaybee * both succeed, when one of them should fail. Not a huge deal. 3681fa9e4066Sahrens */ 36822fdbea25SAleksandr Guzovskiy 36832fdbea25SAleksandr Guzovskiy if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 36842fdbea25SAleksandr Guzovskiy anon_size > arc_c / 4) { 36850e8c6158Smaybee dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 36860e8c6158Smaybee "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 36870e8c6158Smaybee arc_tempreserve>>10, 36880e8c6158Smaybee arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 36890e8c6158Smaybee arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 36901ab7f2deSmaybee reserve>>10, arc_c>>10); 3691be6fd75aSMatthew Ahrens return (SET_ERROR(ERESTART)); 3692fa9e4066Sahrens } 36931ab7f2deSmaybee atomic_add_64(&arc_tempreserve, reserve); 3694fa9e4066Sahrens return (0); 3695fa9e4066Sahrens } 3696fa9e4066Sahrens 3697fa9e4066Sahrens void 3698fa9e4066Sahrens arc_init(void) 3699fa9e4066Sahrens { 3700fa9e4066Sahrens mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3701fa9e4066Sahrens cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3702fa9e4066Sahrens 370313506d1eSmaybee /* Convert seconds to clock ticks */ 3704b19a79ecSperrin arc_min_prefetch_lifespan = 1 * hz; 370513506d1eSmaybee 3706fa9e4066Sahrens /* Start out with 1/8 of all memory */ 370744cb6abcSbmc arc_c = physmem * PAGESIZE / 8; 3708fa9e4066Sahrens 3709fa9e4066Sahrens #ifdef _KERNEL 3710fa9e4066Sahrens /* 3711fa9e4066Sahrens * On architectures where the physical memory can be larger 3712fa9e4066Sahrens * than the addressable space (intel in 32-bit mode), we may 3713fa9e4066Sahrens * need to limit the cache to 1/8 of VM size. 3714fa9e4066Sahrens */ 371544cb6abcSbmc arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3716fa9e4066Sahrens #endif 3717fa9e4066Sahrens 3718112fe045Smaybee /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 371944cb6abcSbmc arc_c_min = MAX(arc_c / 4, 64<<20); 3720112fe045Smaybee /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 372144cb6abcSbmc if (arc_c * 8 >= 1<<30) 372244cb6abcSbmc arc_c_max = (arc_c * 8) - (1<<30); 3723fa9e4066Sahrens else 372444cb6abcSbmc arc_c_max = arc_c_min; 372544cb6abcSbmc arc_c_max = MAX(arc_c * 6, arc_c_max); 3726a2eea2e1Sahrens 3727a2eea2e1Sahrens /* 3728a2eea2e1Sahrens * Allow the tunables to override our calculations if they are 3729a2eea2e1Sahrens * reasonable (ie. over 64MB) 3730a2eea2e1Sahrens */ 3731a2eea2e1Sahrens if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 373244cb6abcSbmc arc_c_max = zfs_arc_max; 373344cb6abcSbmc if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 373444cb6abcSbmc arc_c_min = zfs_arc_min; 3735a2eea2e1Sahrens 373644cb6abcSbmc arc_c = arc_c_max; 373744cb6abcSbmc arc_p = (arc_c >> 1); 3738fa9e4066Sahrens 37390e8c6158Smaybee /* limit meta-data to 1/4 of the arc capacity */ 37400e8c6158Smaybee arc_meta_limit = arc_c_max / 4; 37411116048bSek 37421116048bSek /* Allow the tunable to override if it is reasonable */ 37431116048bSek if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 37441116048bSek arc_meta_limit = zfs_arc_meta_limit; 37451116048bSek 37460e8c6158Smaybee if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 37470e8c6158Smaybee arc_c_min = arc_meta_limit / 2; 37480e8c6158Smaybee 37495a98e54bSBrendan Gregg - Sun Microsystems if (zfs_arc_grow_retry > 0) 37505a98e54bSBrendan Gregg - Sun Microsystems arc_grow_retry = zfs_arc_grow_retry; 37515a98e54bSBrendan Gregg - Sun Microsystems 37525a98e54bSBrendan Gregg - Sun Microsystems if (zfs_arc_shrink_shift > 0) 37535a98e54bSBrendan Gregg - Sun Microsystems arc_shrink_shift = zfs_arc_shrink_shift; 37545a98e54bSBrendan Gregg - Sun Microsystems 37555a98e54bSBrendan Gregg - Sun Microsystems if (zfs_arc_p_min_shift > 0) 37565a98e54bSBrendan Gregg - Sun Microsystems arc_p_min_shift = zfs_arc_p_min_shift; 37575a98e54bSBrendan Gregg - Sun Microsystems 3758fa9e4066Sahrens /* if kmem_flags are set, lets try to use less memory */ 3759fa9e4066Sahrens if (kmem_debugging()) 376044cb6abcSbmc arc_c = arc_c / 2; 376144cb6abcSbmc if (arc_c < arc_c_min) 376244cb6abcSbmc arc_c = arc_c_min; 376344cb6abcSbmc 376444cb6abcSbmc arc_anon = &ARC_anon; 376544cb6abcSbmc arc_mru = &ARC_mru; 376644cb6abcSbmc arc_mru_ghost = &ARC_mru_ghost; 376744cb6abcSbmc arc_mfu = &ARC_mfu; 376844cb6abcSbmc arc_mfu_ghost = &ARC_mfu_ghost; 3769fa94a07fSbrendan arc_l2c_only = &ARC_l2c_only; 377044cb6abcSbmc arc_size = 0; 377144cb6abcSbmc 377244cb6abcSbmc mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 377344cb6abcSbmc mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 377444cb6abcSbmc mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 377544cb6abcSbmc mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 377644cb6abcSbmc mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3777fa94a07fSbrendan mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 377844cb6abcSbmc 37790e8c6158Smaybee list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 37800e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37810e8c6158Smaybee list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 37820e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37830e8c6158Smaybee list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 37840e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37850e8c6158Smaybee list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 37860e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37870e8c6158Smaybee list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 37880e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37890e8c6158Smaybee list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 37900e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37910e8c6158Smaybee list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 37920e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 37930e8c6158Smaybee list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 37940e8c6158Smaybee sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3795fa94a07fSbrendan list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 3796fa94a07fSbrendan sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3797fa94a07fSbrendan list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 3798fa94a07fSbrendan sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3799fa9e4066Sahrens 3800fa9e4066Sahrens buf_init(); 3801fa9e4066Sahrens 3802fa9e4066Sahrens arc_thread_exit = 0; 3803ea8dc4b6Seschrock arc_eviction_list = NULL; 3804ea8dc4b6Seschrock mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 380540d7d650Smaybee bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3806fa9e4066Sahrens 380744cb6abcSbmc arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 380844cb6abcSbmc sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 380944cb6abcSbmc 381044cb6abcSbmc if (arc_ksp != NULL) { 381144cb6abcSbmc arc_ksp->ks_data = &arc_stats; 381244cb6abcSbmc kstat_install(arc_ksp); 381344cb6abcSbmc } 381444cb6abcSbmc 3815fa9e4066Sahrens (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3816fa9e4066Sahrens TS_RUN, minclsyspri); 381749e3519aSmaybee 381849e3519aSmaybee arc_dead = FALSE; 38193a737e0dSbrendan arc_warm = B_FALSE; 38201ab7f2deSmaybee 3821*69962b56SMatthew Ahrens /* 3822*69962b56SMatthew Ahrens * Calculate maximum amount of dirty data per pool. 3823*69962b56SMatthew Ahrens * 3824*69962b56SMatthew Ahrens * If it has been set by /etc/system, take that. 3825*69962b56SMatthew Ahrens * Otherwise, use a percentage of physical memory defined by 3826*69962b56SMatthew Ahrens * zfs_dirty_data_max_percent (default 10%) with a cap at 3827*69962b56SMatthew Ahrens * zfs_dirty_data_max_max (default 4GB). 3828*69962b56SMatthew Ahrens */ 3829*69962b56SMatthew Ahrens if (zfs_dirty_data_max == 0) { 3830*69962b56SMatthew Ahrens zfs_dirty_data_max = physmem * PAGESIZE * 3831*69962b56SMatthew Ahrens zfs_dirty_data_max_percent / 100; 3832*69962b56SMatthew Ahrens zfs_dirty_data_max = MIN(zfs_dirty_data_max, 3833*69962b56SMatthew Ahrens zfs_dirty_data_max_max); 3834*69962b56SMatthew Ahrens } 3835fa9e4066Sahrens } 3836fa9e4066Sahrens 3837fa9e4066Sahrens void 3838fa9e4066Sahrens arc_fini(void) 3839fa9e4066Sahrens { 3840fa9e4066Sahrens mutex_enter(&arc_reclaim_thr_lock); 3841fa9e4066Sahrens arc_thread_exit = 1; 3842fa9e4066Sahrens while (arc_thread_exit != 0) 3843fa9e4066Sahrens cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3844fa9e4066Sahrens mutex_exit(&arc_reclaim_thr_lock); 3845fa9e4066Sahrens 3846874395d5Smaybee arc_flush(NULL); 3847fa9e4066Sahrens 3848fa9e4066Sahrens arc_dead = TRUE; 3849fa9e4066Sahrens 385044cb6abcSbmc if (arc_ksp != NULL) { 385144cb6abcSbmc kstat_delete(arc_ksp); 385244cb6abcSbmc arc_ksp = NULL; 385344cb6abcSbmc } 385444cb6abcSbmc 3855ea8dc4b6Seschrock mutex_destroy(&arc_eviction_mtx); 3856fa9e4066Sahrens mutex_destroy(&arc_reclaim_thr_lock); 3857fa9e4066Sahrens cv_destroy(&arc_reclaim_thr_cv); 3858fa9e4066Sahrens 38590e8c6158Smaybee list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 38600e8c6158Smaybee list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 38610e8c6158Smaybee list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 38620e8c6158Smaybee list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 38630e8c6158Smaybee list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 38640e8c6158Smaybee list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 38650e8c6158Smaybee list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 38660e8c6158Smaybee list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 3867fa9e4066Sahrens 386844cb6abcSbmc mutex_destroy(&arc_anon->arcs_mtx); 386944cb6abcSbmc mutex_destroy(&arc_mru->arcs_mtx); 387044cb6abcSbmc mutex_destroy(&arc_mru_ghost->arcs_mtx); 387144cb6abcSbmc mutex_destroy(&arc_mfu->arcs_mtx); 387244cb6abcSbmc mutex_destroy(&arc_mfu_ghost->arcs_mtx); 3873b5e70f97SRicardo M. Correia mutex_destroy(&arc_l2c_only->arcs_mtx); 38745ad82045Snd 3875fa9e4066Sahrens buf_fini(); 38762fdbea25SAleksandr Guzovskiy 38772fdbea25SAleksandr Guzovskiy ASSERT(arc_loaned_bytes == 0); 3878fa9e4066Sahrens } 3879fa94a07fSbrendan 3880fa94a07fSbrendan /* 3881fa94a07fSbrendan * Level 2 ARC 3882fa94a07fSbrendan * 3883fa94a07fSbrendan * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3884fa94a07fSbrendan * It uses dedicated storage devices to hold cached data, which are populated 3885fa94a07fSbrendan * using large infrequent writes. The main role of this cache is to boost 3886fa94a07fSbrendan * the performance of random read workloads. The intended L2ARC devices 3887fa94a07fSbrendan * include short-stroked disks, solid state disks, and other media with 3888fa94a07fSbrendan * substantially faster read latency than disk. 3889fa94a07fSbrendan * 3890fa94a07fSbrendan * +-----------------------+ 3891fa94a07fSbrendan * | ARC | 3892fa94a07fSbrendan * +-----------------------+ 3893fa94a07fSbrendan * | ^ ^ 3894fa94a07fSbrendan * | | | 3895fa94a07fSbrendan * l2arc_feed_thread() arc_read() 3896fa94a07fSbrendan * | | | 3897fa94a07fSbrendan * | l2arc read | 3898fa94a07fSbrendan * V | | 3899fa94a07fSbrendan * +---------------+ | 3900fa94a07fSbrendan * | L2ARC | | 3901fa94a07fSbrendan * +---------------+ | 3902fa94a07fSbrendan * | ^ | 3903fa94a07fSbrendan * l2arc_write() | | 3904fa94a07fSbrendan * | | | 3905fa94a07fSbrendan * V | | 3906fa94a07fSbrendan * +-------+ +-------+ 3907fa94a07fSbrendan * | vdev | | vdev | 3908fa94a07fSbrendan * | cache | | cache | 3909fa94a07fSbrendan * +-------+ +-------+ 3910fa94a07fSbrendan * +=========+ .-----. 3911fa94a07fSbrendan * : L2ARC : |-_____-| 3912fa94a07fSbrendan * : devices : | Disks | 3913fa94a07fSbrendan * +=========+ `-_____-' 3914fa94a07fSbrendan * 3915fa94a07fSbrendan * Read requests are satisfied from the following sources, in order: 3916fa94a07fSbrendan * 3917fa94a07fSbrendan * 1) ARC 3918fa94a07fSbrendan * 2) vdev cache of L2ARC devices 3919fa94a07fSbrendan * 3) L2ARC devices 3920fa94a07fSbrendan * 4) vdev cache of disks 3921fa94a07fSbrendan * 5) disks 3922fa94a07fSbrendan * 3923fa94a07fSbrendan * Some L2ARC device types exhibit extremely slow write performance. 3924fa94a07fSbrendan * To accommodate for this there are some significant differences between 3925fa94a07fSbrendan * the L2ARC and traditional cache design: 3926fa94a07fSbrendan * 3927fa94a07fSbrendan * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 3928fa94a07fSbrendan * the ARC behave as usual, freeing buffers and placing headers on ghost 3929fa94a07fSbrendan * lists. The ARC does not send buffers to the L2ARC during eviction as 3930fa94a07fSbrendan * this would add inflated write latencies for all ARC memory pressure. 3931fa94a07fSbrendan * 3932fa94a07fSbrendan * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 3933fa94a07fSbrendan * It does this by periodically scanning buffers from the eviction-end of 3934fa94a07fSbrendan * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 3935aad02571SSaso Kiselkov * not already there. It scans until a headroom of buffers is satisfied, 3936aad02571SSaso Kiselkov * which itself is a buffer for ARC eviction. If a compressible buffer is 3937aad02571SSaso Kiselkov * found during scanning and selected for writing to an L2ARC device, we 3938aad02571SSaso Kiselkov * temporarily boost scanning headroom during the next scan cycle to make 3939aad02571SSaso Kiselkov * sure we adapt to compression effects (which might significantly reduce 3940aad02571SSaso Kiselkov * the data volume we write to L2ARC). The thread that does this is 3941fa94a07fSbrendan * l2arc_feed_thread(), illustrated below; example sizes are included to 3942fa94a07fSbrendan * provide a better sense of ratio than this diagram: 3943fa94a07fSbrendan * 3944fa94a07fSbrendan * head --> tail 3945fa94a07fSbrendan * +---------------------+----------+ 3946fa94a07fSbrendan * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 3947fa94a07fSbrendan * +---------------------+----------+ | o L2ARC eligible 3948fa94a07fSbrendan * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 3949fa94a07fSbrendan * +---------------------+----------+ | 3950fa94a07fSbrendan * 15.9 Gbytes ^ 32 Mbytes | 3951fa94a07fSbrendan * headroom | 3952fa94a07fSbrendan * l2arc_feed_thread() 3953fa94a07fSbrendan * | 3954fa94a07fSbrendan * l2arc write hand <--[oooo]--' 3955fa94a07fSbrendan * | 8 Mbyte 3956fa94a07fSbrendan * | write max 3957fa94a07fSbrendan * V 3958fa94a07fSbrendan * +==============================+ 3959fa94a07fSbrendan * L2ARC dev |####|#|###|###| |####| ... | 3960fa94a07fSbrendan * +==============================+ 3961fa94a07fSbrendan * 32 Gbytes 3962fa94a07fSbrendan * 3963fa94a07fSbrendan * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 3964fa94a07fSbrendan * evicted, then the L2ARC has cached a buffer much sooner than it probably 3965fa94a07fSbrendan * needed to, potentially wasting L2ARC device bandwidth and storage. It is 3966fa94a07fSbrendan * safe to say that this is an uncommon case, since buffers at the end of 3967fa94a07fSbrendan * the ARC lists have moved there due to inactivity. 3968fa94a07fSbrendan * 3969fa94a07fSbrendan * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 3970fa94a07fSbrendan * then the L2ARC simply misses copying some buffers. This serves as a 3971fa94a07fSbrendan * pressure valve to prevent heavy read workloads from both stalling the ARC 3972fa94a07fSbrendan * with waits and clogging the L2ARC with writes. This also helps prevent 3973fa94a07fSbrendan * the potential for the L2ARC to churn if it attempts to cache content too 3974fa94a07fSbrendan * quickly, such as during backups of the entire pool. 3975fa94a07fSbrendan * 39763a737e0dSbrendan * 5. After system boot and before the ARC has filled main memory, there are 39773a737e0dSbrendan * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 39783a737e0dSbrendan * lists can remain mostly static. Instead of searching from tail of these 39793a737e0dSbrendan * lists as pictured, the l2arc_feed_thread() will search from the list heads 39803a737e0dSbrendan * for eligible buffers, greatly increasing its chance of finding them. 39813a737e0dSbrendan * 39823a737e0dSbrendan * The L2ARC device write speed is also boosted during this time so that 39833a737e0dSbrendan * the L2ARC warms up faster. Since there have been no ARC evictions yet, 39843a737e0dSbrendan * there are no L2ARC reads, and no fear of degrading read performance 39853a737e0dSbrendan * through increased writes. 39863a737e0dSbrendan * 39873a737e0dSbrendan * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 3988fa94a07fSbrendan * the vdev queue can aggregate them into larger and fewer writes. Each 3989fa94a07fSbrendan * device is written to in a rotor fashion, sweeping writes through 3990fa94a07fSbrendan * available space then repeating. 3991fa94a07fSbrendan * 39923a737e0dSbrendan * 7. The L2ARC does not store dirty content. It never needs to flush 3993fa94a07fSbrendan * write buffers back to disk based storage. 3994fa94a07fSbrendan * 39953a737e0dSbrendan * 8. If an ARC buffer is written (and dirtied) which also exists in the 3996fa94a07fSbrendan * L2ARC, the now stale L2ARC buffer is immediately dropped. 3997fa94a07fSbrendan * 3998fa94a07fSbrendan * The performance of the L2ARC can be tweaked by a number of tunables, which 3999fa94a07fSbrendan * may be necessary for different workloads: 4000fa94a07fSbrendan * 4001fa94a07fSbrendan * l2arc_write_max max write bytes per interval 40023a737e0dSbrendan * l2arc_write_boost extra write bytes during device warmup 4003fa94a07fSbrendan * l2arc_noprefetch skip caching prefetched buffers 4004fa94a07fSbrendan * l2arc_headroom number of max device writes to precache 4005aad02571SSaso Kiselkov * l2arc_headroom_boost when we find compressed buffers during ARC 4006aad02571SSaso Kiselkov * scanning, we multiply headroom by this 4007aad02571SSaso Kiselkov * percentage factor for the next scan cycle, 4008aad02571SSaso Kiselkov * since more compressed buffers are likely to 4009aad02571SSaso Kiselkov * be present 4010fa94a07fSbrendan * l2arc_feed_secs seconds between L2ARC writing 4011fa94a07fSbrendan * 4012fa94a07fSbrendan * Tunables may be removed or added as future performance improvements are 4013fa94a07fSbrendan * integrated, and also may become zpool properties. 40145a98e54bSBrendan Gregg - Sun Microsystems * 40155a98e54bSBrendan Gregg - Sun Microsystems * There are three key functions that control how the L2ARC warms up: 40165a98e54bSBrendan Gregg - Sun Microsystems * 40175a98e54bSBrendan Gregg - Sun Microsystems * l2arc_write_eligible() check if a buffer is eligible to cache 40185a98e54bSBrendan Gregg - Sun Microsystems * l2arc_write_size() calculate how much to write 40195a98e54bSBrendan Gregg - Sun Microsystems * l2arc_write_interval() calculate sleep delay between writes 40205a98e54bSBrendan Gregg - Sun Microsystems * 40215a98e54bSBrendan Gregg - Sun Microsystems * These three functions determine what to write, how much, and how quickly 40225a98e54bSBrendan Gregg - Sun Microsystems * to send writes. 4023fa94a07fSbrendan */ 4024fa94a07fSbrendan 40255a98e54bSBrendan Gregg - Sun Microsystems static boolean_t 4026ac05c741SMark Maybee l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) 40275a98e54bSBrendan Gregg - Sun Microsystems { 40285a98e54bSBrendan Gregg - Sun Microsystems /* 40295a98e54bSBrendan Gregg - Sun Microsystems * A buffer is *not* eligible for the L2ARC if it: 40305a98e54bSBrendan Gregg - Sun Microsystems * 1. belongs to a different spa. 40315ea40c06SBrendan Gregg - Sun Microsystems * 2. is already cached on the L2ARC. 40325ea40c06SBrendan Gregg - Sun Microsystems * 3. has an I/O in progress (it may be an incomplete read). 40335ea40c06SBrendan Gregg - Sun Microsystems * 4. is flagged not eligible (zfs property). 40345a98e54bSBrendan Gregg - Sun Microsystems */ 40355ea40c06SBrendan Gregg - Sun Microsystems if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || 40365a98e54bSBrendan Gregg - Sun Microsystems HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) 40375a98e54bSBrendan Gregg - Sun Microsystems return (B_FALSE); 40385a98e54bSBrendan Gregg - Sun Microsystems 40395a98e54bSBrendan Gregg - Sun Microsystems return (B_TRUE); 40405a98e54bSBrendan Gregg - Sun Microsystems } 40415a98e54bSBrendan Gregg - Sun Microsystems 40425a98e54bSBrendan Gregg - Sun Microsystems static uint64_t 4043aad02571SSaso Kiselkov l2arc_write_size(void) 40445a98e54bSBrendan Gregg - Sun Microsystems { 40455a98e54bSBrendan Gregg - Sun Microsystems uint64_t size; 40465a98e54bSBrendan Gregg - Sun Microsystems 4047aad02571SSaso Kiselkov /* 4048aad02571SSaso Kiselkov * Make sure our globals have meaningful values in case the user 4049aad02571SSaso Kiselkov * altered them. 4050aad02571SSaso Kiselkov */ 4051aad02571SSaso Kiselkov size = l2arc_write_max; 4052aad02571SSaso Kiselkov if (size == 0) { 4053aad02571SSaso Kiselkov cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 4054aad02571SSaso Kiselkov "be greater than zero, resetting it to the default (%d)", 4055aad02571SSaso Kiselkov L2ARC_WRITE_SIZE); 4056aad02571SSaso Kiselkov size = l2arc_write_max = L2ARC_WRITE_SIZE; 4057aad02571SSaso Kiselkov } 40585a98e54bSBrendan Gregg - Sun Microsystems 40595a98e54bSBrendan Gregg - Sun Microsystems if (arc_warm == B_FALSE) 4060aad02571SSaso Kiselkov size += l2arc_write_boost; 40615a98e54bSBrendan Gregg - Sun Microsystems 40625a98e54bSBrendan Gregg - Sun Microsystems return (size); 40635a98e54bSBrendan Gregg - Sun Microsystems 40645a98e54bSBrendan Gregg - Sun Microsystems } 40655a98e54bSBrendan Gregg - Sun Microsystems 40665a98e54bSBrendan Gregg - Sun Microsystems static clock_t 40675a98e54bSBrendan Gregg - Sun Microsystems l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 40685a98e54bSBrendan Gregg - Sun Microsystems { 4069d3d50737SRafael Vanoni clock_t interval, next, now; 40705a98e54bSBrendan Gregg - Sun Microsystems 40715a98e54bSBrendan Gregg - Sun Microsystems /* 40725a98e54bSBrendan Gregg - Sun Microsystems * If the ARC lists are busy, increase our write rate; if the 40735a98e54bSBrendan Gregg - Sun Microsystems * lists are stale, idle back. This is achieved by checking 40745a98e54bSBrendan Gregg - Sun Microsystems * how much we previously wrote - if it was more than half of 40755a98e54bSBrendan Gregg - Sun Microsystems * what we wanted, schedule the next write much sooner. 40765a98e54bSBrendan Gregg - Sun Microsystems */ 40775a98e54bSBrendan Gregg - Sun Microsystems if (l2arc_feed_again && wrote > (wanted / 2)) 40785a98e54bSBrendan Gregg - Sun Microsystems interval = (hz * l2arc_feed_min_ms) / 1000; 40795a98e54bSBrendan Gregg - Sun Microsystems else 40805a98e54bSBrendan Gregg - Sun Microsystems interval = hz * l2arc_feed_secs; 40815a98e54bSBrendan Gregg - Sun Microsystems 4082d3d50737SRafael Vanoni now = ddi_get_lbolt(); 4083d3d50737SRafael Vanoni next = MAX(now, MIN(now + interval, began + interval)); 40845a98e54bSBrendan Gregg - Sun Microsystems 40855a98e54bSBrendan Gregg - Sun Microsystems return (next); 40865a98e54bSBrendan Gregg - Sun Microsystems } 40875a98e54bSBrendan Gregg - Sun Microsystems 4088fa94a07fSbrendan static void 4089fa94a07fSbrendan l2arc_hdr_stat_add(void) 4090fa94a07fSbrendan { 4091e6c728e1Sbrendan ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 4092e6c728e1Sbrendan ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 4093fa94a07fSbrendan } 4094fa94a07fSbrendan 4095fa94a07fSbrendan static void 4096fa94a07fSbrendan l2arc_hdr_stat_remove(void) 4097fa94a07fSbrendan { 4098e6c728e1Sbrendan ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 4099e6c728e1Sbrendan ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 4100fa94a07fSbrendan } 4101fa94a07fSbrendan 4102fa94a07fSbrendan /* 4103fa94a07fSbrendan * Cycle through L2ARC devices. This is how L2ARC load balances. 41043a737e0dSbrendan * If a device is returned, this also returns holding the spa config lock. 4105fa94a07fSbrendan */ 4106fa94a07fSbrendan static l2arc_dev_t * 4107fa94a07fSbrendan l2arc_dev_get_next(void) 4108fa94a07fSbrendan { 41093a737e0dSbrendan l2arc_dev_t *first, *next = NULL; 41103a737e0dSbrendan 41113a737e0dSbrendan /* 41123a737e0dSbrendan * Lock out the removal of spas (spa_namespace_lock), then removal 41133a737e0dSbrendan * of cache devices (l2arc_dev_mtx). Once a device has been selected, 41143a737e0dSbrendan * both locks will be dropped and a spa config lock held instead. 41153a737e0dSbrendan */ 41163a737e0dSbrendan mutex_enter(&spa_namespace_lock); 41173a737e0dSbrendan mutex_enter(&l2arc_dev_mtx); 4118fa94a07fSbrendan 4119c5904d13Seschrock /* if there are no vdevs, there is nothing to do */ 4120c5904d13Seschrock if (l2arc_ndev == 0) 41213a737e0dSbrendan goto out; 4122c5904d13Seschrock 4123c5904d13Seschrock first = NULL; 4124c5904d13Seschrock next = l2arc_dev_last; 4125c5904d13Seschrock do { 4126c5904d13Seschrock /* loop around the list looking for a non-faulted vdev */ 4127c5904d13Seschrock if (next == NULL) { 4128fa94a07fSbrendan next = list_head(l2arc_dev_list); 4129c5904d13Seschrock } else { 4130c5904d13Seschrock next = list_next(l2arc_dev_list, next); 4131c5904d13Seschrock if (next == NULL) 4132c5904d13Seschrock next = list_head(l2arc_dev_list); 4133c5904d13Seschrock } 4134c5904d13Seschrock 4135c5904d13Seschrock /* if we have come back to the start, bail out */ 4136c5904d13Seschrock if (first == NULL) 4137c5904d13Seschrock first = next; 4138c5904d13Seschrock else if (next == first) 4139c5904d13Seschrock break; 4140c5904d13Seschrock 4141c5904d13Seschrock } while (vdev_is_dead(next->l2ad_vdev)); 4142c5904d13Seschrock 4143c5904d13Seschrock /* if we were unable to find any usable vdevs, return NULL */ 4144c5904d13Seschrock if (vdev_is_dead(next->l2ad_vdev)) 41453a737e0dSbrendan next = NULL; 4146fa94a07fSbrendan 4147fa94a07fSbrendan l2arc_dev_last = next; 4148fa94a07fSbrendan 41493a737e0dSbrendan out: 41503a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 41513a737e0dSbrendan 41523a737e0dSbrendan /* 41533a737e0dSbrendan * Grab the config lock to prevent the 'next' device from being 41543a737e0dSbrendan * removed while we are writing to it. 41553a737e0dSbrendan */ 41563a737e0dSbrendan if (next != NULL) 4157e14bb325SJeff Bonwick spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 41583a737e0dSbrendan mutex_exit(&spa_namespace_lock); 41593a737e0dSbrendan 4160fa94a07fSbrendan return (next); 4161fa94a07fSbrendan } 4162fa94a07fSbrendan 41633a737e0dSbrendan /* 41643a737e0dSbrendan * Free buffers that were tagged for destruction. 41653a737e0dSbrendan */ 41663a737e0dSbrendan static void 41673a737e0dSbrendan l2arc_do_free_on_write() 41683a737e0dSbrendan { 41693a737e0dSbrendan list_t *buflist; 41703a737e0dSbrendan l2arc_data_free_t *df, *df_prev; 41713a737e0dSbrendan 41723a737e0dSbrendan mutex_enter(&l2arc_free_on_write_mtx); 41733a737e0dSbrendan buflist = l2arc_free_on_write; 41743a737e0dSbrendan 41753a737e0dSbrendan for (df = list_tail(buflist); df; df = df_prev) { 41763a737e0dSbrendan df_prev = list_prev(buflist, df); 41773a737e0dSbrendan ASSERT(df->l2df_data != NULL); 41783a737e0dSbrendan ASSERT(df->l2df_func != NULL); 41793a737e0dSbrendan df->l2df_func(df->l2df_data, df->l2df_size); 41803a737e0dSbrendan list_remove(buflist, df); 41813a737e0dSbrendan kmem_free(df, sizeof (l2arc_data_free_t)); 41823a737e0dSbrendan } 41833a737e0dSbrendan 41843a737e0dSbrendan mutex_exit(&l2arc_free_on_write_mtx); 41853a737e0dSbrendan } 41863a737e0dSbrendan 4187fa94a07fSbrendan /* 4188fa94a07fSbrendan * A write to a cache device has completed. Update all headers to allow 4189fa94a07fSbrendan * reads from these buffers to begin. 4190fa94a07fSbrendan */ 4191fa94a07fSbrendan static void 4192fa94a07fSbrendan l2arc_write_done(zio_t *zio) 4193fa94a07fSbrendan { 4194fa94a07fSbrendan l2arc_write_callback_t *cb; 4195fa94a07fSbrendan l2arc_dev_t *dev; 4196fa94a07fSbrendan list_t *buflist; 4197fa94a07fSbrendan arc_buf_hdr_t *head, *ab, *ab_prev; 41983a737e0dSbrendan l2arc_buf_hdr_t *abl2; 4199fa94a07fSbrendan kmutex_t *hash_lock; 4200fa94a07fSbrendan 4201fa94a07fSbrendan cb = zio->io_private; 4202fa94a07fSbrendan ASSERT(cb != NULL); 4203fa94a07fSbrendan dev = cb->l2wcb_dev; 4204fa94a07fSbrendan ASSERT(dev != NULL); 4205fa94a07fSbrendan head = cb->l2wcb_head; 4206fa94a07fSbrendan ASSERT(head != NULL); 4207fa94a07fSbrendan buflist = dev->l2ad_buflist; 4208fa94a07fSbrendan ASSERT(buflist != NULL); 4209fa94a07fSbrendan DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 4210fa94a07fSbrendan l2arc_write_callback_t *, cb); 4211fa94a07fSbrendan 4212fa94a07fSbrendan if (zio->io_error != 0) 4213fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_writes_error); 4214fa94a07fSbrendan 4215fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 4216fa94a07fSbrendan 4217fa94a07fSbrendan /* 4218fa94a07fSbrendan * All writes completed, or an error was hit. 4219fa94a07fSbrendan */ 4220fa94a07fSbrendan for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 4221fa94a07fSbrendan ab_prev = list_prev(buflist, ab); 4222fa94a07fSbrendan 4223fa94a07fSbrendan hash_lock = HDR_LOCK(ab); 4224fa94a07fSbrendan if (!mutex_tryenter(hash_lock)) { 4225fa94a07fSbrendan /* 4226fa94a07fSbrendan * This buffer misses out. It may be in a stage 4227fa94a07fSbrendan * of eviction. Its ARC_L2_WRITING flag will be 4228fa94a07fSbrendan * left set, denying reads to this buffer. 4229fa94a07fSbrendan */ 4230fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 4231fa94a07fSbrendan continue; 4232fa94a07fSbrendan } 4233fa94a07fSbrendan 4234aad02571SSaso Kiselkov abl2 = ab->b_l2hdr; 4235aad02571SSaso Kiselkov 4236aad02571SSaso Kiselkov /* 4237aad02571SSaso Kiselkov * Release the temporary compressed buffer as soon as possible. 4238aad02571SSaso Kiselkov */ 4239aad02571SSaso Kiselkov if (abl2->b_compress != ZIO_COMPRESS_OFF) 4240aad02571SSaso Kiselkov l2arc_release_cdata_buf(ab); 4241aad02571SSaso Kiselkov 4242fa94a07fSbrendan if (zio->io_error != 0) { 4243fa94a07fSbrendan /* 42443a737e0dSbrendan * Error - drop L2ARC entry. 4245fa94a07fSbrendan */ 42463a737e0dSbrendan list_remove(buflist, ab); 4247aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4248fa94a07fSbrendan ab->b_l2hdr = NULL; 42493a737e0dSbrendan kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 42503a737e0dSbrendan ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4251fa94a07fSbrendan } 4252fa94a07fSbrendan 4253fa94a07fSbrendan /* 4254fa94a07fSbrendan * Allow ARC to begin reads to this L2ARC entry. 4255fa94a07fSbrendan */ 4256fa94a07fSbrendan ab->b_flags &= ~ARC_L2_WRITING; 4257fa94a07fSbrendan 4258fa94a07fSbrendan mutex_exit(hash_lock); 4259fa94a07fSbrendan } 4260fa94a07fSbrendan 4261fa94a07fSbrendan atomic_inc_64(&l2arc_writes_done); 4262fa94a07fSbrendan list_remove(buflist, head); 4263fa94a07fSbrendan kmem_cache_free(hdr_cache, head); 4264fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 4265fa94a07fSbrendan 42663a737e0dSbrendan l2arc_do_free_on_write(); 4267fa94a07fSbrendan 4268fa94a07fSbrendan kmem_free(cb, sizeof (l2arc_write_callback_t)); 4269fa94a07fSbrendan } 4270fa94a07fSbrendan 4271fa94a07fSbrendan /* 4272fa94a07fSbrendan * A read to a cache device completed. Validate buffer contents before 4273fa94a07fSbrendan * handing over to the regular ARC routines. 4274fa94a07fSbrendan */ 4275fa94a07fSbrendan static void 4276fa94a07fSbrendan l2arc_read_done(zio_t *zio) 4277fa94a07fSbrendan { 4278fa94a07fSbrendan l2arc_read_callback_t *cb; 4279fa94a07fSbrendan arc_buf_hdr_t *hdr; 4280fa94a07fSbrendan arc_buf_t *buf; 4281fa94a07fSbrendan kmutex_t *hash_lock; 42823a737e0dSbrendan int equal; 4283fa94a07fSbrendan 4284e14bb325SJeff Bonwick ASSERT(zio->io_vd != NULL); 4285e14bb325SJeff Bonwick ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 4286e14bb325SJeff Bonwick 4287e14bb325SJeff Bonwick spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 4288e14bb325SJeff Bonwick 4289fa94a07fSbrendan cb = zio->io_private; 4290fa94a07fSbrendan ASSERT(cb != NULL); 4291fa94a07fSbrendan buf = cb->l2rcb_buf; 4292fa94a07fSbrendan ASSERT(buf != NULL); 4293fa94a07fSbrendan 42943f9d6ad7SLin Ling hash_lock = HDR_LOCK(buf->b_hdr); 4295fa94a07fSbrendan mutex_enter(hash_lock); 42963f9d6ad7SLin Ling hdr = buf->b_hdr; 42973f9d6ad7SLin Ling ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4298fa94a07fSbrendan 4299aad02571SSaso Kiselkov /* 4300aad02571SSaso Kiselkov * If the buffer was compressed, decompress it first. 4301aad02571SSaso Kiselkov */ 4302aad02571SSaso Kiselkov if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 4303aad02571SSaso Kiselkov l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 4304aad02571SSaso Kiselkov ASSERT(zio->io_data != NULL); 4305aad02571SSaso Kiselkov 4306fa94a07fSbrendan /* 4307fa94a07fSbrendan * Check this survived the L2ARC journey. 4308fa94a07fSbrendan */ 4309fa94a07fSbrendan equal = arc_cksum_equal(buf); 4310fa94a07fSbrendan if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 4311fa94a07fSbrendan mutex_exit(hash_lock); 4312fa94a07fSbrendan zio->io_private = buf; 4313e14bb325SJeff Bonwick zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 4314e14bb325SJeff Bonwick zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 4315fa94a07fSbrendan arc_read_done(zio); 4316fa94a07fSbrendan } else { 4317fa94a07fSbrendan mutex_exit(hash_lock); 4318fa94a07fSbrendan /* 4319fa94a07fSbrendan * Buffer didn't survive caching. Increment stats and 4320fa94a07fSbrendan * reissue to the original storage device. 4321fa94a07fSbrendan */ 43223a737e0dSbrendan if (zio->io_error != 0) { 4323fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_io_error); 43243a737e0dSbrendan } else { 4325be6fd75aSMatthew Ahrens zio->io_error = SET_ERROR(EIO); 43263a737e0dSbrendan } 4327fa94a07fSbrendan if (!equal) 4328fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_cksum_bad); 4329fa94a07fSbrendan 4330e14bb325SJeff Bonwick /* 4331e14bb325SJeff Bonwick * If there's no waiter, issue an async i/o to the primary 4332e14bb325SJeff Bonwick * storage now. If there *is* a waiter, the caller must 4333e14bb325SJeff Bonwick * issue the i/o in a context where it's OK to block. 4334e14bb325SJeff Bonwick */ 4335a3f829aeSBill Moore if (zio->io_waiter == NULL) { 4336a3f829aeSBill Moore zio_t *pio = zio_unique_parent(zio); 4337a3f829aeSBill Moore 4338a3f829aeSBill Moore ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 4339a3f829aeSBill Moore 4340a3f829aeSBill Moore zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 4341e14bb325SJeff Bonwick buf->b_data, zio->io_size, arc_read_done, buf, 4342e14bb325SJeff Bonwick zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 4343a3f829aeSBill Moore } 4344fa94a07fSbrendan } 4345fa94a07fSbrendan 4346fa94a07fSbrendan kmem_free(cb, sizeof (l2arc_read_callback_t)); 4347fa94a07fSbrendan } 4348fa94a07fSbrendan 4349fa94a07fSbrendan /* 4350fa94a07fSbrendan * This is the list priority from which the L2ARC will search for pages to 4351fa94a07fSbrendan * cache. This is used within loops (0..3) to cycle through lists in the 4352fa94a07fSbrendan * desired order. This order can have a significant effect on cache 4353fa94a07fSbrendan * performance. 4354fa94a07fSbrendan * 4355fa94a07fSbrendan * Currently the metadata lists are hit first, MFU then MRU, followed by 4356fa94a07fSbrendan * the data lists. This function returns a locked list, and also returns 4357fa94a07fSbrendan * the lock pointer. 4358fa94a07fSbrendan */ 4359fa94a07fSbrendan static list_t * 4360fa94a07fSbrendan l2arc_list_locked(int list_num, kmutex_t **lock) 4361fa94a07fSbrendan { 4362d5285caeSGeorge Wilson list_t *list = NULL; 4363fa94a07fSbrendan 4364fa94a07fSbrendan ASSERT(list_num >= 0 && list_num <= 3); 4365fa94a07fSbrendan 4366fa94a07fSbrendan switch (list_num) { 4367fa94a07fSbrendan case 0: 4368fa94a07fSbrendan list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 4369fa94a07fSbrendan *lock = &arc_mfu->arcs_mtx; 4370fa94a07fSbrendan break; 4371fa94a07fSbrendan case 1: 4372fa94a07fSbrendan list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 4373fa94a07fSbrendan *lock = &arc_mru->arcs_mtx; 4374fa94a07fSbrendan break; 4375fa94a07fSbrendan case 2: 4376fa94a07fSbrendan list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 4377fa94a07fSbrendan *lock = &arc_mfu->arcs_mtx; 4378fa94a07fSbrendan break; 4379fa94a07fSbrendan case 3: 4380fa94a07fSbrendan list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 4381fa94a07fSbrendan *lock = &arc_mru->arcs_mtx; 4382fa94a07fSbrendan break; 4383fa94a07fSbrendan } 4384fa94a07fSbrendan 4385fa94a07fSbrendan ASSERT(!(MUTEX_HELD(*lock))); 4386fa94a07fSbrendan mutex_enter(*lock); 4387fa94a07fSbrendan return (list); 4388fa94a07fSbrendan } 4389fa94a07fSbrendan 4390fa94a07fSbrendan /* 4391fa94a07fSbrendan * Evict buffers from the device write hand to the distance specified in 4392fa94a07fSbrendan * bytes. This distance may span populated buffers, it may span nothing. 4393fa94a07fSbrendan * This is clearing a region on the L2ARC device ready for writing. 4394fa94a07fSbrendan * If the 'all' boolean is set, every buffer is evicted. 4395fa94a07fSbrendan */ 4396fa94a07fSbrendan static void 4397fa94a07fSbrendan l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 4398fa94a07fSbrendan { 4399fa94a07fSbrendan list_t *buflist; 4400fa94a07fSbrendan l2arc_buf_hdr_t *abl2; 4401fa94a07fSbrendan arc_buf_hdr_t *ab, *ab_prev; 4402fa94a07fSbrendan kmutex_t *hash_lock; 4403fa94a07fSbrendan uint64_t taddr; 4404fa94a07fSbrendan 4405fa94a07fSbrendan buflist = dev->l2ad_buflist; 4406fa94a07fSbrendan 4407fa94a07fSbrendan if (buflist == NULL) 4408fa94a07fSbrendan return; 4409fa94a07fSbrendan 4410fa94a07fSbrendan if (!all && dev->l2ad_first) { 4411fa94a07fSbrendan /* 4412fa94a07fSbrendan * This is the first sweep through the device. There is 4413fa94a07fSbrendan * nothing to evict. 4414fa94a07fSbrendan */ 4415fa94a07fSbrendan return; 4416fa94a07fSbrendan } 4417fa94a07fSbrendan 44183a737e0dSbrendan if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 4419fa94a07fSbrendan /* 4420fa94a07fSbrendan * When nearing the end of the device, evict to the end 4421fa94a07fSbrendan * before the device write hand jumps to the start. 4422fa94a07fSbrendan */ 4423fa94a07fSbrendan taddr = dev->l2ad_end; 4424fa94a07fSbrendan } else { 4425fa94a07fSbrendan taddr = dev->l2ad_hand + distance; 4426fa94a07fSbrendan } 4427fa94a07fSbrendan DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 4428fa94a07fSbrendan uint64_t, taddr, boolean_t, all); 4429fa94a07fSbrendan 4430fa94a07fSbrendan top: 4431fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 4432fa94a07fSbrendan for (ab = list_tail(buflist); ab; ab = ab_prev) { 4433fa94a07fSbrendan ab_prev = list_prev(buflist, ab); 4434fa94a07fSbrendan 4435fa94a07fSbrendan hash_lock = HDR_LOCK(ab); 4436fa94a07fSbrendan if (!mutex_tryenter(hash_lock)) { 4437fa94a07fSbrendan /* 4438fa94a07fSbrendan * Missed the hash lock. Retry. 4439fa94a07fSbrendan */ 4440fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 4441fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 4442fa94a07fSbrendan mutex_enter(hash_lock); 4443fa94a07fSbrendan mutex_exit(hash_lock); 4444fa94a07fSbrendan goto top; 4445fa94a07fSbrendan } 4446fa94a07fSbrendan 4447fa94a07fSbrendan if (HDR_L2_WRITE_HEAD(ab)) { 4448fa94a07fSbrendan /* 4449fa94a07fSbrendan * We hit a write head node. Leave it for 4450fa94a07fSbrendan * l2arc_write_done(). 4451fa94a07fSbrendan */ 4452fa94a07fSbrendan list_remove(buflist, ab); 4453fa94a07fSbrendan mutex_exit(hash_lock); 4454fa94a07fSbrendan continue; 4455fa94a07fSbrendan } 4456fa94a07fSbrendan 4457fa94a07fSbrendan if (!all && ab->b_l2hdr != NULL && 4458fa94a07fSbrendan (ab->b_l2hdr->b_daddr > taddr || 4459fa94a07fSbrendan ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 4460fa94a07fSbrendan /* 4461fa94a07fSbrendan * We've evicted to the target address, 4462fa94a07fSbrendan * or the end of the device. 4463fa94a07fSbrendan */ 4464fa94a07fSbrendan mutex_exit(hash_lock); 4465fa94a07fSbrendan break; 4466fa94a07fSbrendan } 4467fa94a07fSbrendan 4468fa94a07fSbrendan if (HDR_FREE_IN_PROGRESS(ab)) { 4469fa94a07fSbrendan /* 4470fa94a07fSbrendan * Already on the path to destruction. 4471fa94a07fSbrendan */ 4472fa94a07fSbrendan mutex_exit(hash_lock); 4473fa94a07fSbrendan continue; 4474fa94a07fSbrendan } 4475fa94a07fSbrendan 4476fa94a07fSbrendan if (ab->b_state == arc_l2c_only) { 4477fa94a07fSbrendan ASSERT(!HDR_L2_READING(ab)); 4478fa94a07fSbrendan /* 4479fa94a07fSbrendan * This doesn't exist in the ARC. Destroy. 4480fa94a07fSbrendan * arc_hdr_destroy() will call list_remove() 4481fa94a07fSbrendan * and decrement arcstat_l2_size. 4482fa94a07fSbrendan */ 4483fa94a07fSbrendan arc_change_state(arc_anon, ab, hash_lock); 4484fa94a07fSbrendan arc_hdr_destroy(ab); 4485fa94a07fSbrendan } else { 44863a737e0dSbrendan /* 44873a737e0dSbrendan * Invalidate issued or about to be issued 44883a737e0dSbrendan * reads, since we may be about to write 44893a737e0dSbrendan * over this location. 44903a737e0dSbrendan */ 44913a737e0dSbrendan if (HDR_L2_READING(ab)) { 44923a737e0dSbrendan ARCSTAT_BUMP(arcstat_l2_evict_reading); 44933a737e0dSbrendan ab->b_flags |= ARC_L2_EVICTED; 44943a737e0dSbrendan } 44953a737e0dSbrendan 4496fa94a07fSbrendan /* 4497fa94a07fSbrendan * Tell ARC this no longer exists in L2ARC. 4498fa94a07fSbrendan */ 4499fa94a07fSbrendan if (ab->b_l2hdr != NULL) { 4500fa94a07fSbrendan abl2 = ab->b_l2hdr; 4501aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); 4502fa94a07fSbrendan ab->b_l2hdr = NULL; 4503fa94a07fSbrendan kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4504fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4505fa94a07fSbrendan } 4506fa94a07fSbrendan list_remove(buflist, ab); 4507fa94a07fSbrendan 4508fa94a07fSbrendan /* 4509fa94a07fSbrendan * This may have been leftover after a 4510fa94a07fSbrendan * failed write. 4511fa94a07fSbrendan */ 4512fa94a07fSbrendan ab->b_flags &= ~ARC_L2_WRITING; 4513fa94a07fSbrendan } 4514fa94a07fSbrendan mutex_exit(hash_lock); 4515fa94a07fSbrendan } 4516fa94a07fSbrendan mutex_exit(&l2arc_buflist_mtx); 4517fa94a07fSbrendan 4518b24ab676SJeff Bonwick vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); 4519fa94a07fSbrendan dev->l2ad_evict = taddr; 4520fa94a07fSbrendan } 4521fa94a07fSbrendan 4522fa94a07fSbrendan /* 4523fa94a07fSbrendan * Find and write ARC buffers to the L2ARC device. 4524fa94a07fSbrendan * 4525fa94a07fSbrendan * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4526fa94a07fSbrendan * for reading until they have completed writing. 4527aad02571SSaso Kiselkov * The headroom_boost is an in-out parameter used to maintain headroom boost 4528aad02571SSaso Kiselkov * state between calls to this function. 4529aad02571SSaso Kiselkov * 4530aad02571SSaso Kiselkov * Returns the number of bytes actually written (which may be smaller than 4531aad02571SSaso Kiselkov * the delta by which the device hand has changed due to alignment). 4532fa94a07fSbrendan */ 45335a98e54bSBrendan Gregg - Sun Microsystems static uint64_t 4534aad02571SSaso Kiselkov l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 4535aad02571SSaso Kiselkov boolean_t *headroom_boost) 4536fa94a07fSbrendan { 4537fa94a07fSbrendan arc_buf_hdr_t *ab, *ab_prev, *head; 4538fa94a07fSbrendan list_t *list; 4539aad02571SSaso Kiselkov uint64_t write_asize, write_psize, write_sz, headroom, 4540aad02571SSaso Kiselkov buf_compress_minsz; 4541fa94a07fSbrendan void *buf_data; 4542aad02571SSaso Kiselkov kmutex_t *list_lock; 4543aad02571SSaso Kiselkov boolean_t full; 4544fa94a07fSbrendan l2arc_write_callback_t *cb; 4545fa94a07fSbrendan zio_t *pio, *wzio; 4546e9103aaeSGarrett D'Amore uint64_t guid = spa_load_guid(spa); 4547aad02571SSaso Kiselkov const boolean_t do_headroom_boost = *headroom_boost; 4548fa94a07fSbrendan 4549fa94a07fSbrendan ASSERT(dev->l2ad_vdev != NULL); 4550fa94a07fSbrendan 4551aad02571SSaso Kiselkov /* Lower the flag now, we might want to raise it again later. */ 4552aad02571SSaso Kiselkov *headroom_boost = B_FALSE; 4553aad02571SSaso Kiselkov 4554fa94a07fSbrendan pio = NULL; 4555aad02571SSaso Kiselkov write_sz = write_asize = write_psize = 0; 4556fa94a07fSbrendan full = B_FALSE; 45571ab7f2deSmaybee head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4558fa94a07fSbrendan head->b_flags |= ARC_L2_WRITE_HEAD; 4559fa94a07fSbrendan 4560aad02571SSaso Kiselkov /* 4561aad02571SSaso Kiselkov * We will want to try to compress buffers that are at least 2x the 4562aad02571SSaso Kiselkov * device sector size. 4563aad02571SSaso Kiselkov */ 4564aad02571SSaso Kiselkov buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 4565aad02571SSaso Kiselkov 4566fa94a07fSbrendan /* 4567fa94a07fSbrendan * Copy buffers for L2ARC writing. 4568fa94a07fSbrendan */ 4569fa94a07fSbrendan mutex_enter(&l2arc_buflist_mtx); 4570fa94a07fSbrendan for (int try = 0; try <= 3; try++) { 4571aad02571SSaso Kiselkov uint64_t passed_sz = 0; 4572aad02571SSaso Kiselkov 4573fa94a07fSbrendan list = l2arc_list_locked(try, &list_lock); 4574fa94a07fSbrendan 45753a737e0dSbrendan /* 45763a737e0dSbrendan * L2ARC fast warmup. 45773a737e0dSbrendan * 45783a737e0dSbrendan * Until the ARC is warm and starts to evict, read from the 45793a737e0dSbrendan * head of the ARC lists rather than the tail. 45803a737e0dSbrendan */ 45813a737e0dSbrendan if (arc_warm == B_FALSE) 45823a737e0dSbrendan ab = list_head(list); 45833a737e0dSbrendan else 45843a737e0dSbrendan ab = list_tail(list); 45853a737e0dSbrendan 4586aad02571SSaso Kiselkov headroom = target_sz * l2arc_headroom; 4587aad02571SSaso Kiselkov if (do_headroom_boost) 4588aad02571SSaso Kiselkov headroom = (headroom * l2arc_headroom_boost) / 100; 4589aad02571SSaso Kiselkov 45903a737e0dSbrendan for (; ab; ab = ab_prev) { 4591aad02571SSaso Kiselkov l2arc_buf_hdr_t *l2hdr; 4592aad02571SSaso Kiselkov kmutex_t *hash_lock; 4593aad02571SSaso Kiselkov uint64_t buf_sz; 4594aad02571SSaso Kiselkov 45953a737e0dSbrendan if (arc_warm == B_FALSE) 45963a737e0dSbrendan ab_prev = list_next(list, ab); 45973a737e0dSbrendan else 45983a737e0dSbrendan ab_prev = list_prev(list, ab); 4599fa94a07fSbrendan 4600fa94a07fSbrendan hash_lock = HDR_LOCK(ab); 4601aad02571SSaso Kiselkov if (!mutex_tryenter(hash_lock)) { 4602fa94a07fSbrendan /* 4603fa94a07fSbrendan * Skip this buffer rather than waiting. 4604fa94a07fSbrendan */ 4605fa94a07fSbrendan continue; 4606fa94a07fSbrendan } 4607fa94a07fSbrendan 4608fa94a07fSbrendan passed_sz += ab->b_size; 4609fa94a07fSbrendan if (passed_sz > headroom) { 4610fa94a07fSbrendan /* 4611fa94a07fSbrendan * Searched too far. 4612fa94a07fSbrendan */ 4613fa94a07fSbrendan mutex_exit(hash_lock); 4614fa94a07fSbrendan break; 4615fa94a07fSbrendan } 4616fa94a07fSbrendan 4617ac05c741SMark Maybee if (!l2arc_write_eligible(guid, ab)) { 4618fa94a07fSbrendan mutex_exit(hash_lock); 4619fa94a07fSbrendan continue; 4620fa94a07fSbrendan } 4621fa94a07fSbrendan 4622fa94a07fSbrendan if ((write_sz + ab->b_size) > target_sz) { 4623fa94a07fSbrendan full = B_TRUE; 4624fa94a07fSbrendan mutex_exit(hash_lock); 4625fa94a07fSbrendan break; 4626fa94a07fSbrendan } 4627fa94a07fSbrendan 4628fa94a07fSbrendan if (pio == NULL) { 4629fa94a07fSbrendan /* 4630fa94a07fSbrendan * Insert a dummy header on the buflist so 4631fa94a07fSbrendan * l2arc_write_done() can find where the 4632fa94a07fSbrendan * write buffers begin without searching. 4633fa94a07fSbrendan */ 4634fa94a07fSbrendan list_insert_head(dev->l2ad_buflist, head); 4635fa94a07fSbrendan 4636fa94a07fSbrendan cb = kmem_alloc( 4637fa94a07fSbrendan sizeof (l2arc_write_callback_t), KM_SLEEP); 4638fa94a07fSbrendan cb->l2wcb_dev = dev; 4639fa94a07fSbrendan cb->l2wcb_head = head; 4640fa94a07fSbrendan pio = zio_root(spa, l2arc_write_done, cb, 4641fa94a07fSbrendan ZIO_FLAG_CANFAIL); 4642fa94a07fSbrendan } 4643fa94a07fSbrendan 4644fa94a07fSbrendan /* 4645fa94a07fSbrendan * Create and add a new L2ARC header. 4646fa94a07fSbrendan */ 4647aad02571SSaso Kiselkov l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4648aad02571SSaso Kiselkov l2hdr->b_dev = dev; 4649fa94a07fSbrendan ab->b_flags |= ARC_L2_WRITING; 4650aad02571SSaso Kiselkov 4651aad02571SSaso Kiselkov /* 4652aad02571SSaso Kiselkov * Temporarily stash the data buffer in b_tmp_cdata. 4653aad02571SSaso Kiselkov * The subsequent write step will pick it up from 4654aad02571SSaso Kiselkov * there. This is because can't access ab->b_buf 4655aad02571SSaso Kiselkov * without holding the hash_lock, which we in turn 4656aad02571SSaso Kiselkov * can't access without holding the ARC list locks 4657aad02571SSaso Kiselkov * (which we want to avoid during compression/writing). 4658aad02571SSaso Kiselkov */ 4659aad02571SSaso Kiselkov l2hdr->b_compress = ZIO_COMPRESS_OFF; 4660aad02571SSaso Kiselkov l2hdr->b_asize = ab->b_size; 4661aad02571SSaso Kiselkov l2hdr->b_tmp_cdata = ab->b_buf->b_data; 4662aad02571SSaso Kiselkov 4663fa94a07fSbrendan buf_sz = ab->b_size; 4664aad02571SSaso Kiselkov ab->b_l2hdr = l2hdr; 4665aad02571SSaso Kiselkov 4666aad02571SSaso Kiselkov list_insert_head(dev->l2ad_buflist, ab); 4667fa94a07fSbrendan 4668fa94a07fSbrendan /* 4669fa94a07fSbrendan * Compute and store the buffer cksum before 4670fa94a07fSbrendan * writing. On debug the cksum is verified first. 4671fa94a07fSbrendan */ 4672fa94a07fSbrendan arc_cksum_verify(ab->b_buf); 4673fa94a07fSbrendan arc_cksum_compute(ab->b_buf, B_TRUE); 4674fa94a07fSbrendan 4675fa94a07fSbrendan mutex_exit(hash_lock); 4676fa94a07fSbrendan 4677aad02571SSaso Kiselkov write_sz += buf_sz; 4678aad02571SSaso Kiselkov } 4679aad02571SSaso Kiselkov 4680aad02571SSaso Kiselkov mutex_exit(list_lock); 4681aad02571SSaso Kiselkov 4682aad02571SSaso Kiselkov if (full == B_TRUE) 4683aad02571SSaso Kiselkov break; 4684aad02571SSaso Kiselkov } 4685aad02571SSaso Kiselkov 4686aad02571SSaso Kiselkov /* No buffers selected for writing? */ 4687aad02571SSaso Kiselkov if (pio == NULL) { 4688aad02571SSaso Kiselkov ASSERT0(write_sz); 4689aad02571SSaso Kiselkov mutex_exit(&l2arc_buflist_mtx); 4690aad02571SSaso Kiselkov kmem_cache_free(hdr_cache, head); 4691aad02571SSaso Kiselkov return (0); 4692aad02571SSaso Kiselkov } 4693aad02571SSaso Kiselkov 4694aad02571SSaso Kiselkov /* 4695aad02571SSaso Kiselkov * Now start writing the buffers. We're starting at the write head 4696aad02571SSaso Kiselkov * and work backwards, retracing the course of the buffer selector 4697aad02571SSaso Kiselkov * loop above. 4698aad02571SSaso Kiselkov */ 4699aad02571SSaso Kiselkov for (ab = list_prev(dev->l2ad_buflist, head); ab; 4700aad02571SSaso Kiselkov ab = list_prev(dev->l2ad_buflist, ab)) { 4701aad02571SSaso Kiselkov l2arc_buf_hdr_t *l2hdr; 4702aad02571SSaso Kiselkov uint64_t buf_sz; 4703aad02571SSaso Kiselkov 4704aad02571SSaso Kiselkov /* 4705aad02571SSaso Kiselkov * We shouldn't need to lock the buffer here, since we flagged 4706aad02571SSaso Kiselkov * it as ARC_L2_WRITING in the previous step, but we must take 4707aad02571SSaso Kiselkov * care to only access its L2 cache parameters. In particular, 4708aad02571SSaso Kiselkov * ab->b_buf may be invalid by now due to ARC eviction. 4709aad02571SSaso Kiselkov */ 4710aad02571SSaso Kiselkov l2hdr = ab->b_l2hdr; 4711aad02571SSaso Kiselkov l2hdr->b_daddr = dev->l2ad_hand; 4712aad02571SSaso Kiselkov 4713aad02571SSaso Kiselkov if ((ab->b_flags & ARC_L2COMPRESS) && 4714aad02571SSaso Kiselkov l2hdr->b_asize >= buf_compress_minsz) { 4715aad02571SSaso Kiselkov if (l2arc_compress_buf(l2hdr)) { 4716aad02571SSaso Kiselkov /* 4717aad02571SSaso Kiselkov * If compression succeeded, enable headroom 4718aad02571SSaso Kiselkov * boost on the next scan cycle. 4719aad02571SSaso Kiselkov */ 4720aad02571SSaso Kiselkov *headroom_boost = B_TRUE; 4721aad02571SSaso Kiselkov } 4722aad02571SSaso Kiselkov } 4723aad02571SSaso Kiselkov 4724aad02571SSaso Kiselkov /* 4725aad02571SSaso Kiselkov * Pick up the buffer data we had previously stashed away 4726aad02571SSaso Kiselkov * (and now potentially also compressed). 4727aad02571SSaso Kiselkov */ 4728aad02571SSaso Kiselkov buf_data = l2hdr->b_tmp_cdata; 4729aad02571SSaso Kiselkov buf_sz = l2hdr->b_asize; 4730aad02571SSaso Kiselkov 4731aad02571SSaso Kiselkov /* Compression may have squashed the buffer to zero length. */ 4732aad02571SSaso Kiselkov if (buf_sz != 0) { 4733aad02571SSaso Kiselkov uint64_t buf_p_sz; 4734aad02571SSaso Kiselkov 4735fa94a07fSbrendan wzio = zio_write_phys(pio, dev->l2ad_vdev, 4736fa94a07fSbrendan dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4737fa94a07fSbrendan NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4738fa94a07fSbrendan ZIO_FLAG_CANFAIL, B_FALSE); 4739fa94a07fSbrendan 4740fa94a07fSbrendan DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4741fa94a07fSbrendan zio_t *, wzio); 4742fa94a07fSbrendan (void) zio_nowait(wzio); 4743fa94a07fSbrendan 4744aad02571SSaso Kiselkov write_asize += buf_sz; 4745e14bb325SJeff Bonwick /* 4746e14bb325SJeff Bonwick * Keep the clock hand suitably device-aligned. 4747e14bb325SJeff Bonwick */ 4748aad02571SSaso Kiselkov buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 4749aad02571SSaso Kiselkov write_psize += buf_p_sz; 4750aad02571SSaso Kiselkov dev->l2ad_hand += buf_p_sz; 4751fa94a07fSbrendan } 4752fa94a07fSbrendan } 4753fa94a07fSbrendan 4754aad02571SSaso Kiselkov mutex_exit(&l2arc_buflist_mtx); 4755fa94a07fSbrendan 4756aad02571SSaso Kiselkov ASSERT3U(write_asize, <=, target_sz); 4757fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_writes_sent); 4758aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 4759fa94a07fSbrendan ARCSTAT_INCR(arcstat_l2_size, write_sz); 4760aad02571SSaso Kiselkov ARCSTAT_INCR(arcstat_l2_asize, write_asize); 4761aad02571SSaso Kiselkov vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 4762fa94a07fSbrendan 4763fa94a07fSbrendan /* 4764fa94a07fSbrendan * Bump device hand to the device start if it is approaching the end. 4765fa94a07fSbrendan * l2arc_evict() will already have evicted ahead for this case. 4766fa94a07fSbrendan */ 47673a737e0dSbrendan if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4768b24ab676SJeff Bonwick vdev_space_update(dev->l2ad_vdev, 4769b24ab676SJeff Bonwick dev->l2ad_end - dev->l2ad_hand, 0, 0); 4770fa94a07fSbrendan dev->l2ad_hand = dev->l2ad_start; 4771fa94a07fSbrendan dev->l2ad_evict = dev->l2ad_start; 4772fa94a07fSbrendan dev->l2ad_first = B_FALSE; 4773fa94a07fSbrendan } 4774fa94a07fSbrendan 47755a98e54bSBrendan Gregg - Sun Microsystems dev->l2ad_writing = B_TRUE; 4776fa94a07fSbrendan (void) zio_wait(pio); 47775a98e54bSBrendan Gregg - Sun Microsystems dev->l2ad_writing = B_FALSE; 47785a98e54bSBrendan Gregg - Sun Microsystems 4779aad02571SSaso Kiselkov return (write_asize); 4780aad02571SSaso Kiselkov } 4781aad02571SSaso Kiselkov 4782aad02571SSaso Kiselkov /* 4783aad02571SSaso Kiselkov * Compresses an L2ARC buffer. 4784aad02571SSaso Kiselkov * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its 4785aad02571SSaso Kiselkov * size in l2hdr->b_asize. This routine tries to compress the data and 4786aad02571SSaso Kiselkov * depending on the compression result there are three possible outcomes: 4787aad02571SSaso Kiselkov * *) The buffer was incompressible. The original l2hdr contents were left 4788aad02571SSaso Kiselkov * untouched and are ready for writing to an L2 device. 4789aad02571SSaso Kiselkov * *) The buffer was all-zeros, so there is no need to write it to an L2 4790aad02571SSaso Kiselkov * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 4791aad02571SSaso Kiselkov * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 4792aad02571SSaso Kiselkov * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 4793aad02571SSaso Kiselkov * data buffer which holds the compressed data to be written, and b_asize 4794aad02571SSaso Kiselkov * tells us how much data there is. b_compress is set to the appropriate 4795aad02571SSaso Kiselkov * compression algorithm. Once writing is done, invoke 4796aad02571SSaso Kiselkov * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 4797aad02571SSaso Kiselkov * 4798aad02571SSaso Kiselkov * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 4799aad02571SSaso Kiselkov * buffer was incompressible). 4800aad02571SSaso Kiselkov */ 4801aad02571SSaso Kiselkov static boolean_t 4802aad02571SSaso Kiselkov l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) 4803aad02571SSaso Kiselkov { 4804aad02571SSaso Kiselkov void *cdata; 4805aad02571SSaso Kiselkov size_t csize, len; 4806aad02571SSaso Kiselkov 4807aad02571SSaso Kiselkov ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); 4808aad02571SSaso Kiselkov ASSERT(l2hdr->b_tmp_cdata != NULL); 4809aad02571SSaso Kiselkov 4810aad02571SSaso Kiselkov len = l2hdr->b_asize; 4811aad02571SSaso Kiselkov cdata = zio_data_buf_alloc(len); 4812aad02571SSaso Kiselkov csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, 4813aad02571SSaso Kiselkov cdata, l2hdr->b_asize); 4814aad02571SSaso Kiselkov 4815aad02571SSaso Kiselkov if (csize == 0) { 4816aad02571SSaso Kiselkov /* zero block, indicate that there's nothing to write */ 4817aad02571SSaso Kiselkov zio_data_buf_free(cdata, len); 4818aad02571SSaso Kiselkov l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 4819aad02571SSaso Kiselkov l2hdr->b_asize = 0; 4820aad02571SSaso Kiselkov l2hdr->b_tmp_cdata = NULL; 4821aad02571SSaso Kiselkov ARCSTAT_BUMP(arcstat_l2_compress_zeros); 4822aad02571SSaso Kiselkov return (B_TRUE); 4823aad02571SSaso Kiselkov } else if (csize > 0 && csize < len) { 4824aad02571SSaso Kiselkov /* 4825aad02571SSaso Kiselkov * Compression succeeded, we'll keep the cdata around for 4826aad02571SSaso Kiselkov * writing and release it afterwards. 4827aad02571SSaso Kiselkov */ 4828aad02571SSaso Kiselkov l2hdr->b_compress = ZIO_COMPRESS_LZ4; 4829aad02571SSaso Kiselkov l2hdr->b_asize = csize; 4830aad02571SSaso Kiselkov l2hdr->b_tmp_cdata = cdata; 4831aad02571SSaso Kiselkov ARCSTAT_BUMP(arcstat_l2_compress_successes); 4832aad02571SSaso Kiselkov return (B_TRUE); 4833aad02571SSaso Kiselkov } else { 4834aad02571SSaso Kiselkov /* 4835aad02571SSaso Kiselkov * Compression failed, release the compressed buffer. 4836aad02571SSaso Kiselkov * l2hdr will be left unmodified. 4837aad02571SSaso Kiselkov */ 4838aad02571SSaso Kiselkov zio_data_buf_free(cdata, len); 4839aad02571SSaso Kiselkov ARCSTAT_BUMP(arcstat_l2_compress_failures); 4840aad02571SSaso Kiselkov return (B_FALSE); 4841aad02571SSaso Kiselkov } 4842aad02571SSaso Kiselkov } 4843aad02571SSaso Kiselkov 4844aad02571SSaso Kiselkov /* 4845aad02571SSaso Kiselkov * Decompresses a zio read back from an l2arc device. On success, the 4846aad02571SSaso Kiselkov * underlying zio's io_data buffer is overwritten by the uncompressed 4847aad02571SSaso Kiselkov * version. On decompression error (corrupt compressed stream), the 4848aad02571SSaso Kiselkov * zio->io_error value is set to signal an I/O error. 4849aad02571SSaso Kiselkov * 4850aad02571SSaso Kiselkov * Please note that the compressed data stream is not checksummed, so 4851aad02571SSaso Kiselkov * if the underlying device is experiencing data corruption, we may feed 4852aad02571SSaso Kiselkov * corrupt data to the decompressor, so the decompressor needs to be 4853aad02571SSaso Kiselkov * able to handle this situation (LZ4 does). 4854aad02571SSaso Kiselkov */ 4855aad02571SSaso Kiselkov static void 4856aad02571SSaso Kiselkov l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 4857aad02571SSaso Kiselkov { 4858aad02571SSaso Kiselkov ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 4859aad02571SSaso Kiselkov 4860aad02571SSaso Kiselkov if (zio->io_error != 0) { 4861aad02571SSaso Kiselkov /* 4862aad02571SSaso Kiselkov * An io error has occured, just restore the original io 4863aad02571SSaso Kiselkov * size in preparation for a main pool read. 4864aad02571SSaso Kiselkov */ 4865aad02571SSaso Kiselkov zio->io_orig_size = zio->io_size = hdr->b_size; 4866aad02571SSaso Kiselkov return; 4867aad02571SSaso Kiselkov } 4868aad02571SSaso Kiselkov 4869aad02571SSaso Kiselkov if (c == ZIO_COMPRESS_EMPTY) { 4870aad02571SSaso Kiselkov /* 4871aad02571SSaso Kiselkov * An empty buffer results in a null zio, which means we 4872aad02571SSaso Kiselkov * need to fill its io_data after we're done restoring the 4873aad02571SSaso Kiselkov * buffer's contents. 4874aad02571SSaso Kiselkov */ 4875aad02571SSaso Kiselkov ASSERT(hdr->b_buf != NULL); 4876aad02571SSaso Kiselkov bzero(hdr->b_buf->b_data, hdr->b_size); 4877aad02571SSaso Kiselkov zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; 4878aad02571SSaso Kiselkov } else { 4879aad02571SSaso Kiselkov ASSERT(zio->io_data != NULL); 4880aad02571SSaso Kiselkov /* 4881aad02571SSaso Kiselkov * We copy the compressed data from the start of the arc buffer 4882aad02571SSaso Kiselkov * (the zio_read will have pulled in only what we need, the 4883aad02571SSaso Kiselkov * rest is garbage which we will overwrite at decompression) 4884aad02571SSaso Kiselkov * and then decompress back to the ARC data buffer. This way we 4885aad02571SSaso Kiselkov * can minimize copying by simply decompressing back over the 4886aad02571SSaso Kiselkov * original compressed data (rather than decompressing to an 4887aad02571SSaso Kiselkov * aux buffer and then copying back the uncompressed buffer, 4888aad02571SSaso Kiselkov * which is likely to be much larger). 4889aad02571SSaso Kiselkov */ 4890aad02571SSaso Kiselkov uint64_t csize; 4891aad02571SSaso Kiselkov void *cdata; 4892aad02571SSaso Kiselkov 4893aad02571SSaso Kiselkov csize = zio->io_size; 4894aad02571SSaso Kiselkov cdata = zio_data_buf_alloc(csize); 4895aad02571SSaso Kiselkov bcopy(zio->io_data, cdata, csize); 4896aad02571SSaso Kiselkov if (zio_decompress_data(c, cdata, zio->io_data, csize, 4897aad02571SSaso Kiselkov hdr->b_size) != 0) 4898aad02571SSaso Kiselkov zio->io_error = EIO; 4899aad02571SSaso Kiselkov zio_data_buf_free(cdata, csize); 4900aad02571SSaso Kiselkov } 4901aad02571SSaso Kiselkov 4902aad02571SSaso Kiselkov /* Restore the expected uncompressed IO size. */ 4903aad02571SSaso Kiselkov zio->io_orig_size = zio->io_size = hdr->b_size; 4904aad02571SSaso Kiselkov } 4905aad02571SSaso Kiselkov 4906aad02571SSaso Kiselkov /* 4907aad02571SSaso Kiselkov * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 4908aad02571SSaso Kiselkov * This buffer serves as a temporary holder of compressed data while 4909aad02571SSaso Kiselkov * the buffer entry is being written to an l2arc device. Once that is 4910aad02571SSaso Kiselkov * done, we can dispose of it. 4911aad02571SSaso Kiselkov */ 4912aad02571SSaso Kiselkov static void 4913aad02571SSaso Kiselkov l2arc_release_cdata_buf(arc_buf_hdr_t *ab) 4914aad02571SSaso Kiselkov { 4915aad02571SSaso Kiselkov l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; 4916aad02571SSaso Kiselkov 4917aad02571SSaso Kiselkov if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { 4918aad02571SSaso Kiselkov /* 4919aad02571SSaso Kiselkov * If the data was compressed, then we've allocated a 4920aad02571SSaso Kiselkov * temporary buffer for it, so now we need to release it. 4921aad02571SSaso Kiselkov */ 4922aad02571SSaso Kiselkov ASSERT(l2hdr->b_tmp_cdata != NULL); 4923aad02571SSaso Kiselkov zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); 4924aad02571SSaso Kiselkov } 4925aad02571SSaso Kiselkov l2hdr->b_tmp_cdata = NULL; 4926fa94a07fSbrendan } 4927fa94a07fSbrendan 4928fa94a07fSbrendan /* 4929fa94a07fSbrendan * This thread feeds the L2ARC at regular intervals. This is the beating 4930fa94a07fSbrendan * heart of the L2ARC. 4931fa94a07fSbrendan */ 4932fa94a07fSbrendan static void 4933fa94a07fSbrendan l2arc_feed_thread(void) 4934fa94a07fSbrendan { 4935fa94a07fSbrendan callb_cpr_t cpr; 4936fa94a07fSbrendan l2arc_dev_t *dev; 4937fa94a07fSbrendan spa_t *spa; 49385a98e54bSBrendan Gregg - Sun Microsystems uint64_t size, wrote; 4939d3d50737SRafael Vanoni clock_t begin, next = ddi_get_lbolt(); 4940aad02571SSaso Kiselkov boolean_t headroom_boost = B_FALSE; 4941fa94a07fSbrendan 4942fa94a07fSbrendan CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4943fa94a07fSbrendan 4944fa94a07fSbrendan mutex_enter(&l2arc_feed_thr_lock); 4945fa94a07fSbrendan 4946fa94a07fSbrendan while (l2arc_thread_exit == 0) { 4947fa94a07fSbrendan CALLB_CPR_SAFE_BEGIN(&cpr); 4948fa94a07fSbrendan (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 49495a98e54bSBrendan Gregg - Sun Microsystems next); 4950fa94a07fSbrendan CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4951d3d50737SRafael Vanoni next = ddi_get_lbolt() + hz; 4952fa94a07fSbrendan 49533a737e0dSbrendan /* 49543a737e0dSbrendan * Quick check for L2ARC devices. 49553a737e0dSbrendan */ 4956c5904d13Seschrock mutex_enter(&l2arc_dev_mtx); 49573a737e0dSbrendan if (l2arc_ndev == 0) { 49583a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 49593a737e0dSbrendan continue; 49603a737e0dSbrendan } 49613a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 4962d3d50737SRafael Vanoni begin = ddi_get_lbolt(); 4963c5904d13Seschrock 4964fa94a07fSbrendan /* 4965c5904d13Seschrock * This selects the next l2arc device to write to, and in 4966c5904d13Seschrock * doing so the next spa to feed from: dev->l2ad_spa. This 49673a737e0dSbrendan * will return NULL if there are now no l2arc devices or if 49683a737e0dSbrendan * they are all faulted. 49693a737e0dSbrendan * 49703a737e0dSbrendan * If a device is returned, its spa's config lock is also 49713a737e0dSbrendan * held to prevent device removal. l2arc_dev_get_next() 49723a737e0dSbrendan * will grab and release l2arc_dev_mtx. 4973fa94a07fSbrendan */ 49743a737e0dSbrendan if ((dev = l2arc_dev_get_next()) == NULL) 4975fa94a07fSbrendan continue; 49763a737e0dSbrendan 49773a737e0dSbrendan spa = dev->l2ad_spa; 49783a737e0dSbrendan ASSERT(spa != NULL); 4979fa94a07fSbrendan 4980f9af39baSGeorge Wilson /* 4981f9af39baSGeorge Wilson * If the pool is read-only then force the feed thread to 4982f9af39baSGeorge Wilson * sleep a little longer. 4983f9af39baSGeorge Wilson */ 4984f9af39baSGeorge Wilson if (!spa_writeable(spa)) { 4985f9af39baSGeorge Wilson next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 4986f9af39baSGeorge Wilson spa_config_exit(spa, SCL_L2ARC, dev); 4987f9af39baSGeorge Wilson continue; 4988f9af39baSGeorge Wilson } 4989f9af39baSGeorge Wilson 4990fa94a07fSbrendan /* 4991fa94a07fSbrendan * Avoid contributing to memory pressure. 4992fa94a07fSbrendan */ 4993fa94a07fSbrendan if (arc_reclaim_needed()) { 4994fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4995e14bb325SJeff Bonwick spa_config_exit(spa, SCL_L2ARC, dev); 4996fa94a07fSbrendan continue; 4997fa94a07fSbrendan } 4998fa94a07fSbrendan 4999fa94a07fSbrendan ARCSTAT_BUMP(arcstat_l2_feeds); 5000fa94a07fSbrendan 5001aad02571SSaso Kiselkov size = l2arc_write_size(); 50023a737e0dSbrendan 5003fa94a07fSbrendan /* 5004fa94a07fSbrendan * Evict L2ARC buffers that will be overwritten. 5005fa94a07fSbrendan */ 50063a737e0dSbrendan l2arc_evict(dev, size, B_FALSE); 5007fa94a07fSbrendan 5008fa94a07fSbrendan /* 5009fa94a07fSbrendan * Write ARC buffers. 5010fa94a07fSbrendan */ 5011aad02571SSaso Kiselkov wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 50125a98e54bSBrendan Gregg - Sun Microsystems 50135a98e54bSBrendan Gregg - Sun Microsystems /* 50145a98e54bSBrendan Gregg - Sun Microsystems * Calculate interval between writes. 50155a98e54bSBrendan Gregg - Sun Microsystems */ 50165a98e54bSBrendan Gregg - Sun Microsystems next = l2arc_write_interval(begin, size, wrote); 5017e14bb325SJeff Bonwick spa_config_exit(spa, SCL_L2ARC, dev); 5018fa94a07fSbrendan } 5019fa94a07fSbrendan 5020fa94a07fSbrendan l2arc_thread_exit = 0; 5021fa94a07fSbrendan cv_broadcast(&l2arc_feed_thr_cv); 5022fa94a07fSbrendan CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 5023fa94a07fSbrendan thread_exit(); 5024fa94a07fSbrendan } 5025fa94a07fSbrendan 5026c5904d13Seschrock boolean_t 5027c5904d13Seschrock l2arc_vdev_present(vdev_t *vd) 5028c5904d13Seschrock { 5029c5904d13Seschrock l2arc_dev_t *dev; 5030c5904d13Seschrock 5031c5904d13Seschrock mutex_enter(&l2arc_dev_mtx); 5032c5904d13Seschrock for (dev = list_head(l2arc_dev_list); dev != NULL; 5033c5904d13Seschrock dev = list_next(l2arc_dev_list, dev)) { 5034c5904d13Seschrock if (dev->l2ad_vdev == vd) 5035c5904d13Seschrock break; 5036c5904d13Seschrock } 5037c5904d13Seschrock mutex_exit(&l2arc_dev_mtx); 5038c5904d13Seschrock 5039c5904d13Seschrock return (dev != NULL); 5040c5904d13Seschrock } 5041c5904d13Seschrock 5042fa94a07fSbrendan /* 5043fa94a07fSbrendan * Add a vdev for use by the L2ARC. By this point the spa has already 5044fa94a07fSbrendan * validated the vdev and opened it. 5045fa94a07fSbrendan */ 5046fa94a07fSbrendan void 5047573ca77eSGeorge Wilson l2arc_add_vdev(spa_t *spa, vdev_t *vd) 5048fa94a07fSbrendan { 5049fa94a07fSbrendan l2arc_dev_t *adddev; 5050fa94a07fSbrendan 5051c5904d13Seschrock ASSERT(!l2arc_vdev_present(vd)); 5052c5904d13Seschrock 5053fa94a07fSbrendan /* 5054fa94a07fSbrendan * Create a new l2arc device entry. 5055fa94a07fSbrendan */ 5056fa94a07fSbrendan adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 5057fa94a07fSbrendan adddev->l2ad_spa = spa; 5058fa94a07fSbrendan adddev->l2ad_vdev = vd; 5059573ca77eSGeorge Wilson adddev->l2ad_start = VDEV_LABEL_START_SIZE; 5060573ca77eSGeorge Wilson adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 5061fa94a07fSbrendan adddev->l2ad_hand = adddev->l2ad_start; 5062fa94a07fSbrendan adddev->l2ad_evict = adddev->l2ad_start; 5063fa94a07fSbrendan adddev->l2ad_first = B_TRUE; 50645a98e54bSBrendan Gregg - Sun Microsystems adddev->l2ad_writing = B_FALSE; 5065fa94a07fSbrendan 5066fa94a07fSbrendan /* 5067fa94a07fSbrendan * This is a list of all ARC buffers that are still valid on the 5068fa94a07fSbrendan * device. 5069fa94a07fSbrendan */ 5070fa94a07fSbrendan adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 5071fa94a07fSbrendan list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 5072fa94a07fSbrendan offsetof(arc_buf_hdr_t, b_l2node)); 5073fa94a07fSbrendan 5074b24ab676SJeff Bonwick vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 5075fa94a07fSbrendan 5076fa94a07fSbrendan /* 5077fa94a07fSbrendan * Add device to global list 5078fa94a07fSbrendan */ 5079fa94a07fSbrendan mutex_enter(&l2arc_dev_mtx); 5080fa94a07fSbrendan list_insert_head(l2arc_dev_list, adddev); 5081fa94a07fSbrendan atomic_inc_64(&l2arc_ndev); 5082fa94a07fSbrendan mutex_exit(&l2arc_dev_mtx); 5083fa94a07fSbrendan } 5084fa94a07fSbrendan 5085fa94a07fSbrendan /* 5086fa94a07fSbrendan * Remove a vdev from the L2ARC. 5087fa94a07fSbrendan */ 5088fa94a07fSbrendan void 5089fa94a07fSbrendan l2arc_remove_vdev(vdev_t *vd) 5090fa94a07fSbrendan { 5091fa94a07fSbrendan l2arc_dev_t *dev, *nextdev, *remdev = NULL; 5092fa94a07fSbrendan 5093fa94a07fSbrendan /* 5094fa94a07fSbrendan * Find the device by vdev 5095fa94a07fSbrendan */ 5096fa94a07fSbrendan mutex_enter(&l2arc_dev_mtx); 5097fa94a07fSbrendan for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 5098fa94a07fSbrendan nextdev = list_next(l2arc_dev_list, dev); 5099fa94a07fSbrendan if (vd == dev->l2ad_vdev) { 5100fa94a07fSbrendan remdev = dev; 5101fa94a07fSbrendan break; 5102fa94a07fSbrendan } 5103fa94a07fSbrendan } 5104fa94a07fSbrendan ASSERT(remdev != NULL); 5105fa94a07fSbrendan 5106fa94a07fSbrendan /* 5107fa94a07fSbrendan * Remove device from global list 5108fa94a07fSbrendan */ 5109fa94a07fSbrendan list_remove(l2arc_dev_list, remdev); 5110fa94a07fSbrendan l2arc_dev_last = NULL; /* may have been invalidated */ 51113a737e0dSbrendan atomic_dec_64(&l2arc_ndev); 51123a737e0dSbrendan mutex_exit(&l2arc_dev_mtx); 5113fa94a07fSbrendan 5114fa94a07fSbrendan /* 5115fa94a07fSbrendan * Clear all buflists and ARC references. L2ARC device flush. 5116fa94a07fSbrendan */ 5117fa94a07fSbrendan l2arc_evict(remdev, 0, B_TRUE); 5118fa94a07fSbrendan list_destroy(remdev->l2ad_buflist); 5119fa94a07fSbrendan kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 5120fa94a07fSbrendan kmem_free(remdev, sizeof (l2arc_dev_t)); 5121fa94a07fSbrendan } 5122fa94a07fSbrendan 5123fa94a07fSbrendan void 5124e14bb325SJeff Bonwick l2arc_init(void) 5125fa94a07fSbrendan { 5126fa94a07fSbrendan l2arc_thread_exit = 0; 5127fa94a07fSbrendan l2arc_ndev = 0; 5128fa94a07fSbrendan l2arc_writes_sent = 0; 5129fa94a07fSbrendan l2arc_writes_done = 0; 5130fa94a07fSbrendan 5131fa94a07fSbrendan mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 5132fa94a07fSbrendan cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 5133fa94a07fSbrendan mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 5134fa94a07fSbrendan mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 5135fa94a07fSbrendan mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 5136fa94a07fSbrendan 5137fa94a07fSbrendan l2arc_dev_list = &L2ARC_dev_list; 5138fa94a07fSbrendan l2arc_free_on_write = &L2ARC_free_on_write; 5139fa94a07fSbrendan list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 5140fa94a07fSbrendan offsetof(l2arc_dev_t, l2ad_node)); 5141fa94a07fSbrendan list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 5142fa94a07fSbrendan offsetof(l2arc_data_free_t, l2df_list_node)); 5143fa94a07fSbrendan } 5144fa94a07fSbrendan 5145fa94a07fSbrendan void 5146e14bb325SJeff Bonwick l2arc_fini(void) 5147fa94a07fSbrendan { 51483a737e0dSbrendan /* 51493a737e0dSbrendan * This is called from dmu_fini(), which is called from spa_fini(); 51503a737e0dSbrendan * Because of this, we can assume that all l2arc devices have 51513a737e0dSbrendan * already been removed when the pools themselves were removed. 51523a737e0dSbrendan */ 51533a737e0dSbrendan 51543a737e0dSbrendan l2arc_do_free_on_write(); 51553a737e0dSbrendan 5156fa94a07fSbrendan mutex_destroy(&l2arc_feed_thr_lock); 5157fa94a07fSbrendan cv_destroy(&l2arc_feed_thr_cv); 5158fa94a07fSbrendan mutex_destroy(&l2arc_dev_mtx); 5159fa94a07fSbrendan mutex_destroy(&l2arc_buflist_mtx); 5160fa94a07fSbrendan mutex_destroy(&l2arc_free_on_write_mtx); 5161fa94a07fSbrendan 5162fa94a07fSbrendan list_destroy(l2arc_dev_list); 5163fa94a07fSbrendan list_destroy(l2arc_free_on_write); 5164fa94a07fSbrendan } 5165e14bb325SJeff Bonwick 5166e14bb325SJeff Bonwick void 5167e14bb325SJeff Bonwick l2arc_start(void) 5168e14bb325SJeff Bonwick { 51698ad4d6ddSJeff Bonwick if (!(spa_mode_global & FWRITE)) 5170e14bb325SJeff Bonwick return; 5171e14bb325SJeff Bonwick 5172e14bb325SJeff Bonwick (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 5173e14bb325SJeff Bonwick TS_RUN, minclsyspri); 5174e14bb325SJeff Bonwick } 5175e14bb325SJeff Bonwick 5176e14bb325SJeff Bonwick void 5177e14bb325SJeff Bonwick l2arc_stop(void) 5178e14bb325SJeff Bonwick { 51798ad4d6ddSJeff Bonwick if (!(spa_mode_global & FWRITE)) 5180e14bb325SJeff Bonwick return; 5181e14bb325SJeff Bonwick 5182e14bb325SJeff Bonwick mutex_enter(&l2arc_feed_thr_lock); 5183e14bb325SJeff Bonwick cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 5184e14bb325SJeff Bonwick l2arc_thread_exit = 1; 5185e14bb325SJeff Bonwick while (l2arc_thread_exit != 0) 5186e14bb325SJeff Bonwick cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 5187e14bb325SJeff Bonwick mutex_exit(&l2arc_feed_thr_lock); 5188e14bb325SJeff Bonwick } 5189