1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223d7072f8Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/zfs_context.h> 29fa9e4066Sahrens #include <sys/spa.h> 30fa9e4066Sahrens #include <sys/vdev_impl.h> 31fa9e4066Sahrens #include <sys/zio.h> 32fa9e4066Sahrens 33fa9e4066Sahrens /* 34fa9e4066Sahrens * Virtual device read-ahead caching. 35fa9e4066Sahrens * 36fa9e4066Sahrens * This file implements a simple LRU read-ahead cache. When the DMU reads 37fa9e4066Sahrens * a given block, it will often want other, nearby blocks soon thereafter. 38fa9e4066Sahrens * We take advantage of this by reading a larger disk region and caching 39fa9e4066Sahrens * the result. In the best case, this can turn 256 back-to-back 512-byte 40fa9e4066Sahrens * reads into a single 128k read followed by 255 cache hits; this reduces 41fa9e4066Sahrens * latency dramatically. In the worst case, it can turn an isolated 512-byte 42fa9e4066Sahrens * read into a 128k read, which doesn't affect latency all that much but is 43fa9e4066Sahrens * terribly wasteful of bandwidth. A more intelligent version of the cache 44fa9e4066Sahrens * could keep track of access patterns and not do read-ahead unless it sees 45*fdb2e906Sek * at least two temporally close I/Os to the same region. Currently, only 46*fdb2e906Sek * metadata I/O is inflated. A futher enhancement could take advantage of 47*fdb2e906Sek * more semantic information about the I/O. And it could use something 48*fdb2e906Sek * faster than an AVL tree; that was chosen solely for convenience. 49fa9e4066Sahrens * 50fa9e4066Sahrens * There are five cache operations: allocate, fill, read, write, evict. 51fa9e4066Sahrens * 52fa9e4066Sahrens * (1) Allocate. This reserves a cache entry for the specified region. 53fa9e4066Sahrens * We separate the allocate and fill operations so that multiple threads 54fa9e4066Sahrens * don't generate I/O for the same cache miss. 55fa9e4066Sahrens * 56fa9e4066Sahrens * (2) Fill. When the I/O for a cache miss completes, the fill routine 57fa9e4066Sahrens * places the data in the previously allocated cache entry. 58fa9e4066Sahrens * 59fa9e4066Sahrens * (3) Read. Read data from the cache. 60fa9e4066Sahrens * 61fa9e4066Sahrens * (4) Write. Update cache contents after write completion. 62fa9e4066Sahrens * 63fa9e4066Sahrens * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry 64614409b5Sahrens * if the total cache size exceeds zfs_vdev_cache_size. 65fa9e4066Sahrens */ 66fa9e4066Sahrens 67614409b5Sahrens /* 68614409b5Sahrens * These tunables are for performance analysis. 69614409b5Sahrens */ 70614409b5Sahrens /* 71614409b5Sahrens * All i/os smaller than zfs_vdev_cache_max will be turned into 72614409b5Sahrens * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software 73614409b5Sahrens * track buffer. At most zfs_vdev_cache_size bytes will be kept in each 74614409b5Sahrens * vdev's vdev_cache. 75614409b5Sahrens */ 76614409b5Sahrens int zfs_vdev_cache_max = 1<<14; 77614409b5Sahrens int zfs_vdev_cache_size = 10ULL << 20; 78614409b5Sahrens int zfs_vdev_cache_bshift = 16; 79614409b5Sahrens 80614409b5Sahrens #define VCBS (1 << zfs_vdev_cache_bshift) 81614409b5Sahrens 82fa9e4066Sahrens static int 83fa9e4066Sahrens vdev_cache_offset_compare(const void *a1, const void *a2) 84fa9e4066Sahrens { 85fa9e4066Sahrens const vdev_cache_entry_t *ve1 = a1; 86fa9e4066Sahrens const vdev_cache_entry_t *ve2 = a2; 87fa9e4066Sahrens 88fa9e4066Sahrens if (ve1->ve_offset < ve2->ve_offset) 89fa9e4066Sahrens return (-1); 90fa9e4066Sahrens if (ve1->ve_offset > ve2->ve_offset) 91fa9e4066Sahrens return (1); 92fa9e4066Sahrens return (0); 93fa9e4066Sahrens } 94fa9e4066Sahrens 95fa9e4066Sahrens static int 96fa9e4066Sahrens vdev_cache_lastused_compare(const void *a1, const void *a2) 97fa9e4066Sahrens { 98fa9e4066Sahrens const vdev_cache_entry_t *ve1 = a1; 99fa9e4066Sahrens const vdev_cache_entry_t *ve2 = a2; 100fa9e4066Sahrens 101fa9e4066Sahrens if (ve1->ve_lastused < ve2->ve_lastused) 102fa9e4066Sahrens return (-1); 103fa9e4066Sahrens if (ve1->ve_lastused > ve2->ve_lastused) 104fa9e4066Sahrens return (1); 105fa9e4066Sahrens 106fa9e4066Sahrens /* 107fa9e4066Sahrens * Among equally old entries, sort by offset to ensure uniqueness. 108fa9e4066Sahrens */ 109fa9e4066Sahrens return (vdev_cache_offset_compare(a1, a2)); 110fa9e4066Sahrens } 111fa9e4066Sahrens 112fa9e4066Sahrens /* 113fa9e4066Sahrens * Evict the specified entry from the cache. 114fa9e4066Sahrens */ 115fa9e4066Sahrens static void 116fa9e4066Sahrens vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) 117fa9e4066Sahrens { 118fa9e4066Sahrens ASSERT(MUTEX_HELD(&vc->vc_lock)); 119fa9e4066Sahrens ASSERT(ve->ve_fill_io == NULL); 120fa9e4066Sahrens ASSERT(ve->ve_data != NULL); 121fa9e4066Sahrens 122fa9e4066Sahrens dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n", 123fa9e4066Sahrens vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused, 124fa9e4066Sahrens ve->ve_hits, ve->ve_missed_update); 125fa9e4066Sahrens 126fa9e4066Sahrens avl_remove(&vc->vc_lastused_tree, ve); 127fa9e4066Sahrens avl_remove(&vc->vc_offset_tree, ve); 128614409b5Sahrens zio_buf_free(ve->ve_data, VCBS); 129fa9e4066Sahrens kmem_free(ve, sizeof (vdev_cache_entry_t)); 130fa9e4066Sahrens } 131fa9e4066Sahrens 132fa9e4066Sahrens /* 133fa9e4066Sahrens * Allocate an entry in the cache. At the point we don't have the data, 134fa9e4066Sahrens * we're just creating a placeholder so that multiple threads don't all 135fa9e4066Sahrens * go off and read the same blocks. 136fa9e4066Sahrens */ 137fa9e4066Sahrens static vdev_cache_entry_t * 138fa9e4066Sahrens vdev_cache_allocate(zio_t *zio) 139fa9e4066Sahrens { 140fa9e4066Sahrens vdev_cache_t *vc = &zio->io_vd->vdev_cache; 141614409b5Sahrens uint64_t offset = P2ALIGN(zio->io_offset, VCBS); 142fa9e4066Sahrens vdev_cache_entry_t *ve; 143fa9e4066Sahrens 144fa9e4066Sahrens ASSERT(MUTEX_HELD(&vc->vc_lock)); 145fa9e4066Sahrens 146614409b5Sahrens if (zfs_vdev_cache_size == 0) 147fa9e4066Sahrens return (NULL); 148fa9e4066Sahrens 149fa9e4066Sahrens /* 150fa9e4066Sahrens * If adding a new entry would exceed the cache size, 151fa9e4066Sahrens * evict the oldest entry (LRU). 152fa9e4066Sahrens */ 153614409b5Sahrens if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > 154614409b5Sahrens zfs_vdev_cache_size) { 155fa9e4066Sahrens ve = avl_first(&vc->vc_lastused_tree); 156fa9e4066Sahrens if (ve->ve_fill_io != NULL) { 157fa9e4066Sahrens dprintf("can't evict in %p, still filling\n", vc); 158fa9e4066Sahrens return (NULL); 159fa9e4066Sahrens } 160fa9e4066Sahrens ASSERT(ve->ve_hits != 0); 161fa9e4066Sahrens vdev_cache_evict(vc, ve); 162fa9e4066Sahrens } 163fa9e4066Sahrens 164fa9e4066Sahrens ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); 165fa9e4066Sahrens ve->ve_offset = offset; 166fa9e4066Sahrens ve->ve_lastused = lbolt; 167614409b5Sahrens ve->ve_data = zio_buf_alloc(VCBS); 168fa9e4066Sahrens 169fa9e4066Sahrens avl_add(&vc->vc_offset_tree, ve); 170fa9e4066Sahrens avl_add(&vc->vc_lastused_tree, ve); 171fa9e4066Sahrens 172fa9e4066Sahrens return (ve); 173fa9e4066Sahrens } 174fa9e4066Sahrens 175fa9e4066Sahrens static void 176fa9e4066Sahrens vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) 177fa9e4066Sahrens { 178614409b5Sahrens uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 179fa9e4066Sahrens 180fa9e4066Sahrens ASSERT(MUTEX_HELD(&vc->vc_lock)); 181fa9e4066Sahrens ASSERT(ve->ve_fill_io == NULL); 182fa9e4066Sahrens 183fa9e4066Sahrens if (ve->ve_lastused != lbolt) { 184fa9e4066Sahrens avl_remove(&vc->vc_lastused_tree, ve); 185fa9e4066Sahrens ve->ve_lastused = lbolt; 186fa9e4066Sahrens avl_add(&vc->vc_lastused_tree, ve); 187fa9e4066Sahrens } 188fa9e4066Sahrens 189fa9e4066Sahrens ve->ve_hits++; 190fa9e4066Sahrens bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); 191fa9e4066Sahrens } 192fa9e4066Sahrens 193fa9e4066Sahrens /* 194fa9e4066Sahrens * Fill a previously allocated cache entry with data. 195fa9e4066Sahrens */ 196fa9e4066Sahrens static void 197fa9e4066Sahrens vdev_cache_fill(zio_t *zio) 198fa9e4066Sahrens { 199fa9e4066Sahrens vdev_t *vd = zio->io_vd; 200fa9e4066Sahrens vdev_cache_t *vc = &vd->vdev_cache; 201fa9e4066Sahrens vdev_cache_entry_t *ve = zio->io_private; 202fa9e4066Sahrens zio_t *dio; 203fa9e4066Sahrens 204614409b5Sahrens ASSERT(zio->io_size == VCBS); 205fa9e4066Sahrens 206fa9e4066Sahrens /* 207fa9e4066Sahrens * Add data to the cache. 208fa9e4066Sahrens */ 209fa9e4066Sahrens mutex_enter(&vc->vc_lock); 210fa9e4066Sahrens 211fa9e4066Sahrens ASSERT(ve->ve_fill_io == zio); 212fa9e4066Sahrens ASSERT(ve->ve_offset == zio->io_offset); 213fa9e4066Sahrens ASSERT(ve->ve_data == zio->io_data); 214fa9e4066Sahrens 215fa9e4066Sahrens ve->ve_fill_io = NULL; 216fa9e4066Sahrens 217fa9e4066Sahrens /* 218fa9e4066Sahrens * Even if this cache line was invalidated by a missed write update, 219fa9e4066Sahrens * any reads that were queued up before the missed update are still 220fa9e4066Sahrens * valid, so we can satisfy them from this line before we evict it. 221fa9e4066Sahrens */ 222fa9e4066Sahrens for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) 223fa9e4066Sahrens vdev_cache_hit(vc, ve, dio); 224fa9e4066Sahrens 225fa9e4066Sahrens if (zio->io_error || ve->ve_missed_update) 226fa9e4066Sahrens vdev_cache_evict(vc, ve); 227fa9e4066Sahrens 228fa9e4066Sahrens mutex_exit(&vc->vc_lock); 229fa9e4066Sahrens 230fa9e4066Sahrens while ((dio = zio->io_delegate_list) != NULL) { 231fa9e4066Sahrens zio->io_delegate_list = dio->io_delegate_next; 232fa9e4066Sahrens dio->io_delegate_next = NULL; 233fa9e4066Sahrens dio->io_error = zio->io_error; 234fa9e4066Sahrens zio_next_stage(dio); 235fa9e4066Sahrens } 236fa9e4066Sahrens } 237fa9e4066Sahrens 238fa9e4066Sahrens /* 239fa9e4066Sahrens * Read data from the cache. Returns 0 on cache hit, errno on a miss. 240fa9e4066Sahrens */ 241fa9e4066Sahrens int 242fa9e4066Sahrens vdev_cache_read(zio_t *zio) 243fa9e4066Sahrens { 244fa9e4066Sahrens vdev_cache_t *vc = &zio->io_vd->vdev_cache; 245fa9e4066Sahrens vdev_cache_entry_t *ve, ve_search; 246614409b5Sahrens uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); 247614409b5Sahrens uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 248fa9e4066Sahrens zio_t *fio; 249fa9e4066Sahrens 250fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 251fa9e4066Sahrens 252fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_CACHE) 253fa9e4066Sahrens return (EINVAL); 254fa9e4066Sahrens 255614409b5Sahrens if (zio->io_size > zfs_vdev_cache_max) 256fa9e4066Sahrens return (EOVERFLOW); 257fa9e4066Sahrens 258fa9e4066Sahrens /* 259fa9e4066Sahrens * If the I/O straddles two or more cache blocks, don't cache it. 260fa9e4066Sahrens */ 261614409b5Sahrens if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) 262fa9e4066Sahrens return (EXDEV); 263fa9e4066Sahrens 264614409b5Sahrens ASSERT(cache_phase + zio->io_size <= VCBS); 265fa9e4066Sahrens 266fa9e4066Sahrens mutex_enter(&vc->vc_lock); 267fa9e4066Sahrens 268fa9e4066Sahrens ve_search.ve_offset = cache_offset; 269fa9e4066Sahrens ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); 270fa9e4066Sahrens 271fa9e4066Sahrens if (ve != NULL) { 272fa9e4066Sahrens if (ve->ve_missed_update) { 273fa9e4066Sahrens mutex_exit(&vc->vc_lock); 274fa9e4066Sahrens return (ESTALE); 275fa9e4066Sahrens } 276fa9e4066Sahrens 277fa9e4066Sahrens if ((fio = ve->ve_fill_io) != NULL) { 278fa9e4066Sahrens zio->io_delegate_next = fio->io_delegate_list; 279fa9e4066Sahrens fio->io_delegate_list = zio; 280fa9e4066Sahrens zio_vdev_io_bypass(zio); 281fa9e4066Sahrens mutex_exit(&vc->vc_lock); 282fa9e4066Sahrens return (0); 283fa9e4066Sahrens } 284fa9e4066Sahrens 285fa9e4066Sahrens vdev_cache_hit(vc, ve, zio); 286fa9e4066Sahrens zio_vdev_io_bypass(zio); 287fa9e4066Sahrens 288fa9e4066Sahrens mutex_exit(&vc->vc_lock); 289fa9e4066Sahrens zio_next_stage(zio); 290fa9e4066Sahrens return (0); 291fa9e4066Sahrens } 292fa9e4066Sahrens 293*fdb2e906Sek if (!(zio->io_flags & ZIO_FLAG_METADATA)) { 294*fdb2e906Sek mutex_exit(&vc->vc_lock); 295*fdb2e906Sek return (EINVAL); 296*fdb2e906Sek } 297*fdb2e906Sek 298fa9e4066Sahrens ve = vdev_cache_allocate(zio); 299fa9e4066Sahrens 300fa9e4066Sahrens if (ve == NULL) { 301fa9e4066Sahrens mutex_exit(&vc->vc_lock); 302fa9e4066Sahrens return (ENOMEM); 303fa9e4066Sahrens } 304fa9e4066Sahrens 305fa9e4066Sahrens fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, 306614409b5Sahrens ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, 307ea8dc4b6Seschrock ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 308ea8dc4b6Seschrock ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, 309fa9e4066Sahrens vdev_cache_fill, ve); 310fa9e4066Sahrens 311fa9e4066Sahrens ve->ve_fill_io = fio; 312fa9e4066Sahrens fio->io_delegate_list = zio; 313fa9e4066Sahrens zio_vdev_io_bypass(zio); 314fa9e4066Sahrens 315fa9e4066Sahrens mutex_exit(&vc->vc_lock); 316fa9e4066Sahrens zio_nowait(fio); 317fa9e4066Sahrens 318fa9e4066Sahrens return (0); 319fa9e4066Sahrens } 320fa9e4066Sahrens 321fa9e4066Sahrens /* 322fa9e4066Sahrens * Update cache contents upon write completion. 323fa9e4066Sahrens */ 324fa9e4066Sahrens void 325fa9e4066Sahrens vdev_cache_write(zio_t *zio) 326fa9e4066Sahrens { 327fa9e4066Sahrens vdev_cache_t *vc = &zio->io_vd->vdev_cache; 328fa9e4066Sahrens vdev_cache_entry_t *ve, ve_search; 329fa9e4066Sahrens uint64_t io_start = zio->io_offset; 330fa9e4066Sahrens uint64_t io_end = io_start + zio->io_size; 331614409b5Sahrens uint64_t min_offset = P2ALIGN(io_start, VCBS); 332614409b5Sahrens uint64_t max_offset = P2ROUNDUP(io_end, VCBS); 333fa9e4066Sahrens avl_index_t where; 334fa9e4066Sahrens 335fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_WRITE); 336fa9e4066Sahrens 337fa9e4066Sahrens mutex_enter(&vc->vc_lock); 338fa9e4066Sahrens 339fa9e4066Sahrens ve_search.ve_offset = min_offset; 340fa9e4066Sahrens ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); 341fa9e4066Sahrens 342fa9e4066Sahrens if (ve == NULL) 343fa9e4066Sahrens ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); 344fa9e4066Sahrens 345fa9e4066Sahrens while (ve != NULL && ve->ve_offset < max_offset) { 346fa9e4066Sahrens uint64_t start = MAX(ve->ve_offset, io_start); 347614409b5Sahrens uint64_t end = MIN(ve->ve_offset + VCBS, io_end); 348fa9e4066Sahrens 349fa9e4066Sahrens if (ve->ve_fill_io != NULL) { 350fa9e4066Sahrens ve->ve_missed_update = 1; 351fa9e4066Sahrens } else { 352fa9e4066Sahrens bcopy((char *)zio->io_data + start - io_start, 353fa9e4066Sahrens ve->ve_data + start - ve->ve_offset, end - start); 354fa9e4066Sahrens } 355fa9e4066Sahrens ve = AVL_NEXT(&vc->vc_offset_tree, ve); 356fa9e4066Sahrens } 357fa9e4066Sahrens mutex_exit(&vc->vc_lock); 358fa9e4066Sahrens } 359fa9e4066Sahrens 3603d7072f8Seschrock void 3613d7072f8Seschrock vdev_cache_purge(vdev_t *vd) 3623d7072f8Seschrock { 3633d7072f8Seschrock vdev_cache_t *vc = &vd->vdev_cache; 3643d7072f8Seschrock vdev_cache_entry_t *ve; 3653d7072f8Seschrock 3663d7072f8Seschrock mutex_enter(&vc->vc_lock); 3673d7072f8Seschrock while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) 3683d7072f8Seschrock vdev_cache_evict(vc, ve); 3693d7072f8Seschrock mutex_exit(&vc->vc_lock); 3703d7072f8Seschrock } 3713d7072f8Seschrock 372fa9e4066Sahrens void 373fa9e4066Sahrens vdev_cache_init(vdev_t *vd) 374fa9e4066Sahrens { 375fa9e4066Sahrens vdev_cache_t *vc = &vd->vdev_cache; 376fa9e4066Sahrens 377fa9e4066Sahrens mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); 378fa9e4066Sahrens 379fa9e4066Sahrens avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, 380fa9e4066Sahrens sizeof (vdev_cache_entry_t), 381fa9e4066Sahrens offsetof(struct vdev_cache_entry, ve_offset_node)); 382fa9e4066Sahrens 383fa9e4066Sahrens avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, 384fa9e4066Sahrens sizeof (vdev_cache_entry_t), 385fa9e4066Sahrens offsetof(struct vdev_cache_entry, ve_lastused_node)); 386fa9e4066Sahrens } 387fa9e4066Sahrens 388fa9e4066Sahrens void 389fa9e4066Sahrens vdev_cache_fini(vdev_t *vd) 390fa9e4066Sahrens { 391fa9e4066Sahrens vdev_cache_t *vc = &vd->vdev_cache; 392fa9e4066Sahrens 3933d7072f8Seschrock vdev_cache_purge(vd); 394fa9e4066Sahrens 395fa9e4066Sahrens avl_destroy(&vc->vc_offset_tree); 396fa9e4066Sahrens avl_destroy(&vc->vc_lastused_tree); 397fa9e4066Sahrens 398fa9e4066Sahrens mutex_destroy(&vc->vc_lock); 399fa9e4066Sahrens } 400