1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22ea8dc4b6Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/zfs_context.h> 29ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 30fa9e4066Sahrens #include <sys/spa.h> 31fa9e4066Sahrens #include <sys/txg.h> 32fa9e4066Sahrens #include <sys/spa_impl.h> 33fa9e4066Sahrens #include <sys/vdev_impl.h> 34fa9e4066Sahrens #include <sys/zio_impl.h> 35fa9e4066Sahrens #include <sys/zio_compress.h> 36fa9e4066Sahrens #include <sys/zio_checksum.h> 37fa9e4066Sahrens 38fa9e4066Sahrens /* 39fa9e4066Sahrens * ========================================================================== 40fa9e4066Sahrens * I/O priority table 41fa9e4066Sahrens * ========================================================================== 42fa9e4066Sahrens */ 43fa9e4066Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44fa9e4066Sahrens 0, /* ZIO_PRIORITY_NOW */ 45fa9e4066Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46fa9e4066Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47fa9e4066Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48fa9e4066Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49fa9e4066Sahrens 4, /* ZIO_PRIORITY_FREE */ 50fa9e4066Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51fa9e4066Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52fa9e4066Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53fa9e4066Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54fa9e4066Sahrens }; 55fa9e4066Sahrens 56fa9e4066Sahrens /* 57fa9e4066Sahrens * ========================================================================== 58fa9e4066Sahrens * I/O type descriptions 59fa9e4066Sahrens * ========================================================================== 60fa9e4066Sahrens */ 61fa9e4066Sahrens char *zio_type_name[ZIO_TYPES] = { 62fa9e4066Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63fa9e4066Sahrens 64fa9e4066Sahrens /* At or above this size, force gang blocking - for testing */ 65fa9e4066Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66fa9e4066Sahrens 67fa9e4066Sahrens typedef struct zio_sync_pass { 68fa9e4066Sahrens int zp_defer_free; /* defer frees after this pass */ 69fa9e4066Sahrens int zp_dontcompress; /* don't compress after this pass */ 70fa9e4066Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 71fa9e4066Sahrens } zio_sync_pass_t; 72fa9e4066Sahrens 73fa9e4066Sahrens zio_sync_pass_t zio_sync_pass = { 74fa9e4066Sahrens 1, /* zp_defer_free */ 75fa9e4066Sahrens 4, /* zp_dontcompress */ 76fa9e4066Sahrens 1, /* zp_rewrite */ 77fa9e4066Sahrens }; 78fa9e4066Sahrens 79fa9e4066Sahrens /* 80fa9e4066Sahrens * ========================================================================== 81fa9e4066Sahrens * I/O kmem caches 82fa9e4066Sahrens * ========================================================================== 83fa9e4066Sahrens */ 84fa9e4066Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 85fa9e4066Sahrens 86fa9e4066Sahrens void 87fa9e4066Sahrens zio_init(void) 88fa9e4066Sahrens { 89fa9e4066Sahrens size_t c; 90fa9e4066Sahrens 91fa9e4066Sahrens /* 92fa9e4066Sahrens * For small buffers, we want a cache for each multiple of 93fa9e4066Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 94fa9e4066Sahrens * for each quarter-power of 2. For large buffers, we want 95fa9e4066Sahrens * a cache for each multiple of PAGESIZE. 96fa9e4066Sahrens */ 97fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 98fa9e4066Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 99fa9e4066Sahrens size_t p2 = size; 100fa9e4066Sahrens size_t align = 0; 101fa9e4066Sahrens 102fa9e4066Sahrens while (p2 & (p2 - 1)) 103fa9e4066Sahrens p2 &= p2 - 1; 104fa9e4066Sahrens 105fa9e4066Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 106fa9e4066Sahrens align = SPA_MINBLOCKSIZE; 107fa9e4066Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 108fa9e4066Sahrens align = PAGESIZE; 109fa9e4066Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 110fa9e4066Sahrens align = p2 >> 2; 111fa9e4066Sahrens } 112fa9e4066Sahrens 113fa9e4066Sahrens if (align != 0) { 114fa9e4066Sahrens char name[30]; 115*5ad82045Snd (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 116fa9e4066Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 117a0965f35Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 118fa9e4066Sahrens dprintf("creating cache for size %5lx align %5lx\n", 119fa9e4066Sahrens size, align); 120fa9e4066Sahrens } 121fa9e4066Sahrens } 122fa9e4066Sahrens 123fa9e4066Sahrens while (--c != 0) { 124fa9e4066Sahrens ASSERT(zio_buf_cache[c] != NULL); 125fa9e4066Sahrens if (zio_buf_cache[c - 1] == NULL) 126fa9e4066Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 127fa9e4066Sahrens } 128ea8dc4b6Seschrock 129ea8dc4b6Seschrock zio_inject_init(); 130fa9e4066Sahrens } 131fa9e4066Sahrens 132fa9e4066Sahrens void 133fa9e4066Sahrens zio_fini(void) 134fa9e4066Sahrens { 135fa9e4066Sahrens size_t c; 136fa9e4066Sahrens kmem_cache_t *last_cache = NULL; 137fa9e4066Sahrens 138fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 139fa9e4066Sahrens if (zio_buf_cache[c] != last_cache) { 140fa9e4066Sahrens last_cache = zio_buf_cache[c]; 141fa9e4066Sahrens kmem_cache_destroy(zio_buf_cache[c]); 142fa9e4066Sahrens } 143fa9e4066Sahrens zio_buf_cache[c] = NULL; 144fa9e4066Sahrens } 145ea8dc4b6Seschrock 146ea8dc4b6Seschrock zio_inject_fini(); 147fa9e4066Sahrens } 148fa9e4066Sahrens 149fa9e4066Sahrens /* 150fa9e4066Sahrens * ========================================================================== 151fa9e4066Sahrens * Allocate and free I/O buffers 152fa9e4066Sahrens * ========================================================================== 153fa9e4066Sahrens */ 154fa9e4066Sahrens void * 155fa9e4066Sahrens zio_buf_alloc(size_t size) 156fa9e4066Sahrens { 157fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 158fa9e4066Sahrens 159fa9e4066Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 160fa9e4066Sahrens 161fa9e4066Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 162fa9e4066Sahrens } 163fa9e4066Sahrens 164fa9e4066Sahrens void 165fa9e4066Sahrens zio_buf_free(void *buf, size_t size) 166fa9e4066Sahrens { 167fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 168fa9e4066Sahrens 169fa9e4066Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 170fa9e4066Sahrens 171fa9e4066Sahrens kmem_cache_free(zio_buf_cache[c], buf); 172fa9e4066Sahrens } 173fa9e4066Sahrens 174fa9e4066Sahrens /* 175fa9e4066Sahrens * ========================================================================== 176fa9e4066Sahrens * Push and pop I/O transform buffers 177fa9e4066Sahrens * ========================================================================== 178fa9e4066Sahrens */ 179fa9e4066Sahrens static void 180fa9e4066Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 181fa9e4066Sahrens { 182fa9e4066Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 183fa9e4066Sahrens 184fa9e4066Sahrens zt->zt_data = data; 185fa9e4066Sahrens zt->zt_size = size; 186fa9e4066Sahrens zt->zt_bufsize = bufsize; 187fa9e4066Sahrens 188fa9e4066Sahrens zt->zt_next = zio->io_transform_stack; 189fa9e4066Sahrens zio->io_transform_stack = zt; 190fa9e4066Sahrens 191fa9e4066Sahrens zio->io_data = data; 192fa9e4066Sahrens zio->io_size = size; 193fa9e4066Sahrens } 194fa9e4066Sahrens 195fa9e4066Sahrens static void 196fa9e4066Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 197fa9e4066Sahrens { 198fa9e4066Sahrens zio_transform_t *zt = zio->io_transform_stack; 199fa9e4066Sahrens 200fa9e4066Sahrens *data = zt->zt_data; 201fa9e4066Sahrens *size = zt->zt_size; 202fa9e4066Sahrens *bufsize = zt->zt_bufsize; 203fa9e4066Sahrens 204fa9e4066Sahrens zio->io_transform_stack = zt->zt_next; 205fa9e4066Sahrens kmem_free(zt, sizeof (zio_transform_t)); 206fa9e4066Sahrens 207fa9e4066Sahrens if ((zt = zio->io_transform_stack) != NULL) { 208fa9e4066Sahrens zio->io_data = zt->zt_data; 209fa9e4066Sahrens zio->io_size = zt->zt_size; 210fa9e4066Sahrens } 211fa9e4066Sahrens } 212fa9e4066Sahrens 213fa9e4066Sahrens static void 214fa9e4066Sahrens zio_clear_transform_stack(zio_t *zio) 215fa9e4066Sahrens { 216fa9e4066Sahrens void *data; 217fa9e4066Sahrens uint64_t size, bufsize; 218fa9e4066Sahrens 219fa9e4066Sahrens ASSERT(zio->io_transform_stack != NULL); 220fa9e4066Sahrens 221fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 222fa9e4066Sahrens while (zio->io_transform_stack != NULL) { 223fa9e4066Sahrens zio_buf_free(data, bufsize); 224fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 225fa9e4066Sahrens } 226fa9e4066Sahrens } 227fa9e4066Sahrens 228fa9e4066Sahrens /* 229fa9e4066Sahrens * ========================================================================== 230fa9e4066Sahrens * Create the various types of I/O (read, write, free) 231fa9e4066Sahrens * ========================================================================== 232fa9e4066Sahrens */ 233fa9e4066Sahrens static zio_t * 234fa9e4066Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 235fa9e4066Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 236fa9e4066Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 237fa9e4066Sahrens { 238fa9e4066Sahrens zio_t *zio; 239fa9e4066Sahrens 240fa9e4066Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 241fa9e4066Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 242fa9e4066Sahrens 243fa9e4066Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 244fa9e4066Sahrens zio->io_parent = pio; 245fa9e4066Sahrens zio->io_spa = spa; 246fa9e4066Sahrens zio->io_txg = txg; 247fa9e4066Sahrens if (bp != NULL) { 248fa9e4066Sahrens zio->io_bp = bp; 249fa9e4066Sahrens zio->io_bp_copy = *bp; 250fa9e4066Sahrens zio->io_bp_orig = *bp; 251fa9e4066Sahrens } 252fa9e4066Sahrens zio->io_done = done; 253fa9e4066Sahrens zio->io_private = private; 254fa9e4066Sahrens zio->io_type = type; 255fa9e4066Sahrens zio->io_priority = priority; 256fa9e4066Sahrens zio->io_stage = stage; 257fa9e4066Sahrens zio->io_pipeline = pipeline; 258fa9e4066Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 259fa9e4066Sahrens zio->io_timestamp = lbolt64; 260fa9e4066Sahrens zio->io_flags = flags; 261*5ad82045Snd mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 262fa9e4066Sahrens zio_push_transform(zio, data, size, size); 263fa9e4066Sahrens 264fa9e4066Sahrens if (pio == NULL) { 265fa9e4066Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 266ea8dc4b6Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 267fa9e4066Sahrens zio->io_root = zio; 268fa9e4066Sahrens } else { 269fa9e4066Sahrens zio->io_root = pio->io_root; 270ea8dc4b6Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 271ea8dc4b6Seschrock zio->io_logical = pio->io_logical; 272fa9e4066Sahrens mutex_enter(&pio->io_lock); 273fa9e4066Sahrens if (stage < ZIO_STAGE_READY) 274fa9e4066Sahrens pio->io_children_notready++; 275fa9e4066Sahrens pio->io_children_notdone++; 276fa9e4066Sahrens zio->io_sibling_next = pio->io_child; 277fa9e4066Sahrens zio->io_sibling_prev = NULL; 278fa9e4066Sahrens if (pio->io_child != NULL) 279fa9e4066Sahrens pio->io_child->io_sibling_prev = zio; 280fa9e4066Sahrens pio->io_child = zio; 28144cd46caSbillm zio->io_ndvas = pio->io_ndvas; 282fa9e4066Sahrens mutex_exit(&pio->io_lock); 283fa9e4066Sahrens } 284fa9e4066Sahrens 285fa9e4066Sahrens return (zio); 286fa9e4066Sahrens } 287fa9e4066Sahrens 288fa9e4066Sahrens zio_t * 289fa9e4066Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 290fa9e4066Sahrens int flags) 291fa9e4066Sahrens { 292fa9e4066Sahrens zio_t *zio; 293fa9e4066Sahrens 294fa9e4066Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 295fa9e4066Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 296fa9e4066Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 297fa9e4066Sahrens 298fa9e4066Sahrens return (zio); 299fa9e4066Sahrens } 300fa9e4066Sahrens 301fa9e4066Sahrens zio_t * 302fa9e4066Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 303fa9e4066Sahrens { 304fa9e4066Sahrens return (zio_null(NULL, spa, done, private, flags)); 305fa9e4066Sahrens } 306fa9e4066Sahrens 307fa9e4066Sahrens zio_t * 308fa9e4066Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 309fa9e4066Sahrens uint64_t size, zio_done_func_t *done, void *private, 310ea8dc4b6Seschrock int priority, int flags, zbookmark_t *zb) 311fa9e4066Sahrens { 312fa9e4066Sahrens zio_t *zio; 313fa9e4066Sahrens 314fa9e4066Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 315fa9e4066Sahrens 316fa9e4066Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 317fa9e4066Sahrens ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 318ea8dc4b6Seschrock zio->io_bookmark = *zb; 319ea8dc4b6Seschrock 320ea8dc4b6Seschrock zio->io_logical = zio; 321fa9e4066Sahrens 322fa9e4066Sahrens /* 323fa9e4066Sahrens * Work off our copy of the bp so the caller can free it. 324fa9e4066Sahrens */ 325fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 326fa9e4066Sahrens 327fa9e4066Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 328fa9e4066Sahrens uint64_t csize = BP_GET_PSIZE(bp); 329fa9e4066Sahrens void *cbuf = zio_buf_alloc(csize); 330fa9e4066Sahrens 331fa9e4066Sahrens zio_push_transform(zio, cbuf, csize, csize); 332fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 333fa9e4066Sahrens } 334fa9e4066Sahrens 33544cd46caSbillm if (BP_IS_GANG(bp)) { 336fa9e4066Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 337fa9e4066Sahrens void *gbuf = zio_buf_alloc(gsize); 338fa9e4066Sahrens 339fa9e4066Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 340fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 341fa9e4066Sahrens } 342fa9e4066Sahrens 343fa9e4066Sahrens return (zio); 344fa9e4066Sahrens } 345fa9e4066Sahrens 346fa9e4066Sahrens zio_t * 34744cd46caSbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 348fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 349ea8dc4b6Seschrock zio_done_func_t *done, void *private, int priority, int flags, 350ea8dc4b6Seschrock zbookmark_t *zb) 351fa9e4066Sahrens { 352fa9e4066Sahrens zio_t *zio; 353fa9e4066Sahrens 354fa9e4066Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 355fa9e4066Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 356fa9e4066Sahrens 357fa9e4066Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 358fa9e4066Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 359fa9e4066Sahrens 360fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 361fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 362fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 363fa9e4066Sahrens 364ea8dc4b6Seschrock zio->io_bookmark = *zb; 365ea8dc4b6Seschrock 366ea8dc4b6Seschrock zio->io_logical = zio; 367ea8dc4b6Seschrock 368fa9e4066Sahrens zio->io_checksum = checksum; 369fa9e4066Sahrens zio->io_compress = compress; 37044cd46caSbillm zio->io_ndvas = ncopies; 371fa9e4066Sahrens 372fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF) 373fa9e4066Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 374fa9e4066Sahrens 375fa9e4066Sahrens if (bp->blk_birth != txg) { 376fa9e4066Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 377fa9e4066Sahrens BP_ZERO(bp); 378fa9e4066Sahrens BP_SET_LSIZE(bp, size); 379fa9e4066Sahrens BP_SET_PSIZE(bp, size); 38044cd46caSbillm } else { 38144cd46caSbillm /* Make sure someone doesn't change their mind on overwrites */ 38244cd46caSbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 38344cd46caSbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 384fa9e4066Sahrens } 385fa9e4066Sahrens 386fa9e4066Sahrens return (zio); 387fa9e4066Sahrens } 388fa9e4066Sahrens 389fa9e4066Sahrens zio_t * 390fa9e4066Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 391fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 392ea8dc4b6Seschrock zio_done_func_t *done, void *private, int priority, int flags, 393ea8dc4b6Seschrock zbookmark_t *zb) 394fa9e4066Sahrens { 395fa9e4066Sahrens zio_t *zio; 396fa9e4066Sahrens 397fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 398fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 399fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 400fa9e4066Sahrens 401ea8dc4b6Seschrock zio->io_bookmark = *zb; 402fa9e4066Sahrens zio->io_checksum = checksum; 403fa9e4066Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 404fa9e4066Sahrens 40544cd46caSbillm if (pio != NULL) 40644cd46caSbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 40744cd46caSbillm 408fa9e4066Sahrens return (zio); 409fa9e4066Sahrens } 410fa9e4066Sahrens 411fa9e4066Sahrens static zio_t * 412fa9e4066Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 413fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 414fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 415fa9e4066Sahrens { 416fa9e4066Sahrens zio_t *zio; 417fa9e4066Sahrens 418fa9e4066Sahrens BP_ZERO(bp); 419fa9e4066Sahrens BP_SET_LSIZE(bp, size); 420fa9e4066Sahrens BP_SET_PSIZE(bp, size); 421fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 422fa9e4066Sahrens 423fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 424fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 425fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 426fa9e4066Sahrens 427fa9e4066Sahrens zio->io_checksum = checksum; 428fa9e4066Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 429fa9e4066Sahrens 430fa9e4066Sahrens return (zio); 431fa9e4066Sahrens } 432fa9e4066Sahrens 433fa9e4066Sahrens zio_t * 434fa9e4066Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 435fa9e4066Sahrens zio_done_func_t *done, void *private) 436fa9e4066Sahrens { 437fa9e4066Sahrens zio_t *zio; 438fa9e4066Sahrens 439fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 440fa9e4066Sahrens 441fa9e4066Sahrens if (txg == spa->spa_syncing_txg && 442fa9e4066Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 443fa9e4066Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 444fa9e4066Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 445fa9e4066Sahrens } 446fa9e4066Sahrens 447fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 448fa9e4066Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, 449fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 450fa9e4066Sahrens 451fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 452fa9e4066Sahrens 453fa9e4066Sahrens return (zio); 454fa9e4066Sahrens } 455fa9e4066Sahrens 456fa9e4066Sahrens zio_t * 457fa9e4066Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 458fa9e4066Sahrens zio_done_func_t *done, void *private) 459fa9e4066Sahrens { 460fa9e4066Sahrens zio_t *zio; 461fa9e4066Sahrens 462fa9e4066Sahrens /* 463fa9e4066Sahrens * A claim is an allocation of a specific block. Claims are needed 464fa9e4066Sahrens * to support immediate writes in the intent log. The issue is that 465fa9e4066Sahrens * immediate writes contain committed data, but in a txg that was 466fa9e4066Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 467fa9e4066Sahrens * the intent log claims all blocks that contain immediate write data 468fa9e4066Sahrens * so that the SPA knows they're in use. 469fa9e4066Sahrens * 470fa9e4066Sahrens * All claims *must* be resolved in the first txg -- before the SPA 471fa9e4066Sahrens * starts allocating blocks -- so that nothing is allocated twice. 472fa9e4066Sahrens */ 473fa9e4066Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 474fa9e4066Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 475fa9e4066Sahrens 476fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 477fa9e4066Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 478fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 479fa9e4066Sahrens 480fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 481fa9e4066Sahrens 482fa9e4066Sahrens return (zio); 483fa9e4066Sahrens } 484fa9e4066Sahrens 485fa9e4066Sahrens zio_t * 486fa9e4066Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 487fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 488fa9e4066Sahrens { 489fa9e4066Sahrens zio_t *zio; 490fa9e4066Sahrens int c; 491fa9e4066Sahrens 492fa9e4066Sahrens if (vd->vdev_children == 0) { 493fa9e4066Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 494fa9e4066Sahrens ZIO_TYPE_IOCTL, priority, flags, 495fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 496fa9e4066Sahrens 497fa9e4066Sahrens zio->io_vd = vd; 498fa9e4066Sahrens zio->io_cmd = cmd; 499fa9e4066Sahrens } else { 500fa9e4066Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 501fa9e4066Sahrens 502fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 503fa9e4066Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 504fa9e4066Sahrens done, private, priority, flags)); 505fa9e4066Sahrens } 506fa9e4066Sahrens 507fa9e4066Sahrens return (zio); 508fa9e4066Sahrens } 509fa9e4066Sahrens 510fa9e4066Sahrens static void 511fa9e4066Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 512fa9e4066Sahrens int checksum) 513fa9e4066Sahrens { 514fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 515fa9e4066Sahrens 516fa9e4066Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 517fa9e4066Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 518fa9e4066Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 519fa9e4066Sahrens 520fa9e4066Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 521fa9e4066Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 522fa9e4066Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 523fa9e4066Sahrens 524fa9e4066Sahrens BP_ZERO(bp); 525fa9e4066Sahrens 526fa9e4066Sahrens BP_SET_LSIZE(bp, size); 527fa9e4066Sahrens BP_SET_PSIZE(bp, size); 528fa9e4066Sahrens 529fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 530fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 531fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 532fa9e4066Sahrens 533fa9e4066Sahrens if (checksum != ZIO_CHECKSUM_OFF) 534fa9e4066Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 535fa9e4066Sahrens } 536fa9e4066Sahrens 537fa9e4066Sahrens zio_t * 538fa9e4066Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 539fa9e4066Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 540fa9e4066Sahrens int priority, int flags) 541fa9e4066Sahrens { 542fa9e4066Sahrens zio_t *zio; 543fa9e4066Sahrens blkptr_t blk; 544fa9e4066Sahrens 545fa9e4066Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 546fa9e4066Sahrens 547fa9e4066Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 548fa9e4066Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 549fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 550fa9e4066Sahrens 551fa9e4066Sahrens zio->io_vd = vd; 552fa9e4066Sahrens zio->io_offset = offset; 553fa9e4066Sahrens 554fa9e4066Sahrens /* 555fa9e4066Sahrens * Work off our copy of the bp so the caller can free it. 556fa9e4066Sahrens */ 557fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 558fa9e4066Sahrens 559fa9e4066Sahrens return (zio); 560fa9e4066Sahrens } 561fa9e4066Sahrens 562fa9e4066Sahrens zio_t * 563fa9e4066Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 564fa9e4066Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 565fa9e4066Sahrens int priority, int flags) 566fa9e4066Sahrens { 567fa9e4066Sahrens zio_block_tail_t *zbt; 568fa9e4066Sahrens void *wbuf; 569fa9e4066Sahrens zio_t *zio; 570fa9e4066Sahrens blkptr_t blk; 571fa9e4066Sahrens 572fa9e4066Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 573fa9e4066Sahrens 574fa9e4066Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 575fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 576fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 577fa9e4066Sahrens 578fa9e4066Sahrens zio->io_vd = vd; 579fa9e4066Sahrens zio->io_offset = offset; 580fa9e4066Sahrens 581fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 582fa9e4066Sahrens zio->io_checksum = checksum; 583fa9e4066Sahrens 584fa9e4066Sahrens if (zio_checksum_table[checksum].ci_zbt) { 585fa9e4066Sahrens /* 586fa9e4066Sahrens * zbt checksums are necessarily destructive -- they modify 587fa9e4066Sahrens * one word of the write buffer to hold the verifier/checksum. 588fa9e4066Sahrens * Therefore, we must make a local copy in case the data is 589fa9e4066Sahrens * being written to multiple places. 590fa9e4066Sahrens */ 591fa9e4066Sahrens wbuf = zio_buf_alloc(size); 592fa9e4066Sahrens bcopy(data, wbuf, size); 593fa9e4066Sahrens zio_push_transform(zio, wbuf, size, size); 594fa9e4066Sahrens 595fa9e4066Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 596fa9e4066Sahrens zbt->zbt_cksum = blk.blk_cksum; 597fa9e4066Sahrens } 598fa9e4066Sahrens 599fa9e4066Sahrens return (zio); 600fa9e4066Sahrens } 601fa9e4066Sahrens 602fa9e4066Sahrens /* 603fa9e4066Sahrens * Create a child I/O to do some work for us. It has no associated bp. 604fa9e4066Sahrens */ 605fa9e4066Sahrens zio_t * 606fa9e4066Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 607fa9e4066Sahrens void *data, uint64_t size, int type, int priority, int flags, 608fa9e4066Sahrens zio_done_func_t *done, void *private) 609fa9e4066Sahrens { 610fa9e4066Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 611fa9e4066Sahrens zio_t *cio; 612fa9e4066Sahrens 613fa9e4066Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 614fa9e4066Sahrens /* 615fa9e4066Sahrens * If we have the bp, then the child should perform the 616fa9e4066Sahrens * checksum and the parent need not. This pushes error 617fa9e4066Sahrens * detection as close to the leaves as possible and 618fa9e4066Sahrens * eliminates redundant checksums in the interior nodes. 619fa9e4066Sahrens */ 620fa9e4066Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 621fa9e4066Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 622fa9e4066Sahrens } 623fa9e4066Sahrens 624fa9e4066Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 625fa9e4066Sahrens done, private, type, priority, 626fa9e4066Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 62744cd46caSbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 628fa9e4066Sahrens 629fa9e4066Sahrens cio->io_vd = vd; 630fa9e4066Sahrens cio->io_offset = offset; 631fa9e4066Sahrens 632fa9e4066Sahrens return (cio); 633fa9e4066Sahrens } 634fa9e4066Sahrens 635fa9e4066Sahrens /* 636fa9e4066Sahrens * ========================================================================== 637fa9e4066Sahrens * Initiate I/O, either sync or async 638fa9e4066Sahrens * ========================================================================== 639fa9e4066Sahrens */ 640fa9e4066Sahrens int 641fa9e4066Sahrens zio_wait(zio_t *zio) 642fa9e4066Sahrens { 643fa9e4066Sahrens int error; 644fa9e4066Sahrens 645fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 646fa9e4066Sahrens 647fa9e4066Sahrens zio->io_waiter = curthread; 648fa9e4066Sahrens 649fa9e4066Sahrens zio_next_stage_async(zio); 650fa9e4066Sahrens 651fa9e4066Sahrens mutex_enter(&zio->io_lock); 652fa9e4066Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 653fa9e4066Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 654fa9e4066Sahrens mutex_exit(&zio->io_lock); 655fa9e4066Sahrens 656fa9e4066Sahrens error = zio->io_error; 657*5ad82045Snd mutex_destroy(&zio->io_lock); 658fa9e4066Sahrens kmem_free(zio, sizeof (zio_t)); 659fa9e4066Sahrens 660fa9e4066Sahrens return (error); 661fa9e4066Sahrens } 662fa9e4066Sahrens 663fa9e4066Sahrens void 664fa9e4066Sahrens zio_nowait(zio_t *zio) 665fa9e4066Sahrens { 666fa9e4066Sahrens zio_next_stage_async(zio); 667fa9e4066Sahrens } 668fa9e4066Sahrens 669fa9e4066Sahrens /* 670fa9e4066Sahrens * ========================================================================== 671fa9e4066Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 672fa9e4066Sahrens * ========================================================================== 673fa9e4066Sahrens */ 674fa9e4066Sahrens static void 675fa9e4066Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 676fa9e4066Sahrens { 677fa9e4066Sahrens mutex_enter(&zio->io_lock); 678fa9e4066Sahrens if (*countp == 0) { 679fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 680fa9e4066Sahrens mutex_exit(&zio->io_lock); 681fa9e4066Sahrens zio_next_stage(zio); 682fa9e4066Sahrens } else { 683fa9e4066Sahrens zio->io_stalled = stage; 684fa9e4066Sahrens mutex_exit(&zio->io_lock); 685fa9e4066Sahrens } 686fa9e4066Sahrens } 687fa9e4066Sahrens 688fa9e4066Sahrens static void 689fa9e4066Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 690fa9e4066Sahrens { 691fa9e4066Sahrens zio_t *pio = zio->io_parent; 692fa9e4066Sahrens 693fa9e4066Sahrens mutex_enter(&pio->io_lock); 694fa9e4066Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 695fa9e4066Sahrens pio->io_error = zio->io_error; 696fa9e4066Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 697fa9e4066Sahrens pio->io_stalled = 0; 698fa9e4066Sahrens mutex_exit(&pio->io_lock); 699fa9e4066Sahrens zio_next_stage_async(pio); 700fa9e4066Sahrens } else { 701fa9e4066Sahrens mutex_exit(&pio->io_lock); 702fa9e4066Sahrens } 703fa9e4066Sahrens } 704fa9e4066Sahrens 705fa9e4066Sahrens static void 706fa9e4066Sahrens zio_wait_children_ready(zio_t *zio) 707fa9e4066Sahrens { 708fa9e4066Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 709fa9e4066Sahrens &zio->io_children_notready); 710fa9e4066Sahrens } 711fa9e4066Sahrens 712fa9e4066Sahrens void 713fa9e4066Sahrens zio_wait_children_done(zio_t *zio) 714fa9e4066Sahrens { 715fa9e4066Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 716fa9e4066Sahrens &zio->io_children_notdone); 717fa9e4066Sahrens } 718fa9e4066Sahrens 719fa9e4066Sahrens static void 720fa9e4066Sahrens zio_ready(zio_t *zio) 721fa9e4066Sahrens { 722fa9e4066Sahrens zio_t *pio = zio->io_parent; 723fa9e4066Sahrens 724fa9e4066Sahrens if (pio != NULL) 725fa9e4066Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 726fa9e4066Sahrens &pio->io_children_notready); 727fa9e4066Sahrens 728fa9e4066Sahrens if (zio->io_bp) 729fa9e4066Sahrens zio->io_bp_copy = *zio->io_bp; 730fa9e4066Sahrens 731fa9e4066Sahrens zio_next_stage(zio); 732fa9e4066Sahrens } 733fa9e4066Sahrens 734fa9e4066Sahrens static void 735fa9e4066Sahrens zio_done(zio_t *zio) 736fa9e4066Sahrens { 737fa9e4066Sahrens zio_t *pio = zio->io_parent; 738fa9e4066Sahrens spa_t *spa = zio->io_spa; 739fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 740fa9e4066Sahrens vdev_t *vd = zio->io_vd; 741fbabab8fSmaybee char blkbuf[BP_SPRINTF_LEN]; 742fa9e4066Sahrens 743fa9e4066Sahrens ASSERT(zio->io_children_notready == 0); 744fa9e4066Sahrens ASSERT(zio->io_children_notdone == 0); 745fa9e4066Sahrens 746fa9e4066Sahrens if (bp != NULL) { 747fa9e4066Sahrens ASSERT(bp->blk_pad[0] == 0); 748fa9e4066Sahrens ASSERT(bp->blk_pad[1] == 0); 749fa9e4066Sahrens ASSERT(bp->blk_pad[2] == 0); 750fa9e4066Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 751fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 75244cd46caSbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 753fa9e4066Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 75444cd46caSbillm if (zio->io_ndvas != 0) 75544cd46caSbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 75644cd46caSbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 75744cd46caSbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 75844cd46caSbillm } 759fa9e4066Sahrens } 760fa9e4066Sahrens 761fa9e4066Sahrens if (vd != NULL) 762fa9e4066Sahrens vdev_stat_update(zio); 763fa9e4066Sahrens 764fa9e4066Sahrens if (zio->io_error) { 765ea8dc4b6Seschrock /* 766ea8dc4b6Seschrock * If this I/O is attached to a particular vdev, 767ea8dc4b6Seschrock * generate an error message describing the I/O failure 768ea8dc4b6Seschrock * at the block level. We ignore these errors if the 769ea8dc4b6Seschrock * device is currently unavailable. 770ea8dc4b6Seschrock */ 771ecc2d604Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 772ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 773ecc2d604Sbonwick zio->io_spa, vd, zio, 0, 0); 774ea8dc4b6Seschrock 775ea8dc4b6Seschrock if ((zio->io_error == EIO || 776ea8dc4b6Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 777ea8dc4b6Seschrock zio->io_logical == zio) { 778ea8dc4b6Seschrock /* 779ea8dc4b6Seschrock * For root I/O requests, tell the SPA to log the error 780ea8dc4b6Seschrock * appropriately. Also, generate a logical data 781ea8dc4b6Seschrock * ereport. 782ea8dc4b6Seschrock */ 783ea8dc4b6Seschrock spa_log_error(zio->io_spa, zio); 784ea8dc4b6Seschrock 785ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 786ea8dc4b6Seschrock zio->io_spa, NULL, zio, 0, 0); 787ea8dc4b6Seschrock } 788fa9e4066Sahrens 789ea8dc4b6Seschrock /* 790ea8dc4b6Seschrock * For I/O requests that cannot fail, panic appropriately. 791ea8dc4b6Seschrock */ 792ea8dc4b6Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 793ea8dc4b6Seschrock sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 794ea8dc4b6Seschrock bp ? bp : &zio->io_bp_copy); 795ea8dc4b6Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 796ea8dc4b6Seschrock "%d", zio->io_error == ECKSUM ? 797ea8dc4b6Seschrock "bad checksum" : "I/O failure", 798ea8dc4b6Seschrock zio_type_name[zio->io_type], 799ea8dc4b6Seschrock vdev_description(vd), 800ea8dc4b6Seschrock (u_longlong_t)zio->io_offset, 801ea8dc4b6Seschrock zio, blkbuf, zio->io_error); 802ea8dc4b6Seschrock } 803fa9e4066Sahrens } 804fa9e4066Sahrens 805fa9e4066Sahrens zio_clear_transform_stack(zio); 806fa9e4066Sahrens 807fa9e4066Sahrens if (zio->io_done) 808fa9e4066Sahrens zio->io_done(zio); 809fa9e4066Sahrens 810fa9e4066Sahrens ASSERT(zio->io_delegate_list == NULL); 811fa9e4066Sahrens ASSERT(zio->io_delegate_next == NULL); 812fa9e4066Sahrens 813fa9e4066Sahrens if (pio != NULL) { 814fa9e4066Sahrens zio_t *next, *prev; 815fa9e4066Sahrens 816fa9e4066Sahrens mutex_enter(&pio->io_lock); 817fa9e4066Sahrens next = zio->io_sibling_next; 818fa9e4066Sahrens prev = zio->io_sibling_prev; 819fa9e4066Sahrens if (next != NULL) 820fa9e4066Sahrens next->io_sibling_prev = prev; 821fa9e4066Sahrens if (prev != NULL) 822fa9e4066Sahrens prev->io_sibling_next = next; 823fa9e4066Sahrens if (pio->io_child == zio) 824fa9e4066Sahrens pio->io_child = next; 825fa9e4066Sahrens mutex_exit(&pio->io_lock); 826fa9e4066Sahrens 827fa9e4066Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 828fa9e4066Sahrens &pio->io_children_notdone); 829fa9e4066Sahrens } 830fa9e4066Sahrens 831fa9e4066Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 832ea8dc4b6Seschrock spa_config_exit(spa, zio); 833fa9e4066Sahrens 834fa9e4066Sahrens if (zio->io_waiter != NULL) { 835fa9e4066Sahrens mutex_enter(&zio->io_lock); 836fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 837fa9e4066Sahrens zio->io_stalled = zio->io_stage; 838fa9e4066Sahrens cv_broadcast(&zio->io_cv); 839fa9e4066Sahrens mutex_exit(&zio->io_lock); 840fa9e4066Sahrens } else { 841fa9e4066Sahrens kmem_free(zio, sizeof (zio_t)); 842fa9e4066Sahrens } 843fa9e4066Sahrens } 844fa9e4066Sahrens 845fa9e4066Sahrens /* 846fa9e4066Sahrens * ========================================================================== 847fa9e4066Sahrens * Compression support 848fa9e4066Sahrens * ========================================================================== 849fa9e4066Sahrens */ 850fa9e4066Sahrens static void 851fa9e4066Sahrens zio_write_compress(zio_t *zio) 852fa9e4066Sahrens { 853fa9e4066Sahrens int compress = zio->io_compress; 854fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 855fa9e4066Sahrens void *cbuf; 856fa9e4066Sahrens uint64_t lsize = zio->io_size; 857fa9e4066Sahrens uint64_t csize = lsize; 858fa9e4066Sahrens uint64_t cbufsize = 0; 859fa9e4066Sahrens int pass; 860fa9e4066Sahrens 861fa9e4066Sahrens if (bp->blk_birth == zio->io_txg) { 862fa9e4066Sahrens /* 863fa9e4066Sahrens * We're rewriting an existing block, which means we're 864fa9e4066Sahrens * working on behalf of spa_sync(). For spa_sync() to 865fa9e4066Sahrens * converge, it must eventually be the case that we don't 866fa9e4066Sahrens * have to allocate new blocks. But compression changes 867fa9e4066Sahrens * the blocksize, which forces a reallocate, and makes 868fa9e4066Sahrens * convergence take longer. Therefore, after the first 869fa9e4066Sahrens * few passes, stop compressing to ensure convergence. 870fa9e4066Sahrens */ 871fa9e4066Sahrens pass = spa_sync_pass(zio->io_spa); 872fa9e4066Sahrens if (pass > zio_sync_pass.zp_dontcompress) 873fa9e4066Sahrens compress = ZIO_COMPRESS_OFF; 874fa9e4066Sahrens } else { 875fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 876fa9e4066Sahrens pass = 1; 877fa9e4066Sahrens } 878fa9e4066Sahrens 879fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF) 880fa9e4066Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 881fa9e4066Sahrens &cbuf, &csize, &cbufsize)) 882fa9e4066Sahrens compress = ZIO_COMPRESS_OFF; 883fa9e4066Sahrens 884fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 885fa9e4066Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 886fa9e4066Sahrens 887fa9e4066Sahrens /* 888fa9e4066Sahrens * The final pass of spa_sync() must be all rewrites, but the first 889fa9e4066Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 890fa9e4066Sahrens * but newly allocated blocks are sequential, so they can be written 891fa9e4066Sahrens * to disk faster. Therefore, we allow the first few passes of 892fa9e4066Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 893fa9e4066Sahrens * There should only be a handful of blocks after pass 1 in any case. 894fa9e4066Sahrens */ 895fa9e4066Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 896fa9e4066Sahrens pass > zio_sync_pass.zp_rewrite) { 897fa9e4066Sahrens ASSERT(csize != 0); 898fa9e4066Sahrens ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); 899fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 900fa9e4066Sahrens 901fa9e4066Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 902fa9e4066Sahrens } else { 903fa9e4066Sahrens if (bp->blk_birth == zio->io_txg) { 904fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 905fa9e4066Sahrens bzero(bp, sizeof (blkptr_t)); 906fa9e4066Sahrens } 907fa9e4066Sahrens if (csize == 0) { 908fa9e4066Sahrens BP_ZERO(bp); 909fa9e4066Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 910fa9e4066Sahrens } else { 91144cd46caSbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 912fa9e4066Sahrens BP_SET_LSIZE(bp, lsize); 913fa9e4066Sahrens BP_SET_PSIZE(bp, csize); 914fa9e4066Sahrens BP_SET_COMPRESS(bp, compress); 915fa9e4066Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 916fa9e4066Sahrens } 917fa9e4066Sahrens } 918fa9e4066Sahrens 919fa9e4066Sahrens zio_next_stage(zio); 920fa9e4066Sahrens } 921fa9e4066Sahrens 922fa9e4066Sahrens static void 923fa9e4066Sahrens zio_read_decompress(zio_t *zio) 924fa9e4066Sahrens { 925fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 926fa9e4066Sahrens void *data; 927fa9e4066Sahrens uint64_t size; 928fa9e4066Sahrens uint64_t bufsize; 929fa9e4066Sahrens int compress = BP_GET_COMPRESS(bp); 930fa9e4066Sahrens 931fa9e4066Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 932fa9e4066Sahrens 933fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 934fa9e4066Sahrens 935fa9e4066Sahrens if (zio_decompress_data(compress, data, size, 936fa9e4066Sahrens zio->io_data, zio->io_size)) 937fa9e4066Sahrens zio->io_error = EIO; 938fa9e4066Sahrens 939fa9e4066Sahrens zio_buf_free(data, bufsize); 940fa9e4066Sahrens 941fa9e4066Sahrens zio_next_stage(zio); 942fa9e4066Sahrens } 943fa9e4066Sahrens 944fa9e4066Sahrens /* 945fa9e4066Sahrens * ========================================================================== 946fa9e4066Sahrens * Gang block support 947fa9e4066Sahrens * ========================================================================== 948fa9e4066Sahrens */ 949fa9e4066Sahrens static void 950fa9e4066Sahrens zio_gang_pipeline(zio_t *zio) 951fa9e4066Sahrens { 952fa9e4066Sahrens /* 953fa9e4066Sahrens * By default, the pipeline assumes that we're dealing with a gang 954fa9e4066Sahrens * block. If we're not, strip out any gang-specific stages. 955fa9e4066Sahrens */ 95644cd46caSbillm if (!BP_IS_GANG(zio->io_bp)) 957fa9e4066Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 958fa9e4066Sahrens 959fa9e4066Sahrens zio_next_stage(zio); 960fa9e4066Sahrens } 961fa9e4066Sahrens 962fa9e4066Sahrens static void 963fa9e4066Sahrens zio_gang_byteswap(zio_t *zio) 964fa9e4066Sahrens { 965fa9e4066Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 966fa9e4066Sahrens 967fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 968fa9e4066Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 969fa9e4066Sahrens } 970fa9e4066Sahrens 971fa9e4066Sahrens static void 972fa9e4066Sahrens zio_get_gang_header(zio_t *zio) 973fa9e4066Sahrens { 974fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 975fa9e4066Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 976fa9e4066Sahrens void *gbuf = zio_buf_alloc(gsize); 977fa9e4066Sahrens 97844cd46caSbillm ASSERT(BP_IS_GANG(bp)); 979fa9e4066Sahrens 980fa9e4066Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 981fa9e4066Sahrens 982fa9e4066Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 983fa9e4066Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 984fa9e4066Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 985fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 986fa9e4066Sahrens 987fa9e4066Sahrens zio_wait_children_done(zio); 988fa9e4066Sahrens } 989fa9e4066Sahrens 990fa9e4066Sahrens static void 991fa9e4066Sahrens zio_read_gang_members(zio_t *zio) 992fa9e4066Sahrens { 993fa9e4066Sahrens zio_gbh_phys_t *gbh; 994fa9e4066Sahrens uint64_t gsize, gbufsize, loff, lsize; 995fa9e4066Sahrens int i; 996fa9e4066Sahrens 99744cd46caSbillm ASSERT(BP_IS_GANG(zio->io_bp)); 998fa9e4066Sahrens 999fa9e4066Sahrens zio_gang_byteswap(zio); 1000fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1001fa9e4066Sahrens 1002fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1003fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1004fa9e4066Sahrens lsize = BP_GET_PSIZE(gbp); 1005fa9e4066Sahrens 1006fa9e4066Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1007fa9e4066Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1008fa9e4066Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1009fa9e4066Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1010fa9e4066Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1011fa9e4066Sahrens 1012fa9e4066Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1013fa9e4066Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 1014ea8dc4b6Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1015ea8dc4b6Seschrock &zio->io_bookmark)); 1016fa9e4066Sahrens } 1017fa9e4066Sahrens 1018fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1019fa9e4066Sahrens zio_wait_children_done(zio); 1020fa9e4066Sahrens } 1021fa9e4066Sahrens 1022fa9e4066Sahrens static void 1023fa9e4066Sahrens zio_rewrite_gang_members(zio_t *zio) 1024fa9e4066Sahrens { 1025fa9e4066Sahrens zio_gbh_phys_t *gbh; 1026fa9e4066Sahrens uint64_t gsize, gbufsize, loff, lsize; 1027fa9e4066Sahrens int i; 1028fa9e4066Sahrens 102944cd46caSbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1030fa9e4066Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1031fa9e4066Sahrens 1032fa9e4066Sahrens zio_gang_byteswap(zio); 1033fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1034fa9e4066Sahrens 1035fa9e4066Sahrens ASSERT(gsize == gbufsize); 1036fa9e4066Sahrens 1037fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1038fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1039fa9e4066Sahrens lsize = BP_GET_PSIZE(gbp); 1040fa9e4066Sahrens 1041fa9e4066Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1042fa9e4066Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1043fa9e4066Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1044fa9e4066Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1045fa9e4066Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1046fa9e4066Sahrens 1047fa9e4066Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1048fa9e4066Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1049ea8dc4b6Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 1050ea8dc4b6Seschrock &zio->io_bookmark)); 1051fa9e4066Sahrens } 1052fa9e4066Sahrens 1053fa9e4066Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1054fa9e4066Sahrens zio_wait_children_ready(zio); 1055fa9e4066Sahrens } 1056fa9e4066Sahrens 1057fa9e4066Sahrens static void 1058fa9e4066Sahrens zio_free_gang_members(zio_t *zio) 1059fa9e4066Sahrens { 1060fa9e4066Sahrens zio_gbh_phys_t *gbh; 1061fa9e4066Sahrens uint64_t gsize, gbufsize; 1062fa9e4066Sahrens int i; 1063fa9e4066Sahrens 106444cd46caSbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1065fa9e4066Sahrens 1066fa9e4066Sahrens zio_gang_byteswap(zio); 1067fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1068fa9e4066Sahrens 1069fa9e4066Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1070fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1071fa9e4066Sahrens 1072fa9e4066Sahrens if (BP_IS_HOLE(gbp)) 1073fa9e4066Sahrens continue; 1074fa9e4066Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1075fa9e4066Sahrens gbp, NULL, NULL)); 1076fa9e4066Sahrens } 1077fa9e4066Sahrens 1078fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1079fa9e4066Sahrens zio_next_stage(zio); 1080fa9e4066Sahrens } 1081fa9e4066Sahrens 1082fa9e4066Sahrens static void 1083fa9e4066Sahrens zio_claim_gang_members(zio_t *zio) 1084fa9e4066Sahrens { 1085fa9e4066Sahrens zio_gbh_phys_t *gbh; 1086fa9e4066Sahrens uint64_t gsize, gbufsize; 1087fa9e4066Sahrens int i; 1088fa9e4066Sahrens 108944cd46caSbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1090fa9e4066Sahrens 1091fa9e4066Sahrens zio_gang_byteswap(zio); 1092fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1093fa9e4066Sahrens 1094fa9e4066Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1095fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1096fa9e4066Sahrens if (BP_IS_HOLE(gbp)) 1097fa9e4066Sahrens continue; 1098fa9e4066Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1099fa9e4066Sahrens gbp, NULL, NULL)); 1100fa9e4066Sahrens } 1101fa9e4066Sahrens 1102fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1103fa9e4066Sahrens zio_next_stage(zio); 1104fa9e4066Sahrens } 1105fa9e4066Sahrens 1106fa9e4066Sahrens static void 1107fa9e4066Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1108fa9e4066Sahrens { 1109fa9e4066Sahrens zio_t *pio = zio->io_parent; 111044cd46caSbillm dva_t *cdva = zio->io_bp->blk_dva; 111144cd46caSbillm dva_t *pdva = pio->io_bp->blk_dva; 1112fa9e4066Sahrens uint64_t asize; 111344cd46caSbillm int d; 1114fa9e4066Sahrens 111544cd46caSbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 111644cd46caSbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 111744cd46caSbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 111844cd46caSbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 1119fa9e4066Sahrens 1120fa9e4066Sahrens mutex_enter(&pio->io_lock); 112144cd46caSbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 112244cd46caSbillm ASSERT(DVA_GET_GANG(&pdva[d])); 112344cd46caSbillm asize = DVA_GET_ASIZE(&pdva[d]); 112444cd46caSbillm asize += DVA_GET_ASIZE(&cdva[d]); 112544cd46caSbillm DVA_SET_ASIZE(&pdva[d], asize); 112644cd46caSbillm } 1127fa9e4066Sahrens mutex_exit(&pio->io_lock); 1128fa9e4066Sahrens } 1129fa9e4066Sahrens 1130fa9e4066Sahrens static void 1131fa9e4066Sahrens zio_write_allocate_gang_members(zio_t *zio) 1132fa9e4066Sahrens { 1133fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 113444cd46caSbillm dva_t *dva = bp->blk_dva; 113544cd46caSbillm spa_t *spa = zio->io_spa; 1136fa9e4066Sahrens zio_gbh_phys_t *gbh; 113744cd46caSbillm uint64_t txg = zio->io_txg; 1138fa9e4066Sahrens uint64_t resid = zio->io_size; 1139fa9e4066Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1140fa9e4066Sahrens uint64_t gsize, loff, lsize; 1141fa9e4066Sahrens uint32_t gbps_left; 114244cd46caSbillm int ndvas = zio->io_ndvas; 114344cd46caSbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1144fa9e4066Sahrens int error; 114544cd46caSbillm int i, d; 1146fa9e4066Sahrens 1147fa9e4066Sahrens gsize = SPA_GANGBLOCKSIZE; 1148fa9e4066Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1149fa9e4066Sahrens 115044cd46caSbillm error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL); 1151fa9e4066Sahrens if (error == ENOSPC) 1152fa9e4066Sahrens panic("can't allocate gang block header"); 1153fa9e4066Sahrens ASSERT(error == 0); 1154fa9e4066Sahrens 115544cd46caSbillm for (d = 0; d < gbh_ndvas; d++) 115644cd46caSbillm DVA_SET_GANG(&dva[d], 1); 1157fa9e4066Sahrens 115844cd46caSbillm bp->blk_birth = txg; 1159fa9e4066Sahrens 1160fa9e4066Sahrens gbh = zio_buf_alloc(gsize); 1161fa9e4066Sahrens bzero(gbh, gsize); 1162fa9e4066Sahrens 116344cd46caSbillm /* We need to test multi-level gang blocks */ 116444cd46caSbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 116544cd46caSbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 116644cd46caSbillm 1167fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1168fa9e4066Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1169fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 117044cd46caSbillm dva = gbp->blk_dva; 1171fa9e4066Sahrens 1172fa9e4066Sahrens ASSERT(gbps_left != 0); 1173fa9e4066Sahrens maxalloc = MIN(maxalloc, resid); 1174fa9e4066Sahrens 1175fa9e4066Sahrens while (resid <= maxalloc * gbps_left) { 117644cd46caSbillm error = metaslab_alloc(spa, maxalloc, gbp, ndvas, 117744cd46caSbillm txg, bp); 1178fa9e4066Sahrens if (error == 0) 1179fa9e4066Sahrens break; 1180fa9e4066Sahrens ASSERT3U(error, ==, ENOSPC); 1181fa9e4066Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1182fa9e4066Sahrens panic("really out of space"); 1183fa9e4066Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1184fa9e4066Sahrens } 1185fa9e4066Sahrens 1186fa9e4066Sahrens if (resid <= maxalloc * gbps_left) { 1187fa9e4066Sahrens lsize = maxalloc; 1188fa9e4066Sahrens BP_SET_LSIZE(gbp, lsize); 1189fa9e4066Sahrens BP_SET_PSIZE(gbp, lsize); 1190fa9e4066Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 119144cd46caSbillm gbp->blk_birth = txg; 119244cd46caSbillm zio_nowait(zio_rewrite(zio, spa, 119344cd46caSbillm zio->io_checksum, txg, gbp, 1194fa9e4066Sahrens (char *)zio->io_data + loff, lsize, 1195fa9e4066Sahrens zio_write_allocate_gang_member_done, NULL, 1196ea8dc4b6Seschrock zio->io_priority, zio->io_flags, 1197ea8dc4b6Seschrock &zio->io_bookmark)); 1198fa9e4066Sahrens } else { 1199fa9e4066Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1200fa9e4066Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 120144cd46caSbillm zio_nowait(zio_write_allocate(zio, spa, 120244cd46caSbillm zio->io_checksum, txg, gbp, 1203fa9e4066Sahrens (char *)zio->io_data + loff, lsize, 1204fa9e4066Sahrens zio_write_allocate_gang_member_done, NULL, 1205fa9e4066Sahrens zio->io_priority, zio->io_flags)); 1206fa9e4066Sahrens } 1207fa9e4066Sahrens } 1208fa9e4066Sahrens 1209fa9e4066Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1210fa9e4066Sahrens 1211fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1212fa9e4066Sahrens 1213fa9e4066Sahrens zio_push_transform(zio, gbh, gsize, gsize); 121444cd46caSbillm /* 121544cd46caSbillm * As much as we'd like this to be zio_wait_children_ready(), 121644cd46caSbillm * updating our ASIZE doesn't happen until the io_done callback, 121744cd46caSbillm * so we have to wait for that to finish in order for our BP 121844cd46caSbillm * to be stable. 121944cd46caSbillm */ 1220fa9e4066Sahrens zio_wait_children_done(zio); 1221fa9e4066Sahrens } 1222fa9e4066Sahrens 1223fa9e4066Sahrens /* 1224fa9e4066Sahrens * ========================================================================== 1225fa9e4066Sahrens * Allocate and free blocks 1226fa9e4066Sahrens * ========================================================================== 1227fa9e4066Sahrens */ 1228fa9e4066Sahrens static void 1229fa9e4066Sahrens zio_dva_allocate(zio_t *zio) 1230fa9e4066Sahrens { 1231fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1232fa9e4066Sahrens int error; 1233fa9e4066Sahrens 1234fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 123544cd46caSbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 123644cd46caSbillm ASSERT3U(zio->io_ndvas, >, 0); 123744cd46caSbillm ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); 1238fa9e4066Sahrens 1239fa9e4066Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1240fa9e4066Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1241fa9e4066Sahrens zio_write_allocate_gang_members(zio); 1242fa9e4066Sahrens return; 1243fa9e4066Sahrens } 1244fa9e4066Sahrens 1245fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1246fa9e4066Sahrens 124744cd46caSbillm error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, 124844cd46caSbillm zio->io_txg, NULL); 1249fa9e4066Sahrens 1250fa9e4066Sahrens if (error == 0) { 1251fa9e4066Sahrens bp->blk_birth = zio->io_txg; 1252fa9e4066Sahrens } else if (error == ENOSPC) { 1253fa9e4066Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1254fa9e4066Sahrens panic("really, truly out of space"); 1255fa9e4066Sahrens zio_write_allocate_gang_members(zio); 1256fa9e4066Sahrens return; 1257fa9e4066Sahrens } else { 1258fa9e4066Sahrens zio->io_error = error; 1259fa9e4066Sahrens } 1260fa9e4066Sahrens zio_next_stage(zio); 1261fa9e4066Sahrens } 1262fa9e4066Sahrens 1263fa9e4066Sahrens static void 1264fa9e4066Sahrens zio_dva_free(zio_t *zio) 1265fa9e4066Sahrens { 1266fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1267fa9e4066Sahrens 1268d80c45e0Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1269fa9e4066Sahrens 1270fa9e4066Sahrens BP_ZERO(bp); 1271fa9e4066Sahrens 1272fa9e4066Sahrens zio_next_stage(zio); 1273fa9e4066Sahrens } 1274fa9e4066Sahrens 1275fa9e4066Sahrens static void 1276fa9e4066Sahrens zio_dva_claim(zio_t *zio) 1277fa9e4066Sahrens { 1278d80c45e0Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1279fa9e4066Sahrens 1280fa9e4066Sahrens zio_next_stage(zio); 1281fa9e4066Sahrens } 1282fa9e4066Sahrens 1283fa9e4066Sahrens /* 1284fa9e4066Sahrens * ========================================================================== 1285fa9e4066Sahrens * Read and write to physical devices 1286fa9e4066Sahrens * ========================================================================== 1287fa9e4066Sahrens */ 1288fa9e4066Sahrens 1289fa9e4066Sahrens static void 129044cd46caSbillm zio_vdev_io_start(zio_t *zio) 1291fa9e4066Sahrens { 1292fa9e4066Sahrens vdev_t *vd = zio->io_vd; 129344cd46caSbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 129444cd46caSbillm blkptr_t *bp = zio->io_bp; 129544cd46caSbillm uint64_t align; 129644cd46caSbillm 129744cd46caSbillm if (vd == NULL) { 129844cd46caSbillm /* The mirror_ops handle multiple DVAs in a single BP */ 129944cd46caSbillm vdev_mirror_ops.vdev_op_io_start(zio); 130044cd46caSbillm return; 130144cd46caSbillm } 130244cd46caSbillm 130344cd46caSbillm align = 1ULL << tvd->vdev_ashift; 1304fa9e4066Sahrens 1305ecc2d604Sbonwick if (zio->io_retries == 0 && vd == tvd) 1306fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1307fa9e4066Sahrens 130844cd46caSbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 130944cd46caSbillm vd->vdev_children == 0) { 1310fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1311fa9e4066Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1312fa9e4066Sahrens } 1313fa9e4066Sahrens 1314ecc2d604Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 1315ecc2d604Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 1316ecc2d604Sbonwick char *abuf = zio_buf_alloc(asize); 1317ecc2d604Sbonwick ASSERT(vd == tvd); 1318ecc2d604Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 1319ecc2d604Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 1320ecc2d604Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 1321ecc2d604Sbonwick } 1322ecc2d604Sbonwick zio_push_transform(zio, abuf, asize, asize); 1323ecc2d604Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 1324ecc2d604Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 1325ecc2d604Sbonwick } 1326ecc2d604Sbonwick 1327ecc2d604Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 1328ecc2d604Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 1329ecc2d604Sbonwick ASSERT(bp == NULL || 1330ecc2d604Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1331fa9e4066Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1332fa9e4066Sahrens 1333fa9e4066Sahrens vdev_io_start(zio); 1334fa9e4066Sahrens 1335fa9e4066Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1336fa9e4066Sahrens } 1337fa9e4066Sahrens 1338fa9e4066Sahrens static void 1339fa9e4066Sahrens zio_vdev_io_done(zio_t *zio) 1340fa9e4066Sahrens { 134144cd46caSbillm if (zio->io_vd == NULL) 134244cd46caSbillm /* The mirror_ops handle multiple DVAs in a single BP */ 134344cd46caSbillm vdev_mirror_ops.vdev_op_io_done(zio); 134444cd46caSbillm else 134544cd46caSbillm vdev_io_done(zio); 1346fa9e4066Sahrens } 1347fa9e4066Sahrens 1348fa9e4066Sahrens /* XXPOLICY */ 1349ea8dc4b6Seschrock boolean_t 1350fa9e4066Sahrens zio_should_retry(zio_t *zio) 1351fa9e4066Sahrens { 1352fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1353fa9e4066Sahrens 1354fa9e4066Sahrens if (zio->io_error == 0) 1355fa9e4066Sahrens return (B_FALSE); 1356fa9e4066Sahrens if (zio->io_delegate_list != NULL) 1357fa9e4066Sahrens return (B_FALSE); 135844cd46caSbillm if (vd && vd != vd->vdev_top) 1359fa9e4066Sahrens return (B_FALSE); 1360fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1361fa9e4066Sahrens return (B_FALSE); 1362ea8dc4b6Seschrock if (zio->io_retries > 0) 1363fa9e4066Sahrens return (B_FALSE); 1364fa9e4066Sahrens 1365fa9e4066Sahrens return (B_TRUE); 1366fa9e4066Sahrens } 1367fa9e4066Sahrens 1368fa9e4066Sahrens static void 1369fa9e4066Sahrens zio_vdev_io_assess(zio_t *zio) 1370fa9e4066Sahrens { 1371fa9e4066Sahrens vdev_t *vd = zio->io_vd; 137244cd46caSbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1373fa9e4066Sahrens 1374fa9e4066Sahrens ASSERT(zio->io_vsd == NULL); 1375fa9e4066Sahrens 1376ecc2d604Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 1377ecc2d604Sbonwick void *abuf; 1378ecc2d604Sbonwick uint64_t asize; 1379ecc2d604Sbonwick ASSERT(vd == tvd); 1380ecc2d604Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 1381ecc2d604Sbonwick if (zio->io_type == ZIO_TYPE_READ) 1382ecc2d604Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 1383ecc2d604Sbonwick zio_buf_free(abuf, asize); 1384ecc2d604Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 1385ecc2d604Sbonwick } 1386ecc2d604Sbonwick 1387ea8dc4b6Seschrock if (zio_injection_enabled && !zio->io_error) 1388ea8dc4b6Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1389ea8dc4b6Seschrock 1390fa9e4066Sahrens /* 1391fa9e4066Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1392fa9e4066Sahrens */ 1393fa9e4066Sahrens /* XXPOLICY */ 1394fa9e4066Sahrens if (zio_should_retry(zio)) { 1395fa9e4066Sahrens ASSERT(tvd == vd); 1396fa9e4066Sahrens 1397fa9e4066Sahrens zio->io_retries++; 1398fa9e4066Sahrens zio->io_error = 0; 1399fa9e4066Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1400fa9e4066Sahrens /* XXPOLICY */ 1401fa9e4066Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1402fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 140344cd46caSbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1404fa9e4066Sahrens 1405fa9e4066Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1406fa9e4066Sahrens zio->io_retries, zio_type_name[zio->io_type], 1407fa9e4066Sahrens vdev_description(vd), zio->io_offset); 1408fa9e4066Sahrens 1409ea8dc4b6Seschrock zio_next_stage_async(zio); 1410ea8dc4b6Seschrock return; 1411ea8dc4b6Seschrock } 1412fa9e4066Sahrens 141344cd46caSbillm if (zio->io_error != 0 && zio->io_error != ECKSUM && 141444cd46caSbillm !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { 1415fa9e4066Sahrens /* 1416ea8dc4b6Seschrock * Poor man's hotplug support. Even if we're done retrying this 1417ea8dc4b6Seschrock * I/O, try to reopen the vdev to see if it's still attached. 1418ea8dc4b6Seschrock * To avoid excessive thrashing, we only try it once a minute. 1419ea8dc4b6Seschrock * This also has the effect of detecting when missing devices 1420ea8dc4b6Seschrock * have come back, by polling the device once a minute. 1421ea8dc4b6Seschrock * 1422ea8dc4b6Seschrock * We need to do this asynchronously because we can't grab 1423ea8dc4b6Seschrock * all the necessary locks way down here. 1424fa9e4066Sahrens */ 1425ea8dc4b6Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 1426ea8dc4b6Seschrock vd->vdev_last_try = gethrtime(); 1427ea8dc4b6Seschrock tvd->vdev_reopen_wanted = 1; 1428ea8dc4b6Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 1429ea8dc4b6Seschrock } 1430fa9e4066Sahrens } 1431fa9e4066Sahrens 1432fa9e4066Sahrens zio_next_stage(zio); 1433fa9e4066Sahrens } 1434fa9e4066Sahrens 1435fa9e4066Sahrens void 1436fa9e4066Sahrens zio_vdev_io_reissue(zio_t *zio) 1437fa9e4066Sahrens { 1438fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1439fa9e4066Sahrens ASSERT(zio->io_error == 0); 1440fa9e4066Sahrens 1441fa9e4066Sahrens zio->io_stage--; 1442fa9e4066Sahrens } 1443fa9e4066Sahrens 1444fa9e4066Sahrens void 1445fa9e4066Sahrens zio_vdev_io_redone(zio_t *zio) 1446fa9e4066Sahrens { 1447fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1448fa9e4066Sahrens 1449fa9e4066Sahrens zio->io_stage--; 1450fa9e4066Sahrens } 1451fa9e4066Sahrens 1452fa9e4066Sahrens void 1453fa9e4066Sahrens zio_vdev_io_bypass(zio_t *zio) 1454fa9e4066Sahrens { 1455fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1456fa9e4066Sahrens ASSERT(zio->io_error == 0); 1457fa9e4066Sahrens 1458fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1459fa9e4066Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1460fa9e4066Sahrens } 1461fa9e4066Sahrens 1462fa9e4066Sahrens /* 1463fa9e4066Sahrens * ========================================================================== 1464fa9e4066Sahrens * Generate and verify checksums 1465fa9e4066Sahrens * ========================================================================== 1466fa9e4066Sahrens */ 1467fa9e4066Sahrens static void 1468fa9e4066Sahrens zio_checksum_generate(zio_t *zio) 1469fa9e4066Sahrens { 1470fa9e4066Sahrens int checksum = zio->io_checksum; 1471fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1472fa9e4066Sahrens 1473fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1474fa9e4066Sahrens 1475fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 1476fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1477fa9e4066Sahrens 1478fa9e4066Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1479fa9e4066Sahrens 1480fa9e4066Sahrens zio_next_stage(zio); 1481fa9e4066Sahrens } 1482fa9e4066Sahrens 1483fa9e4066Sahrens static void 1484fa9e4066Sahrens zio_gang_checksum_generate(zio_t *zio) 1485fa9e4066Sahrens { 1486fa9e4066Sahrens zio_cksum_t zc; 1487fa9e4066Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1488fa9e4066Sahrens 148944cd46caSbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1490fa9e4066Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1491fa9e4066Sahrens 1492fa9e4066Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1493fa9e4066Sahrens 1494fa9e4066Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1495fa9e4066Sahrens 1496fa9e4066Sahrens zio_next_stage(zio); 1497fa9e4066Sahrens } 1498fa9e4066Sahrens 1499fa9e4066Sahrens static void 1500fa9e4066Sahrens zio_checksum_verify(zio_t *zio) 1501fa9e4066Sahrens { 1502fa9e4066Sahrens if (zio->io_bp != NULL) { 1503fa9e4066Sahrens zio->io_error = zio_checksum_error(zio); 1504ea8dc4b6Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1505ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1506ea8dc4b6Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1507fa9e4066Sahrens } 1508fa9e4066Sahrens 1509fa9e4066Sahrens zio_next_stage(zio); 1510fa9e4066Sahrens } 1511fa9e4066Sahrens 1512fa9e4066Sahrens /* 1513fa9e4066Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1514fa9e4066Sahrens */ 1515fa9e4066Sahrens void 1516fa9e4066Sahrens zio_checksum_verified(zio_t *zio) 1517fa9e4066Sahrens { 1518fa9e4066Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1519fa9e4066Sahrens } 1520fa9e4066Sahrens 1521fa9e4066Sahrens /* 1522fa9e4066Sahrens * Set the external verifier for a gang block based on stuff in the bp 1523fa9e4066Sahrens */ 1524fa9e4066Sahrens void 1525fa9e4066Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1526fa9e4066Sahrens { 152744cd46caSbillm blkptr_t *bp = zio->io_bp; 152844cd46caSbillm 152944cd46caSbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 153044cd46caSbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 153144cd46caSbillm zcp->zc_word[2] = bp->blk_birth; 1532fa9e4066Sahrens zcp->zc_word[3] = 0; 1533fa9e4066Sahrens } 1534fa9e4066Sahrens 1535fa9e4066Sahrens /* 1536fa9e4066Sahrens * ========================================================================== 1537fa9e4066Sahrens * Define the pipeline 1538fa9e4066Sahrens * ========================================================================== 1539fa9e4066Sahrens */ 1540fa9e4066Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1541fa9e4066Sahrens 1542fa9e4066Sahrens static void 1543fa9e4066Sahrens zio_badop(zio_t *zio) 1544fa9e4066Sahrens { 1545fa9e4066Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1546fa9e4066Sahrens } 1547fa9e4066Sahrens 1548fa9e4066Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1549fa9e4066Sahrens zio_badop, 1550fa9e4066Sahrens zio_wait_children_ready, 1551fa9e4066Sahrens zio_write_compress, 1552fa9e4066Sahrens zio_checksum_generate, 1553fa9e4066Sahrens zio_gang_pipeline, 1554fa9e4066Sahrens zio_get_gang_header, 1555fa9e4066Sahrens zio_rewrite_gang_members, 1556fa9e4066Sahrens zio_free_gang_members, 1557fa9e4066Sahrens zio_claim_gang_members, 1558fa9e4066Sahrens zio_dva_allocate, 1559fa9e4066Sahrens zio_dva_free, 1560fa9e4066Sahrens zio_dva_claim, 1561fa9e4066Sahrens zio_gang_checksum_generate, 1562fa9e4066Sahrens zio_ready, 1563fa9e4066Sahrens zio_vdev_io_start, 1564fa9e4066Sahrens zio_vdev_io_done, 1565fa9e4066Sahrens zio_vdev_io_assess, 1566fa9e4066Sahrens zio_wait_children_done, 1567fa9e4066Sahrens zio_checksum_verify, 1568fa9e4066Sahrens zio_read_gang_members, 1569fa9e4066Sahrens zio_read_decompress, 1570fa9e4066Sahrens zio_done, 1571fa9e4066Sahrens zio_badop 1572fa9e4066Sahrens }; 1573fa9e4066Sahrens 1574fa9e4066Sahrens /* 1575fa9e4066Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1576fa9e4066Sahrens * There's no locking on io_stage because there's no legitimate way for 1577fa9e4066Sahrens * multiple threads to be attempting to process the same I/O. 1578fa9e4066Sahrens */ 1579fa9e4066Sahrens void 1580fa9e4066Sahrens zio_next_stage(zio_t *zio) 1581fa9e4066Sahrens { 1582fa9e4066Sahrens uint32_t pipeline = zio->io_pipeline; 1583fa9e4066Sahrens 1584fa9e4066Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1585fa9e4066Sahrens 1586fa9e4066Sahrens if (zio->io_error) { 1587fa9e4066Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1588fa9e4066Sahrens zio, vdev_description(zio->io_vd), 1589fa9e4066Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1590fa9e4066Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1591fa9e4066Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1592fa9e4066Sahrens } 1593fa9e4066Sahrens 1594fa9e4066Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1595fa9e4066Sahrens continue; 1596fa9e4066Sahrens 1597fa9e4066Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1598fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 1599fa9e4066Sahrens 1600fa9e4066Sahrens zio_pipeline[zio->io_stage](zio); 1601fa9e4066Sahrens } 1602fa9e4066Sahrens 1603fa9e4066Sahrens void 1604fa9e4066Sahrens zio_next_stage_async(zio_t *zio) 1605fa9e4066Sahrens { 1606fa9e4066Sahrens taskq_t *tq; 1607fa9e4066Sahrens uint32_t pipeline = zio->io_pipeline; 1608fa9e4066Sahrens 1609fa9e4066Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1610fa9e4066Sahrens 1611fa9e4066Sahrens if (zio->io_error) { 1612fa9e4066Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1613fa9e4066Sahrens zio, vdev_description(zio->io_vd), 1614fa9e4066Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1615fa9e4066Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1616fa9e4066Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1617fa9e4066Sahrens } 1618fa9e4066Sahrens 1619fa9e4066Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1620fa9e4066Sahrens continue; 1621fa9e4066Sahrens 1622fa9e4066Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1623fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 1624fa9e4066Sahrens 1625fa9e4066Sahrens /* 1626fa9e4066Sahrens * For performance, we'll probably want two sets of task queues: 1627fa9e4066Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1628fa9e4066Sahrens * part is for read performance: since we have to make a pass over 1629fa9e4066Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1630fa9e4066Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1631fa9e4066Sahrens * that thread is probably still there. Getting this optimization 1632fa9e4066Sahrens * right avoids performance-hostile cache-to-cache transfers. 1633fa9e4066Sahrens * 1634fa9e4066Sahrens * Note that having two sets of task queues is also necessary for 1635fa9e4066Sahrens * correctness: if all of the issue threads get bogged down waiting 1636fa9e4066Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1637fa9e4066Sahrens * there won't be any threads available to service I/O completion 1638fa9e4066Sahrens * interrupts. 1639fa9e4066Sahrens */ 1640fa9e4066Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1641fa9e4066Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1642fa9e4066Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1643fa9e4066Sahrens else 1644fa9e4066Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1645fa9e4066Sahrens (void) taskq_dispatch(tq, 1646fa9e4066Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1647fa9e4066Sahrens } else { 1648fa9e4066Sahrens zio_pipeline[zio->io_stage](zio); 1649fa9e4066Sahrens } 1650fa9e4066Sahrens } 1651fa9e4066Sahrens 1652fa9e4066Sahrens /* 1653fa9e4066Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1654fa9e4066Sahrens */ 1655fa9e4066Sahrens int 1656d80c45e0Sbonwick zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg) 1657fa9e4066Sahrens { 1658fa9e4066Sahrens int error; 1659fa9e4066Sahrens 1660ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 1661fa9e4066Sahrens 1662fa9e4066Sahrens BP_ZERO(bp); 1663fa9e4066Sahrens 166444cd46caSbillm error = metaslab_alloc(spa, size, bp, 1, txg, NULL); 1665fa9e4066Sahrens 1666fa9e4066Sahrens if (error == 0) { 1667fa9e4066Sahrens BP_SET_LSIZE(bp, size); 1668fa9e4066Sahrens BP_SET_PSIZE(bp, size); 1669fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 1670d80c45e0Sbonwick BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG); 1671fa9e4066Sahrens BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); 1672fa9e4066Sahrens BP_SET_LEVEL(bp, 0); 1673fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1674fa9e4066Sahrens bp->blk_birth = txg; 1675fa9e4066Sahrens } 1676fa9e4066Sahrens 1677ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 1678fa9e4066Sahrens 1679fa9e4066Sahrens return (error); 1680fa9e4066Sahrens } 1681fa9e4066Sahrens 1682fa9e4066Sahrens /* 1683fa9e4066Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1684fa9e4066Sahrens * nothing to do except metaslab_free() it. 1685fa9e4066Sahrens */ 1686fa9e4066Sahrens void 1687fa9e4066Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1688fa9e4066Sahrens { 168944cd46caSbillm ASSERT(!BP_IS_GANG(bp)); 1690fa9e4066Sahrens 1691ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 1692fa9e4066Sahrens 1693d80c45e0Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1694fa9e4066Sahrens 1695ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 1696fa9e4066Sahrens } 1697