1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5*ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6*ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22*ea8dc4b6Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/zfs_context.h> 29*ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 30fa9e4066Sahrens #include <sys/spa.h> 31fa9e4066Sahrens #include <sys/txg.h> 32fa9e4066Sahrens #include <sys/spa_impl.h> 33fa9e4066Sahrens #include <sys/vdev_impl.h> 34fa9e4066Sahrens #include <sys/zio_impl.h> 35fa9e4066Sahrens #include <sys/zio_compress.h> 36fa9e4066Sahrens #include <sys/zio_checksum.h> 37fa9e4066Sahrens 38fa9e4066Sahrens /* 39fa9e4066Sahrens * ========================================================================== 40fa9e4066Sahrens * I/O priority table 41fa9e4066Sahrens * ========================================================================== 42fa9e4066Sahrens */ 43fa9e4066Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44fa9e4066Sahrens 0, /* ZIO_PRIORITY_NOW */ 45fa9e4066Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46fa9e4066Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47fa9e4066Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48fa9e4066Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49fa9e4066Sahrens 4, /* ZIO_PRIORITY_FREE */ 50fa9e4066Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51fa9e4066Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52fa9e4066Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53fa9e4066Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54fa9e4066Sahrens }; 55fa9e4066Sahrens 56fa9e4066Sahrens /* 57fa9e4066Sahrens * ========================================================================== 58fa9e4066Sahrens * I/O type descriptions 59fa9e4066Sahrens * ========================================================================== 60fa9e4066Sahrens */ 61fa9e4066Sahrens char *zio_type_name[ZIO_TYPES] = { 62fa9e4066Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63fa9e4066Sahrens 64fa9e4066Sahrens /* At or above this size, force gang blocking - for testing */ 65fa9e4066Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66fa9e4066Sahrens 67fa9e4066Sahrens typedef struct zio_sync_pass { 68fa9e4066Sahrens int zp_defer_free; /* defer frees after this pass */ 69fa9e4066Sahrens int zp_dontcompress; /* don't compress after this pass */ 70fa9e4066Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 71fa9e4066Sahrens } zio_sync_pass_t; 72fa9e4066Sahrens 73fa9e4066Sahrens zio_sync_pass_t zio_sync_pass = { 74fa9e4066Sahrens 1, /* zp_defer_free */ 75fa9e4066Sahrens 4, /* zp_dontcompress */ 76fa9e4066Sahrens 1, /* zp_rewrite */ 77fa9e4066Sahrens }; 78fa9e4066Sahrens 79fa9e4066Sahrens /* 80fa9e4066Sahrens * ========================================================================== 81fa9e4066Sahrens * I/O kmem caches 82fa9e4066Sahrens * ========================================================================== 83fa9e4066Sahrens */ 84fa9e4066Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 85fa9e4066Sahrens 86fa9e4066Sahrens void 87fa9e4066Sahrens zio_init(void) 88fa9e4066Sahrens { 89fa9e4066Sahrens size_t c; 90fa9e4066Sahrens 91fa9e4066Sahrens /* 92fa9e4066Sahrens * For small buffers, we want a cache for each multiple of 93fa9e4066Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 94fa9e4066Sahrens * for each quarter-power of 2. For large buffers, we want 95fa9e4066Sahrens * a cache for each multiple of PAGESIZE. 96fa9e4066Sahrens */ 97fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 98fa9e4066Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 99fa9e4066Sahrens size_t p2 = size; 100fa9e4066Sahrens size_t align = 0; 101fa9e4066Sahrens 102fa9e4066Sahrens while (p2 & (p2 - 1)) 103fa9e4066Sahrens p2 &= p2 - 1; 104fa9e4066Sahrens 105fa9e4066Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 106fa9e4066Sahrens align = SPA_MINBLOCKSIZE; 107fa9e4066Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 108fa9e4066Sahrens align = PAGESIZE; 109fa9e4066Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 110fa9e4066Sahrens align = p2 >> 2; 111fa9e4066Sahrens } 112fa9e4066Sahrens 113fa9e4066Sahrens if (align != 0) { 114fa9e4066Sahrens char name[30]; 115fa9e4066Sahrens (void) sprintf(name, "zio_buf_%lu", size); 116fa9e4066Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 117a0965f35Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 118fa9e4066Sahrens dprintf("creating cache for size %5lx align %5lx\n", 119fa9e4066Sahrens size, align); 120fa9e4066Sahrens } 121fa9e4066Sahrens } 122fa9e4066Sahrens 123fa9e4066Sahrens while (--c != 0) { 124fa9e4066Sahrens ASSERT(zio_buf_cache[c] != NULL); 125fa9e4066Sahrens if (zio_buf_cache[c - 1] == NULL) 126fa9e4066Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 127fa9e4066Sahrens } 128*ea8dc4b6Seschrock 129*ea8dc4b6Seschrock zio_inject_init(); 130fa9e4066Sahrens } 131fa9e4066Sahrens 132fa9e4066Sahrens void 133fa9e4066Sahrens zio_fini(void) 134fa9e4066Sahrens { 135fa9e4066Sahrens size_t c; 136fa9e4066Sahrens kmem_cache_t *last_cache = NULL; 137fa9e4066Sahrens 138fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 139fa9e4066Sahrens if (zio_buf_cache[c] != last_cache) { 140fa9e4066Sahrens last_cache = zio_buf_cache[c]; 141fa9e4066Sahrens kmem_cache_destroy(zio_buf_cache[c]); 142fa9e4066Sahrens } 143fa9e4066Sahrens zio_buf_cache[c] = NULL; 144fa9e4066Sahrens } 145*ea8dc4b6Seschrock 146*ea8dc4b6Seschrock zio_inject_fini(); 147fa9e4066Sahrens } 148fa9e4066Sahrens 149fa9e4066Sahrens /* 150fa9e4066Sahrens * ========================================================================== 151fa9e4066Sahrens * Allocate and free I/O buffers 152fa9e4066Sahrens * ========================================================================== 153fa9e4066Sahrens */ 154fa9e4066Sahrens void * 155fa9e4066Sahrens zio_buf_alloc(size_t size) 156fa9e4066Sahrens { 157fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 158fa9e4066Sahrens 159fa9e4066Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 160fa9e4066Sahrens 161fa9e4066Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 162fa9e4066Sahrens } 163fa9e4066Sahrens 164fa9e4066Sahrens void 165fa9e4066Sahrens zio_buf_free(void *buf, size_t size) 166fa9e4066Sahrens { 167fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 168fa9e4066Sahrens 169fa9e4066Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 170fa9e4066Sahrens 171fa9e4066Sahrens kmem_cache_free(zio_buf_cache[c], buf); 172fa9e4066Sahrens } 173fa9e4066Sahrens 174fa9e4066Sahrens /* 175fa9e4066Sahrens * ========================================================================== 176fa9e4066Sahrens * Push and pop I/O transform buffers 177fa9e4066Sahrens * ========================================================================== 178fa9e4066Sahrens */ 179fa9e4066Sahrens static void 180fa9e4066Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 181fa9e4066Sahrens { 182fa9e4066Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 183fa9e4066Sahrens 184fa9e4066Sahrens zt->zt_data = data; 185fa9e4066Sahrens zt->zt_size = size; 186fa9e4066Sahrens zt->zt_bufsize = bufsize; 187fa9e4066Sahrens 188fa9e4066Sahrens zt->zt_next = zio->io_transform_stack; 189fa9e4066Sahrens zio->io_transform_stack = zt; 190fa9e4066Sahrens 191fa9e4066Sahrens zio->io_data = data; 192fa9e4066Sahrens zio->io_size = size; 193fa9e4066Sahrens } 194fa9e4066Sahrens 195fa9e4066Sahrens static void 196fa9e4066Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 197fa9e4066Sahrens { 198fa9e4066Sahrens zio_transform_t *zt = zio->io_transform_stack; 199fa9e4066Sahrens 200fa9e4066Sahrens *data = zt->zt_data; 201fa9e4066Sahrens *size = zt->zt_size; 202fa9e4066Sahrens *bufsize = zt->zt_bufsize; 203fa9e4066Sahrens 204fa9e4066Sahrens zio->io_transform_stack = zt->zt_next; 205fa9e4066Sahrens kmem_free(zt, sizeof (zio_transform_t)); 206fa9e4066Sahrens 207fa9e4066Sahrens if ((zt = zio->io_transform_stack) != NULL) { 208fa9e4066Sahrens zio->io_data = zt->zt_data; 209fa9e4066Sahrens zio->io_size = zt->zt_size; 210fa9e4066Sahrens } 211fa9e4066Sahrens } 212fa9e4066Sahrens 213fa9e4066Sahrens static void 214fa9e4066Sahrens zio_clear_transform_stack(zio_t *zio) 215fa9e4066Sahrens { 216fa9e4066Sahrens void *data; 217fa9e4066Sahrens uint64_t size, bufsize; 218fa9e4066Sahrens 219fa9e4066Sahrens ASSERT(zio->io_transform_stack != NULL); 220fa9e4066Sahrens 221fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 222fa9e4066Sahrens while (zio->io_transform_stack != NULL) { 223fa9e4066Sahrens zio_buf_free(data, bufsize); 224fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 225fa9e4066Sahrens } 226fa9e4066Sahrens } 227fa9e4066Sahrens 228fa9e4066Sahrens /* 229fa9e4066Sahrens * ========================================================================== 230fa9e4066Sahrens * Create the various types of I/O (read, write, free) 231fa9e4066Sahrens * ========================================================================== 232fa9e4066Sahrens */ 233fa9e4066Sahrens static zio_t * 234fa9e4066Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 235fa9e4066Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 236fa9e4066Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 237fa9e4066Sahrens { 238fa9e4066Sahrens zio_t *zio; 239fa9e4066Sahrens 240fa9e4066Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 241fa9e4066Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 242fa9e4066Sahrens 243fa9e4066Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 244fa9e4066Sahrens zio->io_parent = pio; 245fa9e4066Sahrens zio->io_spa = spa; 246fa9e4066Sahrens zio->io_txg = txg; 247fa9e4066Sahrens if (bp != NULL) { 248fa9e4066Sahrens zio->io_bp = bp; 249fa9e4066Sahrens zio->io_bp_copy = *bp; 250fa9e4066Sahrens zio->io_bp_orig = *bp; 251fa9e4066Sahrens /* XXBP - Need to inherit this when it matters */ 252fa9e4066Sahrens zio->io_dva_index = 0; 253fa9e4066Sahrens } 254fa9e4066Sahrens zio->io_done = done; 255fa9e4066Sahrens zio->io_private = private; 256fa9e4066Sahrens zio->io_type = type; 257fa9e4066Sahrens zio->io_priority = priority; 258fa9e4066Sahrens zio->io_stage = stage; 259fa9e4066Sahrens zio->io_pipeline = pipeline; 260fa9e4066Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 261fa9e4066Sahrens zio->io_timestamp = lbolt64; 262fa9e4066Sahrens zio->io_flags = flags; 263fa9e4066Sahrens zio_push_transform(zio, data, size, size); 264fa9e4066Sahrens 265fa9e4066Sahrens if (pio == NULL) { 266fa9e4066Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 267*ea8dc4b6Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 268fa9e4066Sahrens zio->io_root = zio; 269fa9e4066Sahrens } else { 270fa9e4066Sahrens zio->io_root = pio->io_root; 271*ea8dc4b6Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 272*ea8dc4b6Seschrock zio->io_logical = pio->io_logical; 273fa9e4066Sahrens mutex_enter(&pio->io_lock); 274fa9e4066Sahrens if (stage < ZIO_STAGE_READY) 275fa9e4066Sahrens pio->io_children_notready++; 276fa9e4066Sahrens pio->io_children_notdone++; 277fa9e4066Sahrens zio->io_sibling_next = pio->io_child; 278fa9e4066Sahrens zio->io_sibling_prev = NULL; 279fa9e4066Sahrens if (pio->io_child != NULL) 280fa9e4066Sahrens pio->io_child->io_sibling_prev = zio; 281fa9e4066Sahrens pio->io_child = zio; 282fa9e4066Sahrens mutex_exit(&pio->io_lock); 283fa9e4066Sahrens } 284fa9e4066Sahrens 285fa9e4066Sahrens return (zio); 286fa9e4066Sahrens } 287fa9e4066Sahrens 288fa9e4066Sahrens zio_t * 289fa9e4066Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 290fa9e4066Sahrens int flags) 291fa9e4066Sahrens { 292fa9e4066Sahrens zio_t *zio; 293fa9e4066Sahrens 294fa9e4066Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 295fa9e4066Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 296fa9e4066Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 297fa9e4066Sahrens 298fa9e4066Sahrens return (zio); 299fa9e4066Sahrens } 300fa9e4066Sahrens 301fa9e4066Sahrens zio_t * 302fa9e4066Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 303fa9e4066Sahrens { 304fa9e4066Sahrens return (zio_null(NULL, spa, done, private, flags)); 305fa9e4066Sahrens } 306fa9e4066Sahrens 307fa9e4066Sahrens zio_t * 308fa9e4066Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 309fa9e4066Sahrens uint64_t size, zio_done_func_t *done, void *private, 310*ea8dc4b6Seschrock int priority, int flags, zbookmark_t *zb) 311fa9e4066Sahrens { 312fa9e4066Sahrens zio_t *zio; 313fa9e4066Sahrens dva_t *dva; 314fa9e4066Sahrens 315fa9e4066Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 316fa9e4066Sahrens 317fa9e4066Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 318fa9e4066Sahrens ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 319*ea8dc4b6Seschrock zio->io_bookmark = *zb; 320*ea8dc4b6Seschrock 321*ea8dc4b6Seschrock zio->io_logical = zio; 322fa9e4066Sahrens 323fa9e4066Sahrens /* 324fa9e4066Sahrens * Work off our copy of the bp so the caller can free it. 325fa9e4066Sahrens */ 326fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 327fa9e4066Sahrens 328fa9e4066Sahrens bp = zio->io_bp; 329fa9e4066Sahrens dva = ZIO_GET_DVA(zio); 330fa9e4066Sahrens 331fa9e4066Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 332fa9e4066Sahrens uint64_t csize = BP_GET_PSIZE(bp); 333fa9e4066Sahrens void *cbuf = zio_buf_alloc(csize); 334fa9e4066Sahrens 335fa9e4066Sahrens zio_push_transform(zio, cbuf, csize, csize); 336fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 337fa9e4066Sahrens } 338fa9e4066Sahrens 339fa9e4066Sahrens if (DVA_GET_GANG(dva)) { 340fa9e4066Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 341fa9e4066Sahrens void *gbuf = zio_buf_alloc(gsize); 342fa9e4066Sahrens 343fa9e4066Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 344fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 345fa9e4066Sahrens } 346fa9e4066Sahrens 347fa9e4066Sahrens return (zio); 348fa9e4066Sahrens } 349fa9e4066Sahrens 350fa9e4066Sahrens zio_t * 351fa9e4066Sahrens zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, 352fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 353*ea8dc4b6Seschrock zio_done_func_t *done, void *private, int priority, int flags, 354*ea8dc4b6Seschrock zbookmark_t *zb) 355fa9e4066Sahrens { 356fa9e4066Sahrens zio_t *zio; 357fa9e4066Sahrens 358fa9e4066Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 359fa9e4066Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 360fa9e4066Sahrens 361fa9e4066Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 362fa9e4066Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 363fa9e4066Sahrens 364fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 365fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 366fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 367fa9e4066Sahrens 368*ea8dc4b6Seschrock zio->io_bookmark = *zb; 369*ea8dc4b6Seschrock 370*ea8dc4b6Seschrock zio->io_logical = zio; 371*ea8dc4b6Seschrock 372fa9e4066Sahrens zio->io_checksum = checksum; 373fa9e4066Sahrens zio->io_compress = compress; 374fa9e4066Sahrens 375fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF) 376fa9e4066Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 377fa9e4066Sahrens 378fa9e4066Sahrens if (bp->blk_birth != txg) { 379fa9e4066Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 380fa9e4066Sahrens BP_ZERO(bp); 381fa9e4066Sahrens BP_SET_LSIZE(bp, size); 382fa9e4066Sahrens BP_SET_PSIZE(bp, size); 383fa9e4066Sahrens } 384fa9e4066Sahrens 385fa9e4066Sahrens return (zio); 386fa9e4066Sahrens } 387fa9e4066Sahrens 388fa9e4066Sahrens zio_t * 389fa9e4066Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 390fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 391*ea8dc4b6Seschrock zio_done_func_t *done, void *private, int priority, int flags, 392*ea8dc4b6Seschrock zbookmark_t *zb) 393fa9e4066Sahrens { 394fa9e4066Sahrens zio_t *zio; 395fa9e4066Sahrens 396fa9e4066Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 397fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 398fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 399fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 400fa9e4066Sahrens 401*ea8dc4b6Seschrock zio->io_bookmark = *zb; 402fa9e4066Sahrens zio->io_checksum = checksum; 403fa9e4066Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 404fa9e4066Sahrens 405fa9e4066Sahrens return (zio); 406fa9e4066Sahrens } 407fa9e4066Sahrens 408fa9e4066Sahrens static zio_t * 409fa9e4066Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 410fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 411fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 412fa9e4066Sahrens { 413fa9e4066Sahrens zio_t *zio; 414fa9e4066Sahrens 415fa9e4066Sahrens BP_ZERO(bp); 416fa9e4066Sahrens BP_SET_LSIZE(bp, size); 417fa9e4066Sahrens BP_SET_PSIZE(bp, size); 418fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 419fa9e4066Sahrens 420fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 421fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 422fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 423fa9e4066Sahrens 424fa9e4066Sahrens zio->io_checksum = checksum; 425fa9e4066Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 426fa9e4066Sahrens 427fa9e4066Sahrens return (zio); 428fa9e4066Sahrens } 429fa9e4066Sahrens 430fa9e4066Sahrens zio_t * 431fa9e4066Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 432fa9e4066Sahrens zio_done_func_t *done, void *private) 433fa9e4066Sahrens { 434fa9e4066Sahrens zio_t *zio; 435fa9e4066Sahrens 436fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 437fa9e4066Sahrens 438fa9e4066Sahrens if (txg == spa->spa_syncing_txg && 439fa9e4066Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 440fa9e4066Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 441fa9e4066Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 442fa9e4066Sahrens } 443fa9e4066Sahrens 444fa9e4066Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 445fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 446fa9e4066Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, 447fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 448fa9e4066Sahrens 449fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 450fa9e4066Sahrens 451fa9e4066Sahrens return (zio); 452fa9e4066Sahrens } 453fa9e4066Sahrens 454fa9e4066Sahrens zio_t * 455fa9e4066Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 456fa9e4066Sahrens zio_done_func_t *done, void *private) 457fa9e4066Sahrens { 458fa9e4066Sahrens zio_t *zio; 459fa9e4066Sahrens 460fa9e4066Sahrens /* 461fa9e4066Sahrens * A claim is an allocation of a specific block. Claims are needed 462fa9e4066Sahrens * to support immediate writes in the intent log. The issue is that 463fa9e4066Sahrens * immediate writes contain committed data, but in a txg that was 464fa9e4066Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 465fa9e4066Sahrens * the intent log claims all blocks that contain immediate write data 466fa9e4066Sahrens * so that the SPA knows they're in use. 467fa9e4066Sahrens * 468fa9e4066Sahrens * All claims *must* be resolved in the first txg -- before the SPA 469fa9e4066Sahrens * starts allocating blocks -- so that nothing is allocated twice. 470fa9e4066Sahrens */ 471fa9e4066Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 472fa9e4066Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 473fa9e4066Sahrens 474fa9e4066Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 475fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 476fa9e4066Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 477fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 478fa9e4066Sahrens 479fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 480fa9e4066Sahrens 481fa9e4066Sahrens return (zio); 482fa9e4066Sahrens } 483fa9e4066Sahrens 484fa9e4066Sahrens zio_t * 485fa9e4066Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 486fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 487fa9e4066Sahrens { 488fa9e4066Sahrens zio_t *zio; 489fa9e4066Sahrens int c; 490fa9e4066Sahrens 491fa9e4066Sahrens if (vd->vdev_children == 0) { 492fa9e4066Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 493fa9e4066Sahrens ZIO_TYPE_IOCTL, priority, flags, 494fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 495fa9e4066Sahrens 496fa9e4066Sahrens zio->io_vd = vd; 497fa9e4066Sahrens zio->io_cmd = cmd; 498fa9e4066Sahrens } else { 499fa9e4066Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 500fa9e4066Sahrens 501fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 502fa9e4066Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 503fa9e4066Sahrens done, private, priority, flags)); 504fa9e4066Sahrens } 505fa9e4066Sahrens 506fa9e4066Sahrens return (zio); 507fa9e4066Sahrens } 508fa9e4066Sahrens 509fa9e4066Sahrens static void 510fa9e4066Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 511fa9e4066Sahrens int checksum) 512fa9e4066Sahrens { 513fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 514fa9e4066Sahrens 515fa9e4066Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 516fa9e4066Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 517fa9e4066Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 518fa9e4066Sahrens 519fa9e4066Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 520fa9e4066Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 521fa9e4066Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 522fa9e4066Sahrens 523fa9e4066Sahrens BP_ZERO(bp); 524fa9e4066Sahrens 525fa9e4066Sahrens BP_SET_LSIZE(bp, size); 526fa9e4066Sahrens BP_SET_PSIZE(bp, size); 527fa9e4066Sahrens 528fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 529fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 530fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 531fa9e4066Sahrens 532fa9e4066Sahrens if (checksum != ZIO_CHECKSUM_OFF) 533fa9e4066Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 534fa9e4066Sahrens } 535fa9e4066Sahrens 536fa9e4066Sahrens zio_t * 537fa9e4066Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 538fa9e4066Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 539fa9e4066Sahrens int priority, int flags) 540fa9e4066Sahrens { 541fa9e4066Sahrens zio_t *zio; 542fa9e4066Sahrens blkptr_t blk; 543fa9e4066Sahrens 544fa9e4066Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 545fa9e4066Sahrens 546fa9e4066Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 547fa9e4066Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 548fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 549fa9e4066Sahrens 550fa9e4066Sahrens zio->io_vd = vd; 551fa9e4066Sahrens zio->io_offset = offset; 552fa9e4066Sahrens 553fa9e4066Sahrens /* 554fa9e4066Sahrens * Work off our copy of the bp so the caller can free it. 555fa9e4066Sahrens */ 556fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 557fa9e4066Sahrens 558fa9e4066Sahrens return (zio); 559fa9e4066Sahrens } 560fa9e4066Sahrens 561fa9e4066Sahrens zio_t * 562fa9e4066Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 563fa9e4066Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 564fa9e4066Sahrens int priority, int flags) 565fa9e4066Sahrens { 566fa9e4066Sahrens zio_block_tail_t *zbt; 567fa9e4066Sahrens void *wbuf; 568fa9e4066Sahrens zio_t *zio; 569fa9e4066Sahrens blkptr_t blk; 570fa9e4066Sahrens 571fa9e4066Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 572fa9e4066Sahrens 573fa9e4066Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 574fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 575fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 576fa9e4066Sahrens 577fa9e4066Sahrens zio->io_vd = vd; 578fa9e4066Sahrens zio->io_offset = offset; 579fa9e4066Sahrens 580fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 581fa9e4066Sahrens zio->io_checksum = checksum; 582fa9e4066Sahrens 583fa9e4066Sahrens if (zio_checksum_table[checksum].ci_zbt) { 584fa9e4066Sahrens /* 585fa9e4066Sahrens * zbt checksums are necessarily destructive -- they modify 586fa9e4066Sahrens * one word of the write buffer to hold the verifier/checksum. 587fa9e4066Sahrens * Therefore, we must make a local copy in case the data is 588fa9e4066Sahrens * being written to multiple places. 589fa9e4066Sahrens */ 590fa9e4066Sahrens wbuf = zio_buf_alloc(size); 591fa9e4066Sahrens bcopy(data, wbuf, size); 592fa9e4066Sahrens zio_push_transform(zio, wbuf, size, size); 593fa9e4066Sahrens 594fa9e4066Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 595fa9e4066Sahrens zbt->zbt_cksum = blk.blk_cksum; 596fa9e4066Sahrens } 597fa9e4066Sahrens 598fa9e4066Sahrens return (zio); 599fa9e4066Sahrens } 600fa9e4066Sahrens 601fa9e4066Sahrens /* 602fa9e4066Sahrens * Create a child I/O to do some work for us. It has no associated bp. 603fa9e4066Sahrens */ 604fa9e4066Sahrens zio_t * 605fa9e4066Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 606fa9e4066Sahrens void *data, uint64_t size, int type, int priority, int flags, 607fa9e4066Sahrens zio_done_func_t *done, void *private) 608fa9e4066Sahrens { 609fa9e4066Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 610fa9e4066Sahrens zio_t *cio; 611fa9e4066Sahrens 612fa9e4066Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 613fa9e4066Sahrens /* 614fa9e4066Sahrens * If we have the bp, then the child should perform the 615fa9e4066Sahrens * checksum and the parent need not. This pushes error 616fa9e4066Sahrens * detection as close to the leaves as possible and 617fa9e4066Sahrens * eliminates redundant checksums in the interior nodes. 618fa9e4066Sahrens */ 619fa9e4066Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 620fa9e4066Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 621fa9e4066Sahrens } 622fa9e4066Sahrens 623fa9e4066Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 624fa9e4066Sahrens done, private, type, priority, 625fa9e4066Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 626fa9e4066Sahrens ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline); 627fa9e4066Sahrens 628fa9e4066Sahrens cio->io_vd = vd; 629fa9e4066Sahrens cio->io_offset = offset; 630fa9e4066Sahrens 631fa9e4066Sahrens return (cio); 632fa9e4066Sahrens } 633fa9e4066Sahrens 634fa9e4066Sahrens /* 635fa9e4066Sahrens * ========================================================================== 636fa9e4066Sahrens * Initiate I/O, either sync or async 637fa9e4066Sahrens * ========================================================================== 638fa9e4066Sahrens */ 639fa9e4066Sahrens int 640fa9e4066Sahrens zio_wait(zio_t *zio) 641fa9e4066Sahrens { 642fa9e4066Sahrens int error; 643fa9e4066Sahrens 644fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 645fa9e4066Sahrens 646fa9e4066Sahrens zio->io_waiter = curthread; 647fa9e4066Sahrens 648fa9e4066Sahrens zio_next_stage_async(zio); 649fa9e4066Sahrens 650fa9e4066Sahrens mutex_enter(&zio->io_lock); 651fa9e4066Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 652fa9e4066Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 653fa9e4066Sahrens mutex_exit(&zio->io_lock); 654fa9e4066Sahrens 655fa9e4066Sahrens error = zio->io_error; 656fa9e4066Sahrens 657fa9e4066Sahrens kmem_free(zio, sizeof (zio_t)); 658fa9e4066Sahrens 659fa9e4066Sahrens return (error); 660fa9e4066Sahrens } 661fa9e4066Sahrens 662fa9e4066Sahrens void 663fa9e4066Sahrens zio_nowait(zio_t *zio) 664fa9e4066Sahrens { 665fa9e4066Sahrens zio_next_stage_async(zio); 666fa9e4066Sahrens } 667fa9e4066Sahrens 668fa9e4066Sahrens /* 669fa9e4066Sahrens * ========================================================================== 670fa9e4066Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 671fa9e4066Sahrens * ========================================================================== 672fa9e4066Sahrens */ 673fa9e4066Sahrens static void 674fa9e4066Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 675fa9e4066Sahrens { 676fa9e4066Sahrens mutex_enter(&zio->io_lock); 677fa9e4066Sahrens if (*countp == 0) { 678fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 679fa9e4066Sahrens mutex_exit(&zio->io_lock); 680fa9e4066Sahrens zio_next_stage(zio); 681fa9e4066Sahrens } else { 682fa9e4066Sahrens zio->io_stalled = stage; 683fa9e4066Sahrens mutex_exit(&zio->io_lock); 684fa9e4066Sahrens } 685fa9e4066Sahrens } 686fa9e4066Sahrens 687fa9e4066Sahrens static void 688fa9e4066Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 689fa9e4066Sahrens { 690fa9e4066Sahrens zio_t *pio = zio->io_parent; 691fa9e4066Sahrens 692fa9e4066Sahrens mutex_enter(&pio->io_lock); 693fa9e4066Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 694fa9e4066Sahrens pio->io_error = zio->io_error; 695fa9e4066Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 696fa9e4066Sahrens pio->io_stalled = 0; 697fa9e4066Sahrens mutex_exit(&pio->io_lock); 698fa9e4066Sahrens zio_next_stage_async(pio); 699fa9e4066Sahrens } else { 700fa9e4066Sahrens mutex_exit(&pio->io_lock); 701fa9e4066Sahrens } 702fa9e4066Sahrens } 703fa9e4066Sahrens 704fa9e4066Sahrens static void 705fa9e4066Sahrens zio_wait_children_ready(zio_t *zio) 706fa9e4066Sahrens { 707fa9e4066Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 708fa9e4066Sahrens &zio->io_children_notready); 709fa9e4066Sahrens } 710fa9e4066Sahrens 711fa9e4066Sahrens void 712fa9e4066Sahrens zio_wait_children_done(zio_t *zio) 713fa9e4066Sahrens { 714fa9e4066Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 715fa9e4066Sahrens &zio->io_children_notdone); 716fa9e4066Sahrens } 717fa9e4066Sahrens 718fa9e4066Sahrens static void 719fa9e4066Sahrens zio_ready(zio_t *zio) 720fa9e4066Sahrens { 721fa9e4066Sahrens zio_t *pio = zio->io_parent; 722fa9e4066Sahrens 723fa9e4066Sahrens if (pio != NULL) 724fa9e4066Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 725fa9e4066Sahrens &pio->io_children_notready); 726fa9e4066Sahrens 727fa9e4066Sahrens if (zio->io_bp) 728fa9e4066Sahrens zio->io_bp_copy = *zio->io_bp; 729fa9e4066Sahrens 730fa9e4066Sahrens zio_next_stage(zio); 731fa9e4066Sahrens } 732fa9e4066Sahrens 733fa9e4066Sahrens static void 734fa9e4066Sahrens zio_done(zio_t *zio) 735fa9e4066Sahrens { 736fa9e4066Sahrens zio_t *pio = zio->io_parent; 737fa9e4066Sahrens spa_t *spa = zio->io_spa; 738fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 739fa9e4066Sahrens vdev_t *vd = zio->io_vd; 740fbabab8fSmaybee char blkbuf[BP_SPRINTF_LEN]; 741fa9e4066Sahrens 742fa9e4066Sahrens ASSERT(zio->io_children_notready == 0); 743fa9e4066Sahrens ASSERT(zio->io_children_notdone == 0); 744fa9e4066Sahrens 745fa9e4066Sahrens if (bp != NULL) { 746fa9e4066Sahrens ASSERT(bp->blk_pad[0] == 0); 747fa9e4066Sahrens ASSERT(bp->blk_pad[1] == 0); 748fa9e4066Sahrens ASSERT(bp->blk_pad[2] == 0); 749fa9e4066Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 750fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 751fa9e4066Sahrens !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 752fa9e4066Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 753fa9e4066Sahrens } 754fa9e4066Sahrens 755fa9e4066Sahrens if (vd != NULL) 756fa9e4066Sahrens vdev_stat_update(zio); 757fa9e4066Sahrens 758fa9e4066Sahrens if (zio->io_error) { 759*ea8dc4b6Seschrock /* 760*ea8dc4b6Seschrock * If this I/O is attached to a particular vdev, 761*ea8dc4b6Seschrock * generate an error message describing the I/O failure 762*ea8dc4b6Seschrock * at the block level. We ignore these errors if the 763*ea8dc4b6Seschrock * device is currently unavailable. 764*ea8dc4b6Seschrock */ 765*ea8dc4b6Seschrock if (zio->io_error != ECKSUM && zio->io_vd && 766*ea8dc4b6Seschrock !vdev_is_dead(zio->io_vd)) 767*ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 768*ea8dc4b6Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 769*ea8dc4b6Seschrock 770*ea8dc4b6Seschrock if ((zio->io_error == EIO || 771*ea8dc4b6Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 772*ea8dc4b6Seschrock zio->io_logical == zio) { 773*ea8dc4b6Seschrock /* 774*ea8dc4b6Seschrock * For root I/O requests, tell the SPA to log the error 775*ea8dc4b6Seschrock * appropriately. Also, generate a logical data 776*ea8dc4b6Seschrock * ereport. 777*ea8dc4b6Seschrock */ 778*ea8dc4b6Seschrock spa_log_error(zio->io_spa, zio); 779*ea8dc4b6Seschrock 780*ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 781*ea8dc4b6Seschrock zio->io_spa, NULL, zio, 0, 0); 782*ea8dc4b6Seschrock } 783fa9e4066Sahrens 784*ea8dc4b6Seschrock /* 785*ea8dc4b6Seschrock * For I/O requests that cannot fail, panic appropriately. 786*ea8dc4b6Seschrock */ 787*ea8dc4b6Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 788*ea8dc4b6Seschrock sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 789*ea8dc4b6Seschrock bp ? bp : &zio->io_bp_copy); 790*ea8dc4b6Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 791*ea8dc4b6Seschrock "%d", zio->io_error == ECKSUM ? 792*ea8dc4b6Seschrock "bad checksum" : "I/O failure", 793*ea8dc4b6Seschrock zio_type_name[zio->io_type], 794*ea8dc4b6Seschrock vdev_description(vd), 795*ea8dc4b6Seschrock (u_longlong_t)zio->io_offset, 796*ea8dc4b6Seschrock zio, blkbuf, zio->io_error); 797*ea8dc4b6Seschrock } 798fa9e4066Sahrens } 799fa9e4066Sahrens 800fa9e4066Sahrens zio_clear_transform_stack(zio); 801fa9e4066Sahrens 802fa9e4066Sahrens if (zio->io_done) 803fa9e4066Sahrens zio->io_done(zio); 804fa9e4066Sahrens 805fa9e4066Sahrens ASSERT(zio->io_delegate_list == NULL); 806fa9e4066Sahrens ASSERT(zio->io_delegate_next == NULL); 807fa9e4066Sahrens 808fa9e4066Sahrens if (pio != NULL) { 809fa9e4066Sahrens zio_t *next, *prev; 810fa9e4066Sahrens 811fa9e4066Sahrens mutex_enter(&pio->io_lock); 812fa9e4066Sahrens next = zio->io_sibling_next; 813fa9e4066Sahrens prev = zio->io_sibling_prev; 814fa9e4066Sahrens if (next != NULL) 815fa9e4066Sahrens next->io_sibling_prev = prev; 816fa9e4066Sahrens if (prev != NULL) 817fa9e4066Sahrens prev->io_sibling_next = next; 818fa9e4066Sahrens if (pio->io_child == zio) 819fa9e4066Sahrens pio->io_child = next; 820fa9e4066Sahrens mutex_exit(&pio->io_lock); 821fa9e4066Sahrens 822fa9e4066Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 823fa9e4066Sahrens &pio->io_children_notdone); 824fa9e4066Sahrens } 825fa9e4066Sahrens 826fa9e4066Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 827*ea8dc4b6Seschrock spa_config_exit(spa, zio); 828fa9e4066Sahrens 829fa9e4066Sahrens if (zio->io_waiter != NULL) { 830fa9e4066Sahrens mutex_enter(&zio->io_lock); 831fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 832fa9e4066Sahrens zio->io_stalled = zio->io_stage; 833fa9e4066Sahrens cv_broadcast(&zio->io_cv); 834fa9e4066Sahrens mutex_exit(&zio->io_lock); 835fa9e4066Sahrens } else { 836fa9e4066Sahrens kmem_free(zio, sizeof (zio_t)); 837fa9e4066Sahrens } 838fa9e4066Sahrens } 839fa9e4066Sahrens 840fa9e4066Sahrens /* 841fa9e4066Sahrens * ========================================================================== 842fa9e4066Sahrens * Compression support 843fa9e4066Sahrens * ========================================================================== 844fa9e4066Sahrens */ 845fa9e4066Sahrens static void 846fa9e4066Sahrens zio_write_compress(zio_t *zio) 847fa9e4066Sahrens { 848fa9e4066Sahrens int compress = zio->io_compress; 849fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 850fa9e4066Sahrens void *cbuf; 851fa9e4066Sahrens uint64_t lsize = zio->io_size; 852fa9e4066Sahrens uint64_t csize = lsize; 853fa9e4066Sahrens uint64_t cbufsize = 0; 854fa9e4066Sahrens int pass; 855fa9e4066Sahrens 856fa9e4066Sahrens if (bp->blk_birth == zio->io_txg) { 857fa9e4066Sahrens /* 858fa9e4066Sahrens * We're rewriting an existing block, which means we're 859fa9e4066Sahrens * working on behalf of spa_sync(). For spa_sync() to 860fa9e4066Sahrens * converge, it must eventually be the case that we don't 861fa9e4066Sahrens * have to allocate new blocks. But compression changes 862fa9e4066Sahrens * the blocksize, which forces a reallocate, and makes 863fa9e4066Sahrens * convergence take longer. Therefore, after the first 864fa9e4066Sahrens * few passes, stop compressing to ensure convergence. 865fa9e4066Sahrens */ 866fa9e4066Sahrens pass = spa_sync_pass(zio->io_spa); 867fa9e4066Sahrens if (pass > zio_sync_pass.zp_dontcompress) 868fa9e4066Sahrens compress = ZIO_COMPRESS_OFF; 869fa9e4066Sahrens } else { 870fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 871fa9e4066Sahrens pass = 1; 872fa9e4066Sahrens } 873fa9e4066Sahrens 874fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF) 875fa9e4066Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 876fa9e4066Sahrens &cbuf, &csize, &cbufsize)) 877fa9e4066Sahrens compress = ZIO_COMPRESS_OFF; 878fa9e4066Sahrens 879fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 880fa9e4066Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 881fa9e4066Sahrens 882fa9e4066Sahrens /* 883fa9e4066Sahrens * The final pass of spa_sync() must be all rewrites, but the first 884fa9e4066Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 885fa9e4066Sahrens * but newly allocated blocks are sequential, so they can be written 886fa9e4066Sahrens * to disk faster. Therefore, we allow the first few passes of 887fa9e4066Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 888fa9e4066Sahrens * There should only be a handful of blocks after pass 1 in any case. 889fa9e4066Sahrens */ 890fa9e4066Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 891fa9e4066Sahrens pass > zio_sync_pass.zp_rewrite) { 892fa9e4066Sahrens ASSERT(csize != 0); 893fa9e4066Sahrens ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); 894fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 895fa9e4066Sahrens 896fa9e4066Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 897fa9e4066Sahrens } else { 898fa9e4066Sahrens if (bp->blk_birth == zio->io_txg) { 899fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 900fa9e4066Sahrens bzero(bp, sizeof (blkptr_t)); 901fa9e4066Sahrens } 902fa9e4066Sahrens if (csize == 0) { 903fa9e4066Sahrens BP_ZERO(bp); 904fa9e4066Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 905fa9e4066Sahrens } else { 906fa9e4066Sahrens BP_SET_LSIZE(bp, lsize); 907fa9e4066Sahrens BP_SET_PSIZE(bp, csize); 908fa9e4066Sahrens BP_SET_COMPRESS(bp, compress); 909fa9e4066Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 910fa9e4066Sahrens } 911fa9e4066Sahrens } 912fa9e4066Sahrens 913fa9e4066Sahrens zio_next_stage(zio); 914fa9e4066Sahrens } 915fa9e4066Sahrens 916fa9e4066Sahrens static void 917fa9e4066Sahrens zio_read_decompress(zio_t *zio) 918fa9e4066Sahrens { 919fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 920fa9e4066Sahrens void *data; 921fa9e4066Sahrens uint64_t size; 922fa9e4066Sahrens uint64_t bufsize; 923fa9e4066Sahrens int compress = BP_GET_COMPRESS(bp); 924fa9e4066Sahrens 925fa9e4066Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 926fa9e4066Sahrens 927fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 928fa9e4066Sahrens 929fa9e4066Sahrens if (zio_decompress_data(compress, data, size, 930fa9e4066Sahrens zio->io_data, zio->io_size)) 931fa9e4066Sahrens zio->io_error = EIO; 932fa9e4066Sahrens 933fa9e4066Sahrens zio_buf_free(data, bufsize); 934fa9e4066Sahrens 935fa9e4066Sahrens zio_next_stage(zio); 936fa9e4066Sahrens } 937fa9e4066Sahrens 938fa9e4066Sahrens /* 939fa9e4066Sahrens * ========================================================================== 940fa9e4066Sahrens * Gang block support 941fa9e4066Sahrens * ========================================================================== 942fa9e4066Sahrens */ 943fa9e4066Sahrens static void 944fa9e4066Sahrens zio_gang_pipeline(zio_t *zio) 945fa9e4066Sahrens { 946fa9e4066Sahrens /* 947fa9e4066Sahrens * By default, the pipeline assumes that we're dealing with a gang 948fa9e4066Sahrens * block. If we're not, strip out any gang-specific stages. 949fa9e4066Sahrens */ 950fa9e4066Sahrens if (!DVA_GET_GANG(ZIO_GET_DVA(zio))) 951fa9e4066Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 952fa9e4066Sahrens 953fa9e4066Sahrens zio_next_stage(zio); 954fa9e4066Sahrens } 955fa9e4066Sahrens 956fa9e4066Sahrens static void 957fa9e4066Sahrens zio_gang_byteswap(zio_t *zio) 958fa9e4066Sahrens { 959fa9e4066Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 960fa9e4066Sahrens 961fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 962fa9e4066Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 963fa9e4066Sahrens } 964fa9e4066Sahrens 965fa9e4066Sahrens static void 966fa9e4066Sahrens zio_get_gang_header(zio_t *zio) 967fa9e4066Sahrens { 968fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 969fa9e4066Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 970fa9e4066Sahrens void *gbuf = zio_buf_alloc(gsize); 971fa9e4066Sahrens 972fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 973fa9e4066Sahrens 974fa9e4066Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 975fa9e4066Sahrens 976fa9e4066Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 977fa9e4066Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 978fa9e4066Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 979fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 980fa9e4066Sahrens 981fa9e4066Sahrens zio_wait_children_done(zio); 982fa9e4066Sahrens } 983fa9e4066Sahrens 984fa9e4066Sahrens static void 985fa9e4066Sahrens zio_read_gang_members(zio_t *zio) 986fa9e4066Sahrens { 987fa9e4066Sahrens zio_gbh_phys_t *gbh; 988fa9e4066Sahrens uint64_t gsize, gbufsize, loff, lsize; 989fa9e4066Sahrens int i; 990fa9e4066Sahrens 991fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 992fa9e4066Sahrens 993fa9e4066Sahrens zio_gang_byteswap(zio); 994fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 995fa9e4066Sahrens 996fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 997fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 998fa9e4066Sahrens lsize = BP_GET_PSIZE(gbp); 999fa9e4066Sahrens 1000fa9e4066Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1001fa9e4066Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1002fa9e4066Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1003fa9e4066Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1004fa9e4066Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1005fa9e4066Sahrens 1006fa9e4066Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1007fa9e4066Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 1008*ea8dc4b6Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1009*ea8dc4b6Seschrock &zio->io_bookmark)); 1010fa9e4066Sahrens } 1011fa9e4066Sahrens 1012fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1013fa9e4066Sahrens zio_wait_children_done(zio); 1014fa9e4066Sahrens } 1015fa9e4066Sahrens 1016fa9e4066Sahrens static void 1017fa9e4066Sahrens zio_rewrite_gang_members(zio_t *zio) 1018fa9e4066Sahrens { 1019fa9e4066Sahrens zio_gbh_phys_t *gbh; 1020fa9e4066Sahrens uint64_t gsize, gbufsize, loff, lsize; 1021fa9e4066Sahrens int i; 1022fa9e4066Sahrens 1023fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1024fa9e4066Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1025fa9e4066Sahrens 1026fa9e4066Sahrens zio_gang_byteswap(zio); 1027fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1028fa9e4066Sahrens 1029fa9e4066Sahrens ASSERT(gsize == gbufsize); 1030fa9e4066Sahrens 1031fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1032fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1033fa9e4066Sahrens lsize = BP_GET_PSIZE(gbp); 1034fa9e4066Sahrens 1035fa9e4066Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1036fa9e4066Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1037fa9e4066Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1038fa9e4066Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1039fa9e4066Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1040fa9e4066Sahrens 1041fa9e4066Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1042fa9e4066Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1043*ea8dc4b6Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 1044*ea8dc4b6Seschrock &zio->io_bookmark)); 1045fa9e4066Sahrens } 1046fa9e4066Sahrens 1047fa9e4066Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1048fa9e4066Sahrens zio_wait_children_ready(zio); 1049fa9e4066Sahrens } 1050fa9e4066Sahrens 1051fa9e4066Sahrens static void 1052fa9e4066Sahrens zio_free_gang_members(zio_t *zio) 1053fa9e4066Sahrens { 1054fa9e4066Sahrens zio_gbh_phys_t *gbh; 1055fa9e4066Sahrens uint64_t gsize, gbufsize; 1056fa9e4066Sahrens int i; 1057fa9e4066Sahrens 1058fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1059fa9e4066Sahrens 1060fa9e4066Sahrens zio_gang_byteswap(zio); 1061fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1062fa9e4066Sahrens 1063fa9e4066Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1064fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1065fa9e4066Sahrens 1066fa9e4066Sahrens if (BP_IS_HOLE(gbp)) 1067fa9e4066Sahrens continue; 1068fa9e4066Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1069fa9e4066Sahrens gbp, NULL, NULL)); 1070fa9e4066Sahrens } 1071fa9e4066Sahrens 1072fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1073fa9e4066Sahrens zio_next_stage(zio); 1074fa9e4066Sahrens } 1075fa9e4066Sahrens 1076fa9e4066Sahrens static void 1077fa9e4066Sahrens zio_claim_gang_members(zio_t *zio) 1078fa9e4066Sahrens { 1079fa9e4066Sahrens zio_gbh_phys_t *gbh; 1080fa9e4066Sahrens uint64_t gsize, gbufsize; 1081fa9e4066Sahrens int i; 1082fa9e4066Sahrens 1083fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1084fa9e4066Sahrens 1085fa9e4066Sahrens zio_gang_byteswap(zio); 1086fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1087fa9e4066Sahrens 1088fa9e4066Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1089fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1090fa9e4066Sahrens if (BP_IS_HOLE(gbp)) 1091fa9e4066Sahrens continue; 1092fa9e4066Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1093fa9e4066Sahrens gbp, NULL, NULL)); 1094fa9e4066Sahrens } 1095fa9e4066Sahrens 1096fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1097fa9e4066Sahrens zio_next_stage(zio); 1098fa9e4066Sahrens } 1099fa9e4066Sahrens 1100fa9e4066Sahrens static void 1101fa9e4066Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1102fa9e4066Sahrens { 1103fa9e4066Sahrens zio_t *pio = zio->io_parent; 1104fa9e4066Sahrens dva_t *cdva = ZIO_GET_DVA(zio); 1105fa9e4066Sahrens dva_t *pdva = ZIO_GET_DVA(pio); 1106fa9e4066Sahrens uint64_t asize; 1107fa9e4066Sahrens 1108fa9e4066Sahrens ASSERT(DVA_GET_GANG(pdva)); 1109fa9e4066Sahrens 1110fa9e4066Sahrens /* XXBP - Need to be careful here with multiple DVAs */ 1111fa9e4066Sahrens mutex_enter(&pio->io_lock); 1112fa9e4066Sahrens asize = DVA_GET_ASIZE(pdva); 1113fa9e4066Sahrens asize += DVA_GET_ASIZE(cdva); 1114fa9e4066Sahrens DVA_SET_ASIZE(pdva, asize); 1115fa9e4066Sahrens mutex_exit(&pio->io_lock); 1116fa9e4066Sahrens } 1117fa9e4066Sahrens 1118fa9e4066Sahrens static void 1119fa9e4066Sahrens zio_write_allocate_gang_members(zio_t *zio) 1120fa9e4066Sahrens { 1121fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1122fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1123fa9e4066Sahrens zio_gbh_phys_t *gbh; 1124fa9e4066Sahrens uint64_t resid = zio->io_size; 1125fa9e4066Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1126fa9e4066Sahrens uint64_t gsize, loff, lsize; 1127fa9e4066Sahrens uint32_t gbps_left; 1128fa9e4066Sahrens int error; 1129fa9e4066Sahrens int i; 1130fa9e4066Sahrens 1131fa9e4066Sahrens gsize = SPA_GANGBLOCKSIZE; 1132fa9e4066Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1133fa9e4066Sahrens 1134fa9e4066Sahrens error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg); 1135fa9e4066Sahrens if (error == ENOSPC) 1136fa9e4066Sahrens panic("can't allocate gang block header"); 1137fa9e4066Sahrens ASSERT(error == 0); 1138fa9e4066Sahrens 1139fa9e4066Sahrens DVA_SET_GANG(dva, 1); 1140fa9e4066Sahrens 1141fa9e4066Sahrens bp->blk_birth = zio->io_txg; 1142fa9e4066Sahrens 1143fa9e4066Sahrens gbh = zio_buf_alloc(gsize); 1144fa9e4066Sahrens bzero(gbh, gsize); 1145fa9e4066Sahrens 1146fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1147fa9e4066Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1148fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1149fa9e4066Sahrens dva = &gbp->blk_dva[0]; 1150fa9e4066Sahrens 1151fa9e4066Sahrens ASSERT(gbps_left != 0); 1152fa9e4066Sahrens maxalloc = MIN(maxalloc, resid); 1153fa9e4066Sahrens 1154fa9e4066Sahrens while (resid <= maxalloc * gbps_left) { 1155fa9e4066Sahrens error = metaslab_alloc(zio->io_spa, maxalloc, dva, 1156fa9e4066Sahrens zio->io_txg); 1157fa9e4066Sahrens if (error == 0) 1158fa9e4066Sahrens break; 1159fa9e4066Sahrens ASSERT3U(error, ==, ENOSPC); 1160fa9e4066Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1161fa9e4066Sahrens panic("really out of space"); 1162fa9e4066Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1163fa9e4066Sahrens } 1164fa9e4066Sahrens 1165fa9e4066Sahrens if (resid <= maxalloc * gbps_left) { 1166fa9e4066Sahrens lsize = maxalloc; 1167fa9e4066Sahrens BP_SET_LSIZE(gbp, lsize); 1168fa9e4066Sahrens BP_SET_PSIZE(gbp, lsize); 1169fa9e4066Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1170fa9e4066Sahrens gbp->blk_birth = zio->io_txg; 1171fa9e4066Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, 1172fa9e4066Sahrens zio->io_checksum, zio->io_txg, gbp, 1173fa9e4066Sahrens (char *)zio->io_data + loff, lsize, 1174fa9e4066Sahrens zio_write_allocate_gang_member_done, NULL, 1175*ea8dc4b6Seschrock zio->io_priority, zio->io_flags, 1176*ea8dc4b6Seschrock &zio->io_bookmark)); 1177fa9e4066Sahrens } else { 1178fa9e4066Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1179fa9e4066Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 1180fa9e4066Sahrens zio_nowait(zio_write_allocate(zio, zio->io_spa, 1181fa9e4066Sahrens zio->io_checksum, zio->io_txg, gbp, 1182fa9e4066Sahrens (char *)zio->io_data + loff, lsize, 1183fa9e4066Sahrens zio_write_allocate_gang_member_done, NULL, 1184fa9e4066Sahrens zio->io_priority, zio->io_flags)); 1185fa9e4066Sahrens } 1186fa9e4066Sahrens } 1187fa9e4066Sahrens 1188fa9e4066Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1189fa9e4066Sahrens 1190fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1191fa9e4066Sahrens 1192fa9e4066Sahrens zio_push_transform(zio, gbh, gsize, gsize); 1193fa9e4066Sahrens zio_wait_children_done(zio); 1194fa9e4066Sahrens } 1195fa9e4066Sahrens 1196fa9e4066Sahrens /* 1197fa9e4066Sahrens * ========================================================================== 1198fa9e4066Sahrens * Allocate and free blocks 1199fa9e4066Sahrens * ========================================================================== 1200fa9e4066Sahrens */ 1201fa9e4066Sahrens static void 1202fa9e4066Sahrens zio_dva_allocate(zio_t *zio) 1203fa9e4066Sahrens { 1204fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1205fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1206fa9e4066Sahrens int error; 1207fa9e4066Sahrens 1208fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 1209fa9e4066Sahrens 1210fa9e4066Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1211fa9e4066Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1212fa9e4066Sahrens zio_write_allocate_gang_members(zio); 1213fa9e4066Sahrens return; 1214fa9e4066Sahrens } 1215fa9e4066Sahrens 1216fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1217fa9e4066Sahrens 1218fa9e4066Sahrens error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg); 1219fa9e4066Sahrens 1220fa9e4066Sahrens if (error == 0) { 1221fa9e4066Sahrens bp->blk_birth = zio->io_txg; 1222fa9e4066Sahrens } else if (error == ENOSPC) { 1223fa9e4066Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1224fa9e4066Sahrens panic("really, truly out of space"); 1225fa9e4066Sahrens zio_write_allocate_gang_members(zio); 1226fa9e4066Sahrens return; 1227fa9e4066Sahrens } else { 1228fa9e4066Sahrens zio->io_error = error; 1229fa9e4066Sahrens } 1230fa9e4066Sahrens zio_next_stage(zio); 1231fa9e4066Sahrens } 1232fa9e4066Sahrens 1233fa9e4066Sahrens static void 1234fa9e4066Sahrens zio_dva_free(zio_t *zio) 1235fa9e4066Sahrens { 1236fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1237fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1238fa9e4066Sahrens 1239fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 1240fa9e4066Sahrens 1241fa9e4066Sahrens metaslab_free(zio->io_spa, dva, zio->io_txg); 1242fa9e4066Sahrens 1243fa9e4066Sahrens BP_ZERO(bp); 1244fa9e4066Sahrens 1245fa9e4066Sahrens zio_next_stage(zio); 1246fa9e4066Sahrens } 1247fa9e4066Sahrens 1248fa9e4066Sahrens static void 1249fa9e4066Sahrens zio_dva_claim(zio_t *zio) 1250fa9e4066Sahrens { 1251fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1252fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1253fa9e4066Sahrens 1254fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 1255fa9e4066Sahrens 1256fa9e4066Sahrens zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg); 1257fa9e4066Sahrens 1258fa9e4066Sahrens zio_next_stage(zio); 1259fa9e4066Sahrens } 1260fa9e4066Sahrens 1261fa9e4066Sahrens static void 1262fa9e4066Sahrens zio_dva_translate(zio_t *zio) 1263fa9e4066Sahrens { 1264fa9e4066Sahrens spa_t *spa = zio->io_spa; 1265fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1266fa9e4066Sahrens uint64_t vdev = DVA_GET_VDEV(dva); 1267fa9e4066Sahrens uint64_t offset = DVA_GET_OFFSET(dva); 1268fa9e4066Sahrens 1269fa9e4066Sahrens ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio)); 1270fa9e4066Sahrens 1271fa9e4066Sahrens zio->io_offset = offset; 1272fa9e4066Sahrens 1273fa9e4066Sahrens if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL) 1274fa9e4066Sahrens zio->io_error = ENXIO; 1275fa9e4066Sahrens else if (offset + zio->io_size > zio->io_vd->vdev_asize) 1276fa9e4066Sahrens zio->io_error = EOVERFLOW; 1277fa9e4066Sahrens 1278fa9e4066Sahrens zio_next_stage(zio); 1279fa9e4066Sahrens } 1280fa9e4066Sahrens 1281fa9e4066Sahrens /* 1282fa9e4066Sahrens * ========================================================================== 1283fa9e4066Sahrens * Read and write to physical devices 1284fa9e4066Sahrens * ========================================================================== 1285fa9e4066Sahrens */ 1286fa9e4066Sahrens 1287fa9e4066Sahrens static void 1288fa9e4066Sahrens zio_vdev_io_setup(zio_t *zio) 1289fa9e4066Sahrens { 1290fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1291fa9e4066Sahrens 1292fa9e4066Sahrens /* XXPOLICY */ 1293fa9e4066Sahrens if (zio->io_retries == 0 && vd == vd->vdev_top) 1294fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1295fa9e4066Sahrens 1296fa9e4066Sahrens if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1297fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1298fa9e4066Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1299fa9e4066Sahrens } 1300fa9e4066Sahrens 1301fa9e4066Sahrens zio_next_stage(zio); 1302fa9e4066Sahrens } 1303fa9e4066Sahrens 1304fa9e4066Sahrens static void 1305fa9e4066Sahrens zio_vdev_io_start(zio_t *zio) 1306fa9e4066Sahrens { 1307fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1308fa9e4066Sahrens 1309fa9e4066Sahrens ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0); 1310fa9e4066Sahrens ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0); 1311fa9e4066Sahrens ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size); 1312fa9e4066Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1313fa9e4066Sahrens 1314fa9e4066Sahrens vdev_io_start(zio); 1315fa9e4066Sahrens 1316fa9e4066Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1317fa9e4066Sahrens } 1318fa9e4066Sahrens 1319fa9e4066Sahrens static void 1320fa9e4066Sahrens zio_vdev_io_done(zio_t *zio) 1321fa9e4066Sahrens { 1322fa9e4066Sahrens vdev_io_done(zio); 1323fa9e4066Sahrens } 1324fa9e4066Sahrens 1325fa9e4066Sahrens /* XXPOLICY */ 1326*ea8dc4b6Seschrock boolean_t 1327fa9e4066Sahrens zio_should_retry(zio_t *zio) 1328fa9e4066Sahrens { 1329fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1330fa9e4066Sahrens 1331fa9e4066Sahrens if (zio->io_error == 0) 1332fa9e4066Sahrens return (B_FALSE); 1333fa9e4066Sahrens if (zio->io_delegate_list != NULL) 1334fa9e4066Sahrens return (B_FALSE); 1335fa9e4066Sahrens if (vd != vd->vdev_top) 1336fa9e4066Sahrens return (B_FALSE); 1337fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1338fa9e4066Sahrens return (B_FALSE); 1339*ea8dc4b6Seschrock if (zio->io_retries > 0) 1340fa9e4066Sahrens return (B_FALSE); 1341fa9e4066Sahrens 1342fa9e4066Sahrens return (B_TRUE); 1343fa9e4066Sahrens } 1344fa9e4066Sahrens 1345fa9e4066Sahrens static void 1346fa9e4066Sahrens zio_vdev_io_assess(zio_t *zio) 1347fa9e4066Sahrens { 1348fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1349fa9e4066Sahrens vdev_t *tvd = vd->vdev_top; 1350fa9e4066Sahrens 1351fa9e4066Sahrens ASSERT(zio->io_vsd == NULL); 1352fa9e4066Sahrens 1353*ea8dc4b6Seschrock if (zio_injection_enabled && !zio->io_error) 1354*ea8dc4b6Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1355*ea8dc4b6Seschrock 1356fa9e4066Sahrens /* 1357fa9e4066Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1358fa9e4066Sahrens */ 1359fa9e4066Sahrens /* XXPOLICY */ 1360fa9e4066Sahrens if (zio_should_retry(zio)) { 1361fa9e4066Sahrens ASSERT(tvd == vd); 1362fa9e4066Sahrens ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); 1363fa9e4066Sahrens 1364fa9e4066Sahrens zio->io_retries++; 1365fa9e4066Sahrens zio->io_error = 0; 1366fa9e4066Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1367fa9e4066Sahrens /* XXPOLICY */ 1368fa9e4066Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1369fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1370fa9e4066Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1; 1371fa9e4066Sahrens 1372fa9e4066Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1373fa9e4066Sahrens zio->io_retries, zio_type_name[zio->io_type], 1374fa9e4066Sahrens vdev_description(vd), zio->io_offset); 1375fa9e4066Sahrens 1376*ea8dc4b6Seschrock zio_next_stage_async(zio); 1377*ea8dc4b6Seschrock return; 1378*ea8dc4b6Seschrock } 1379fa9e4066Sahrens 1380*ea8dc4b6Seschrock if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && 1381*ea8dc4b6Seschrock zio->io_error != ECKSUM) { 1382fa9e4066Sahrens /* 1383*ea8dc4b6Seschrock * Poor man's hotplug support. Even if we're done retrying this 1384*ea8dc4b6Seschrock * I/O, try to reopen the vdev to see if it's still attached. 1385*ea8dc4b6Seschrock * To avoid excessive thrashing, we only try it once a minute. 1386*ea8dc4b6Seschrock * This also has the effect of detecting when missing devices 1387*ea8dc4b6Seschrock * have come back, by polling the device once a minute. 1388*ea8dc4b6Seschrock * 1389*ea8dc4b6Seschrock * We need to do this asynchronously because we can't grab 1390*ea8dc4b6Seschrock * all the necessary locks way down here. 1391fa9e4066Sahrens */ 1392*ea8dc4b6Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 1393*ea8dc4b6Seschrock vd->vdev_last_try = gethrtime(); 1394*ea8dc4b6Seschrock tvd->vdev_reopen_wanted = 1; 1395*ea8dc4b6Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 1396*ea8dc4b6Seschrock } 1397fa9e4066Sahrens } 1398fa9e4066Sahrens 1399fa9e4066Sahrens zio_next_stage(zio); 1400fa9e4066Sahrens } 1401fa9e4066Sahrens 1402fa9e4066Sahrens void 1403fa9e4066Sahrens zio_vdev_io_reissue(zio_t *zio) 1404fa9e4066Sahrens { 1405fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1406fa9e4066Sahrens ASSERT(zio->io_error == 0); 1407fa9e4066Sahrens 1408fa9e4066Sahrens zio->io_stage--; 1409fa9e4066Sahrens } 1410fa9e4066Sahrens 1411fa9e4066Sahrens void 1412fa9e4066Sahrens zio_vdev_io_redone(zio_t *zio) 1413fa9e4066Sahrens { 1414fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1415fa9e4066Sahrens 1416fa9e4066Sahrens zio->io_stage--; 1417fa9e4066Sahrens } 1418fa9e4066Sahrens 1419fa9e4066Sahrens void 1420fa9e4066Sahrens zio_vdev_io_bypass(zio_t *zio) 1421fa9e4066Sahrens { 1422fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1423fa9e4066Sahrens ASSERT(zio->io_error == 0); 1424fa9e4066Sahrens 1425fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1426fa9e4066Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1427fa9e4066Sahrens } 1428fa9e4066Sahrens 1429fa9e4066Sahrens /* 1430fa9e4066Sahrens * ========================================================================== 1431fa9e4066Sahrens * Generate and verify checksums 1432fa9e4066Sahrens * ========================================================================== 1433fa9e4066Sahrens */ 1434fa9e4066Sahrens static void 1435fa9e4066Sahrens zio_checksum_generate(zio_t *zio) 1436fa9e4066Sahrens { 1437fa9e4066Sahrens int checksum = zio->io_checksum; 1438fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1439fa9e4066Sahrens 1440fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1441fa9e4066Sahrens 1442fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 1443fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1444fa9e4066Sahrens 1445fa9e4066Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1446fa9e4066Sahrens 1447fa9e4066Sahrens zio_next_stage(zio); 1448fa9e4066Sahrens } 1449fa9e4066Sahrens 1450fa9e4066Sahrens static void 1451fa9e4066Sahrens zio_gang_checksum_generate(zio_t *zio) 1452fa9e4066Sahrens { 1453fa9e4066Sahrens zio_cksum_t zc; 1454fa9e4066Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1455fa9e4066Sahrens 1456fa9e4066Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1457fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1458fa9e4066Sahrens 1459fa9e4066Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1460fa9e4066Sahrens 1461fa9e4066Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1462fa9e4066Sahrens 1463fa9e4066Sahrens zio_next_stage(zio); 1464fa9e4066Sahrens } 1465fa9e4066Sahrens 1466fa9e4066Sahrens static void 1467fa9e4066Sahrens zio_checksum_verify(zio_t *zio) 1468fa9e4066Sahrens { 1469fa9e4066Sahrens if (zio->io_bp != NULL) { 1470fa9e4066Sahrens zio->io_error = zio_checksum_error(zio); 1471*ea8dc4b6Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1472*ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1473*ea8dc4b6Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1474fa9e4066Sahrens } 1475fa9e4066Sahrens 1476fa9e4066Sahrens zio_next_stage(zio); 1477fa9e4066Sahrens } 1478fa9e4066Sahrens 1479fa9e4066Sahrens /* 1480fa9e4066Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1481fa9e4066Sahrens */ 1482fa9e4066Sahrens void 1483fa9e4066Sahrens zio_checksum_verified(zio_t *zio) 1484fa9e4066Sahrens { 1485fa9e4066Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1486fa9e4066Sahrens } 1487fa9e4066Sahrens 1488fa9e4066Sahrens /* 1489fa9e4066Sahrens * Set the external verifier for a gang block based on stuff in the bp 1490fa9e4066Sahrens */ 1491fa9e4066Sahrens void 1492fa9e4066Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1493fa9e4066Sahrens { 1494fa9e4066Sahrens zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio)); 1495fa9e4066Sahrens zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio)); 1496fa9e4066Sahrens zcp->zc_word[2] = zio->io_bp->blk_birth; 1497fa9e4066Sahrens zcp->zc_word[3] = 0; 1498fa9e4066Sahrens } 1499fa9e4066Sahrens 1500fa9e4066Sahrens /* 1501fa9e4066Sahrens * ========================================================================== 1502fa9e4066Sahrens * Define the pipeline 1503fa9e4066Sahrens * ========================================================================== 1504fa9e4066Sahrens */ 1505fa9e4066Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1506fa9e4066Sahrens 1507fa9e4066Sahrens static void 1508fa9e4066Sahrens zio_badop(zio_t *zio) 1509fa9e4066Sahrens { 1510fa9e4066Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1511fa9e4066Sahrens } 1512fa9e4066Sahrens 1513fa9e4066Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1514fa9e4066Sahrens zio_badop, 1515fa9e4066Sahrens zio_wait_children_ready, 1516fa9e4066Sahrens zio_write_compress, 1517fa9e4066Sahrens zio_checksum_generate, 1518fa9e4066Sahrens zio_gang_pipeline, 1519fa9e4066Sahrens zio_get_gang_header, 1520fa9e4066Sahrens zio_rewrite_gang_members, 1521fa9e4066Sahrens zio_free_gang_members, 1522fa9e4066Sahrens zio_claim_gang_members, 1523fa9e4066Sahrens zio_dva_allocate, 1524fa9e4066Sahrens zio_dva_free, 1525fa9e4066Sahrens zio_dva_claim, 1526fa9e4066Sahrens zio_gang_checksum_generate, 1527fa9e4066Sahrens zio_ready, 1528fa9e4066Sahrens zio_dva_translate, 1529fa9e4066Sahrens zio_vdev_io_setup, 1530fa9e4066Sahrens zio_vdev_io_start, 1531fa9e4066Sahrens zio_vdev_io_done, 1532fa9e4066Sahrens zio_vdev_io_assess, 1533fa9e4066Sahrens zio_wait_children_done, 1534fa9e4066Sahrens zio_checksum_verify, 1535fa9e4066Sahrens zio_read_gang_members, 1536fa9e4066Sahrens zio_read_decompress, 1537fa9e4066Sahrens zio_done, 1538fa9e4066Sahrens zio_badop 1539fa9e4066Sahrens }; 1540fa9e4066Sahrens 1541fa9e4066Sahrens /* 1542fa9e4066Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1543fa9e4066Sahrens * There's no locking on io_stage because there's no legitimate way for 1544fa9e4066Sahrens * multiple threads to be attempting to process the same I/O. 1545fa9e4066Sahrens */ 1546fa9e4066Sahrens void 1547fa9e4066Sahrens zio_next_stage(zio_t *zio) 1548fa9e4066Sahrens { 1549fa9e4066Sahrens uint32_t pipeline = zio->io_pipeline; 1550fa9e4066Sahrens 1551fa9e4066Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1552fa9e4066Sahrens 1553fa9e4066Sahrens if (zio->io_error) { 1554fa9e4066Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1555fa9e4066Sahrens zio, vdev_description(zio->io_vd), 1556fa9e4066Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1557fa9e4066Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1558fa9e4066Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1559fa9e4066Sahrens } 1560fa9e4066Sahrens 1561fa9e4066Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1562fa9e4066Sahrens continue; 1563fa9e4066Sahrens 1564fa9e4066Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1565fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 1566fa9e4066Sahrens 1567fa9e4066Sahrens zio_pipeline[zio->io_stage](zio); 1568fa9e4066Sahrens } 1569fa9e4066Sahrens 1570fa9e4066Sahrens void 1571fa9e4066Sahrens zio_next_stage_async(zio_t *zio) 1572fa9e4066Sahrens { 1573fa9e4066Sahrens taskq_t *tq; 1574fa9e4066Sahrens uint32_t pipeline = zio->io_pipeline; 1575fa9e4066Sahrens 1576fa9e4066Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1577fa9e4066Sahrens 1578fa9e4066Sahrens if (zio->io_error) { 1579fa9e4066Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1580fa9e4066Sahrens zio, vdev_description(zio->io_vd), 1581fa9e4066Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1582fa9e4066Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1583fa9e4066Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1584fa9e4066Sahrens } 1585fa9e4066Sahrens 1586fa9e4066Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1587fa9e4066Sahrens continue; 1588fa9e4066Sahrens 1589fa9e4066Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1590fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 1591fa9e4066Sahrens 1592fa9e4066Sahrens /* 1593fa9e4066Sahrens * For performance, we'll probably want two sets of task queues: 1594fa9e4066Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1595fa9e4066Sahrens * part is for read performance: since we have to make a pass over 1596fa9e4066Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1597fa9e4066Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1598fa9e4066Sahrens * that thread is probably still there. Getting this optimization 1599fa9e4066Sahrens * right avoids performance-hostile cache-to-cache transfers. 1600fa9e4066Sahrens * 1601fa9e4066Sahrens * Note that having two sets of task queues is also necessary for 1602fa9e4066Sahrens * correctness: if all of the issue threads get bogged down waiting 1603fa9e4066Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1604fa9e4066Sahrens * there won't be any threads available to service I/O completion 1605fa9e4066Sahrens * interrupts. 1606fa9e4066Sahrens */ 1607fa9e4066Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1608fa9e4066Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1609fa9e4066Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1610fa9e4066Sahrens else 1611fa9e4066Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1612fa9e4066Sahrens (void) taskq_dispatch(tq, 1613fa9e4066Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1614fa9e4066Sahrens } else { 1615fa9e4066Sahrens zio_pipeline[zio->io_stage](zio); 1616fa9e4066Sahrens } 1617fa9e4066Sahrens } 1618fa9e4066Sahrens 1619fa9e4066Sahrens /* 1620fa9e4066Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1621fa9e4066Sahrens */ 1622fa9e4066Sahrens int 1623fa9e4066Sahrens zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, 1624fa9e4066Sahrens uint64_t txg) 1625fa9e4066Sahrens { 1626fa9e4066Sahrens int error; 1627fa9e4066Sahrens 1628*ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 1629fa9e4066Sahrens 1630fa9e4066Sahrens BP_ZERO(bp); 1631fa9e4066Sahrens 1632fa9e4066Sahrens error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg); 1633fa9e4066Sahrens 1634fa9e4066Sahrens if (error == 0) { 1635fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 1636fa9e4066Sahrens BP_SET_LSIZE(bp, size); 1637fa9e4066Sahrens BP_SET_PSIZE(bp, size); 1638fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 1639fa9e4066Sahrens BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); 1640fa9e4066Sahrens BP_SET_LEVEL(bp, 0); 1641fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1642fa9e4066Sahrens bp->blk_birth = txg; 1643fa9e4066Sahrens } 1644fa9e4066Sahrens 1645*ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 1646fa9e4066Sahrens 1647fa9e4066Sahrens return (error); 1648fa9e4066Sahrens } 1649fa9e4066Sahrens 1650fa9e4066Sahrens /* 1651fa9e4066Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1652fa9e4066Sahrens * nothing to do except metaslab_free() it. 1653fa9e4066Sahrens */ 1654fa9e4066Sahrens void 1655fa9e4066Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1656fa9e4066Sahrens { 1657fa9e4066Sahrens ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0); 1658fa9e4066Sahrens 1659fa9e4066Sahrens dprintf_bp(bp, "txg %llu: ", txg); 1660fa9e4066Sahrens 1661*ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 1662fa9e4066Sahrens 1663fa9e4066Sahrens metaslab_free(spa, BP_IDENTITY(bp), txg); 1664fa9e4066Sahrens 1665*ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 1666fa9e4066Sahrens } 1667