1*fa9e4066Sahrens /* 2*fa9e4066Sahrens * CDDL HEADER START 3*fa9e4066Sahrens * 4*fa9e4066Sahrens * The contents of this file are subject to the terms of the 5*fa9e4066Sahrens * Common Development and Distribution License, Version 1.0 only 6*fa9e4066Sahrens * (the "License"). You may not use this file except in compliance 7*fa9e4066Sahrens * with the License. 8*fa9e4066Sahrens * 9*fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 11*fa9e4066Sahrens * See the License for the specific language governing permissions 12*fa9e4066Sahrens * and limitations under the License. 13*fa9e4066Sahrens * 14*fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*fa9e4066Sahrens * 20*fa9e4066Sahrens * CDDL HEADER END 21*fa9e4066Sahrens */ 22*fa9e4066Sahrens /* 23*fa9e4066Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*fa9e4066Sahrens * Use is subject to license terms. 25*fa9e4066Sahrens */ 26*fa9e4066Sahrens 27*fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*fa9e4066Sahrens 29*fa9e4066Sahrens #include <sys/zfs_context.h> 30*fa9e4066Sahrens #include <sys/spa.h> 31*fa9e4066Sahrens #include <sys/txg.h> 32*fa9e4066Sahrens #include <sys/spa_impl.h> 33*fa9e4066Sahrens #include <sys/vdev_impl.h> 34*fa9e4066Sahrens #include <sys/zio_impl.h> 35*fa9e4066Sahrens #include <sys/zio_compress.h> 36*fa9e4066Sahrens #include <sys/zio_checksum.h> 37*fa9e4066Sahrens 38*fa9e4066Sahrens static void zio_vdev_io_enter(zio_t *zio); 39*fa9e4066Sahrens static void zio_vdev_io_exit(zio_t *zio); 40*fa9e4066Sahrens 41*fa9e4066Sahrens /* 42*fa9e4066Sahrens * ========================================================================== 43*fa9e4066Sahrens * I/O priority table 44*fa9e4066Sahrens * ========================================================================== 45*fa9e4066Sahrens */ 46*fa9e4066Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 47*fa9e4066Sahrens 0, /* ZIO_PRIORITY_NOW */ 48*fa9e4066Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 49*fa9e4066Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 50*fa9e4066Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 51*fa9e4066Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 52*fa9e4066Sahrens 4, /* ZIO_PRIORITY_FREE */ 53*fa9e4066Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 54*fa9e4066Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 55*fa9e4066Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 56*fa9e4066Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 57*fa9e4066Sahrens }; 58*fa9e4066Sahrens 59*fa9e4066Sahrens /* 60*fa9e4066Sahrens * ========================================================================== 61*fa9e4066Sahrens * I/O type descriptions 62*fa9e4066Sahrens * ========================================================================== 63*fa9e4066Sahrens */ 64*fa9e4066Sahrens char *zio_type_name[ZIO_TYPES] = { 65*fa9e4066Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 66*fa9e4066Sahrens 67*fa9e4066Sahrens /* At or above this size, force gang blocking - for testing */ 68*fa9e4066Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 69*fa9e4066Sahrens 70*fa9e4066Sahrens typedef struct zio_sync_pass { 71*fa9e4066Sahrens int zp_defer_free; /* defer frees after this pass */ 72*fa9e4066Sahrens int zp_dontcompress; /* don't compress after this pass */ 73*fa9e4066Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74*fa9e4066Sahrens } zio_sync_pass_t; 75*fa9e4066Sahrens 76*fa9e4066Sahrens zio_sync_pass_t zio_sync_pass = { 77*fa9e4066Sahrens 1, /* zp_defer_free */ 78*fa9e4066Sahrens 4, /* zp_dontcompress */ 79*fa9e4066Sahrens 1, /* zp_rewrite */ 80*fa9e4066Sahrens }; 81*fa9e4066Sahrens 82*fa9e4066Sahrens /* 83*fa9e4066Sahrens * ========================================================================== 84*fa9e4066Sahrens * I/O kmem caches 85*fa9e4066Sahrens * ========================================================================== 86*fa9e4066Sahrens */ 87*fa9e4066Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 88*fa9e4066Sahrens 89*fa9e4066Sahrens void 90*fa9e4066Sahrens zio_init(void) 91*fa9e4066Sahrens { 92*fa9e4066Sahrens size_t c; 93*fa9e4066Sahrens 94*fa9e4066Sahrens /* 95*fa9e4066Sahrens * For small buffers, we want a cache for each multiple of 96*fa9e4066Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 97*fa9e4066Sahrens * for each quarter-power of 2. For large buffers, we want 98*fa9e4066Sahrens * a cache for each multiple of PAGESIZE. 99*fa9e4066Sahrens */ 100*fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 101*fa9e4066Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 102*fa9e4066Sahrens size_t p2 = size; 103*fa9e4066Sahrens size_t align = 0; 104*fa9e4066Sahrens 105*fa9e4066Sahrens while (p2 & (p2 - 1)) 106*fa9e4066Sahrens p2 &= p2 - 1; 107*fa9e4066Sahrens 108*fa9e4066Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 109*fa9e4066Sahrens align = SPA_MINBLOCKSIZE; 110*fa9e4066Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 111*fa9e4066Sahrens align = PAGESIZE; 112*fa9e4066Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 113*fa9e4066Sahrens align = p2 >> 2; 114*fa9e4066Sahrens } 115*fa9e4066Sahrens 116*fa9e4066Sahrens if (align != 0) { 117*fa9e4066Sahrens char name[30]; 118*fa9e4066Sahrens (void) sprintf(name, "zio_buf_%lu", size); 119*fa9e4066Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 120*fa9e4066Sahrens align, NULL, NULL, NULL, NULL, NULL, 0); 121*fa9e4066Sahrens dprintf("creating cache for size %5lx align %5lx\n", 122*fa9e4066Sahrens size, align); 123*fa9e4066Sahrens } 124*fa9e4066Sahrens } 125*fa9e4066Sahrens 126*fa9e4066Sahrens while (--c != 0) { 127*fa9e4066Sahrens ASSERT(zio_buf_cache[c] != NULL); 128*fa9e4066Sahrens if (zio_buf_cache[c - 1] == NULL) 129*fa9e4066Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 130*fa9e4066Sahrens } 131*fa9e4066Sahrens } 132*fa9e4066Sahrens 133*fa9e4066Sahrens void 134*fa9e4066Sahrens zio_fini(void) 135*fa9e4066Sahrens { 136*fa9e4066Sahrens size_t c; 137*fa9e4066Sahrens kmem_cache_t *last_cache = NULL; 138*fa9e4066Sahrens 139*fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 140*fa9e4066Sahrens if (zio_buf_cache[c] != last_cache) { 141*fa9e4066Sahrens last_cache = zio_buf_cache[c]; 142*fa9e4066Sahrens kmem_cache_destroy(zio_buf_cache[c]); 143*fa9e4066Sahrens } 144*fa9e4066Sahrens zio_buf_cache[c] = NULL; 145*fa9e4066Sahrens } 146*fa9e4066Sahrens } 147*fa9e4066Sahrens 148*fa9e4066Sahrens /* 149*fa9e4066Sahrens * ========================================================================== 150*fa9e4066Sahrens * Allocate and free I/O buffers 151*fa9e4066Sahrens * ========================================================================== 152*fa9e4066Sahrens */ 153*fa9e4066Sahrens void * 154*fa9e4066Sahrens zio_buf_alloc(size_t size) 155*fa9e4066Sahrens { 156*fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 157*fa9e4066Sahrens 158*fa9e4066Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 159*fa9e4066Sahrens 160*fa9e4066Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 161*fa9e4066Sahrens } 162*fa9e4066Sahrens 163*fa9e4066Sahrens void 164*fa9e4066Sahrens zio_buf_free(void *buf, size_t size) 165*fa9e4066Sahrens { 166*fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 167*fa9e4066Sahrens 168*fa9e4066Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 169*fa9e4066Sahrens 170*fa9e4066Sahrens kmem_cache_free(zio_buf_cache[c], buf); 171*fa9e4066Sahrens } 172*fa9e4066Sahrens 173*fa9e4066Sahrens /* 174*fa9e4066Sahrens * ========================================================================== 175*fa9e4066Sahrens * Push and pop I/O transform buffers 176*fa9e4066Sahrens * ========================================================================== 177*fa9e4066Sahrens */ 178*fa9e4066Sahrens static void 179*fa9e4066Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 180*fa9e4066Sahrens { 181*fa9e4066Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 182*fa9e4066Sahrens 183*fa9e4066Sahrens zt->zt_data = data; 184*fa9e4066Sahrens zt->zt_size = size; 185*fa9e4066Sahrens zt->zt_bufsize = bufsize; 186*fa9e4066Sahrens 187*fa9e4066Sahrens zt->zt_next = zio->io_transform_stack; 188*fa9e4066Sahrens zio->io_transform_stack = zt; 189*fa9e4066Sahrens 190*fa9e4066Sahrens zio->io_data = data; 191*fa9e4066Sahrens zio->io_size = size; 192*fa9e4066Sahrens } 193*fa9e4066Sahrens 194*fa9e4066Sahrens static void 195*fa9e4066Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 196*fa9e4066Sahrens { 197*fa9e4066Sahrens zio_transform_t *zt = zio->io_transform_stack; 198*fa9e4066Sahrens 199*fa9e4066Sahrens *data = zt->zt_data; 200*fa9e4066Sahrens *size = zt->zt_size; 201*fa9e4066Sahrens *bufsize = zt->zt_bufsize; 202*fa9e4066Sahrens 203*fa9e4066Sahrens zio->io_transform_stack = zt->zt_next; 204*fa9e4066Sahrens kmem_free(zt, sizeof (zio_transform_t)); 205*fa9e4066Sahrens 206*fa9e4066Sahrens if ((zt = zio->io_transform_stack) != NULL) { 207*fa9e4066Sahrens zio->io_data = zt->zt_data; 208*fa9e4066Sahrens zio->io_size = zt->zt_size; 209*fa9e4066Sahrens } 210*fa9e4066Sahrens } 211*fa9e4066Sahrens 212*fa9e4066Sahrens static void 213*fa9e4066Sahrens zio_clear_transform_stack(zio_t *zio) 214*fa9e4066Sahrens { 215*fa9e4066Sahrens void *data; 216*fa9e4066Sahrens uint64_t size, bufsize; 217*fa9e4066Sahrens 218*fa9e4066Sahrens ASSERT(zio->io_transform_stack != NULL); 219*fa9e4066Sahrens 220*fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 221*fa9e4066Sahrens while (zio->io_transform_stack != NULL) { 222*fa9e4066Sahrens zio_buf_free(data, bufsize); 223*fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 224*fa9e4066Sahrens } 225*fa9e4066Sahrens } 226*fa9e4066Sahrens 227*fa9e4066Sahrens /* 228*fa9e4066Sahrens * ========================================================================== 229*fa9e4066Sahrens * Create the various types of I/O (read, write, free) 230*fa9e4066Sahrens * ========================================================================== 231*fa9e4066Sahrens */ 232*fa9e4066Sahrens static zio_t * 233*fa9e4066Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 234*fa9e4066Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 235*fa9e4066Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 236*fa9e4066Sahrens { 237*fa9e4066Sahrens zio_t *zio; 238*fa9e4066Sahrens 239*fa9e4066Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 240*fa9e4066Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 241*fa9e4066Sahrens 242*fa9e4066Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 243*fa9e4066Sahrens zio->io_parent = pio; 244*fa9e4066Sahrens zio->io_spa = spa; 245*fa9e4066Sahrens zio->io_txg = txg; 246*fa9e4066Sahrens if (bp != NULL) { 247*fa9e4066Sahrens zio->io_bp = bp; 248*fa9e4066Sahrens zio->io_bp_copy = *bp; 249*fa9e4066Sahrens zio->io_bp_orig = *bp; 250*fa9e4066Sahrens /* XXBP - Need to inherit this when it matters */ 251*fa9e4066Sahrens zio->io_dva_index = 0; 252*fa9e4066Sahrens } 253*fa9e4066Sahrens zio->io_done = done; 254*fa9e4066Sahrens zio->io_private = private; 255*fa9e4066Sahrens zio->io_type = type; 256*fa9e4066Sahrens zio->io_priority = priority; 257*fa9e4066Sahrens zio->io_stage = stage; 258*fa9e4066Sahrens zio->io_pipeline = pipeline; 259*fa9e4066Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 260*fa9e4066Sahrens zio->io_timestamp = lbolt64; 261*fa9e4066Sahrens zio->io_flags = flags; 262*fa9e4066Sahrens zio_push_transform(zio, data, size, size); 263*fa9e4066Sahrens 264*fa9e4066Sahrens if (pio == NULL) { 265*fa9e4066Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 266*fa9e4066Sahrens spa_config_enter(zio->io_spa, RW_READER); 267*fa9e4066Sahrens zio->io_root = zio; 268*fa9e4066Sahrens } else { 269*fa9e4066Sahrens zio->io_root = pio->io_root; 270*fa9e4066Sahrens 271*fa9e4066Sahrens mutex_enter(&pio->io_lock); 272*fa9e4066Sahrens if (stage < ZIO_STAGE_READY) 273*fa9e4066Sahrens pio->io_children_notready++; 274*fa9e4066Sahrens pio->io_children_notdone++; 275*fa9e4066Sahrens zio->io_sibling_next = pio->io_child; 276*fa9e4066Sahrens zio->io_sibling_prev = NULL; 277*fa9e4066Sahrens if (pio->io_child != NULL) 278*fa9e4066Sahrens pio->io_child->io_sibling_prev = zio; 279*fa9e4066Sahrens pio->io_child = zio; 280*fa9e4066Sahrens mutex_exit(&pio->io_lock); 281*fa9e4066Sahrens } 282*fa9e4066Sahrens 283*fa9e4066Sahrens return (zio); 284*fa9e4066Sahrens } 285*fa9e4066Sahrens 286*fa9e4066Sahrens zio_t * 287*fa9e4066Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 288*fa9e4066Sahrens int flags) 289*fa9e4066Sahrens { 290*fa9e4066Sahrens zio_t *zio; 291*fa9e4066Sahrens 292*fa9e4066Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 293*fa9e4066Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 294*fa9e4066Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 295*fa9e4066Sahrens 296*fa9e4066Sahrens return (zio); 297*fa9e4066Sahrens } 298*fa9e4066Sahrens 299*fa9e4066Sahrens zio_t * 300*fa9e4066Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 301*fa9e4066Sahrens { 302*fa9e4066Sahrens return (zio_null(NULL, spa, done, private, flags)); 303*fa9e4066Sahrens } 304*fa9e4066Sahrens 305*fa9e4066Sahrens zio_t * 306*fa9e4066Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 307*fa9e4066Sahrens uint64_t size, zio_done_func_t *done, void *private, 308*fa9e4066Sahrens int priority, int flags) 309*fa9e4066Sahrens { 310*fa9e4066Sahrens zio_t *zio; 311*fa9e4066Sahrens dva_t *dva; 312*fa9e4066Sahrens 313*fa9e4066Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 314*fa9e4066Sahrens 315*fa9e4066Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 316*fa9e4066Sahrens ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 317*fa9e4066Sahrens 318*fa9e4066Sahrens /* 319*fa9e4066Sahrens * Work off our copy of the bp so the caller can free it. 320*fa9e4066Sahrens */ 321*fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 322*fa9e4066Sahrens 323*fa9e4066Sahrens bp = zio->io_bp; 324*fa9e4066Sahrens dva = ZIO_GET_DVA(zio); 325*fa9e4066Sahrens 326*fa9e4066Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 327*fa9e4066Sahrens uint64_t csize = BP_GET_PSIZE(bp); 328*fa9e4066Sahrens void *cbuf = zio_buf_alloc(csize); 329*fa9e4066Sahrens 330*fa9e4066Sahrens zio_push_transform(zio, cbuf, csize, csize); 331*fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 332*fa9e4066Sahrens } 333*fa9e4066Sahrens 334*fa9e4066Sahrens if (DVA_GET_GANG(dva)) { 335*fa9e4066Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 336*fa9e4066Sahrens void *gbuf = zio_buf_alloc(gsize); 337*fa9e4066Sahrens 338*fa9e4066Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 339*fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 340*fa9e4066Sahrens } 341*fa9e4066Sahrens 342*fa9e4066Sahrens return (zio); 343*fa9e4066Sahrens } 344*fa9e4066Sahrens 345*fa9e4066Sahrens zio_t * 346*fa9e4066Sahrens zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, 347*fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 348*fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 349*fa9e4066Sahrens { 350*fa9e4066Sahrens zio_t *zio; 351*fa9e4066Sahrens 352*fa9e4066Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 353*fa9e4066Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 354*fa9e4066Sahrens 355*fa9e4066Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 356*fa9e4066Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 357*fa9e4066Sahrens 358*fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 359*fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 360*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 361*fa9e4066Sahrens 362*fa9e4066Sahrens zio->io_checksum = checksum; 363*fa9e4066Sahrens zio->io_compress = compress; 364*fa9e4066Sahrens 365*fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF) 366*fa9e4066Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 367*fa9e4066Sahrens 368*fa9e4066Sahrens if (bp->blk_birth != txg) { 369*fa9e4066Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 370*fa9e4066Sahrens BP_ZERO(bp); 371*fa9e4066Sahrens BP_SET_LSIZE(bp, size); 372*fa9e4066Sahrens BP_SET_PSIZE(bp, size); 373*fa9e4066Sahrens } 374*fa9e4066Sahrens 375*fa9e4066Sahrens return (zio); 376*fa9e4066Sahrens } 377*fa9e4066Sahrens 378*fa9e4066Sahrens zio_t * 379*fa9e4066Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 380*fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 381*fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 382*fa9e4066Sahrens { 383*fa9e4066Sahrens zio_t *zio; 384*fa9e4066Sahrens 385*fa9e4066Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 386*fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 387*fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 388*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 389*fa9e4066Sahrens 390*fa9e4066Sahrens zio->io_checksum = checksum; 391*fa9e4066Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 392*fa9e4066Sahrens 393*fa9e4066Sahrens return (zio); 394*fa9e4066Sahrens } 395*fa9e4066Sahrens 396*fa9e4066Sahrens static zio_t * 397*fa9e4066Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 398*fa9e4066Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 399*fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 400*fa9e4066Sahrens { 401*fa9e4066Sahrens zio_t *zio; 402*fa9e4066Sahrens 403*fa9e4066Sahrens BP_ZERO(bp); 404*fa9e4066Sahrens BP_SET_LSIZE(bp, size); 405*fa9e4066Sahrens BP_SET_PSIZE(bp, size); 406*fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 407*fa9e4066Sahrens 408*fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 409*fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags, 410*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 411*fa9e4066Sahrens 412*fa9e4066Sahrens zio->io_checksum = checksum; 413*fa9e4066Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 414*fa9e4066Sahrens 415*fa9e4066Sahrens return (zio); 416*fa9e4066Sahrens } 417*fa9e4066Sahrens 418*fa9e4066Sahrens zio_t * 419*fa9e4066Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 420*fa9e4066Sahrens zio_done_func_t *done, void *private) 421*fa9e4066Sahrens { 422*fa9e4066Sahrens zio_t *zio; 423*fa9e4066Sahrens 424*fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 425*fa9e4066Sahrens 426*fa9e4066Sahrens if (txg == spa->spa_syncing_txg && 427*fa9e4066Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 428*fa9e4066Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 429*fa9e4066Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 430*fa9e4066Sahrens } 431*fa9e4066Sahrens 432*fa9e4066Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 433*fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 434*fa9e4066Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, 435*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 436*fa9e4066Sahrens 437*fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 438*fa9e4066Sahrens 439*fa9e4066Sahrens return (zio); 440*fa9e4066Sahrens } 441*fa9e4066Sahrens 442*fa9e4066Sahrens zio_t * 443*fa9e4066Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 444*fa9e4066Sahrens zio_done_func_t *done, void *private) 445*fa9e4066Sahrens { 446*fa9e4066Sahrens zio_t *zio; 447*fa9e4066Sahrens 448*fa9e4066Sahrens /* 449*fa9e4066Sahrens * A claim is an allocation of a specific block. Claims are needed 450*fa9e4066Sahrens * to support immediate writes in the intent log. The issue is that 451*fa9e4066Sahrens * immediate writes contain committed data, but in a txg that was 452*fa9e4066Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 453*fa9e4066Sahrens * the intent log claims all blocks that contain immediate write data 454*fa9e4066Sahrens * so that the SPA knows they're in use. 455*fa9e4066Sahrens * 456*fa9e4066Sahrens * All claims *must* be resolved in the first txg -- before the SPA 457*fa9e4066Sahrens * starts allocating blocks -- so that nothing is allocated twice. 458*fa9e4066Sahrens */ 459*fa9e4066Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 460*fa9e4066Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 461*fa9e4066Sahrens 462*fa9e4066Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 463*fa9e4066Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 464*fa9e4066Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 465*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 466*fa9e4066Sahrens 467*fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 468*fa9e4066Sahrens 469*fa9e4066Sahrens return (zio); 470*fa9e4066Sahrens } 471*fa9e4066Sahrens 472*fa9e4066Sahrens zio_t * 473*fa9e4066Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 474*fa9e4066Sahrens zio_done_func_t *done, void *private, int priority, int flags) 475*fa9e4066Sahrens { 476*fa9e4066Sahrens zio_t *zio; 477*fa9e4066Sahrens int c; 478*fa9e4066Sahrens 479*fa9e4066Sahrens if (vd->vdev_children == 0) { 480*fa9e4066Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 481*fa9e4066Sahrens ZIO_TYPE_IOCTL, priority, flags, 482*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 483*fa9e4066Sahrens 484*fa9e4066Sahrens zio->io_vd = vd; 485*fa9e4066Sahrens zio->io_cmd = cmd; 486*fa9e4066Sahrens } else { 487*fa9e4066Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 488*fa9e4066Sahrens 489*fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 490*fa9e4066Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 491*fa9e4066Sahrens done, private, priority, flags)); 492*fa9e4066Sahrens } 493*fa9e4066Sahrens 494*fa9e4066Sahrens return (zio); 495*fa9e4066Sahrens } 496*fa9e4066Sahrens 497*fa9e4066Sahrens static void 498*fa9e4066Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 499*fa9e4066Sahrens int checksum) 500*fa9e4066Sahrens { 501*fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 502*fa9e4066Sahrens 503*fa9e4066Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 504*fa9e4066Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 505*fa9e4066Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 506*fa9e4066Sahrens 507*fa9e4066Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 508*fa9e4066Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 509*fa9e4066Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 510*fa9e4066Sahrens 511*fa9e4066Sahrens BP_ZERO(bp); 512*fa9e4066Sahrens 513*fa9e4066Sahrens BP_SET_LSIZE(bp, size); 514*fa9e4066Sahrens BP_SET_PSIZE(bp, size); 515*fa9e4066Sahrens 516*fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 517*fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 518*fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 519*fa9e4066Sahrens 520*fa9e4066Sahrens if (checksum != ZIO_CHECKSUM_OFF) 521*fa9e4066Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 522*fa9e4066Sahrens } 523*fa9e4066Sahrens 524*fa9e4066Sahrens zio_t * 525*fa9e4066Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 526*fa9e4066Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 527*fa9e4066Sahrens int priority, int flags) 528*fa9e4066Sahrens { 529*fa9e4066Sahrens zio_t *zio; 530*fa9e4066Sahrens blkptr_t blk; 531*fa9e4066Sahrens 532*fa9e4066Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 533*fa9e4066Sahrens 534*fa9e4066Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 535*fa9e4066Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 536*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 537*fa9e4066Sahrens 538*fa9e4066Sahrens zio->io_vd = vd; 539*fa9e4066Sahrens zio->io_offset = offset; 540*fa9e4066Sahrens 541*fa9e4066Sahrens /* 542*fa9e4066Sahrens * Work off our copy of the bp so the caller can free it. 543*fa9e4066Sahrens */ 544*fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 545*fa9e4066Sahrens 546*fa9e4066Sahrens return (zio); 547*fa9e4066Sahrens } 548*fa9e4066Sahrens 549*fa9e4066Sahrens zio_t * 550*fa9e4066Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 551*fa9e4066Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 552*fa9e4066Sahrens int priority, int flags) 553*fa9e4066Sahrens { 554*fa9e4066Sahrens zio_block_tail_t *zbt; 555*fa9e4066Sahrens void *wbuf; 556*fa9e4066Sahrens zio_t *zio; 557*fa9e4066Sahrens blkptr_t blk; 558*fa9e4066Sahrens 559*fa9e4066Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 560*fa9e4066Sahrens 561*fa9e4066Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 562*fa9e4066Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 563*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 564*fa9e4066Sahrens 565*fa9e4066Sahrens zio->io_vd = vd; 566*fa9e4066Sahrens zio->io_offset = offset; 567*fa9e4066Sahrens 568*fa9e4066Sahrens zio->io_bp = &zio->io_bp_copy; 569*fa9e4066Sahrens zio->io_checksum = checksum; 570*fa9e4066Sahrens 571*fa9e4066Sahrens if (zio_checksum_table[checksum].ci_zbt) { 572*fa9e4066Sahrens /* 573*fa9e4066Sahrens * zbt checksums are necessarily destructive -- they modify 574*fa9e4066Sahrens * one word of the write buffer to hold the verifier/checksum. 575*fa9e4066Sahrens * Therefore, we must make a local copy in case the data is 576*fa9e4066Sahrens * being written to multiple places. 577*fa9e4066Sahrens */ 578*fa9e4066Sahrens wbuf = zio_buf_alloc(size); 579*fa9e4066Sahrens bcopy(data, wbuf, size); 580*fa9e4066Sahrens zio_push_transform(zio, wbuf, size, size); 581*fa9e4066Sahrens 582*fa9e4066Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 583*fa9e4066Sahrens zbt->zbt_cksum = blk.blk_cksum; 584*fa9e4066Sahrens } 585*fa9e4066Sahrens 586*fa9e4066Sahrens return (zio); 587*fa9e4066Sahrens } 588*fa9e4066Sahrens 589*fa9e4066Sahrens /* 590*fa9e4066Sahrens * Create a child I/O to do some work for us. It has no associated bp. 591*fa9e4066Sahrens */ 592*fa9e4066Sahrens zio_t * 593*fa9e4066Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 594*fa9e4066Sahrens void *data, uint64_t size, int type, int priority, int flags, 595*fa9e4066Sahrens zio_done_func_t *done, void *private) 596*fa9e4066Sahrens { 597*fa9e4066Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 598*fa9e4066Sahrens zio_t *cio; 599*fa9e4066Sahrens 600*fa9e4066Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 601*fa9e4066Sahrens /* 602*fa9e4066Sahrens * If we have the bp, then the child should perform the 603*fa9e4066Sahrens * checksum and the parent need not. This pushes error 604*fa9e4066Sahrens * detection as close to the leaves as possible and 605*fa9e4066Sahrens * eliminates redundant checksums in the interior nodes. 606*fa9e4066Sahrens */ 607*fa9e4066Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 608*fa9e4066Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 609*fa9e4066Sahrens } 610*fa9e4066Sahrens 611*fa9e4066Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 612*fa9e4066Sahrens done, private, type, priority, 613*fa9e4066Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 614*fa9e4066Sahrens ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline); 615*fa9e4066Sahrens 616*fa9e4066Sahrens cio->io_vd = vd; 617*fa9e4066Sahrens cio->io_offset = offset; 618*fa9e4066Sahrens 619*fa9e4066Sahrens return (cio); 620*fa9e4066Sahrens } 621*fa9e4066Sahrens 622*fa9e4066Sahrens /* 623*fa9e4066Sahrens * ========================================================================== 624*fa9e4066Sahrens * Initiate I/O, either sync or async 625*fa9e4066Sahrens * ========================================================================== 626*fa9e4066Sahrens */ 627*fa9e4066Sahrens int 628*fa9e4066Sahrens zio_wait(zio_t *zio) 629*fa9e4066Sahrens { 630*fa9e4066Sahrens int error; 631*fa9e4066Sahrens 632*fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 633*fa9e4066Sahrens 634*fa9e4066Sahrens zio->io_waiter = curthread; 635*fa9e4066Sahrens 636*fa9e4066Sahrens zio_next_stage_async(zio); 637*fa9e4066Sahrens 638*fa9e4066Sahrens mutex_enter(&zio->io_lock); 639*fa9e4066Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 640*fa9e4066Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 641*fa9e4066Sahrens mutex_exit(&zio->io_lock); 642*fa9e4066Sahrens 643*fa9e4066Sahrens error = zio->io_error; 644*fa9e4066Sahrens 645*fa9e4066Sahrens kmem_free(zio, sizeof (zio_t)); 646*fa9e4066Sahrens 647*fa9e4066Sahrens return (error); 648*fa9e4066Sahrens } 649*fa9e4066Sahrens 650*fa9e4066Sahrens void 651*fa9e4066Sahrens zio_nowait(zio_t *zio) 652*fa9e4066Sahrens { 653*fa9e4066Sahrens zio_next_stage_async(zio); 654*fa9e4066Sahrens } 655*fa9e4066Sahrens 656*fa9e4066Sahrens /* 657*fa9e4066Sahrens * ========================================================================== 658*fa9e4066Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 659*fa9e4066Sahrens * ========================================================================== 660*fa9e4066Sahrens */ 661*fa9e4066Sahrens static void 662*fa9e4066Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 663*fa9e4066Sahrens { 664*fa9e4066Sahrens mutex_enter(&zio->io_lock); 665*fa9e4066Sahrens if (*countp == 0) { 666*fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 667*fa9e4066Sahrens mutex_exit(&zio->io_lock); 668*fa9e4066Sahrens zio_next_stage(zio); 669*fa9e4066Sahrens } else { 670*fa9e4066Sahrens if (zio->io_stage == ZIO_STAGE_VDEV_IO_START) 671*fa9e4066Sahrens zio_vdev_io_exit(zio); 672*fa9e4066Sahrens zio->io_stalled = stage; 673*fa9e4066Sahrens mutex_exit(&zio->io_lock); 674*fa9e4066Sahrens } 675*fa9e4066Sahrens } 676*fa9e4066Sahrens 677*fa9e4066Sahrens static void 678*fa9e4066Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 679*fa9e4066Sahrens { 680*fa9e4066Sahrens zio_t *pio = zio->io_parent; 681*fa9e4066Sahrens 682*fa9e4066Sahrens mutex_enter(&pio->io_lock); 683*fa9e4066Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 684*fa9e4066Sahrens pio->io_error = zio->io_error; 685*fa9e4066Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 686*fa9e4066Sahrens if (pio->io_stage == ZIO_STAGE_VDEV_IO_START) 687*fa9e4066Sahrens zio_vdev_io_enter(pio); 688*fa9e4066Sahrens pio->io_stalled = 0; 689*fa9e4066Sahrens mutex_exit(&pio->io_lock); 690*fa9e4066Sahrens zio_next_stage_async(pio); 691*fa9e4066Sahrens } else { 692*fa9e4066Sahrens mutex_exit(&pio->io_lock); 693*fa9e4066Sahrens } 694*fa9e4066Sahrens } 695*fa9e4066Sahrens 696*fa9e4066Sahrens static void 697*fa9e4066Sahrens zio_wait_children_ready(zio_t *zio) 698*fa9e4066Sahrens { 699*fa9e4066Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 700*fa9e4066Sahrens &zio->io_children_notready); 701*fa9e4066Sahrens } 702*fa9e4066Sahrens 703*fa9e4066Sahrens void 704*fa9e4066Sahrens zio_wait_children_done(zio_t *zio) 705*fa9e4066Sahrens { 706*fa9e4066Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 707*fa9e4066Sahrens &zio->io_children_notdone); 708*fa9e4066Sahrens } 709*fa9e4066Sahrens 710*fa9e4066Sahrens static void 711*fa9e4066Sahrens zio_ready(zio_t *zio) 712*fa9e4066Sahrens { 713*fa9e4066Sahrens zio_t *pio = zio->io_parent; 714*fa9e4066Sahrens 715*fa9e4066Sahrens if (pio != NULL) 716*fa9e4066Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 717*fa9e4066Sahrens &pio->io_children_notready); 718*fa9e4066Sahrens 719*fa9e4066Sahrens if (zio->io_bp) 720*fa9e4066Sahrens zio->io_bp_copy = *zio->io_bp; 721*fa9e4066Sahrens 722*fa9e4066Sahrens zio_next_stage(zio); 723*fa9e4066Sahrens } 724*fa9e4066Sahrens 725*fa9e4066Sahrens static void 726*fa9e4066Sahrens zio_done(zio_t *zio) 727*fa9e4066Sahrens { 728*fa9e4066Sahrens zio_t *pio = zio->io_parent; 729*fa9e4066Sahrens spa_t *spa = zio->io_spa; 730*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 731*fa9e4066Sahrens vdev_t *vd = zio->io_vd; 732*fa9e4066Sahrens char blkbuf[300]; 733*fa9e4066Sahrens 734*fa9e4066Sahrens ASSERT(zio->io_children_notready == 0); 735*fa9e4066Sahrens ASSERT(zio->io_children_notdone == 0); 736*fa9e4066Sahrens 737*fa9e4066Sahrens if (bp != NULL) { 738*fa9e4066Sahrens ASSERT(bp->blk_pad[0] == 0); 739*fa9e4066Sahrens ASSERT(bp->blk_pad[1] == 0); 740*fa9e4066Sahrens ASSERT(bp->blk_pad[2] == 0); 741*fa9e4066Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 742*fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 743*fa9e4066Sahrens !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 744*fa9e4066Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 745*fa9e4066Sahrens } 746*fa9e4066Sahrens 747*fa9e4066Sahrens if (vd != NULL) 748*fa9e4066Sahrens vdev_stat_update(zio); 749*fa9e4066Sahrens 750*fa9e4066Sahrens if (zio->io_error) { 751*fa9e4066Sahrens sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); 752*fa9e4066Sahrens dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n", 753*fa9e4066Sahrens zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 754*fa9e4066Sahrens zio_type_name[zio->io_type], 755*fa9e4066Sahrens vdev_description(vd), 756*fa9e4066Sahrens (u_longlong_t)zio->io_offset, 757*fa9e4066Sahrens zio, blkbuf, zio->io_error); 758*fa9e4066Sahrens } 759*fa9e4066Sahrens 760*fa9e4066Sahrens if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) { 761*fa9e4066Sahrens sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); 762*fa9e4066Sahrens dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n", 763*fa9e4066Sahrens "partial write", 764*fa9e4066Sahrens zio_type_name[zio->io_type], 765*fa9e4066Sahrens vdev_description(vd), 766*fa9e4066Sahrens (u_longlong_t)zio->io_offset, 767*fa9e4066Sahrens zio, blkbuf, zio->io_numerrors); 768*fa9e4066Sahrens } 769*fa9e4066Sahrens 770*fa9e4066Sahrens if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 771*fa9e4066Sahrens sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); 772*fa9e4066Sahrens panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 773*fa9e4066Sahrens zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 774*fa9e4066Sahrens zio_type_name[zio->io_type], 775*fa9e4066Sahrens vdev_description(vd), 776*fa9e4066Sahrens (u_longlong_t)zio->io_offset, 777*fa9e4066Sahrens zio, blkbuf, zio->io_error); 778*fa9e4066Sahrens } 779*fa9e4066Sahrens 780*fa9e4066Sahrens zio_clear_transform_stack(zio); 781*fa9e4066Sahrens 782*fa9e4066Sahrens if (zio->io_done) 783*fa9e4066Sahrens zio->io_done(zio); 784*fa9e4066Sahrens 785*fa9e4066Sahrens ASSERT(zio->io_delegate_list == NULL); 786*fa9e4066Sahrens ASSERT(zio->io_delegate_next == NULL); 787*fa9e4066Sahrens 788*fa9e4066Sahrens if (pio != NULL) { 789*fa9e4066Sahrens zio_t *next, *prev; 790*fa9e4066Sahrens 791*fa9e4066Sahrens mutex_enter(&pio->io_lock); 792*fa9e4066Sahrens next = zio->io_sibling_next; 793*fa9e4066Sahrens prev = zio->io_sibling_prev; 794*fa9e4066Sahrens if (next != NULL) 795*fa9e4066Sahrens next->io_sibling_prev = prev; 796*fa9e4066Sahrens if (prev != NULL) 797*fa9e4066Sahrens prev->io_sibling_next = next; 798*fa9e4066Sahrens if (pio->io_child == zio) 799*fa9e4066Sahrens pio->io_child = next; 800*fa9e4066Sahrens mutex_exit(&pio->io_lock); 801*fa9e4066Sahrens 802*fa9e4066Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 803*fa9e4066Sahrens &pio->io_children_notdone); 804*fa9e4066Sahrens } 805*fa9e4066Sahrens 806*fa9e4066Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 807*fa9e4066Sahrens spa_config_exit(spa); 808*fa9e4066Sahrens 809*fa9e4066Sahrens if (zio->io_waiter != NULL) { 810*fa9e4066Sahrens mutex_enter(&zio->io_lock); 811*fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 812*fa9e4066Sahrens zio->io_stalled = zio->io_stage; 813*fa9e4066Sahrens cv_broadcast(&zio->io_cv); 814*fa9e4066Sahrens mutex_exit(&zio->io_lock); 815*fa9e4066Sahrens } else { 816*fa9e4066Sahrens kmem_free(zio, sizeof (zio_t)); 817*fa9e4066Sahrens } 818*fa9e4066Sahrens } 819*fa9e4066Sahrens 820*fa9e4066Sahrens /* 821*fa9e4066Sahrens * ========================================================================== 822*fa9e4066Sahrens * Compression support 823*fa9e4066Sahrens * ========================================================================== 824*fa9e4066Sahrens */ 825*fa9e4066Sahrens static void 826*fa9e4066Sahrens zio_write_compress(zio_t *zio) 827*fa9e4066Sahrens { 828*fa9e4066Sahrens int compress = zio->io_compress; 829*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 830*fa9e4066Sahrens void *cbuf; 831*fa9e4066Sahrens uint64_t lsize = zio->io_size; 832*fa9e4066Sahrens uint64_t csize = lsize; 833*fa9e4066Sahrens uint64_t cbufsize = 0; 834*fa9e4066Sahrens int pass; 835*fa9e4066Sahrens 836*fa9e4066Sahrens if (bp->blk_birth == zio->io_txg) { 837*fa9e4066Sahrens /* 838*fa9e4066Sahrens * We're rewriting an existing block, which means we're 839*fa9e4066Sahrens * working on behalf of spa_sync(). For spa_sync() to 840*fa9e4066Sahrens * converge, it must eventually be the case that we don't 841*fa9e4066Sahrens * have to allocate new blocks. But compression changes 842*fa9e4066Sahrens * the blocksize, which forces a reallocate, and makes 843*fa9e4066Sahrens * convergence take longer. Therefore, after the first 844*fa9e4066Sahrens * few passes, stop compressing to ensure convergence. 845*fa9e4066Sahrens */ 846*fa9e4066Sahrens pass = spa_sync_pass(zio->io_spa); 847*fa9e4066Sahrens if (pass > zio_sync_pass.zp_dontcompress) 848*fa9e4066Sahrens compress = ZIO_COMPRESS_OFF; 849*fa9e4066Sahrens } else { 850*fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 851*fa9e4066Sahrens pass = 1; 852*fa9e4066Sahrens } 853*fa9e4066Sahrens 854*fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF) 855*fa9e4066Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 856*fa9e4066Sahrens &cbuf, &csize, &cbufsize)) 857*fa9e4066Sahrens compress = ZIO_COMPRESS_OFF; 858*fa9e4066Sahrens 859*fa9e4066Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 860*fa9e4066Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 861*fa9e4066Sahrens 862*fa9e4066Sahrens /* 863*fa9e4066Sahrens * The final pass of spa_sync() must be all rewrites, but the first 864*fa9e4066Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 865*fa9e4066Sahrens * but newly allocated blocks are sequential, so they can be written 866*fa9e4066Sahrens * to disk faster. Therefore, we allow the first few passes of 867*fa9e4066Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 868*fa9e4066Sahrens * There should only be a handful of blocks after pass 1 in any case. 869*fa9e4066Sahrens */ 870*fa9e4066Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 871*fa9e4066Sahrens pass > zio_sync_pass.zp_rewrite) { 872*fa9e4066Sahrens ASSERT(csize != 0); 873*fa9e4066Sahrens ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); 874*fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 875*fa9e4066Sahrens 876*fa9e4066Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 877*fa9e4066Sahrens } else { 878*fa9e4066Sahrens if (bp->blk_birth == zio->io_txg) { 879*fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 880*fa9e4066Sahrens bzero(bp, sizeof (blkptr_t)); 881*fa9e4066Sahrens } 882*fa9e4066Sahrens if (csize == 0) { 883*fa9e4066Sahrens BP_ZERO(bp); 884*fa9e4066Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 885*fa9e4066Sahrens } else { 886*fa9e4066Sahrens BP_SET_LSIZE(bp, lsize); 887*fa9e4066Sahrens BP_SET_PSIZE(bp, csize); 888*fa9e4066Sahrens BP_SET_COMPRESS(bp, compress); 889*fa9e4066Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 890*fa9e4066Sahrens } 891*fa9e4066Sahrens } 892*fa9e4066Sahrens 893*fa9e4066Sahrens zio_next_stage(zio); 894*fa9e4066Sahrens } 895*fa9e4066Sahrens 896*fa9e4066Sahrens static void 897*fa9e4066Sahrens zio_read_decompress(zio_t *zio) 898*fa9e4066Sahrens { 899*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 900*fa9e4066Sahrens void *data; 901*fa9e4066Sahrens uint64_t size; 902*fa9e4066Sahrens uint64_t bufsize; 903*fa9e4066Sahrens int compress = BP_GET_COMPRESS(bp); 904*fa9e4066Sahrens 905*fa9e4066Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 906*fa9e4066Sahrens 907*fa9e4066Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 908*fa9e4066Sahrens 909*fa9e4066Sahrens if (zio_decompress_data(compress, data, size, 910*fa9e4066Sahrens zio->io_data, zio->io_size)) 911*fa9e4066Sahrens zio->io_error = EIO; 912*fa9e4066Sahrens 913*fa9e4066Sahrens zio_buf_free(data, bufsize); 914*fa9e4066Sahrens 915*fa9e4066Sahrens zio_next_stage(zio); 916*fa9e4066Sahrens } 917*fa9e4066Sahrens 918*fa9e4066Sahrens /* 919*fa9e4066Sahrens * ========================================================================== 920*fa9e4066Sahrens * Gang block support 921*fa9e4066Sahrens * ========================================================================== 922*fa9e4066Sahrens */ 923*fa9e4066Sahrens static void 924*fa9e4066Sahrens zio_gang_pipeline(zio_t *zio) 925*fa9e4066Sahrens { 926*fa9e4066Sahrens /* 927*fa9e4066Sahrens * By default, the pipeline assumes that we're dealing with a gang 928*fa9e4066Sahrens * block. If we're not, strip out any gang-specific stages. 929*fa9e4066Sahrens */ 930*fa9e4066Sahrens if (!DVA_GET_GANG(ZIO_GET_DVA(zio))) 931*fa9e4066Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 932*fa9e4066Sahrens 933*fa9e4066Sahrens zio_next_stage(zio); 934*fa9e4066Sahrens } 935*fa9e4066Sahrens 936*fa9e4066Sahrens static void 937*fa9e4066Sahrens zio_gang_byteswap(zio_t *zio) 938*fa9e4066Sahrens { 939*fa9e4066Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 940*fa9e4066Sahrens 941*fa9e4066Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 942*fa9e4066Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 943*fa9e4066Sahrens } 944*fa9e4066Sahrens 945*fa9e4066Sahrens static void 946*fa9e4066Sahrens zio_get_gang_header(zio_t *zio) 947*fa9e4066Sahrens { 948*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 949*fa9e4066Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 950*fa9e4066Sahrens void *gbuf = zio_buf_alloc(gsize); 951*fa9e4066Sahrens 952*fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 953*fa9e4066Sahrens 954*fa9e4066Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 955*fa9e4066Sahrens 956*fa9e4066Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 957*fa9e4066Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 958*fa9e4066Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 959*fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 960*fa9e4066Sahrens 961*fa9e4066Sahrens zio_wait_children_done(zio); 962*fa9e4066Sahrens } 963*fa9e4066Sahrens 964*fa9e4066Sahrens static void 965*fa9e4066Sahrens zio_read_gang_members(zio_t *zio) 966*fa9e4066Sahrens { 967*fa9e4066Sahrens zio_gbh_phys_t *gbh; 968*fa9e4066Sahrens uint64_t gsize, gbufsize, loff, lsize; 969*fa9e4066Sahrens int i; 970*fa9e4066Sahrens 971*fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 972*fa9e4066Sahrens 973*fa9e4066Sahrens zio_gang_byteswap(zio); 974*fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 975*fa9e4066Sahrens 976*fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 977*fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 978*fa9e4066Sahrens lsize = BP_GET_PSIZE(gbp); 979*fa9e4066Sahrens 980*fa9e4066Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 981*fa9e4066Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 982*fa9e4066Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 983*fa9e4066Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 984*fa9e4066Sahrens ASSERT(!BP_IS_HOLE(gbp)); 985*fa9e4066Sahrens 986*fa9e4066Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 987*fa9e4066Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 988*fa9e4066Sahrens zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 989*fa9e4066Sahrens } 990*fa9e4066Sahrens 991*fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 992*fa9e4066Sahrens zio_wait_children_done(zio); 993*fa9e4066Sahrens } 994*fa9e4066Sahrens 995*fa9e4066Sahrens static void 996*fa9e4066Sahrens zio_rewrite_gang_members(zio_t *zio) 997*fa9e4066Sahrens { 998*fa9e4066Sahrens zio_gbh_phys_t *gbh; 999*fa9e4066Sahrens uint64_t gsize, gbufsize, loff, lsize; 1000*fa9e4066Sahrens int i; 1001*fa9e4066Sahrens 1002*fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1003*fa9e4066Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1004*fa9e4066Sahrens 1005*fa9e4066Sahrens zio_gang_byteswap(zio); 1006*fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1007*fa9e4066Sahrens 1008*fa9e4066Sahrens ASSERT(gsize == gbufsize); 1009*fa9e4066Sahrens 1010*fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1011*fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1012*fa9e4066Sahrens lsize = BP_GET_PSIZE(gbp); 1013*fa9e4066Sahrens 1014*fa9e4066Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1015*fa9e4066Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1016*fa9e4066Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1017*fa9e4066Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1018*fa9e4066Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1019*fa9e4066Sahrens 1020*fa9e4066Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1021*fa9e4066Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1022*fa9e4066Sahrens NULL, NULL, zio->io_priority, zio->io_flags)); 1023*fa9e4066Sahrens } 1024*fa9e4066Sahrens 1025*fa9e4066Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1026*fa9e4066Sahrens zio_wait_children_ready(zio); 1027*fa9e4066Sahrens } 1028*fa9e4066Sahrens 1029*fa9e4066Sahrens static void 1030*fa9e4066Sahrens zio_free_gang_members(zio_t *zio) 1031*fa9e4066Sahrens { 1032*fa9e4066Sahrens zio_gbh_phys_t *gbh; 1033*fa9e4066Sahrens uint64_t gsize, gbufsize; 1034*fa9e4066Sahrens int i; 1035*fa9e4066Sahrens 1036*fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1037*fa9e4066Sahrens 1038*fa9e4066Sahrens zio_gang_byteswap(zio); 1039*fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1040*fa9e4066Sahrens 1041*fa9e4066Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1042*fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1043*fa9e4066Sahrens 1044*fa9e4066Sahrens if (BP_IS_HOLE(gbp)) 1045*fa9e4066Sahrens continue; 1046*fa9e4066Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1047*fa9e4066Sahrens gbp, NULL, NULL)); 1048*fa9e4066Sahrens } 1049*fa9e4066Sahrens 1050*fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1051*fa9e4066Sahrens zio_next_stage(zio); 1052*fa9e4066Sahrens } 1053*fa9e4066Sahrens 1054*fa9e4066Sahrens static void 1055*fa9e4066Sahrens zio_claim_gang_members(zio_t *zio) 1056*fa9e4066Sahrens { 1057*fa9e4066Sahrens zio_gbh_phys_t *gbh; 1058*fa9e4066Sahrens uint64_t gsize, gbufsize; 1059*fa9e4066Sahrens int i; 1060*fa9e4066Sahrens 1061*fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1062*fa9e4066Sahrens 1063*fa9e4066Sahrens zio_gang_byteswap(zio); 1064*fa9e4066Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1065*fa9e4066Sahrens 1066*fa9e4066Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1067*fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1068*fa9e4066Sahrens if (BP_IS_HOLE(gbp)) 1069*fa9e4066Sahrens continue; 1070*fa9e4066Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1071*fa9e4066Sahrens gbp, NULL, NULL)); 1072*fa9e4066Sahrens } 1073*fa9e4066Sahrens 1074*fa9e4066Sahrens zio_buf_free(gbh, gbufsize); 1075*fa9e4066Sahrens zio_next_stage(zio); 1076*fa9e4066Sahrens } 1077*fa9e4066Sahrens 1078*fa9e4066Sahrens static void 1079*fa9e4066Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1080*fa9e4066Sahrens { 1081*fa9e4066Sahrens zio_t *pio = zio->io_parent; 1082*fa9e4066Sahrens dva_t *cdva = ZIO_GET_DVA(zio); 1083*fa9e4066Sahrens dva_t *pdva = ZIO_GET_DVA(pio); 1084*fa9e4066Sahrens uint64_t asize; 1085*fa9e4066Sahrens 1086*fa9e4066Sahrens ASSERT(DVA_GET_GANG(pdva)); 1087*fa9e4066Sahrens 1088*fa9e4066Sahrens /* XXBP - Need to be careful here with multiple DVAs */ 1089*fa9e4066Sahrens mutex_enter(&pio->io_lock); 1090*fa9e4066Sahrens asize = DVA_GET_ASIZE(pdva); 1091*fa9e4066Sahrens asize += DVA_GET_ASIZE(cdva); 1092*fa9e4066Sahrens DVA_SET_ASIZE(pdva, asize); 1093*fa9e4066Sahrens mutex_exit(&pio->io_lock); 1094*fa9e4066Sahrens } 1095*fa9e4066Sahrens 1096*fa9e4066Sahrens static void 1097*fa9e4066Sahrens zio_write_allocate_gang_members(zio_t *zio) 1098*fa9e4066Sahrens { 1099*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1100*fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1101*fa9e4066Sahrens zio_gbh_phys_t *gbh; 1102*fa9e4066Sahrens uint64_t resid = zio->io_size; 1103*fa9e4066Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1104*fa9e4066Sahrens uint64_t gsize, loff, lsize; 1105*fa9e4066Sahrens uint32_t gbps_left; 1106*fa9e4066Sahrens int error; 1107*fa9e4066Sahrens int i; 1108*fa9e4066Sahrens 1109*fa9e4066Sahrens gsize = SPA_GANGBLOCKSIZE; 1110*fa9e4066Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1111*fa9e4066Sahrens 1112*fa9e4066Sahrens error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg); 1113*fa9e4066Sahrens if (error == ENOSPC) 1114*fa9e4066Sahrens panic("can't allocate gang block header"); 1115*fa9e4066Sahrens ASSERT(error == 0); 1116*fa9e4066Sahrens 1117*fa9e4066Sahrens DVA_SET_GANG(dva, 1); 1118*fa9e4066Sahrens 1119*fa9e4066Sahrens bp->blk_birth = zio->io_txg; 1120*fa9e4066Sahrens 1121*fa9e4066Sahrens gbh = zio_buf_alloc(gsize); 1122*fa9e4066Sahrens bzero(gbh, gsize); 1123*fa9e4066Sahrens 1124*fa9e4066Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1125*fa9e4066Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1126*fa9e4066Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1127*fa9e4066Sahrens dva = &gbp->blk_dva[0]; 1128*fa9e4066Sahrens 1129*fa9e4066Sahrens ASSERT(gbps_left != 0); 1130*fa9e4066Sahrens maxalloc = MIN(maxalloc, resid); 1131*fa9e4066Sahrens 1132*fa9e4066Sahrens while (resid <= maxalloc * gbps_left) { 1133*fa9e4066Sahrens error = metaslab_alloc(zio->io_spa, maxalloc, dva, 1134*fa9e4066Sahrens zio->io_txg); 1135*fa9e4066Sahrens if (error == 0) 1136*fa9e4066Sahrens break; 1137*fa9e4066Sahrens ASSERT3U(error, ==, ENOSPC); 1138*fa9e4066Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1139*fa9e4066Sahrens panic("really out of space"); 1140*fa9e4066Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1141*fa9e4066Sahrens } 1142*fa9e4066Sahrens 1143*fa9e4066Sahrens if (resid <= maxalloc * gbps_left) { 1144*fa9e4066Sahrens lsize = maxalloc; 1145*fa9e4066Sahrens BP_SET_LSIZE(gbp, lsize); 1146*fa9e4066Sahrens BP_SET_PSIZE(gbp, lsize); 1147*fa9e4066Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1148*fa9e4066Sahrens gbp->blk_birth = zio->io_txg; 1149*fa9e4066Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, 1150*fa9e4066Sahrens zio->io_checksum, zio->io_txg, gbp, 1151*fa9e4066Sahrens (char *)zio->io_data + loff, lsize, 1152*fa9e4066Sahrens zio_write_allocate_gang_member_done, NULL, 1153*fa9e4066Sahrens zio->io_priority, zio->io_flags)); 1154*fa9e4066Sahrens } else { 1155*fa9e4066Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1156*fa9e4066Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 1157*fa9e4066Sahrens zio_nowait(zio_write_allocate(zio, zio->io_spa, 1158*fa9e4066Sahrens zio->io_checksum, zio->io_txg, gbp, 1159*fa9e4066Sahrens (char *)zio->io_data + loff, lsize, 1160*fa9e4066Sahrens zio_write_allocate_gang_member_done, NULL, 1161*fa9e4066Sahrens zio->io_priority, zio->io_flags)); 1162*fa9e4066Sahrens } 1163*fa9e4066Sahrens } 1164*fa9e4066Sahrens 1165*fa9e4066Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1166*fa9e4066Sahrens 1167*fa9e4066Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1168*fa9e4066Sahrens 1169*fa9e4066Sahrens zio_push_transform(zio, gbh, gsize, gsize); 1170*fa9e4066Sahrens zio_wait_children_done(zio); 1171*fa9e4066Sahrens } 1172*fa9e4066Sahrens 1173*fa9e4066Sahrens /* 1174*fa9e4066Sahrens * ========================================================================== 1175*fa9e4066Sahrens * Allocate and free blocks 1176*fa9e4066Sahrens * ========================================================================== 1177*fa9e4066Sahrens */ 1178*fa9e4066Sahrens static void 1179*fa9e4066Sahrens zio_dva_allocate(zio_t *zio) 1180*fa9e4066Sahrens { 1181*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1182*fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1183*fa9e4066Sahrens int error; 1184*fa9e4066Sahrens 1185*fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 1186*fa9e4066Sahrens 1187*fa9e4066Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1188*fa9e4066Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1189*fa9e4066Sahrens zio_write_allocate_gang_members(zio); 1190*fa9e4066Sahrens return; 1191*fa9e4066Sahrens } 1192*fa9e4066Sahrens 1193*fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1194*fa9e4066Sahrens 1195*fa9e4066Sahrens error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg); 1196*fa9e4066Sahrens 1197*fa9e4066Sahrens if (error == 0) { 1198*fa9e4066Sahrens bp->blk_birth = zio->io_txg; 1199*fa9e4066Sahrens } else if (error == ENOSPC) { 1200*fa9e4066Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1201*fa9e4066Sahrens panic("really, truly out of space"); 1202*fa9e4066Sahrens zio_write_allocate_gang_members(zio); 1203*fa9e4066Sahrens return; 1204*fa9e4066Sahrens } else { 1205*fa9e4066Sahrens zio->io_error = error; 1206*fa9e4066Sahrens } 1207*fa9e4066Sahrens zio_next_stage(zio); 1208*fa9e4066Sahrens } 1209*fa9e4066Sahrens 1210*fa9e4066Sahrens static void 1211*fa9e4066Sahrens zio_dva_free(zio_t *zio) 1212*fa9e4066Sahrens { 1213*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1214*fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1215*fa9e4066Sahrens 1216*fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 1217*fa9e4066Sahrens 1218*fa9e4066Sahrens metaslab_free(zio->io_spa, dva, zio->io_txg); 1219*fa9e4066Sahrens 1220*fa9e4066Sahrens BP_ZERO(bp); 1221*fa9e4066Sahrens 1222*fa9e4066Sahrens zio_next_stage(zio); 1223*fa9e4066Sahrens } 1224*fa9e4066Sahrens 1225*fa9e4066Sahrens static void 1226*fa9e4066Sahrens zio_dva_claim(zio_t *zio) 1227*fa9e4066Sahrens { 1228*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1229*fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1230*fa9e4066Sahrens 1231*fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 1232*fa9e4066Sahrens 1233*fa9e4066Sahrens zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg); 1234*fa9e4066Sahrens 1235*fa9e4066Sahrens zio_next_stage(zio); 1236*fa9e4066Sahrens } 1237*fa9e4066Sahrens 1238*fa9e4066Sahrens static void 1239*fa9e4066Sahrens zio_dva_translate(zio_t *zio) 1240*fa9e4066Sahrens { 1241*fa9e4066Sahrens spa_t *spa = zio->io_spa; 1242*fa9e4066Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1243*fa9e4066Sahrens uint64_t vdev = DVA_GET_VDEV(dva); 1244*fa9e4066Sahrens uint64_t offset = DVA_GET_OFFSET(dva); 1245*fa9e4066Sahrens 1246*fa9e4066Sahrens ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio)); 1247*fa9e4066Sahrens 1248*fa9e4066Sahrens zio->io_offset = offset; 1249*fa9e4066Sahrens 1250*fa9e4066Sahrens if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL) 1251*fa9e4066Sahrens zio->io_error = ENXIO; 1252*fa9e4066Sahrens else if (offset + zio->io_size > zio->io_vd->vdev_asize) 1253*fa9e4066Sahrens zio->io_error = EOVERFLOW; 1254*fa9e4066Sahrens 1255*fa9e4066Sahrens zio_next_stage(zio); 1256*fa9e4066Sahrens } 1257*fa9e4066Sahrens 1258*fa9e4066Sahrens /* 1259*fa9e4066Sahrens * ========================================================================== 1260*fa9e4066Sahrens * Read and write to physical devices 1261*fa9e4066Sahrens * ========================================================================== 1262*fa9e4066Sahrens */ 1263*fa9e4066Sahrens static void 1264*fa9e4066Sahrens zio_vdev_io_enter(zio_t *zio) 1265*fa9e4066Sahrens { 1266*fa9e4066Sahrens vdev_t *tvd = zio->io_vd->vdev_top; 1267*fa9e4066Sahrens 1268*fa9e4066Sahrens mutex_enter(&tvd->vdev_io_lock); 1269*fa9e4066Sahrens ASSERT(zio->io_pending.list_next == NULL); 1270*fa9e4066Sahrens list_insert_tail(&tvd->vdev_io_pending, zio); 1271*fa9e4066Sahrens mutex_exit(&tvd->vdev_io_lock); 1272*fa9e4066Sahrens } 1273*fa9e4066Sahrens 1274*fa9e4066Sahrens static void 1275*fa9e4066Sahrens zio_vdev_io_exit(zio_t *zio) 1276*fa9e4066Sahrens { 1277*fa9e4066Sahrens vdev_t *tvd = zio->io_vd->vdev_top; 1278*fa9e4066Sahrens 1279*fa9e4066Sahrens mutex_enter(&tvd->vdev_io_lock); 1280*fa9e4066Sahrens ASSERT(zio->io_pending.list_next != NULL); 1281*fa9e4066Sahrens list_remove(&tvd->vdev_io_pending, zio); 1282*fa9e4066Sahrens if (list_head(&tvd->vdev_io_pending) == NULL) 1283*fa9e4066Sahrens cv_broadcast(&tvd->vdev_io_cv); 1284*fa9e4066Sahrens mutex_exit(&tvd->vdev_io_lock); 1285*fa9e4066Sahrens } 1286*fa9e4066Sahrens 1287*fa9e4066Sahrens static void 1288*fa9e4066Sahrens zio_vdev_io_retry(void *vdarg) 1289*fa9e4066Sahrens { 1290*fa9e4066Sahrens vdev_t *vd = vdarg; 1291*fa9e4066Sahrens zio_t *zio, *zq; 1292*fa9e4066Sahrens 1293*fa9e4066Sahrens ASSERT(vd == vd->vdev_top); 1294*fa9e4066Sahrens 1295*fa9e4066Sahrens /* XXPOLICY */ 1296*fa9e4066Sahrens delay(hz); 1297*fa9e4066Sahrens 1298*fa9e4066Sahrens vdev_reopen(vd, &zq); 1299*fa9e4066Sahrens 1300*fa9e4066Sahrens while ((zio = zq) != NULL) { 1301*fa9e4066Sahrens zq = zio->io_retry_next; 1302*fa9e4066Sahrens zio->io_retry_next = NULL; 1303*fa9e4066Sahrens dprintf("async retry #%d for I/O to %s offset %llx\n", 1304*fa9e4066Sahrens zio->io_retries, vdev_description(vd), zio->io_offset); 1305*fa9e4066Sahrens zio_next_stage_async(zio); 1306*fa9e4066Sahrens } 1307*fa9e4066Sahrens } 1308*fa9e4066Sahrens 1309*fa9e4066Sahrens static void 1310*fa9e4066Sahrens zio_vdev_io_setup(zio_t *zio) 1311*fa9e4066Sahrens { 1312*fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1313*fa9e4066Sahrens 1314*fa9e4066Sahrens /* XXPOLICY */ 1315*fa9e4066Sahrens if (zio->io_retries == 0 && vd == vd->vdev_top) 1316*fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1317*fa9e4066Sahrens 1318*fa9e4066Sahrens if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1319*fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1320*fa9e4066Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1321*fa9e4066Sahrens } 1322*fa9e4066Sahrens 1323*fa9e4066Sahrens zio_vdev_io_enter(zio); 1324*fa9e4066Sahrens 1325*fa9e4066Sahrens zio_next_stage(zio); 1326*fa9e4066Sahrens } 1327*fa9e4066Sahrens 1328*fa9e4066Sahrens static void 1329*fa9e4066Sahrens zio_vdev_io_start(zio_t *zio) 1330*fa9e4066Sahrens { 1331*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1332*fa9e4066Sahrens 1333*fa9e4066Sahrens ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0); 1334*fa9e4066Sahrens ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0); 1335*fa9e4066Sahrens ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size); 1336*fa9e4066Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1337*fa9e4066Sahrens 1338*fa9e4066Sahrens vdev_io_start(zio); 1339*fa9e4066Sahrens 1340*fa9e4066Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1341*fa9e4066Sahrens } 1342*fa9e4066Sahrens 1343*fa9e4066Sahrens static void 1344*fa9e4066Sahrens zio_vdev_io_done(zio_t *zio) 1345*fa9e4066Sahrens { 1346*fa9e4066Sahrens vdev_io_done(zio); 1347*fa9e4066Sahrens } 1348*fa9e4066Sahrens 1349*fa9e4066Sahrens /* XXPOLICY */ 1350*fa9e4066Sahrens static boolean_t 1351*fa9e4066Sahrens zio_should_retry(zio_t *zio) 1352*fa9e4066Sahrens { 1353*fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1354*fa9e4066Sahrens 1355*fa9e4066Sahrens if (zio->io_error == 0) 1356*fa9e4066Sahrens return (B_FALSE); 1357*fa9e4066Sahrens if (zio->io_delegate_list != NULL) 1358*fa9e4066Sahrens return (B_FALSE); 1359*fa9e4066Sahrens if (vd != vd->vdev_top) 1360*fa9e4066Sahrens return (B_FALSE); 1361*fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1362*fa9e4066Sahrens return (B_FALSE); 1363*fa9e4066Sahrens if (zio->io_retries > 300 && 1364*fa9e4066Sahrens (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL))) 1365*fa9e4066Sahrens return (B_FALSE); 1366*fa9e4066Sahrens if (zio->io_retries > 1 && 1367*fa9e4066Sahrens (zio->io_error == ECKSUM || zio->io_error == ENXIO)) 1368*fa9e4066Sahrens return (B_FALSE); 1369*fa9e4066Sahrens 1370*fa9e4066Sahrens return (B_TRUE); 1371*fa9e4066Sahrens } 1372*fa9e4066Sahrens 1373*fa9e4066Sahrens static void 1374*fa9e4066Sahrens zio_vdev_io_assess(zio_t *zio) 1375*fa9e4066Sahrens { 1376*fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1377*fa9e4066Sahrens vdev_t *tvd = vd->vdev_top; 1378*fa9e4066Sahrens 1379*fa9e4066Sahrens zio_vdev_io_exit(zio); 1380*fa9e4066Sahrens 1381*fa9e4066Sahrens ASSERT(zio->io_vsd == NULL); 1382*fa9e4066Sahrens 1383*fa9e4066Sahrens /* 1384*fa9e4066Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1385*fa9e4066Sahrens */ 1386*fa9e4066Sahrens /* XXPOLICY */ 1387*fa9e4066Sahrens if (zio_should_retry(zio)) { 1388*fa9e4066Sahrens zio_t *zq; 1389*fa9e4066Sahrens 1390*fa9e4066Sahrens ASSERT(tvd == vd); 1391*fa9e4066Sahrens ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); 1392*fa9e4066Sahrens 1393*fa9e4066Sahrens zio->io_retries++; 1394*fa9e4066Sahrens zio->io_error = 0; 1395*fa9e4066Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1396*fa9e4066Sahrens /* XXPOLICY */ 1397*fa9e4066Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1398*fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1399*fa9e4066Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1; 1400*fa9e4066Sahrens 1401*fa9e4066Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1402*fa9e4066Sahrens zio->io_retries, zio_type_name[zio->io_type], 1403*fa9e4066Sahrens vdev_description(vd), zio->io_offset); 1404*fa9e4066Sahrens 1405*fa9e4066Sahrens /* 1406*fa9e4066Sahrens * If this is the first retry, do it immediately. 1407*fa9e4066Sahrens */ 1408*fa9e4066Sahrens /* XXPOLICY */ 1409*fa9e4066Sahrens if (zio->io_retries == 1) { 1410*fa9e4066Sahrens zio_next_stage_async(zio); 1411*fa9e4066Sahrens return; 1412*fa9e4066Sahrens } 1413*fa9e4066Sahrens 1414*fa9e4066Sahrens /* 1415*fa9e4066Sahrens * This was not the first retry, so go through the 1416*fa9e4066Sahrens * longer enqueue/delay/vdev_reopen() process. 1417*fa9e4066Sahrens */ 1418*fa9e4066Sahrens mutex_enter(&tvd->vdev_io_lock); 1419*fa9e4066Sahrens ASSERT(zio->io_retry_next == NULL); 1420*fa9e4066Sahrens zio->io_retry_next = zq = tvd->vdev_io_retry; 1421*fa9e4066Sahrens tvd->vdev_io_retry = zio; 1422*fa9e4066Sahrens mutex_exit(&tvd->vdev_io_lock); 1423*fa9e4066Sahrens if (zq == NULL) 1424*fa9e4066Sahrens (void) taskq_dispatch( 1425*fa9e4066Sahrens tvd->vdev_spa->spa_vdev_retry_taskq, 1426*fa9e4066Sahrens zio_vdev_io_retry, tvd, TQ_SLEEP); 1427*fa9e4066Sahrens return; 1428*fa9e4066Sahrens } 1429*fa9e4066Sahrens 1430*fa9e4066Sahrens zio_next_stage(zio); 1431*fa9e4066Sahrens } 1432*fa9e4066Sahrens 1433*fa9e4066Sahrens void 1434*fa9e4066Sahrens zio_vdev_io_reissue(zio_t *zio) 1435*fa9e4066Sahrens { 1436*fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1437*fa9e4066Sahrens ASSERT(zio->io_error == 0); 1438*fa9e4066Sahrens 1439*fa9e4066Sahrens zio->io_stage--; 1440*fa9e4066Sahrens } 1441*fa9e4066Sahrens 1442*fa9e4066Sahrens void 1443*fa9e4066Sahrens zio_vdev_io_redone(zio_t *zio) 1444*fa9e4066Sahrens { 1445*fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1446*fa9e4066Sahrens 1447*fa9e4066Sahrens zio->io_stage--; 1448*fa9e4066Sahrens } 1449*fa9e4066Sahrens 1450*fa9e4066Sahrens void 1451*fa9e4066Sahrens zio_vdev_io_bypass(zio_t *zio) 1452*fa9e4066Sahrens { 1453*fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1454*fa9e4066Sahrens ASSERT(zio->io_error == 0); 1455*fa9e4066Sahrens 1456*fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1457*fa9e4066Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1458*fa9e4066Sahrens } 1459*fa9e4066Sahrens 1460*fa9e4066Sahrens /* 1461*fa9e4066Sahrens * ========================================================================== 1462*fa9e4066Sahrens * Generate and verify checksums 1463*fa9e4066Sahrens * ========================================================================== 1464*fa9e4066Sahrens */ 1465*fa9e4066Sahrens static void 1466*fa9e4066Sahrens zio_checksum_generate(zio_t *zio) 1467*fa9e4066Sahrens { 1468*fa9e4066Sahrens int checksum = zio->io_checksum; 1469*fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 1470*fa9e4066Sahrens 1471*fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1472*fa9e4066Sahrens 1473*fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 1474*fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1475*fa9e4066Sahrens 1476*fa9e4066Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1477*fa9e4066Sahrens 1478*fa9e4066Sahrens zio_next_stage(zio); 1479*fa9e4066Sahrens } 1480*fa9e4066Sahrens 1481*fa9e4066Sahrens static void 1482*fa9e4066Sahrens zio_gang_checksum_generate(zio_t *zio) 1483*fa9e4066Sahrens { 1484*fa9e4066Sahrens zio_cksum_t zc; 1485*fa9e4066Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1486*fa9e4066Sahrens 1487*fa9e4066Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1488*fa9e4066Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1489*fa9e4066Sahrens 1490*fa9e4066Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1491*fa9e4066Sahrens 1492*fa9e4066Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1493*fa9e4066Sahrens 1494*fa9e4066Sahrens zio_next_stage(zio); 1495*fa9e4066Sahrens } 1496*fa9e4066Sahrens 1497*fa9e4066Sahrens static void 1498*fa9e4066Sahrens zio_checksum_verify(zio_t *zio) 1499*fa9e4066Sahrens { 1500*fa9e4066Sahrens if (zio->io_bp != NULL) { 1501*fa9e4066Sahrens zio->io_error = zio_checksum_error(zio); 1502*fa9e4066Sahrens if (zio->io_error) { 1503*fa9e4066Sahrens dprintf("bad checksum on vdev %s\n", 1504*fa9e4066Sahrens vdev_description(zio->io_vd)); 1505*fa9e4066Sahrens } 1506*fa9e4066Sahrens } 1507*fa9e4066Sahrens 1508*fa9e4066Sahrens zio_next_stage(zio); 1509*fa9e4066Sahrens } 1510*fa9e4066Sahrens 1511*fa9e4066Sahrens /* 1512*fa9e4066Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1513*fa9e4066Sahrens */ 1514*fa9e4066Sahrens void 1515*fa9e4066Sahrens zio_checksum_verified(zio_t *zio) 1516*fa9e4066Sahrens { 1517*fa9e4066Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1518*fa9e4066Sahrens } 1519*fa9e4066Sahrens 1520*fa9e4066Sahrens /* 1521*fa9e4066Sahrens * Set the external verifier for a gang block based on stuff in the bp 1522*fa9e4066Sahrens */ 1523*fa9e4066Sahrens void 1524*fa9e4066Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1525*fa9e4066Sahrens { 1526*fa9e4066Sahrens zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio)); 1527*fa9e4066Sahrens zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio)); 1528*fa9e4066Sahrens zcp->zc_word[2] = zio->io_bp->blk_birth; 1529*fa9e4066Sahrens zcp->zc_word[3] = 0; 1530*fa9e4066Sahrens } 1531*fa9e4066Sahrens 1532*fa9e4066Sahrens /* 1533*fa9e4066Sahrens * ========================================================================== 1534*fa9e4066Sahrens * Define the pipeline 1535*fa9e4066Sahrens * ========================================================================== 1536*fa9e4066Sahrens */ 1537*fa9e4066Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1538*fa9e4066Sahrens 1539*fa9e4066Sahrens static void 1540*fa9e4066Sahrens zio_badop(zio_t *zio) 1541*fa9e4066Sahrens { 1542*fa9e4066Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1543*fa9e4066Sahrens } 1544*fa9e4066Sahrens 1545*fa9e4066Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1546*fa9e4066Sahrens zio_badop, 1547*fa9e4066Sahrens zio_wait_children_ready, 1548*fa9e4066Sahrens zio_write_compress, 1549*fa9e4066Sahrens zio_checksum_generate, 1550*fa9e4066Sahrens zio_gang_pipeline, 1551*fa9e4066Sahrens zio_get_gang_header, 1552*fa9e4066Sahrens zio_rewrite_gang_members, 1553*fa9e4066Sahrens zio_free_gang_members, 1554*fa9e4066Sahrens zio_claim_gang_members, 1555*fa9e4066Sahrens zio_dva_allocate, 1556*fa9e4066Sahrens zio_dva_free, 1557*fa9e4066Sahrens zio_dva_claim, 1558*fa9e4066Sahrens zio_gang_checksum_generate, 1559*fa9e4066Sahrens zio_ready, 1560*fa9e4066Sahrens zio_dva_translate, 1561*fa9e4066Sahrens zio_vdev_io_setup, 1562*fa9e4066Sahrens zio_vdev_io_start, 1563*fa9e4066Sahrens zio_vdev_io_done, 1564*fa9e4066Sahrens zio_vdev_io_assess, 1565*fa9e4066Sahrens zio_wait_children_done, 1566*fa9e4066Sahrens zio_checksum_verify, 1567*fa9e4066Sahrens zio_read_gang_members, 1568*fa9e4066Sahrens zio_read_decompress, 1569*fa9e4066Sahrens zio_done, 1570*fa9e4066Sahrens zio_badop 1571*fa9e4066Sahrens }; 1572*fa9e4066Sahrens 1573*fa9e4066Sahrens /* 1574*fa9e4066Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1575*fa9e4066Sahrens * There's no locking on io_stage because there's no legitimate way for 1576*fa9e4066Sahrens * multiple threads to be attempting to process the same I/O. 1577*fa9e4066Sahrens */ 1578*fa9e4066Sahrens void 1579*fa9e4066Sahrens zio_next_stage(zio_t *zio) 1580*fa9e4066Sahrens { 1581*fa9e4066Sahrens uint32_t pipeline = zio->io_pipeline; 1582*fa9e4066Sahrens 1583*fa9e4066Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1584*fa9e4066Sahrens 1585*fa9e4066Sahrens if (zio->io_error) { 1586*fa9e4066Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1587*fa9e4066Sahrens zio, vdev_description(zio->io_vd), 1588*fa9e4066Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1589*fa9e4066Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1590*fa9e4066Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1591*fa9e4066Sahrens } 1592*fa9e4066Sahrens 1593*fa9e4066Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1594*fa9e4066Sahrens continue; 1595*fa9e4066Sahrens 1596*fa9e4066Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1597*fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 1598*fa9e4066Sahrens 1599*fa9e4066Sahrens zio_pipeline[zio->io_stage](zio); 1600*fa9e4066Sahrens } 1601*fa9e4066Sahrens 1602*fa9e4066Sahrens void 1603*fa9e4066Sahrens zio_next_stage_async(zio_t *zio) 1604*fa9e4066Sahrens { 1605*fa9e4066Sahrens taskq_t *tq; 1606*fa9e4066Sahrens uint32_t pipeline = zio->io_pipeline; 1607*fa9e4066Sahrens 1608*fa9e4066Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1609*fa9e4066Sahrens 1610*fa9e4066Sahrens if (zio->io_error) { 1611*fa9e4066Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1612*fa9e4066Sahrens zio, vdev_description(zio->io_vd), 1613*fa9e4066Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1614*fa9e4066Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1615*fa9e4066Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1616*fa9e4066Sahrens } 1617*fa9e4066Sahrens 1618*fa9e4066Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1619*fa9e4066Sahrens continue; 1620*fa9e4066Sahrens 1621*fa9e4066Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1622*fa9e4066Sahrens ASSERT(zio->io_stalled == 0); 1623*fa9e4066Sahrens 1624*fa9e4066Sahrens /* 1625*fa9e4066Sahrens * For performance, we'll probably want two sets of task queues: 1626*fa9e4066Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1627*fa9e4066Sahrens * part is for read performance: since we have to make a pass over 1628*fa9e4066Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1629*fa9e4066Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1630*fa9e4066Sahrens * that thread is probably still there. Getting this optimization 1631*fa9e4066Sahrens * right avoids performance-hostile cache-to-cache transfers. 1632*fa9e4066Sahrens * 1633*fa9e4066Sahrens * Note that having two sets of task queues is also necessary for 1634*fa9e4066Sahrens * correctness: if all of the issue threads get bogged down waiting 1635*fa9e4066Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1636*fa9e4066Sahrens * there won't be any threads available to service I/O completion 1637*fa9e4066Sahrens * interrupts. 1638*fa9e4066Sahrens */ 1639*fa9e4066Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1640*fa9e4066Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1641*fa9e4066Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1642*fa9e4066Sahrens else 1643*fa9e4066Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1644*fa9e4066Sahrens (void) taskq_dispatch(tq, 1645*fa9e4066Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1646*fa9e4066Sahrens } else { 1647*fa9e4066Sahrens zio_pipeline[zio->io_stage](zio); 1648*fa9e4066Sahrens } 1649*fa9e4066Sahrens } 1650*fa9e4066Sahrens 1651*fa9e4066Sahrens /* 1652*fa9e4066Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1653*fa9e4066Sahrens */ 1654*fa9e4066Sahrens int 1655*fa9e4066Sahrens zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, 1656*fa9e4066Sahrens uint64_t txg) 1657*fa9e4066Sahrens { 1658*fa9e4066Sahrens int error; 1659*fa9e4066Sahrens 1660*fa9e4066Sahrens spa_config_enter(spa, RW_READER); 1661*fa9e4066Sahrens 1662*fa9e4066Sahrens BP_ZERO(bp); 1663*fa9e4066Sahrens 1664*fa9e4066Sahrens error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg); 1665*fa9e4066Sahrens 1666*fa9e4066Sahrens if (error == 0) { 1667*fa9e4066Sahrens BP_SET_CHECKSUM(bp, checksum); 1668*fa9e4066Sahrens BP_SET_LSIZE(bp, size); 1669*fa9e4066Sahrens BP_SET_PSIZE(bp, size); 1670*fa9e4066Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 1671*fa9e4066Sahrens BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); 1672*fa9e4066Sahrens BP_SET_LEVEL(bp, 0); 1673*fa9e4066Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1674*fa9e4066Sahrens bp->blk_birth = txg; 1675*fa9e4066Sahrens } 1676*fa9e4066Sahrens 1677*fa9e4066Sahrens spa_config_exit(spa); 1678*fa9e4066Sahrens 1679*fa9e4066Sahrens return (error); 1680*fa9e4066Sahrens } 1681*fa9e4066Sahrens 1682*fa9e4066Sahrens /* 1683*fa9e4066Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1684*fa9e4066Sahrens * nothing to do except metaslab_free() it. 1685*fa9e4066Sahrens */ 1686*fa9e4066Sahrens void 1687*fa9e4066Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1688*fa9e4066Sahrens { 1689*fa9e4066Sahrens ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0); 1690*fa9e4066Sahrens 1691*fa9e4066Sahrens dprintf_bp(bp, "txg %llu: ", txg); 1692*fa9e4066Sahrens 1693*fa9e4066Sahrens spa_config_enter(spa, RW_READER); 1694*fa9e4066Sahrens 1695*fa9e4066Sahrens metaslab_free(spa, BP_IDENTITY(bp), txg); 1696*fa9e4066Sahrens 1697*fa9e4066Sahrens spa_config_exit(spa); 1698*fa9e4066Sahrens } 1699