1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23f78cdc34SPaul Dagnelie * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 245aeb9474SGarrett D'Amore * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 26663207adSDon Brady * Copyright (c) 2017, Intel Corporation. 27fa9e4066Sahrens */ 28fa9e4066Sahrens 29de710d24SJosef 'Jeff' Sipek #include <sys/sysmacros.h> 30fa9e4066Sahrens #include <sys/zfs_context.h> 31ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 32fa9e4066Sahrens #include <sys/spa.h> 33fa9e4066Sahrens #include <sys/txg.h> 34fa9e4066Sahrens #include <sys/spa_impl.h> 35fa9e4066Sahrens #include <sys/vdev_impl.h> 36084fd14fSBrian Behlendorf #include <sys/vdev_trim.h> 37fa9e4066Sahrens #include <sys/zio_impl.h> 38fa9e4066Sahrens #include <sys/zio_compress.h> 39fa9e4066Sahrens #include <sys/zio_checksum.h> 40b24ab676SJeff Bonwick #include <sys/dmu_objset.h> 41b24ab676SJeff Bonwick #include <sys/arc.h> 42b24ab676SJeff Bonwick #include <sys/ddt.h> 435d7b4d43SMatthew Ahrens #include <sys/blkptr.h> 4443466aaeSMax Grossman #include <sys/zfeature.h> 45*dd50e0ccSTony Hutter #include <sys/time.h> 46a3874b8bSToomas Soome #include <sys/dsl_scan.h> 470f7643c7SGeorge Wilson #include <sys/metaslab_impl.h> 48770499e1SDan Kimmel #include <sys/abd.h> 49f78cdc34SPaul Dagnelie #include <sys/cityhash.h> 50eb633035STom Caputi #include <sys/dsl_crypt.h> 51fa9e4066Sahrens 52fa9e4066Sahrens /* 53fa9e4066Sahrens * ========================================================================== 54fa9e4066Sahrens * I/O type descriptions 55fa9e4066Sahrens * ========================================================================== 56fa9e4066Sahrens */ 5769962b56SMatthew Ahrens const char *zio_type_name[ZIO_TYPES] = { 5880eb36f2SGeorge Wilson "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 59084fd14fSBrian Behlendorf "zio_ioctl", "z_trim" 6080eb36f2SGeorge Wilson }; 61fa9e4066Sahrens 620f7643c7SGeorge Wilson boolean_t zio_dva_throttle_enabled = B_TRUE; 630f7643c7SGeorge Wilson 64fa9e4066Sahrens /* 65fa9e4066Sahrens * ========================================================================== 66fa9e4066Sahrens * I/O kmem caches 67fa9e4066Sahrens * ========================================================================== 68fa9e4066Sahrens */ 69ccae0b50Seschrock kmem_cache_t *zio_cache; 70a3f829aeSBill Moore kmem_cache_t *zio_link_cache; 71fa9e4066Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 72ad23a2dbSjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 73ad23a2dbSjohansen 74ad23a2dbSjohansen #ifdef _KERNEL 75ad23a2dbSjohansen extern vmem_t *zio_alloc_arena; 76ad23a2dbSjohansen #endif 77fa9e4066Sahrens 78738f37bcSGeorge Wilson #define ZIO_PIPELINE_CONTINUE 0x100 79738f37bcSGeorge Wilson #define ZIO_PIPELINE_STOP 0x101 80738f37bcSGeorge Wilson 81*dd50e0ccSTony Hutter /* Mark IOs as "slow" if they take longer than 30 seconds */ 82*dd50e0ccSTony Hutter int zio_slow_io_ms = (30 * MILLISEC); 83*dd50e0ccSTony Hutter 84a2cdcdd2SPaul Dagnelie #define BP_SPANB(indblkshift, level) \ 85a2cdcdd2SPaul Dagnelie (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) 86a2cdcdd2SPaul Dagnelie #define COMPARE_META_LEVEL 0x80000000ul 8701f55e48SGeorge Wilson /* 8801f55e48SGeorge Wilson * The following actions directly effect the spa's sync-to-convergence logic. 8901f55e48SGeorge Wilson * The values below define the sync pass when we start performing the action. 9001f55e48SGeorge Wilson * Care should be taken when changing these values as they directly impact 9101f55e48SGeorge Wilson * spa_sync() performance. Tuning these values may introduce subtle performance 9201f55e48SGeorge Wilson * pathologies and should only be done in the context of performance analysis. 9301f55e48SGeorge Wilson * These tunables will eventually be removed and replaced with #defines once 9401f55e48SGeorge Wilson * enough analysis has been done to determine optimal values. 9501f55e48SGeorge Wilson * 9601f55e48SGeorge Wilson * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 9701f55e48SGeorge Wilson * regular blocks are not deferred. 9801f55e48SGeorge Wilson */ 9901f55e48SGeorge Wilson int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 10001f55e48SGeorge Wilson int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 10101f55e48SGeorge Wilson int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 10201f55e48SGeorge Wilson 1030a4e9518Sgw /* 104e14bb325SJeff Bonwick * An allocating zio is one that either currently has the DVA allocate 105e14bb325SJeff Bonwick * stage set or will have it later in its lifetime. 1060a4e9518Sgw */ 107b24ab676SJeff Bonwick #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 108b24ab676SJeff Bonwick 10935a5a358SJonathan Adams boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 11035a5a358SJonathan Adams 111b24ab676SJeff Bonwick #ifdef ZFS_DEBUG 112b24ab676SJeff Bonwick int zio_buf_debug_limit = 16384; 113b24ab676SJeff Bonwick #else 114b24ab676SJeff Bonwick int zio_buf_debug_limit = 0; 115b24ab676SJeff Bonwick #endif 1160a4e9518Sgw 1170f7643c7SGeorge Wilson static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); 1180f7643c7SGeorge Wilson 119fa9e4066Sahrens void 120fa9e4066Sahrens zio_init(void) 121fa9e4066Sahrens { 122fa9e4066Sahrens size_t c; 123ad23a2dbSjohansen vmem_t *data_alloc_arena = NULL; 124ad23a2dbSjohansen 125ad23a2dbSjohansen #ifdef _KERNEL 126ad23a2dbSjohansen data_alloc_arena = zio_alloc_arena; 127ad23a2dbSjohansen #endif 128a3f829aeSBill Moore zio_cache = kmem_cache_create("zio_cache", 129a3f829aeSBill Moore sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 130a3f829aeSBill Moore zio_link_cache = kmem_cache_create("zio_link_cache", 131a3f829aeSBill Moore sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 132ccae0b50Seschrock 133fa9e4066Sahrens /* 134fa9e4066Sahrens * For small buffers, we want a cache for each multiple of 135b5152584SMatthew Ahrens * SPA_MINBLOCKSIZE. For larger buffers, we want a cache 136b5152584SMatthew Ahrens * for each quarter-power of 2. 137fa9e4066Sahrens */ 138fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 139fa9e4066Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 140fa9e4066Sahrens size_t p2 = size; 141fa9e4066Sahrens size_t align = 0; 142e291592aSJonathan Adams size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 143fa9e4066Sahrens 144de710d24SJosef 'Jeff' Sipek while (!ISP2(p2)) 145fa9e4066Sahrens p2 &= p2 - 1; 146fa9e4066Sahrens 147cd1c8b85SMatthew Ahrens #ifndef _KERNEL 148cd1c8b85SMatthew Ahrens /* 149cd1c8b85SMatthew Ahrens * If we are using watchpoints, put each buffer on its own page, 150cd1c8b85SMatthew Ahrens * to eliminate the performance overhead of trapping to the 151cd1c8b85SMatthew Ahrens * kernel when modifying a non-watched buffer that shares the 152cd1c8b85SMatthew Ahrens * page with a watched buffer. 153cd1c8b85SMatthew Ahrens */ 154cd1c8b85SMatthew Ahrens if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 155cd1c8b85SMatthew Ahrens continue; 156cd1c8b85SMatthew Ahrens #endif 157fa9e4066Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 158fa9e4066Sahrens align = SPA_MINBLOCKSIZE; 159cd1c8b85SMatthew Ahrens } else if (IS_P2ALIGNED(size, p2 >> 2)) { 160b5152584SMatthew Ahrens align = MIN(p2 >> 2, PAGESIZE); 161fa9e4066Sahrens } 162fa9e4066Sahrens 163fa9e4066Sahrens if (align != 0) { 164ad23a2dbSjohansen char name[36]; 1655ad82045Snd (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 166fa9e4066Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 167e291592aSJonathan Adams align, NULL, NULL, NULL, NULL, NULL, cflags); 168ad23a2dbSjohansen 169e291592aSJonathan Adams /* 170e291592aSJonathan Adams * Since zio_data bufs do not appear in crash dumps, we 171e291592aSJonathan Adams * pass KMC_NOTOUCH so that no allocator metadata is 172e291592aSJonathan Adams * stored with the buffers. 173e291592aSJonathan Adams */ 174ad23a2dbSjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 175ad23a2dbSjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 176ad23a2dbSjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 177e291592aSJonathan Adams cflags | KMC_NOTOUCH); 178fa9e4066Sahrens } 179fa9e4066Sahrens } 180fa9e4066Sahrens 181fa9e4066Sahrens while (--c != 0) { 182fa9e4066Sahrens ASSERT(zio_buf_cache[c] != NULL); 183fa9e4066Sahrens if (zio_buf_cache[c - 1] == NULL) 184fa9e4066Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 185ad23a2dbSjohansen 186ad23a2dbSjohansen ASSERT(zio_data_buf_cache[c] != NULL); 187ad23a2dbSjohansen if (zio_data_buf_cache[c - 1] == NULL) 188ad23a2dbSjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 189fa9e4066Sahrens } 190ea8dc4b6Seschrock 191ea8dc4b6Seschrock zio_inject_init(); 192fa9e4066Sahrens } 193fa9e4066Sahrens 194fa9e4066Sahrens void 195fa9e4066Sahrens zio_fini(void) 196fa9e4066Sahrens { 197fa9e4066Sahrens size_t c; 198fa9e4066Sahrens kmem_cache_t *last_cache = NULL; 199ad23a2dbSjohansen kmem_cache_t *last_data_cache = NULL; 200fa9e4066Sahrens 201fa9e4066Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 202fa9e4066Sahrens if (zio_buf_cache[c] != last_cache) { 203fa9e4066Sahrens last_cache = zio_buf_cache[c]; 204fa9e4066Sahrens kmem_cache_destroy(zio_buf_cache[c]); 205fa9e4066Sahrens } 206fa9e4066Sahrens zio_buf_cache[c] = NULL; 207ad23a2dbSjohansen 208ad23a2dbSjohansen if (zio_data_buf_cache[c] != last_data_cache) { 209ad23a2dbSjohansen last_data_cache = zio_data_buf_cache[c]; 210ad23a2dbSjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 211ad23a2dbSjohansen } 212ad23a2dbSjohansen zio_data_buf_cache[c] = NULL; 213fa9e4066Sahrens } 214ea8dc4b6Seschrock 215a3f829aeSBill Moore kmem_cache_destroy(zio_link_cache); 216ccae0b50Seschrock kmem_cache_destroy(zio_cache); 217ccae0b50Seschrock 218ea8dc4b6Seschrock zio_inject_fini(); 219fa9e4066Sahrens } 220fa9e4066Sahrens 221fa9e4066Sahrens /* 222fa9e4066Sahrens * ========================================================================== 223fa9e4066Sahrens * Allocate and free I/O buffers 224fa9e4066Sahrens * ========================================================================== 225fa9e4066Sahrens */ 226ad23a2dbSjohansen 227ad23a2dbSjohansen /* 228ad23a2dbSjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 229ad23a2dbSjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 230ad23a2dbSjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 231ad23a2dbSjohansen * excess / transient data in-core during a crashdump. 232ad23a2dbSjohansen */ 233fa9e4066Sahrens void * 234fa9e4066Sahrens zio_buf_alloc(size_t size) 235fa9e4066Sahrens { 236fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 237fa9e4066Sahrens 238f63ab3d5SMatthew Ahrens VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 239fa9e4066Sahrens 2401ab7f2deSmaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 241fa9e4066Sahrens } 242fa9e4066Sahrens 243ad23a2dbSjohansen /* 244ad23a2dbSjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 245ad23a2dbSjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 246ad23a2dbSjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 247ad23a2dbSjohansen * of kernel heap dumped to disk when the kernel panics) 248ad23a2dbSjohansen */ 249ad23a2dbSjohansen void * 250ad23a2dbSjohansen zio_data_buf_alloc(size_t size) 251ad23a2dbSjohansen { 252ad23a2dbSjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 253ad23a2dbSjohansen 254f63ab3d5SMatthew Ahrens VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 255ad23a2dbSjohansen 2561ab7f2deSmaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 257ad23a2dbSjohansen } 258ad23a2dbSjohansen 259fa9e4066Sahrens void 260fa9e4066Sahrens zio_buf_free(void *buf, size_t size) 261fa9e4066Sahrens { 262fa9e4066Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 263fa9e4066Sahrens 264f63ab3d5SMatthew Ahrens VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 265fa9e4066Sahrens 266fa9e4066Sahrens kmem_cache_free(zio_buf_cache[c], buf); 267fa9e4066Sahrens } 268fa9e4066Sahrens 269ad23a2dbSjohansen void 270ad23a2dbSjohansen zio_data_buf_free(void *buf, size_t size) 271ad23a2dbSjohansen { 272ad23a2dbSjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 273ad23a2dbSjohansen 274f63ab3d5SMatthew Ahrens VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 275ad23a2dbSjohansen 276ad23a2dbSjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 277ad23a2dbSjohansen } 278b3995adbSahrens 279eb633035STom Caputi /* ARGSUSED */ 280eb633035STom Caputi static void 281eb633035STom Caputi zio_abd_free(void *abd, size_t size) 282eb633035STom Caputi { 283eb633035STom Caputi abd_free((abd_t *)abd); 284eb633035STom Caputi } 285eb633035STom Caputi 286fa9e4066Sahrens /* 287fa9e4066Sahrens * ========================================================================== 288fa9e4066Sahrens * Push and pop I/O transform buffers 289fa9e4066Sahrens * ========================================================================== 290fa9e4066Sahrens */ 291dcbf3bd6SGeorge Wilson void 292770499e1SDan Kimmel zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, 2939a686fbcSPaul Dagnelie zio_transform_func_t *transform) 294fa9e4066Sahrens { 295fa9e4066Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 296fa9e4066Sahrens 297770499e1SDan Kimmel /* 298770499e1SDan Kimmel * Ensure that anyone expecting this zio to contain a linear ABD isn't 299770499e1SDan Kimmel * going to get a nasty surprise when they try to access the data. 300770499e1SDan Kimmel */ 301770499e1SDan Kimmel IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); 302770499e1SDan Kimmel 303770499e1SDan Kimmel zt->zt_orig_abd = zio->io_abd; 304e14bb325SJeff Bonwick zt->zt_orig_size = zio->io_size; 305fa9e4066Sahrens zt->zt_bufsize = bufsize; 306e14bb325SJeff Bonwick zt->zt_transform = transform; 307fa9e4066Sahrens 308fa9e4066Sahrens zt->zt_next = zio->io_transform_stack; 309fa9e4066Sahrens zio->io_transform_stack = zt; 310fa9e4066Sahrens 311770499e1SDan Kimmel zio->io_abd = data; 312fa9e4066Sahrens zio->io_size = size; 313fa9e4066Sahrens } 314fa9e4066Sahrens 315dcbf3bd6SGeorge Wilson void 316e14bb325SJeff Bonwick zio_pop_transforms(zio_t *zio) 317fa9e4066Sahrens { 318e14bb325SJeff Bonwick zio_transform_t *zt; 319e14bb325SJeff Bonwick 320e14bb325SJeff Bonwick while ((zt = zio->io_transform_stack) != NULL) { 321e14bb325SJeff Bonwick if (zt->zt_transform != NULL) 322e14bb325SJeff Bonwick zt->zt_transform(zio, 323770499e1SDan Kimmel zt->zt_orig_abd, zt->zt_orig_size); 324fa9e4066Sahrens 325b24ab676SJeff Bonwick if (zt->zt_bufsize != 0) 326770499e1SDan Kimmel abd_free(zio->io_abd); 327fa9e4066Sahrens 328770499e1SDan Kimmel zio->io_abd = zt->zt_orig_abd; 329e14bb325SJeff Bonwick zio->io_size = zt->zt_orig_size; 330e14bb325SJeff Bonwick zio->io_transform_stack = zt->zt_next; 331fa9e4066Sahrens 332e14bb325SJeff Bonwick kmem_free(zt, sizeof (zio_transform_t)); 333fa9e4066Sahrens } 334fa9e4066Sahrens } 335fa9e4066Sahrens 336e14bb325SJeff Bonwick /* 337e14bb325SJeff Bonwick * ========================================================================== 338eb633035STom Caputi * I/O transform callbacks for subblocks, decompression, and decryption 339e14bb325SJeff Bonwick * ========================================================================== 340e14bb325SJeff Bonwick */ 341e14bb325SJeff Bonwick static void 342770499e1SDan Kimmel zio_subblock(zio_t *zio, abd_t *data, uint64_t size) 343e14bb325SJeff Bonwick { 344e14bb325SJeff Bonwick ASSERT(zio->io_size > size); 345e14bb325SJeff Bonwick 346e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_READ) 347770499e1SDan Kimmel abd_copy(data, zio->io_abd, size); 348e14bb325SJeff Bonwick } 349e14bb325SJeff Bonwick 350e14bb325SJeff Bonwick static void 351770499e1SDan Kimmel zio_decompress(zio_t *zio, abd_t *data, uint64_t size) 352e14bb325SJeff Bonwick { 353770499e1SDan Kimmel if (zio->io_error == 0) { 354770499e1SDan Kimmel void *tmp = abd_borrow_buf(data, size); 355770499e1SDan Kimmel int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 356770499e1SDan Kimmel zio->io_abd, tmp, zio->io_size, size); 357770499e1SDan Kimmel abd_return_buf_copy(data, tmp, size); 358770499e1SDan Kimmel 359770499e1SDan Kimmel if (ret != 0) 360770499e1SDan Kimmel zio->io_error = SET_ERROR(EIO); 361770499e1SDan Kimmel } 362e14bb325SJeff Bonwick } 363e14bb325SJeff Bonwick 364eb633035STom Caputi static void 365eb633035STom Caputi zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) 366eb633035STom Caputi { 367eb633035STom Caputi int ret; 368eb633035STom Caputi void *tmp; 369eb633035STom Caputi blkptr_t *bp = zio->io_bp; 370eb633035STom Caputi spa_t *spa = zio->io_spa; 371eb633035STom Caputi uint64_t dsobj = zio->io_bookmark.zb_objset; 372eb633035STom Caputi uint64_t lsize = BP_GET_LSIZE(bp); 373eb633035STom Caputi dmu_object_type_t ot = BP_GET_TYPE(bp); 374eb633035STom Caputi uint8_t salt[ZIO_DATA_SALT_LEN]; 375eb633035STom Caputi uint8_t iv[ZIO_DATA_IV_LEN]; 376eb633035STom Caputi uint8_t mac[ZIO_DATA_MAC_LEN]; 377eb633035STom Caputi boolean_t no_crypt = B_FALSE; 378eb633035STom Caputi 379eb633035STom Caputi ASSERT(BP_USES_CRYPT(bp)); 380eb633035STom Caputi ASSERT3U(size, !=, 0); 381eb633035STom Caputi 382eb633035STom Caputi if (zio->io_error != 0) 383eb633035STom Caputi return; 384eb633035STom Caputi 385eb633035STom Caputi /* 386eb633035STom Caputi * Verify the cksum of MACs stored in an indirect bp. It will always 387eb633035STom Caputi * be possible to verify this since it does not require an encryption 388eb633035STom Caputi * key. 389eb633035STom Caputi */ 390eb633035STom Caputi if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { 391eb633035STom Caputi zio_crypt_decode_mac_bp(bp, mac); 392eb633035STom Caputi 393eb633035STom Caputi if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 394eb633035STom Caputi /* 395eb633035STom Caputi * We haven't decompressed the data yet, but 396eb633035STom Caputi * zio_crypt_do_indirect_mac_checksum() requires 397eb633035STom Caputi * decompressed data to be able to parse out the MACs 398eb633035STom Caputi * from the indirect block. We decompress it now and 399eb633035STom Caputi * throw away the result after we are finished. 400eb633035STom Caputi */ 401eb633035STom Caputi tmp = zio_buf_alloc(lsize); 402eb633035STom Caputi ret = zio_decompress_data(BP_GET_COMPRESS(bp), 403eb633035STom Caputi zio->io_abd, tmp, zio->io_size, lsize); 404eb633035STom Caputi if (ret != 0) { 405eb633035STom Caputi ret = SET_ERROR(EIO); 406eb633035STom Caputi goto error; 407eb633035STom Caputi } 408eb633035STom Caputi ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, 409eb633035STom Caputi tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); 410eb633035STom Caputi zio_buf_free(tmp, lsize); 411eb633035STom Caputi } else { 412eb633035STom Caputi ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, 413eb633035STom Caputi zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); 414eb633035STom Caputi } 415eb633035STom Caputi abd_copy(data, zio->io_abd, size); 416eb633035STom Caputi 417eb633035STom Caputi if (ret != 0) 418eb633035STom Caputi goto error; 419eb633035STom Caputi 420eb633035STom Caputi return; 421eb633035STom Caputi } 422eb633035STom Caputi 423eb633035STom Caputi /* 424eb633035STom Caputi * If this is an authenticated block, just check the MAC. It would be 425eb633035STom Caputi * nice to separate this out into its own flag, but for the moment 426eb633035STom Caputi * enum zio_flag is out of bits. 427eb633035STom Caputi */ 428eb633035STom Caputi if (BP_IS_AUTHENTICATED(bp)) { 429eb633035STom Caputi if (ot == DMU_OT_OBJSET) { 430eb633035STom Caputi ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, 431eb633035STom Caputi dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); 432eb633035STom Caputi } else { 433eb633035STom Caputi zio_crypt_decode_mac_bp(bp, mac); 434eb633035STom Caputi ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, 435eb633035STom Caputi zio->io_abd, size, mac); 436eb633035STom Caputi } 437eb633035STom Caputi abd_copy(data, zio->io_abd, size); 438eb633035STom Caputi 439eb633035STom Caputi if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { 440eb633035STom Caputi ret = zio_handle_decrypt_injection(spa, 441eb633035STom Caputi &zio->io_bookmark, ot, ECKSUM); 442eb633035STom Caputi } 443eb633035STom Caputi if (ret != 0) 444eb633035STom Caputi goto error; 445eb633035STom Caputi 446eb633035STom Caputi return; 447eb633035STom Caputi } 448eb633035STom Caputi 449eb633035STom Caputi zio_crypt_decode_params_bp(bp, salt, iv); 450eb633035STom Caputi 451eb633035STom Caputi if (ot == DMU_OT_INTENT_LOG) { 452eb633035STom Caputi tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); 453eb633035STom Caputi zio_crypt_decode_mac_zil(tmp, mac); 454eb633035STom Caputi abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); 455eb633035STom Caputi } else { 456eb633035STom Caputi zio_crypt_decode_mac_bp(bp, mac); 457eb633035STom Caputi } 458eb633035STom Caputi 459eb633035STom Caputi ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), 460eb633035STom Caputi BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, 461eb633035STom Caputi zio->io_abd, &no_crypt); 462eb633035STom Caputi if (no_crypt) 463eb633035STom Caputi abd_copy(data, zio->io_abd, size); 464eb633035STom Caputi 465eb633035STom Caputi if (ret != 0) 466eb633035STom Caputi goto error; 467eb633035STom Caputi 468eb633035STom Caputi return; 469eb633035STom Caputi 470eb633035STom Caputi error: 471eb633035STom Caputi /* assert that the key was found unless this was speculative */ 472eb633035STom Caputi ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); 473eb633035STom Caputi 474eb633035STom Caputi /* 475eb633035STom Caputi * If there was a decryption / authentication error return EIO as 476eb633035STom Caputi * the io_error. If this was not a speculative zio, create an ereport. 477eb633035STom Caputi */ 478eb633035STom Caputi if (ret == ECKSUM) { 479eb633035STom Caputi zio->io_error = SET_ERROR(EIO); 480eb633035STom Caputi if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { 481eb633035STom Caputi spa_log_error(spa, &zio->io_bookmark); 482eb633035STom Caputi zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, 483eb633035STom Caputi spa, NULL, &zio->io_bookmark, zio, 0, 0); 484eb633035STom Caputi } 485eb633035STom Caputi } else { 486eb633035STom Caputi zio->io_error = ret; 487eb633035STom Caputi } 488eb633035STom Caputi } 489eb633035STom Caputi 490e14bb325SJeff Bonwick /* 491e14bb325SJeff Bonwick * ========================================================================== 492e14bb325SJeff Bonwick * I/O parent/child relationships and pipeline interlocks 493e14bb325SJeff Bonwick * ========================================================================== 494e14bb325SJeff Bonwick */ 495a3f829aeSBill Moore zio_t * 4960f7643c7SGeorge Wilson zio_walk_parents(zio_t *cio, zio_link_t **zl) 497a3f829aeSBill Moore { 498a3f829aeSBill Moore list_t *pl = &cio->io_parent_list; 499e14bb325SJeff Bonwick 5000f7643c7SGeorge Wilson *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); 5010f7643c7SGeorge Wilson if (*zl == NULL) 502a3f829aeSBill Moore return (NULL); 503a3f829aeSBill Moore 5040f7643c7SGeorge Wilson ASSERT((*zl)->zl_child == cio); 5050f7643c7SGeorge Wilson return ((*zl)->zl_parent); 506a3f829aeSBill Moore } 507a3f829aeSBill Moore 508a3f829aeSBill Moore zio_t * 5090f7643c7SGeorge Wilson zio_walk_children(zio_t *pio, zio_link_t **zl) 510a3f829aeSBill Moore { 511a3f829aeSBill Moore list_t *cl = &pio->io_child_list; 512a3f829aeSBill Moore 513a3874b8bSToomas Soome ASSERT(MUTEX_HELD(&pio->io_lock)); 514a3874b8bSToomas Soome 5150f7643c7SGeorge Wilson *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); 5160f7643c7SGeorge Wilson if (*zl == NULL) 517a3f829aeSBill Moore return (NULL); 518a3f829aeSBill Moore 5190f7643c7SGeorge Wilson ASSERT((*zl)->zl_parent == pio); 5200f7643c7SGeorge Wilson return ((*zl)->zl_child); 521a3f829aeSBill Moore } 522a3f829aeSBill Moore 523a3f829aeSBill Moore zio_t * 524a3f829aeSBill Moore zio_unique_parent(zio_t *cio) 525a3f829aeSBill Moore { 5260f7643c7SGeorge Wilson zio_link_t *zl = NULL; 5270f7643c7SGeorge Wilson zio_t *pio = zio_walk_parents(cio, &zl); 528a3f829aeSBill Moore 5290f7643c7SGeorge Wilson VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); 530a3f829aeSBill Moore return (pio); 531a3f829aeSBill Moore } 532a3f829aeSBill Moore 533a3f829aeSBill Moore void 534a3f829aeSBill Moore zio_add_child(zio_t *pio, zio_t *cio) 535e14bb325SJeff Bonwick { 536a3f829aeSBill Moore zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 537a3f829aeSBill Moore 538a3f829aeSBill Moore /* 539a3f829aeSBill Moore * Logical I/Os can have logical, gang, or vdev children. 540a3f829aeSBill Moore * Gang I/Os can have gang or vdev children. 541a3f829aeSBill Moore * Vdev I/Os can only have vdev children. 542a3f829aeSBill Moore * The following ASSERT captures all of these constraints. 543a3f829aeSBill Moore */ 5441271e4b1SPrakash Surya ASSERT3S(cio->io_child_type, <=, pio->io_child_type); 545a3f829aeSBill Moore 546a3f829aeSBill Moore zl->zl_parent = pio; 547a3f829aeSBill Moore zl->zl_child = cio; 548a3f829aeSBill Moore 549e14bb325SJeff Bonwick mutex_enter(&pio->io_lock); 550a3874b8bSToomas Soome mutex_enter(&cio->io_lock); 551a3f829aeSBill Moore 552a3f829aeSBill Moore ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 553a3f829aeSBill Moore 554a3f829aeSBill Moore for (int w = 0; w < ZIO_WAIT_TYPES; w++) 555a3f829aeSBill Moore pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 556a3f829aeSBill Moore 557a3f829aeSBill Moore list_insert_head(&pio->io_child_list, zl); 558a3f829aeSBill Moore list_insert_head(&cio->io_parent_list, zl); 559a3f829aeSBill Moore 560b24ab676SJeff Bonwick pio->io_child_count++; 561b24ab676SJeff Bonwick cio->io_parent_count++; 562b24ab676SJeff Bonwick 563a3f829aeSBill Moore mutex_exit(&cio->io_lock); 564a3874b8bSToomas Soome mutex_exit(&pio->io_lock); 565e14bb325SJeff Bonwick } 566e14bb325SJeff Bonwick 567fa9e4066Sahrens static void 568a3f829aeSBill Moore zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 569e14bb325SJeff Bonwick { 570a3f829aeSBill Moore ASSERT(zl->zl_parent == pio); 571a3f829aeSBill Moore ASSERT(zl->zl_child == cio); 572e14bb325SJeff Bonwick 573e14bb325SJeff Bonwick mutex_enter(&pio->io_lock); 574a3874b8bSToomas Soome mutex_enter(&cio->io_lock); 575a3f829aeSBill Moore 576a3f829aeSBill Moore list_remove(&pio->io_child_list, zl); 577a3f829aeSBill Moore list_remove(&cio->io_parent_list, zl); 578a3f829aeSBill Moore 579b24ab676SJeff Bonwick pio->io_child_count--; 580b24ab676SJeff Bonwick cio->io_parent_count--; 581b24ab676SJeff Bonwick 582a3f829aeSBill Moore mutex_exit(&cio->io_lock); 583a3874b8bSToomas Soome mutex_exit(&pio->io_lock); 584a3f829aeSBill Moore 585a3f829aeSBill Moore kmem_cache_free(zio_link_cache, zl); 586e14bb325SJeff Bonwick } 587e14bb325SJeff Bonwick 588e14bb325SJeff Bonwick static boolean_t 589d6e1c446SGeorge Wilson zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) 590fa9e4066Sahrens { 591e14bb325SJeff Bonwick boolean_t waiting = B_FALSE; 592e14bb325SJeff Bonwick 593e14bb325SJeff Bonwick mutex_enter(&zio->io_lock); 594e14bb325SJeff Bonwick ASSERT(zio->io_stall == NULL); 595d6e1c446SGeorge Wilson for (int c = 0; c < ZIO_CHILD_TYPES; c++) { 596d6e1c446SGeorge Wilson if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) 597d6e1c446SGeorge Wilson continue; 598d6e1c446SGeorge Wilson 599d6e1c446SGeorge Wilson uint64_t *countp = &zio->io_children[c][wait]; 600d6e1c446SGeorge Wilson if (*countp != 0) { 601d6e1c446SGeorge Wilson zio->io_stage >>= 1; 602d6e1c446SGeorge Wilson ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); 603d6e1c446SGeorge Wilson zio->io_stall = countp; 604d6e1c446SGeorge Wilson waiting = B_TRUE; 605d6e1c446SGeorge Wilson break; 606d6e1c446SGeorge Wilson } 607e14bb325SJeff Bonwick } 608e14bb325SJeff Bonwick mutex_exit(&zio->io_lock); 609e14bb325SJeff Bonwick return (waiting); 610e14bb325SJeff Bonwick } 611fa9e4066Sahrens 612e14bb325SJeff Bonwick static void 613e14bb325SJeff Bonwick zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 614e14bb325SJeff Bonwick { 615e14bb325SJeff Bonwick uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 616e14bb325SJeff Bonwick int *errorp = &pio->io_child_error[zio->io_child_type]; 617fa9e4066Sahrens 618e14bb325SJeff Bonwick mutex_enter(&pio->io_lock); 619e14bb325SJeff Bonwick if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 620e14bb325SJeff Bonwick *errorp = zio_worst_error(*errorp, zio->io_error); 621e14bb325SJeff Bonwick pio->io_reexecute |= zio->io_reexecute; 622e14bb325SJeff Bonwick ASSERT3U(*countp, >, 0); 62369962b56SMatthew Ahrens 62469962b56SMatthew Ahrens (*countp)--; 62569962b56SMatthew Ahrens 62669962b56SMatthew Ahrens if (*countp == 0 && pio->io_stall == countp) { 6270f7643c7SGeorge Wilson zio_taskq_type_t type = 6280f7643c7SGeorge Wilson pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : 6290f7643c7SGeorge Wilson ZIO_TASKQ_INTERRUPT; 630e14bb325SJeff Bonwick pio->io_stall = NULL; 631e14bb325SJeff Bonwick mutex_exit(&pio->io_lock); 6320f7643c7SGeorge Wilson /* 6330f7643c7SGeorge Wilson * Dispatch the parent zio in its own taskq so that 6340f7643c7SGeorge Wilson * the child can continue to make progress. This also 6350f7643c7SGeorge Wilson * prevents overflowing the stack when we have deeply nested 6360f7643c7SGeorge Wilson * parent-child relationships. 6370f7643c7SGeorge Wilson */ 6380f7643c7SGeorge Wilson zio_taskq_dispatch(pio, type, B_FALSE); 639e14bb325SJeff Bonwick } else { 640e14bb325SJeff Bonwick mutex_exit(&pio->io_lock); 641fa9e4066Sahrens } 642fa9e4066Sahrens } 643fa9e4066Sahrens 644e14bb325SJeff Bonwick static void 645e14bb325SJeff Bonwick zio_inherit_child_errors(zio_t *zio, enum zio_child c) 646e14bb325SJeff Bonwick { 647e14bb325SJeff Bonwick if (zio->io_child_error[c] != 0 && zio->io_error == 0) 648e14bb325SJeff Bonwick zio->io_error = zio->io_child_error[c]; 649e14bb325SJeff Bonwick } 650e14bb325SJeff Bonwick 6510f7643c7SGeorge Wilson int 65294c2d0ebSMatthew Ahrens zio_bookmark_compare(const void *x1, const void *x2) 6530f7643c7SGeorge Wilson { 6540f7643c7SGeorge Wilson const zio_t *z1 = x1; 6550f7643c7SGeorge Wilson const zio_t *z2 = x2; 6560f7643c7SGeorge Wilson 65794c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) 6580f7643c7SGeorge Wilson return (-1); 65994c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) 6600f7643c7SGeorge Wilson return (1); 6610f7643c7SGeorge Wilson 66294c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) 6630f7643c7SGeorge Wilson return (-1); 66494c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) 66594c2d0ebSMatthew Ahrens return (1); 66694c2d0ebSMatthew Ahrens 66794c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) 66894c2d0ebSMatthew Ahrens return (-1); 66994c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) 67094c2d0ebSMatthew Ahrens return (1); 67194c2d0ebSMatthew Ahrens 67294c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) 67394c2d0ebSMatthew Ahrens return (-1); 67494c2d0ebSMatthew Ahrens if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) 6750f7643c7SGeorge Wilson return (1); 6760f7643c7SGeorge Wilson 6770f7643c7SGeorge Wilson if (z1 < z2) 6780f7643c7SGeorge Wilson return (-1); 6790f7643c7SGeorge Wilson if (z1 > z2) 6800f7643c7SGeorge Wilson return (1); 6810f7643c7SGeorge Wilson 6820f7643c7SGeorge Wilson return (0); 6830f7643c7SGeorge Wilson } 6840f7643c7SGeorge Wilson 685fa9e4066Sahrens /* 686fa9e4066Sahrens * ========================================================================== 687e14bb325SJeff Bonwick * Create the various types of I/O (read, write, free, etc) 688fa9e4066Sahrens * ========================================================================== 689fa9e4066Sahrens */ 690fa9e4066Sahrens static zio_t * 691b24ab676SJeff Bonwick zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 692770499e1SDan Kimmel abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, 6935602294fSDan Kimmel void *private, zio_type_t type, zio_priority_t priority, 6945602294fSDan Kimmel enum zio_flag flags, vdev_t *vd, uint64_t offset, 6955602294fSDan Kimmel const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) 696fa9e4066Sahrens { 697fa9e4066Sahrens zio_t *zio; 698fa9e4066Sahrens 699084fd14fSBrian Behlendorf IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); 7005602294fSDan Kimmel ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); 701e14bb325SJeff Bonwick ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 702fa9e4066Sahrens 703e14bb325SJeff Bonwick ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 704e14bb325SJeff Bonwick ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 705e14bb325SJeff Bonwick ASSERT(vd || stage == ZIO_STAGE_OPEN); 706088f3894Sahrens 707eb633035STom Caputi IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); 7085602294fSDan Kimmel 709ccae0b50Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 710ccae0b50Seschrock bzero(zio, sizeof (zio_t)); 711e14bb325SJeff Bonwick 712e14bb325SJeff Bonwick mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 713e14bb325SJeff Bonwick cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 714e14bb325SJeff Bonwick 715a3f829aeSBill Moore list_create(&zio->io_parent_list, sizeof (zio_link_t), 716a3f829aeSBill Moore offsetof(zio_link_t, zl_parent_node)); 717a3f829aeSBill Moore list_create(&zio->io_child_list, sizeof (zio_link_t), 718a3f829aeSBill Moore offsetof(zio_link_t, zl_child_node)); 7198363e80aSGeorge Wilson metaslab_trace_init(&zio->io_alloc_list); 720a3f829aeSBill Moore 721e14bb325SJeff Bonwick if (vd != NULL) 722e14bb325SJeff Bonwick zio->io_child_type = ZIO_CHILD_VDEV; 723e14bb325SJeff Bonwick else if (flags & ZIO_FLAG_GANG_CHILD) 724e14bb325SJeff Bonwick zio->io_child_type = ZIO_CHILD_GANG; 725b24ab676SJeff Bonwick else if (flags & ZIO_FLAG_DDT_CHILD) 726b24ab676SJeff Bonwick zio->io_child_type = ZIO_CHILD_DDT; 727e14bb325SJeff Bonwick else 728e14bb325SJeff Bonwick zio->io_child_type = ZIO_CHILD_LOGICAL; 729e14bb325SJeff Bonwick 730fa9e4066Sahrens if (bp != NULL) { 731b24ab676SJeff Bonwick zio->io_bp = (blkptr_t *)bp; 732fa9e4066Sahrens zio->io_bp_copy = *bp; 733fa9e4066Sahrens zio->io_bp_orig = *bp; 734b24ab676SJeff Bonwick if (type != ZIO_TYPE_WRITE || 735b24ab676SJeff Bonwick zio->io_child_type == ZIO_CHILD_DDT) 736e14bb325SJeff Bonwick zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 737f5383399SBill Moore if (zio->io_child_type == ZIO_CHILD_LOGICAL) 738e14bb325SJeff Bonwick zio->io_logical = zio; 739f5383399SBill Moore if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 740f5383399SBill Moore pipeline |= ZIO_GANG_STAGES; 741fa9e4066Sahrens } 742e14bb325SJeff Bonwick 743e14bb325SJeff Bonwick zio->io_spa = spa; 744e14bb325SJeff Bonwick zio->io_txg = txg; 745fa9e4066Sahrens zio->io_done = done; 746fa9e4066Sahrens zio->io_private = private; 747fa9e4066Sahrens zio->io_type = type; 748fa9e4066Sahrens zio->io_priority = priority; 749e14bb325SJeff Bonwick zio->io_vd = vd; 750e14bb325SJeff Bonwick zio->io_offset = offset; 751770499e1SDan Kimmel zio->io_orig_abd = zio->io_abd = data; 7525602294fSDan Kimmel zio->io_orig_size = zio->io_size = psize; 7535602294fSDan Kimmel zio->io_lsize = lsize; 754e14bb325SJeff Bonwick zio->io_orig_flags = zio->io_flags = flags; 755e14bb325SJeff Bonwick zio->io_orig_stage = zio->io_stage = stage; 756e14bb325SJeff Bonwick zio->io_orig_pipeline = zio->io_pipeline = pipeline; 7570f7643c7SGeorge Wilson zio->io_pipeline_trace = ZIO_STAGE_OPEN; 758fa9e4066Sahrens 759a3f829aeSBill Moore zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 760a3f829aeSBill Moore zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 761a3f829aeSBill Moore 762e14bb325SJeff Bonwick if (zb != NULL) 763e14bb325SJeff Bonwick zio->io_bookmark = *zb; 764e14bb325SJeff Bonwick 765e14bb325SJeff Bonwick if (pio != NULL) { 766663207adSDon Brady if (zio->io_metaslab_class == NULL) 767663207adSDon Brady zio->io_metaslab_class = pio->io_metaslab_class; 768e14bb325SJeff Bonwick if (zio->io_logical == NULL) 769ea8dc4b6Seschrock zio->io_logical = pio->io_logical; 770f5383399SBill Moore if (zio->io_child_type == ZIO_CHILD_GANG) 771f5383399SBill Moore zio->io_gang_leader = pio->io_gang_leader; 772e14bb325SJeff Bonwick zio_add_child(pio, zio); 773fa9e4066Sahrens } 774fa9e4066Sahrens 775fa9e4066Sahrens return (zio); 776fa9e4066Sahrens } 777fa9e4066Sahrens 7780a4e9518Sgw static void 779e14bb325SJeff Bonwick zio_destroy(zio_t *zio) 7800a4e9518Sgw { 7818363e80aSGeorge Wilson metaslab_trace_fini(&zio->io_alloc_list); 782a3f829aeSBill Moore list_destroy(&zio->io_parent_list); 783a3f829aeSBill Moore list_destroy(&zio->io_child_list); 784e14bb325SJeff Bonwick mutex_destroy(&zio->io_lock); 785e14bb325SJeff Bonwick cv_destroy(&zio->io_cv); 786e14bb325SJeff Bonwick kmem_cache_free(zio_cache, zio); 7870a4e9518Sgw } 7880a4e9518Sgw 789fa9e4066Sahrens zio_t * 790a3f829aeSBill Moore zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 791b24ab676SJeff Bonwick void *private, enum zio_flag flags) 792fa9e4066Sahrens { 793fa9e4066Sahrens zio_t *zio; 794fa9e4066Sahrens 7955602294fSDan Kimmel zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, 796a3f829aeSBill Moore ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 797e14bb325SJeff Bonwick ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 798fa9e4066Sahrens 799fa9e4066Sahrens return (zio); 800fa9e4066Sahrens } 801fa9e4066Sahrens 802fa9e4066Sahrens zio_t * 803b24ab676SJeff Bonwick zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 804fa9e4066Sahrens { 805a3f829aeSBill Moore return (zio_null(NULL, spa, NULL, done, private, flags)); 806fa9e4066Sahrens } 807fa9e4066Sahrens 808f63ab3d5SMatthew Ahrens void 809f63ab3d5SMatthew Ahrens zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) 810f63ab3d5SMatthew Ahrens { 811f63ab3d5SMatthew Ahrens if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { 812f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p has invalid TYPE %llu", 813f63ab3d5SMatthew Ahrens bp, (longlong_t)BP_GET_TYPE(bp)); 814f63ab3d5SMatthew Ahrens } 815f63ab3d5SMatthew Ahrens if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || 816f63ab3d5SMatthew Ahrens BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { 817f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", 818f63ab3d5SMatthew Ahrens bp, (longlong_t)BP_GET_CHECKSUM(bp)); 819f63ab3d5SMatthew Ahrens } 820f63ab3d5SMatthew Ahrens if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || 821f63ab3d5SMatthew Ahrens BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { 822f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", 823f63ab3d5SMatthew Ahrens bp, (longlong_t)BP_GET_COMPRESS(bp)); 824f63ab3d5SMatthew Ahrens } 825f63ab3d5SMatthew Ahrens if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { 826f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", 827f63ab3d5SMatthew Ahrens bp, (longlong_t)BP_GET_LSIZE(bp)); 828f63ab3d5SMatthew Ahrens } 829f63ab3d5SMatthew Ahrens if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { 830f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", 831f63ab3d5SMatthew Ahrens bp, (longlong_t)BP_GET_PSIZE(bp)); 832f63ab3d5SMatthew Ahrens } 833f63ab3d5SMatthew Ahrens 834f63ab3d5SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) { 835f63ab3d5SMatthew Ahrens if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { 836f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", 837f63ab3d5SMatthew Ahrens bp, (longlong_t)BPE_GET_ETYPE(bp)); 838f63ab3d5SMatthew Ahrens } 839f63ab3d5SMatthew Ahrens } 840f63ab3d5SMatthew Ahrens 8416f793812SPavel Zakharov /* 8426f793812SPavel Zakharov * Do not verify individual DVAs if the config is not trusted. This 8436f793812SPavel Zakharov * will be done once the zio is executed in vdev_mirror_map_alloc. 8446f793812SPavel Zakharov */ 8456f793812SPavel Zakharov if (!spa->spa_trust_config) 8466f793812SPavel Zakharov return; 8476f793812SPavel Zakharov 848f63ab3d5SMatthew Ahrens /* 849f63ab3d5SMatthew Ahrens * Pool-specific checks. 850f63ab3d5SMatthew Ahrens * 851f63ab3d5SMatthew Ahrens * Note: it would be nice to verify that the blk_birth and 852f63ab3d5SMatthew Ahrens * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() 853f63ab3d5SMatthew Ahrens * allows the birth time of log blocks (and dmu_sync()-ed blocks 854f63ab3d5SMatthew Ahrens * that are in the log) to be arbitrarily large. 855f63ab3d5SMatthew Ahrens */ 856f63ab3d5SMatthew Ahrens for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 857f63ab3d5SMatthew Ahrens uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); 858f63ab3d5SMatthew Ahrens if (vdevid >= spa->spa_root_vdev->vdev_children) { 859f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p DVA %u has invalid " 860f63ab3d5SMatthew Ahrens "VDEV %llu", 861f63ab3d5SMatthew Ahrens bp, i, (longlong_t)vdevid); 8625897eb49SJustin Gibbs continue; 863f63ab3d5SMatthew Ahrens } 864f63ab3d5SMatthew Ahrens vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 865f63ab3d5SMatthew Ahrens if (vd == NULL) { 866f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p DVA %u has invalid " 867f63ab3d5SMatthew Ahrens "VDEV %llu", 868f63ab3d5SMatthew Ahrens bp, i, (longlong_t)vdevid); 8695897eb49SJustin Gibbs continue; 870f63ab3d5SMatthew Ahrens } 871f63ab3d5SMatthew Ahrens if (vd->vdev_ops == &vdev_hole_ops) { 872f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p DVA %u has hole " 873f63ab3d5SMatthew Ahrens "VDEV %llu", 874f63ab3d5SMatthew Ahrens bp, i, (longlong_t)vdevid); 8755897eb49SJustin Gibbs continue; 876f63ab3d5SMatthew Ahrens } 877f63ab3d5SMatthew Ahrens if (vd->vdev_ops == &vdev_missing_ops) { 878f63ab3d5SMatthew Ahrens /* 879f63ab3d5SMatthew Ahrens * "missing" vdevs are valid during import, but we 880f63ab3d5SMatthew Ahrens * don't have their detailed info (e.g. asize), so 881f63ab3d5SMatthew Ahrens * we can't perform any more checks on them. 882f63ab3d5SMatthew Ahrens */ 883f63ab3d5SMatthew Ahrens continue; 884f63ab3d5SMatthew Ahrens } 885f63ab3d5SMatthew Ahrens uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 886f63ab3d5SMatthew Ahrens uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); 887f63ab3d5SMatthew Ahrens if (BP_IS_GANG(bp)) 888f63ab3d5SMatthew Ahrens asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 889f63ab3d5SMatthew Ahrens if (offset + asize > vd->vdev_asize) { 890f63ab3d5SMatthew Ahrens zfs_panic_recover("blkptr at %p DVA %u has invalid " 891f63ab3d5SMatthew Ahrens "OFFSET %llu", 892f63ab3d5SMatthew Ahrens bp, i, (longlong_t)offset); 893f63ab3d5SMatthew Ahrens } 894f63ab3d5SMatthew Ahrens } 895f63ab3d5SMatthew Ahrens } 896f63ab3d5SMatthew Ahrens 8976f793812SPavel Zakharov boolean_t 8986f793812SPavel Zakharov zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) 8996f793812SPavel Zakharov { 9006f793812SPavel Zakharov uint64_t vdevid = DVA_GET_VDEV(dva); 9016f793812SPavel Zakharov 9026f793812SPavel Zakharov if (vdevid >= spa->spa_root_vdev->vdev_children) 9036f793812SPavel Zakharov return (B_FALSE); 9046f793812SPavel Zakharov 9056f793812SPavel Zakharov vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; 9066f793812SPavel Zakharov if (vd == NULL) 9076f793812SPavel Zakharov return (B_FALSE); 9086f793812SPavel Zakharov 9096f793812SPavel Zakharov if (vd->vdev_ops == &vdev_hole_ops) 9106f793812SPavel Zakharov return (B_FALSE); 9116f793812SPavel Zakharov 9126f793812SPavel Zakharov if (vd->vdev_ops == &vdev_missing_ops) { 9136f793812SPavel Zakharov return (B_FALSE); 9146f793812SPavel Zakharov } 9156f793812SPavel Zakharov 9166f793812SPavel Zakharov uint64_t offset = DVA_GET_OFFSET(dva); 9176f793812SPavel Zakharov uint64_t asize = DVA_GET_ASIZE(dva); 9186f793812SPavel Zakharov 9196f793812SPavel Zakharov if (BP_IS_GANG(bp)) 9206f793812SPavel Zakharov asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 9216f793812SPavel Zakharov if (offset + asize > vd->vdev_asize) 9226f793812SPavel Zakharov return (B_FALSE); 9236f793812SPavel Zakharov 9246f793812SPavel Zakharov return (B_TRUE); 9256f793812SPavel Zakharov } 9266f793812SPavel Zakharov 927fa9e4066Sahrens zio_t * 928e14bb325SJeff Bonwick zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 929770499e1SDan Kimmel abd_t *data, uint64_t size, zio_done_func_t *done, void *private, 9307802d7bfSMatthew Ahrens zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) 931fa9e4066Sahrens { 932fa9e4066Sahrens zio_t *zio; 933fa9e4066Sahrens 934f63ab3d5SMatthew Ahrens zfs_blkptr_verify(spa, bp); 935f63ab3d5SMatthew Ahrens 936b24ab676SJeff Bonwick zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 9375602294fSDan Kimmel data, size, size, done, private, 938e14bb325SJeff Bonwick ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 939b24ab676SJeff Bonwick ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 940b24ab676SJeff Bonwick ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 941fa9e4066Sahrens 942fa9e4066Sahrens return (zio); 943fa9e4066Sahrens } 944fa9e4066Sahrens 945fa9e4066Sahrens zio_t * 946e14bb325SJeff Bonwick zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 947770499e1SDan Kimmel abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, 9488df0bcf0SPaul Dagnelie zio_done_func_t *ready, zio_done_func_t *children_ready, 9498df0bcf0SPaul Dagnelie zio_done_func_t *physdone, zio_done_func_t *done, 9508df0bcf0SPaul Dagnelie void *private, zio_priority_t priority, enum zio_flag flags, 9518df0bcf0SPaul Dagnelie const zbookmark_phys_t *zb) 952fa9e4066Sahrens { 953fa9e4066Sahrens zio_t *zio; 954fa9e4066Sahrens 955e14bb325SJeff Bonwick ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 956e14bb325SJeff Bonwick zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 957e14bb325SJeff Bonwick zp->zp_compress >= ZIO_COMPRESS_OFF && 958e14bb325SJeff Bonwick zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 959ad135b5dSChristopher Siden DMU_OT_IS_VALID(zp->zp_type) && 960e14bb325SJeff Bonwick zp->zp_level < 32 && 961b24ab676SJeff Bonwick zp->zp_copies > 0 && 96280901aeaSGeorge Wilson zp->zp_copies <= spa_max_replication(spa)); 9630a4e9518Sgw 9645602294fSDan Kimmel zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, 965e14bb325SJeff Bonwick ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 966b24ab676SJeff Bonwick ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 967b24ab676SJeff Bonwick ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 968fa9e4066Sahrens 969c717a561Smaybee zio->io_ready = ready; 9708df0bcf0SPaul Dagnelie zio->io_children_ready = children_ready; 97169962b56SMatthew Ahrens zio->io_physdone = physdone; 972e14bb325SJeff Bonwick zio->io_prop = *zp; 973fa9e4066Sahrens 9745d7b4d43SMatthew Ahrens /* 9755d7b4d43SMatthew Ahrens * Data can be NULL if we are going to call zio_write_override() to 9765d7b4d43SMatthew Ahrens * provide the already-allocated BP. But we may need the data to 9775d7b4d43SMatthew Ahrens * verify a dedup hit (if requested). In this case, don't try to 978eb633035STom Caputi * dedup (just take the already-allocated BP verbatim). Encrypted 979eb633035STom Caputi * dedup blocks need data as well so we also disable dedup in this 980eb633035STom Caputi * case. 9815d7b4d43SMatthew Ahrens */ 982eb633035STom Caputi if (data == NULL && 983eb633035STom Caputi (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { 9845d7b4d43SMatthew Ahrens zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; 9855d7b4d43SMatthew Ahrens } 9865d7b4d43SMatthew Ahrens 987fa9e4066Sahrens return (zio); 988fa9e4066Sahrens } 989fa9e4066Sahrens 990fa9e4066Sahrens zio_t * 991770499e1SDan Kimmel zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, 99269962b56SMatthew Ahrens uint64_t size, zio_done_func_t *done, void *private, 9937802d7bfSMatthew Ahrens zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) 994fa9e4066Sahrens { 995fa9e4066Sahrens zio_t *zio; 996fa9e4066Sahrens 9975602294fSDan Kimmel zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, 9980f7643c7SGeorge Wilson ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, 999e14bb325SJeff Bonwick ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 1000fa9e4066Sahrens 1001fa9e4066Sahrens return (zio); 1002fa9e4066Sahrens } 1003fa9e4066Sahrens 1004b24ab676SJeff Bonwick void 100580901aeaSGeorge Wilson zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 1006b24ab676SJeff Bonwick { 1007b24ab676SJeff Bonwick ASSERT(zio->io_type == ZIO_TYPE_WRITE); 1008b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1009b24ab676SJeff Bonwick ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1010b24ab676SJeff Bonwick ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 1011b24ab676SJeff Bonwick 101280901aeaSGeorge Wilson /* 101380901aeaSGeorge Wilson * We must reset the io_prop to match the values that existed 101480901aeaSGeorge Wilson * when the bp was first written by dmu_sync() keeping in mind 101580901aeaSGeorge Wilson * that nopwrite and dedup are mutually exclusive. 101680901aeaSGeorge Wilson */ 101780901aeaSGeorge Wilson zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 101880901aeaSGeorge Wilson zio->io_prop.zp_nopwrite = nopwrite; 1019b24ab676SJeff Bonwick zio->io_prop.zp_copies = copies; 1020b24ab676SJeff Bonwick zio->io_bp_override = bp; 1021b24ab676SJeff Bonwick } 1022b24ab676SJeff Bonwick 1023b24ab676SJeff Bonwick void 1024b24ab676SJeff Bonwick zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 1025b24ab676SJeff Bonwick { 10265d7b4d43SMatthew Ahrens 10275cabbc6bSPrashanth Sreenivasa zfs_blkptr_verify(spa, bp); 10285cabbc6bSPrashanth Sreenivasa 10295d7b4d43SMatthew Ahrens /* 10305d7b4d43SMatthew Ahrens * The check for EMBEDDED is a performance optimization. We 10315d7b4d43SMatthew Ahrens * process the free here (by ignoring it) rather than 10325d7b4d43SMatthew Ahrens * putting it on the list and then processing it in zio_free_sync(). 10335d7b4d43SMatthew Ahrens */ 10345d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 10355d7b4d43SMatthew Ahrens return; 10363b2aab18SMatthew Ahrens metaslab_check_free(spa, bp); 10379cb154a3SMatthew Ahrens 10389cb154a3SMatthew Ahrens /* 10399cb154a3SMatthew Ahrens * Frees that are for the currently-syncing txg, are not going to be 10409cb154a3SMatthew Ahrens * deferred, and which will not need to do a read (i.e. not GANG or 10419cb154a3SMatthew Ahrens * DEDUP), can be processed immediately. Otherwise, put them on the 10429cb154a3SMatthew Ahrens * in-memory list for later processing. 1043814dcd43SSerapheim Dimitropoulos * 1044814dcd43SSerapheim Dimitropoulos * Note that we only defer frees after zfs_sync_pass_deferred_free 1045814dcd43SSerapheim Dimitropoulos * when the log space map feature is disabled. [see relevant comment 1046814dcd43SSerapheim Dimitropoulos * in spa_sync_iterate_to_convergence()] 10479cb154a3SMatthew Ahrens */ 1048814dcd43SSerapheim Dimitropoulos if (BP_IS_GANG(bp) || 1049814dcd43SSerapheim Dimitropoulos BP_GET_DEDUP(bp) || 10509cb154a3SMatthew Ahrens txg != spa->spa_syncing_txg || 1051814dcd43SSerapheim Dimitropoulos (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && 1052814dcd43SSerapheim Dimitropoulos !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { 10539cb154a3SMatthew Ahrens bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 10549cb154a3SMatthew Ahrens } else { 10559cb154a3SMatthew Ahrens VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); 10569cb154a3SMatthew Ahrens } 1057b24ab676SJeff Bonwick } 1058b24ab676SJeff Bonwick 1059fa9e4066Sahrens zio_t * 1060b24ab676SJeff Bonwick zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 1061b24ab676SJeff Bonwick enum zio_flag flags) 1062fa9e4066Sahrens { 1063fa9e4066Sahrens zio_t *zio; 10649cb154a3SMatthew Ahrens enum zio_stage stage = ZIO_FREE_PIPELINE; 1065fa9e4066Sahrens 1066fa9e4066Sahrens ASSERT(!BP_IS_HOLE(bp)); 1067b24ab676SJeff Bonwick ASSERT(spa_syncing_txg(spa) == txg); 1068fa9e4066Sahrens 10695d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 10705d7b4d43SMatthew Ahrens return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 10715d7b4d43SMatthew Ahrens 10723b2aab18SMatthew Ahrens metaslab_check_free(spa, bp); 10736e6d5868SMatthew Ahrens arc_freed(spa, bp); 1074a3874b8bSToomas Soome dsl_scan_freed(spa, bp); 10753b2aab18SMatthew Ahrens 10769cb154a3SMatthew Ahrens /* 10779cb154a3SMatthew Ahrens * GANG and DEDUP blocks can induce a read (for the gang block header, 10789cb154a3SMatthew Ahrens * or the DDT), so issue them asynchronously so that this thread is 10799cb154a3SMatthew Ahrens * not tied up. 10809cb154a3SMatthew Ahrens */ 10819cb154a3SMatthew Ahrens if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 10829cb154a3SMatthew Ahrens stage |= ZIO_STAGE_ISSUE_ASYNC; 10839cb154a3SMatthew Ahrens 1084e14bb325SJeff Bonwick zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 10855602294fSDan Kimmel BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, 10865602294fSDan Kimmel flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 10879cb154a3SMatthew Ahrens 1088fa9e4066Sahrens return (zio); 1089fa9e4066Sahrens } 1090fa9e4066Sahrens 1091fa9e4066Sahrens zio_t * 1092b24ab676SJeff Bonwick zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 1093b24ab676SJeff Bonwick zio_done_func_t *done, void *private, enum zio_flag flags) 1094fa9e4066Sahrens { 1095fa9e4066Sahrens zio_t *zio; 1096fa9e4066Sahrens 10975cabbc6bSPrashanth Sreenivasa zfs_blkptr_verify(spa, bp); 10985d7b4d43SMatthew Ahrens 10995d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 11005d7b4d43SMatthew Ahrens return (zio_null(pio, spa, NULL, NULL, NULL, 0)); 11015d7b4d43SMatthew Ahrens 1102fa9e4066Sahrens /* 1103fa9e4066Sahrens * A claim is an allocation of a specific block. Claims are needed 1104fa9e4066Sahrens * to support immediate writes in the intent log. The issue is that 1105fa9e4066Sahrens * immediate writes contain committed data, but in a txg that was 1106fa9e4066Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 1107fa9e4066Sahrens * the intent log claims all blocks that contain immediate write data 1108fa9e4066Sahrens * so that the SPA knows they're in use. 1109fa9e4066Sahrens * 1110fa9e4066Sahrens * All claims *must* be resolved in the first txg -- before the SPA 1111fa9e4066Sahrens * starts allocating blocks -- so that nothing is allocated twice. 1112b24ab676SJeff Bonwick * If txg == 0 we just verify that the block is claimable. 1113fa9e4066Sahrens */ 111486714001SSerapheim Dimitropoulos ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, 111586714001SSerapheim Dimitropoulos spa_min_claim_txg(spa)); 111686714001SSerapheim Dimitropoulos ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); 1117b24ab676SJeff Bonwick ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 1118fa9e4066Sahrens 1119e14bb325SJeff Bonwick zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 11205602294fSDan Kimmel BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 11215602294fSDan Kimmel flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 11220f7643c7SGeorge Wilson ASSERT0(zio->io_queued_timestamp); 1123fa9e4066Sahrens 1124fa9e4066Sahrens return (zio); 1125fa9e4066Sahrens } 1126fa9e4066Sahrens 1127fa9e4066Sahrens zio_t * 1128fa9e4066Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 112969962b56SMatthew Ahrens zio_done_func_t *done, void *private, enum zio_flag flags) 1130fa9e4066Sahrens { 1131fa9e4066Sahrens zio_t *zio; 1132fa9e4066Sahrens int c; 1133fa9e4066Sahrens 1134fa9e4066Sahrens if (vd->vdev_children == 0) { 11355602294fSDan Kimmel zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, 113669962b56SMatthew Ahrens ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 1137fa9e4066Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 1138fa9e4066Sahrens 1139fa9e4066Sahrens zio->io_cmd = cmd; 1140fa9e4066Sahrens } else { 1141a3f829aeSBill Moore zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 1142fa9e4066Sahrens 1143fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 1144fa9e4066Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 114569962b56SMatthew Ahrens done, private, flags)); 1146fa9e4066Sahrens } 1147fa9e4066Sahrens 1148fa9e4066Sahrens return (zio); 1149fa9e4066Sahrens } 1150fa9e4066Sahrens 1151084fd14fSBrian Behlendorf zio_t * 1152084fd14fSBrian Behlendorf zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1153084fd14fSBrian Behlendorf zio_done_func_t *done, void *private, zio_priority_t priority, 1154084fd14fSBrian Behlendorf enum zio_flag flags, enum trim_flag trim_flags) 1155084fd14fSBrian Behlendorf { 1156084fd14fSBrian Behlendorf zio_t *zio; 1157084fd14fSBrian Behlendorf 1158084fd14fSBrian Behlendorf ASSERT0(vd->vdev_children); 1159084fd14fSBrian Behlendorf ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 1160084fd14fSBrian Behlendorf ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1161084fd14fSBrian Behlendorf ASSERT3U(size, !=, 0); 1162084fd14fSBrian Behlendorf 1163084fd14fSBrian Behlendorf zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, 1164084fd14fSBrian Behlendorf private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, 1165084fd14fSBrian Behlendorf vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); 1166084fd14fSBrian Behlendorf zio->io_trim_flags = trim_flags; 1167084fd14fSBrian Behlendorf 1168084fd14fSBrian Behlendorf return (zio); 1169084fd14fSBrian Behlendorf } 1170084fd14fSBrian Behlendorf 1171fa9e4066Sahrens zio_t * 1172fa9e4066Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1173770499e1SDan Kimmel abd_t *data, int checksum, zio_done_func_t *done, void *private, 117469962b56SMatthew Ahrens zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1175fa9e4066Sahrens { 1176fa9e4066Sahrens zio_t *zio; 11770a4e9518Sgw 1178e14bb325SJeff Bonwick ASSERT(vd->vdev_children == 0); 1179e14bb325SJeff Bonwick ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1180e14bb325SJeff Bonwick offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1181e14bb325SJeff Bonwick ASSERT3U(offset + size, <=, vd->vdev_psize); 1182fa9e4066Sahrens 11835602294fSDan Kimmel zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, 11845602294fSDan Kimmel private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, 11855602294fSDan Kimmel offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 1186fa9e4066Sahrens 1187e14bb325SJeff Bonwick zio->io_prop.zp_checksum = checksum; 1188fa9e4066Sahrens 1189fa9e4066Sahrens return (zio); 1190fa9e4066Sahrens } 1191fa9e4066Sahrens 1192fa9e4066Sahrens zio_t * 1193fa9e4066Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 1194770499e1SDan Kimmel abd_t *data, int checksum, zio_done_func_t *done, void *private, 119569962b56SMatthew Ahrens zio_priority_t priority, enum zio_flag flags, boolean_t labels) 1196fa9e4066Sahrens { 1197fa9e4066Sahrens zio_t *zio; 11980a4e9518Sgw 1199e14bb325SJeff Bonwick ASSERT(vd->vdev_children == 0); 1200e14bb325SJeff Bonwick ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 1201e14bb325SJeff Bonwick offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 1202e14bb325SJeff Bonwick ASSERT3U(offset + size, <=, vd->vdev_psize); 1203fa9e4066Sahrens 12045602294fSDan Kimmel zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, 12055602294fSDan Kimmel private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, 12065602294fSDan Kimmel offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 1207fa9e4066Sahrens 1208e14bb325SJeff Bonwick zio->io_prop.zp_checksum = checksum; 1209fa9e4066Sahrens 121045818ee1SMatthew Ahrens if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 1211fa9e4066Sahrens /* 12126e1f5caaSNeil Perrin * zec checksums are necessarily destructive -- they modify 1213e14bb325SJeff Bonwick * the end of the write buffer to hold the verifier/checksum. 1214fa9e4066Sahrens * Therefore, we must make a local copy in case the data is 1215e14bb325SJeff Bonwick * being written to multiple places in parallel. 1216fa9e4066Sahrens */ 1217770499e1SDan Kimmel abd_t *wbuf = abd_alloc_sametype(data, size); 1218770499e1SDan Kimmel abd_copy(wbuf, data, size); 1219770499e1SDan Kimmel 1220e14bb325SJeff Bonwick zio_push_transform(zio, wbuf, size, size, NULL); 1221fa9e4066Sahrens } 1222fa9e4066Sahrens 1223fa9e4066Sahrens return (zio); 1224fa9e4066Sahrens } 1225fa9e4066Sahrens 1226fa9e4066Sahrens /* 1227e14bb325SJeff Bonwick * Create a child I/O to do some work for us. 1228fa9e4066Sahrens */ 1229fa9e4066Sahrens zio_t * 1230e14bb325SJeff Bonwick zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 1231770499e1SDan Kimmel abd_t *data, uint64_t size, int type, zio_priority_t priority, 1232dcbf3bd6SGeorge Wilson enum zio_flag flags, zio_done_func_t *done, void *private) 1233fa9e4066Sahrens { 1234b24ab676SJeff Bonwick enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 1235e14bb325SJeff Bonwick zio_t *zio; 1236e14bb325SJeff Bonwick 12375cabbc6bSPrashanth Sreenivasa /* 12385cabbc6bSPrashanth Sreenivasa * vdev child I/Os do not propagate their error to the parent. 12395cabbc6bSPrashanth Sreenivasa * Therefore, for correct operation the caller *must* check for 12405cabbc6bSPrashanth Sreenivasa * and handle the error in the child i/o's done callback. 12415cabbc6bSPrashanth Sreenivasa * The only exceptions are i/os that we don't care about 12425cabbc6bSPrashanth Sreenivasa * (OPTIONAL or REPAIR). 12435cabbc6bSPrashanth Sreenivasa */ 12445cabbc6bSPrashanth Sreenivasa ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || 12455cabbc6bSPrashanth Sreenivasa done != NULL); 12465cabbc6bSPrashanth Sreenivasa 1247fa9e4066Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 1248fa9e4066Sahrens /* 1249fa9e4066Sahrens * If we have the bp, then the child should perform the 1250fa9e4066Sahrens * checksum and the parent need not. This pushes error 1251fa9e4066Sahrens * detection as close to the leaves as possible and 1252fa9e4066Sahrens * eliminates redundant checksums in the interior nodes. 1253fa9e4066Sahrens */ 1254b24ab676SJeff Bonwick pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 1255b24ab676SJeff Bonwick pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 1256fa9e4066Sahrens } 1257fa9e4066Sahrens 12585cabbc6bSPrashanth Sreenivasa if (vd->vdev_ops->vdev_op_leaf) { 12595cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_children); 1260e14bb325SJeff Bonwick offset += VDEV_LABEL_START_SIZE; 12615cabbc6bSPrashanth Sreenivasa } 1262e14bb325SJeff Bonwick 12635cabbc6bSPrashanth Sreenivasa flags |= ZIO_VDEV_CHILD_FLAGS(pio); 1264b24ab676SJeff Bonwick 1265b24ab676SJeff Bonwick /* 1266b24ab676SJeff Bonwick * If we've decided to do a repair, the write is not speculative -- 1267b24ab676SJeff Bonwick * even if the original read was. 1268b24ab676SJeff Bonwick */ 1269b24ab676SJeff Bonwick if (flags & ZIO_FLAG_IO_REPAIR) 1270b24ab676SJeff Bonwick flags &= ~ZIO_FLAG_SPECULATIVE; 1271b24ab676SJeff Bonwick 12720f7643c7SGeorge Wilson /* 12730f7643c7SGeorge Wilson * If we're creating a child I/O that is not associated with a 12740f7643c7SGeorge Wilson * top-level vdev, then the child zio is not an allocating I/O. 12750f7643c7SGeorge Wilson * If this is a retried I/O then we ignore it since we will 12760f7643c7SGeorge Wilson * have already processed the original allocating I/O. 12770f7643c7SGeorge Wilson */ 12780f7643c7SGeorge Wilson if (flags & ZIO_FLAG_IO_ALLOCATING && 12790f7643c7SGeorge Wilson (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { 1280663207adSDon Brady ASSERT(pio->io_metaslab_class != NULL); 1281663207adSDon Brady ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); 12820f7643c7SGeorge Wilson ASSERT(type == ZIO_TYPE_WRITE); 12830f7643c7SGeorge Wilson ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); 12840f7643c7SGeorge Wilson ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); 12850f7643c7SGeorge Wilson ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || 12860f7643c7SGeorge Wilson pio->io_child_type == ZIO_CHILD_GANG); 12870f7643c7SGeorge Wilson 12880f7643c7SGeorge Wilson flags &= ~ZIO_FLAG_IO_ALLOCATING; 12890f7643c7SGeorge Wilson } 12900f7643c7SGeorge Wilson 12915602294fSDan Kimmel zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, 1292b24ab676SJeff Bonwick done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 1293b24ab676SJeff Bonwick ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 12940f7643c7SGeorge Wilson ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 1295fa9e4066Sahrens 129669962b56SMatthew Ahrens zio->io_physdone = pio->io_physdone; 129769962b56SMatthew Ahrens if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 129869962b56SMatthew Ahrens zio->io_logical->io_phys_children++; 129969962b56SMatthew Ahrens 1300e14bb325SJeff Bonwick return (zio); 130132b87932Sek } 130232b87932Sek 1303e14bb325SJeff Bonwick zio_t * 1304770499e1SDan Kimmel zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, 13053a4b1be9SMatthew Ahrens zio_type_t type, zio_priority_t priority, enum zio_flag flags, 13069a686fbcSPaul Dagnelie zio_done_func_t *done, void *private) 1307fa9e4066Sahrens { 1308e14bb325SJeff Bonwick zio_t *zio; 1309fa9e4066Sahrens 1310e14bb325SJeff Bonwick ASSERT(vd->vdev_ops->vdev_op_leaf); 1311fa9e4066Sahrens 1312e14bb325SJeff Bonwick zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 13135602294fSDan Kimmel data, size, size, done, private, type, priority, 131469962b56SMatthew Ahrens flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 1315e14bb325SJeff Bonwick vd, offset, NULL, 1316b24ab676SJeff Bonwick ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 1317fa9e4066Sahrens 1318e14bb325SJeff Bonwick return (zio); 1319e05725b1Sbonwick } 1320e05725b1Sbonwick 1321e05725b1Sbonwick void 1322e14bb325SJeff Bonwick zio_flush(zio_t *zio, vdev_t *vd) 1323e05725b1Sbonwick { 1324e14bb325SJeff Bonwick zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 132569962b56SMatthew Ahrens NULL, NULL, 1326e14bb325SJeff Bonwick ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 1327fa9e4066Sahrens } 1328fa9e4066Sahrens 13296e1f5caaSNeil Perrin void 13306e1f5caaSNeil Perrin zio_shrink(zio_t *zio, uint64_t size) 13316e1f5caaSNeil Perrin { 13321271e4b1SPrakash Surya ASSERT3P(zio->io_executor, ==, NULL); 13331271e4b1SPrakash Surya ASSERT3P(zio->io_orig_size, ==, zio->io_size); 13341271e4b1SPrakash Surya ASSERT3U(size, <=, zio->io_size); 13356e1f5caaSNeil Perrin 13366e1f5caaSNeil Perrin /* 13376e1f5caaSNeil Perrin * We don't shrink for raidz because of problems with the 13386e1f5caaSNeil Perrin * reconstruction when reading back less than the block size. 13396e1f5caaSNeil Perrin * Note, BP_IS_RAIDZ() assumes no compression. 13406e1f5caaSNeil Perrin */ 13416e1f5caaSNeil Perrin ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 13425602294fSDan Kimmel if (!BP_IS_RAIDZ(zio->io_bp)) { 13435602294fSDan Kimmel /* we are not doing a raw write */ 13445602294fSDan Kimmel ASSERT3U(zio->io_size, ==, zio->io_lsize); 13455602294fSDan Kimmel zio->io_orig_size = zio->io_size = zio->io_lsize = size; 13465602294fSDan Kimmel } 13476e1f5caaSNeil Perrin } 13486e1f5caaSNeil Perrin 1349fa9e4066Sahrens /* 1350fa9e4066Sahrens * ========================================================================== 1351e14bb325SJeff Bonwick * Prepare to read and write logical blocks 1352fa9e4066Sahrens * ========================================================================== 1353fa9e4066Sahrens */ 1354e14bb325SJeff Bonwick 1355e05725b1Sbonwick static int 1356e14bb325SJeff Bonwick zio_read_bp_init(zio_t *zio) 1357fa9e4066Sahrens { 1358e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 1359eb633035STom Caputi uint64_t psize = 1360eb633035STom Caputi BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); 1361e05725b1Sbonwick 13625cabbc6bSPrashanth Sreenivasa ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 13635cabbc6bSPrashanth Sreenivasa 136403361682SJeff Bonwick if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 1365f5383399SBill Moore zio->io_child_type == ZIO_CHILD_LOGICAL && 1366eb633035STom Caputi !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { 1367770499e1SDan Kimmel zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), 1368770499e1SDan Kimmel psize, psize, zio_decompress); 1369e14bb325SJeff Bonwick } 1370fa9e4066Sahrens 1371eb633035STom Caputi if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || 1372eb633035STom Caputi BP_HAS_INDIRECT_MAC_CKSUM(bp)) && 1373eb633035STom Caputi zio->io_child_type == ZIO_CHILD_LOGICAL) { 1374eb633035STom Caputi zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), 1375eb633035STom Caputi psize, psize, zio_decrypt); 1376eb633035STom Caputi } 1377770499e1SDan Kimmel 1378eb633035STom Caputi if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { 1379770499e1SDan Kimmel int psize = BPE_GET_PSIZE(bp); 1380770499e1SDan Kimmel void *data = abd_borrow_buf(zio->io_abd, psize); 1381eb633035STom Caputi 1382eb633035STom Caputi zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1383770499e1SDan Kimmel decode_embedded_bp_compressed(bp, data); 1384770499e1SDan Kimmel abd_return_buf_copy(zio->io_abd, data, psize); 13855d7b4d43SMatthew Ahrens } else { 13865d7b4d43SMatthew Ahrens ASSERT(!BP_IS_EMBEDDED(bp)); 13875cabbc6bSPrashanth Sreenivasa ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 13885d7b4d43SMatthew Ahrens } 13895d7b4d43SMatthew Ahrens 1390ad135b5dSChristopher Siden if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 1391e14bb325SJeff Bonwick zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1392fa9e4066Sahrens 1393bbfd46c4SJeff Bonwick if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 1394bbfd46c4SJeff Bonwick zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1395bbfd46c4SJeff Bonwick 1396b24ab676SJeff Bonwick if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 1397b24ab676SJeff Bonwick zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 1398b24ab676SJeff Bonwick 1399e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 1400fa9e4066Sahrens } 1401fa9e4066Sahrens 1402e05725b1Sbonwick static int 1403e14bb325SJeff Bonwick zio_write_bp_init(zio_t *zio) 14040a4e9518Sgw { 1405e14bb325SJeff Bonwick if (!IO_IS_ALLOCATING(zio)) 1406e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 14070a4e9518Sgw 1408b24ab676SJeff Bonwick ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 1409b24ab676SJeff Bonwick 1410b24ab676SJeff Bonwick if (zio->io_bp_override) { 14110f7643c7SGeorge Wilson blkptr_t *bp = zio->io_bp; 14120f7643c7SGeorge Wilson zio_prop_t *zp = &zio->io_prop; 14130f7643c7SGeorge Wilson 1414b24ab676SJeff Bonwick ASSERT(bp->blk_birth != zio->io_txg); 1415b24ab676SJeff Bonwick ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 1416b24ab676SJeff Bonwick 1417b24ab676SJeff Bonwick *bp = *zio->io_bp_override; 1418b24ab676SJeff Bonwick zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1419b24ab676SJeff Bonwick 14205d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 14215d7b4d43SMatthew Ahrens return (ZIO_PIPELINE_CONTINUE); 14225d7b4d43SMatthew Ahrens 142380901aeaSGeorge Wilson /* 142480901aeaSGeorge Wilson * If we've been overridden and nopwrite is set then 142580901aeaSGeorge Wilson * set the flag accordingly to indicate that a nopwrite 142680901aeaSGeorge Wilson * has already occurred. 142780901aeaSGeorge Wilson */ 142880901aeaSGeorge Wilson if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 142980901aeaSGeorge Wilson ASSERT(!zp->zp_dedup); 14300f7643c7SGeorge Wilson ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); 143180901aeaSGeorge Wilson zio->io_flags |= ZIO_FLAG_NOPWRITE; 143280901aeaSGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 143380901aeaSGeorge Wilson } 143480901aeaSGeorge Wilson 143580901aeaSGeorge Wilson ASSERT(!zp->zp_nopwrite); 143680901aeaSGeorge Wilson 1437b24ab676SJeff Bonwick if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1438b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 1439b24ab676SJeff Bonwick 144045818ee1SMatthew Ahrens ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & 144145818ee1SMatthew Ahrens ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); 1442b24ab676SJeff Bonwick 1443eb633035STom Caputi if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && 1444eb633035STom Caputi !zp->zp_encrypt) { 1445b24ab676SJeff Bonwick BP_SET_DEDUP(bp, 1); 1446b24ab676SJeff Bonwick zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1447b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 1448b24ab676SJeff Bonwick } 14490f7643c7SGeorge Wilson 14500f7643c7SGeorge Wilson /* 14510f7643c7SGeorge Wilson * We were unable to handle this as an override bp, treat 14520f7643c7SGeorge Wilson * it as a regular write I/O. 14530f7643c7SGeorge Wilson */ 1454b39b744bSMatthew Ahrens zio->io_bp_override = NULL; 14550f7643c7SGeorge Wilson *bp = zio->io_bp_orig; 14560f7643c7SGeorge Wilson zio->io_pipeline = zio->io_orig_pipeline; 1457b24ab676SJeff Bonwick } 14580a4e9518Sgw 14590f7643c7SGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 14600f7643c7SGeorge Wilson } 14610f7643c7SGeorge Wilson 14620f7643c7SGeorge Wilson static int 14630f7643c7SGeorge Wilson zio_write_compress(zio_t *zio) 14640f7643c7SGeorge Wilson { 14650f7643c7SGeorge Wilson spa_t *spa = zio->io_spa; 14660f7643c7SGeorge Wilson zio_prop_t *zp = &zio->io_prop; 14670f7643c7SGeorge Wilson enum zio_compress compress = zp->zp_compress; 14680f7643c7SGeorge Wilson blkptr_t *bp = zio->io_bp; 14695602294fSDan Kimmel uint64_t lsize = zio->io_lsize; 14705602294fSDan Kimmel uint64_t psize = zio->io_size; 14710f7643c7SGeorge Wilson int pass = 1; 14720f7643c7SGeorge Wilson 14730f7643c7SGeorge Wilson /* 14740f7643c7SGeorge Wilson * If our children haven't all reached the ready stage, 14750f7643c7SGeorge Wilson * wait for them and then repeat this pipeline stage. 14760f7643c7SGeorge Wilson */ 1477d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | 1478d6e1c446SGeorge Wilson ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { 14790f7643c7SGeorge Wilson return (ZIO_PIPELINE_STOP); 1480d6e1c446SGeorge Wilson } 14810f7643c7SGeorge Wilson 14820f7643c7SGeorge Wilson if (!IO_IS_ALLOCATING(zio)) 14830f7643c7SGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 14840f7643c7SGeorge Wilson 14850f7643c7SGeorge Wilson if (zio->io_children_ready != NULL) { 14860f7643c7SGeorge Wilson /* 14870f7643c7SGeorge Wilson * Now that all our children are ready, run the callback 14880f7643c7SGeorge Wilson * associated with this zio in case it wants to modify the 14890f7643c7SGeorge Wilson * data to be written. 14900f7643c7SGeorge Wilson */ 14910f7643c7SGeorge Wilson ASSERT3U(zp->zp_level, >, 0); 14920f7643c7SGeorge Wilson zio->io_children_ready(zio); 14930f7643c7SGeorge Wilson } 14940f7643c7SGeorge Wilson 14950f7643c7SGeorge Wilson ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 14960f7643c7SGeorge Wilson ASSERT(zio->io_bp_override == NULL); 14970f7643c7SGeorge Wilson 149843466aaeSMax Grossman if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1499e14bb325SJeff Bonwick /* 1500e14bb325SJeff Bonwick * We're rewriting an existing block, which means we're 1501e14bb325SJeff Bonwick * working on behalf of spa_sync(). For spa_sync() to 1502e14bb325SJeff Bonwick * converge, it must eventually be the case that we don't 1503e14bb325SJeff Bonwick * have to allocate new blocks. But compression changes 1504e14bb325SJeff Bonwick * the blocksize, which forces a reallocate, and makes 1505e14bb325SJeff Bonwick * convergence take longer. Therefore, after the first 1506e14bb325SJeff Bonwick * few passes, stop compressing to ensure convergence. 1507e14bb325SJeff Bonwick */ 1508b24ab676SJeff Bonwick pass = spa_sync_pass(spa); 1509b24ab676SJeff Bonwick 1510b24ab676SJeff Bonwick ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1511b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1512b24ab676SJeff Bonwick ASSERT(!BP_GET_DEDUP(bp)); 1513e05725b1Sbonwick 151401f55e48SGeorge Wilson if (pass >= zfs_sync_pass_dont_compress) 1515e14bb325SJeff Bonwick compress = ZIO_COMPRESS_OFF; 1516e05725b1Sbonwick 1517e14bb325SJeff Bonwick /* Make sure someone doesn't change their mind on overwrites */ 15185d7b4d43SMatthew Ahrens ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), 1519b24ab676SJeff Bonwick spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1520e14bb325SJeff Bonwick } 1521fa9e4066Sahrens 15225602294fSDan Kimmel /* If it's a compressed write that is not raw, compress the buffer. */ 1523eb633035STom Caputi if (compress != ZIO_COMPRESS_OFF && 1524eb633035STom Caputi !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { 1525b24ab676SJeff Bonwick void *cbuf = zio_buf_alloc(lsize); 1526770499e1SDan Kimmel psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); 1527b24ab676SJeff Bonwick if (psize == 0 || psize == lsize) { 1528e14bb325SJeff Bonwick compress = ZIO_COMPRESS_OFF; 1529b24ab676SJeff Bonwick zio_buf_free(cbuf, lsize); 1530eb633035STom Caputi } else if (!zp->zp_dedup && !zp->zp_encrypt && 1531eb633035STom Caputi psize <= BPE_PAYLOAD_SIZE && 15325d7b4d43SMatthew Ahrens zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && 15335d7b4d43SMatthew Ahrens spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { 15345d7b4d43SMatthew Ahrens encode_embedded_bp_compressed(bp, 15355d7b4d43SMatthew Ahrens cbuf, compress, lsize, psize); 15365d7b4d43SMatthew Ahrens BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); 15375d7b4d43SMatthew Ahrens BP_SET_TYPE(bp, zio->io_prop.zp_type); 15385d7b4d43SMatthew Ahrens BP_SET_LEVEL(bp, zio->io_prop.zp_level); 15395d7b4d43SMatthew Ahrens zio_buf_free(cbuf, lsize); 15405d7b4d43SMatthew Ahrens bp->blk_birth = zio->io_txg; 15415d7b4d43SMatthew Ahrens zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 15425d7b4d43SMatthew Ahrens ASSERT(spa_feature_is_active(spa, 15435d7b4d43SMatthew Ahrens SPA_FEATURE_EMBEDDED_DATA)); 15445d7b4d43SMatthew Ahrens return (ZIO_PIPELINE_CONTINUE); 1545b24ab676SJeff Bonwick } else { 15465d7b4d43SMatthew Ahrens /* 154781cd5c55SMatthew Ahrens * Round up compressed size up to the ashift 154881cd5c55SMatthew Ahrens * of the smallest-ashift device, and zero the tail. 154981cd5c55SMatthew Ahrens * This ensures that the compressed size of the BP 155081cd5c55SMatthew Ahrens * (and thus compressratio property) are correct, 155181cd5c55SMatthew Ahrens * in that we charge for the padding used to fill out 155281cd5c55SMatthew Ahrens * the last sector. 15535d7b4d43SMatthew Ahrens */ 155481cd5c55SMatthew Ahrens ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 155581cd5c55SMatthew Ahrens size_t rounded = (size_t)P2ROUNDUP(psize, 155681cd5c55SMatthew Ahrens 1ULL << spa->spa_min_ashift); 155781cd5c55SMatthew Ahrens if (rounded >= lsize) { 15585d7b4d43SMatthew Ahrens compress = ZIO_COMPRESS_OFF; 15595d7b4d43SMatthew Ahrens zio_buf_free(cbuf, lsize); 156081cd5c55SMatthew Ahrens psize = lsize; 15615d7b4d43SMatthew Ahrens } else { 1562770499e1SDan Kimmel abd_t *cdata = abd_get_from_buf(cbuf, lsize); 1563770499e1SDan Kimmel abd_take_ownership_of_buf(cdata, B_TRUE); 1564770499e1SDan Kimmel abd_zero_off(cdata, psize, rounded - psize); 156581cd5c55SMatthew Ahrens psize = rounded; 1566770499e1SDan Kimmel zio_push_transform(zio, cdata, 15675d7b4d43SMatthew Ahrens psize, lsize, NULL); 15685d7b4d43SMatthew Ahrens } 1569e14bb325SJeff Bonwick } 15700f7643c7SGeorge Wilson 15710f7643c7SGeorge Wilson /* 15720f7643c7SGeorge Wilson * We were unable to handle this as an override bp, treat 15730f7643c7SGeorge Wilson * it as a regular write I/O. 15740f7643c7SGeorge Wilson */ 15750f7643c7SGeorge Wilson zio->io_bp_override = NULL; 15760f7643c7SGeorge Wilson *bp = zio->io_bp_orig; 15770f7643c7SGeorge Wilson zio->io_pipeline = zio->io_orig_pipeline; 1578eb633035STom Caputi 1579eb633035STom Caputi } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && 1580eb633035STom Caputi zp->zp_type == DMU_OT_DNODE) { 1581eb633035STom Caputi /* 1582eb633035STom Caputi * The DMU actually relies on the zio layer's compression 1583eb633035STom Caputi * to free metadnode blocks that have had all contained 1584eb633035STom Caputi * dnodes freed. As a result, even when doing a raw 1585eb633035STom Caputi * receive, we must check whether the block can be compressed 1586eb633035STom Caputi * to a hole. 1587eb633035STom Caputi */ 1588eb633035STom Caputi psize = zio_compress_data(ZIO_COMPRESS_EMPTY, 1589eb633035STom Caputi zio->io_abd, NULL, lsize); 1590eb633035STom Caputi if (psize == 0) 1591eb633035STom Caputi compress = ZIO_COMPRESS_OFF; 15925602294fSDan Kimmel } else { 15935602294fSDan Kimmel ASSERT3U(psize, !=, 0); 1594e14bb325SJeff Bonwick } 1595c717a561Smaybee 1596e14bb325SJeff Bonwick /* 1597e14bb325SJeff Bonwick * The final pass of spa_sync() must be all rewrites, but the first 1598e14bb325SJeff Bonwick * few passes offer a trade-off: allocating blocks defers convergence, 1599e14bb325SJeff Bonwick * but newly allocated blocks are sequential, so they can be written 1600e14bb325SJeff Bonwick * to disk faster. Therefore, we allow the first few passes of 1601e14bb325SJeff Bonwick * spa_sync() to allocate new blocks, but force rewrites after that. 1602e14bb325SJeff Bonwick * There should only be a handful of blocks after pass 1 in any case. 1603e14bb325SJeff Bonwick */ 160443466aaeSMax Grossman if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 160543466aaeSMax Grossman BP_GET_PSIZE(bp) == psize && 160601f55e48SGeorge Wilson pass >= zfs_sync_pass_rewrite) { 1607663207adSDon Brady VERIFY3U(psize, !=, 0); 1608b24ab676SJeff Bonwick enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1609e14bb325SJeff Bonwick zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1610e14bb325SJeff Bonwick zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1611e14bb325SJeff Bonwick } else { 1612e14bb325SJeff Bonwick BP_ZERO(bp); 1613e14bb325SJeff Bonwick zio->io_pipeline = ZIO_WRITE_PIPELINE; 1614e14bb325SJeff Bonwick } 1615fa9e4066Sahrens 1616b24ab676SJeff Bonwick if (psize == 0) { 161743466aaeSMax Grossman if (zio->io_bp_orig.blk_birth != 0 && 161843466aaeSMax Grossman spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 161943466aaeSMax Grossman BP_SET_LSIZE(bp, lsize); 162043466aaeSMax Grossman BP_SET_TYPE(bp, zp->zp_type); 162143466aaeSMax Grossman BP_SET_LEVEL(bp, zp->zp_level); 162243466aaeSMax Grossman BP_SET_BIRTH(bp, zio->io_txg, 0); 162343466aaeSMax Grossman } 1624e14bb325SJeff Bonwick zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1625e14bb325SJeff Bonwick } else { 1626e14bb325SJeff Bonwick ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1627e14bb325SJeff Bonwick BP_SET_LSIZE(bp, lsize); 162843466aaeSMax Grossman BP_SET_TYPE(bp, zp->zp_type); 162943466aaeSMax Grossman BP_SET_LEVEL(bp, zp->zp_level); 1630b24ab676SJeff Bonwick BP_SET_PSIZE(bp, psize); 1631e14bb325SJeff Bonwick BP_SET_COMPRESS(bp, compress); 1632e14bb325SJeff Bonwick BP_SET_CHECKSUM(bp, zp->zp_checksum); 1633b24ab676SJeff Bonwick BP_SET_DEDUP(bp, zp->zp_dedup); 1634e14bb325SJeff Bonwick BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1635b24ab676SJeff Bonwick if (zp->zp_dedup) { 1636b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1637b24ab676SJeff Bonwick ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1638eb633035STom Caputi ASSERT(!zp->zp_encrypt || 1639eb633035STom Caputi DMU_OT_IS_ENCRYPTED(zp->zp_type)); 1640b24ab676SJeff Bonwick zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1641b24ab676SJeff Bonwick } 164280901aeaSGeorge Wilson if (zp->zp_nopwrite) { 164380901aeaSGeorge Wilson ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 164480901aeaSGeorge Wilson ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 164580901aeaSGeorge Wilson zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 164680901aeaSGeorge Wilson } 1647b24ab676SJeff Bonwick } 1648b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 1649b24ab676SJeff Bonwick } 1650b24ab676SJeff Bonwick 1651b24ab676SJeff Bonwick static int 1652b24ab676SJeff Bonwick zio_free_bp_init(zio_t *zio) 1653b24ab676SJeff Bonwick { 1654b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 1655b24ab676SJeff Bonwick 1656b24ab676SJeff Bonwick if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1657b24ab676SJeff Bonwick if (BP_GET_DEDUP(bp)) 1658b24ab676SJeff Bonwick zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1659e14bb325SJeff Bonwick } 1660fa9e4066Sahrens 16615cabbc6bSPrashanth Sreenivasa ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); 16625cabbc6bSPrashanth Sreenivasa 1663e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 1664fa9e4066Sahrens } 1665fa9e4066Sahrens 1666e14bb325SJeff Bonwick /* 1667e14bb325SJeff Bonwick * ========================================================================== 1668e14bb325SJeff Bonwick * Execute the I/O pipeline 1669e14bb325SJeff Bonwick * ========================================================================== 1670e14bb325SJeff Bonwick */ 1671e14bb325SJeff Bonwick 1672e14bb325SJeff Bonwick static void 1673ec94d322SAdam Leventhal zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1674fa9e4066Sahrens { 167580eb36f2SGeorge Wilson spa_t *spa = zio->io_spa; 1676e14bb325SJeff Bonwick zio_type_t t = zio->io_type; 16775aeb9474SGarrett D'Amore int flags = (cutinline ? TQ_FRONT : 0); 16780a4e9518Sgw 16790a4e9518Sgw /* 1680bbe36defSGeorge Wilson * If we're a config writer or a probe, the normal issue and 1681bbe36defSGeorge Wilson * interrupt threads may all be blocked waiting for the config lock. 1682bbe36defSGeorge Wilson * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 16830a4e9518Sgw */ 1684bbe36defSGeorge Wilson if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1685e14bb325SJeff Bonwick t = ZIO_TYPE_NULL; 16860a4e9518Sgw 16870a4e9518Sgw /* 1688e14bb325SJeff Bonwick * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 16890a4e9518Sgw */ 1690e14bb325SJeff Bonwick if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1691e14bb325SJeff Bonwick t = ZIO_TYPE_NULL; 16920a4e9518Sgw 169380eb36f2SGeorge Wilson /* 1694ec94d322SAdam Leventhal * If this is a high priority I/O, then use the high priority taskq if 1695ec94d322SAdam Leventhal * available. 169680eb36f2SGeorge Wilson */ 16972258ad0bSGeorge Wilson if ((zio->io_priority == ZIO_PRIORITY_NOW || 16982258ad0bSGeorge Wilson zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && 1699ec94d322SAdam Leventhal spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 170080eb36f2SGeorge Wilson q++; 170180eb36f2SGeorge Wilson 170280eb36f2SGeorge Wilson ASSERT3U(q, <, ZIO_TASKQ_TYPES); 17035aeb9474SGarrett D'Amore 17045aeb9474SGarrett D'Amore /* 17055aeb9474SGarrett D'Amore * NB: We are assuming that the zio can only be dispatched 17065aeb9474SGarrett D'Amore * to a single taskq at a time. It would be a grievous error 17075aeb9474SGarrett D'Amore * to dispatch the zio to another taskq at the same time. 17085aeb9474SGarrett D'Amore */ 17095aeb9474SGarrett D'Amore ASSERT(zio->io_tqent.tqent_next == NULL); 1710ec94d322SAdam Leventhal spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1711ec94d322SAdam Leventhal flags, &zio->io_tqent); 1712e14bb325SJeff Bonwick } 17130a4e9518Sgw 1714e14bb325SJeff Bonwick static boolean_t 1715ec94d322SAdam Leventhal zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1716e14bb325SJeff Bonwick { 1717e14bb325SJeff Bonwick kthread_t *executor = zio->io_executor; 1718e14bb325SJeff Bonwick spa_t *spa = zio->io_spa; 17190a4e9518Sgw 1720ec94d322SAdam Leventhal for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1721ec94d322SAdam Leventhal spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1722ec94d322SAdam Leventhal uint_t i; 1723ec94d322SAdam Leventhal for (i = 0; i < tqs->stqs_count; i++) { 1724ec94d322SAdam Leventhal if (taskq_member(tqs->stqs_taskq[i], executor)) 1725ec94d322SAdam Leventhal return (B_TRUE); 1726ec94d322SAdam Leventhal } 1727ec94d322SAdam Leventhal } 17280a4e9518Sgw 1729e14bb325SJeff Bonwick return (B_FALSE); 1730e14bb325SJeff Bonwick } 1731e05725b1Sbonwick 1732e14bb325SJeff Bonwick static int 1733e14bb325SJeff Bonwick zio_issue_async(zio_t *zio) 1734e14bb325SJeff Bonwick { 173535a5a358SJonathan Adams zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1736e14bb325SJeff Bonwick 1737e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 17380a4e9518Sgw } 17390a4e9518Sgw 1740e14bb325SJeff Bonwick void 1741e14bb325SJeff Bonwick zio_interrupt(zio_t *zio) 17420a4e9518Sgw { 174335a5a358SJonathan Adams zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1744e14bb325SJeff Bonwick } 17450a4e9518Sgw 174697e81309SPrakash Surya void 174797e81309SPrakash Surya zio_delay_interrupt(zio_t *zio) 174897e81309SPrakash Surya { 174997e81309SPrakash Surya /* 175097e81309SPrakash Surya * The timeout_generic() function isn't defined in userspace, so 175197e81309SPrakash Surya * rather than trying to implement the function, the zio delay 175297e81309SPrakash Surya * functionality has been disabled for userspace builds. 175397e81309SPrakash Surya */ 175497e81309SPrakash Surya 175597e81309SPrakash Surya #ifdef _KERNEL 175697e81309SPrakash Surya /* 175797e81309SPrakash Surya * If io_target_timestamp is zero, then no delay has been registered 175897e81309SPrakash Surya * for this IO, thus jump to the end of this function and "skip" the 175997e81309SPrakash Surya * delay; issuing it directly to the zio layer. 176097e81309SPrakash Surya */ 176197e81309SPrakash Surya if (zio->io_target_timestamp != 0) { 176297e81309SPrakash Surya hrtime_t now = gethrtime(); 176397e81309SPrakash Surya 176497e81309SPrakash Surya if (now >= zio->io_target_timestamp) { 176597e81309SPrakash Surya /* 176697e81309SPrakash Surya * This IO has already taken longer than the target 176797e81309SPrakash Surya * delay to complete, so we don't want to delay it 176897e81309SPrakash Surya * any longer; we "miss" the delay and issue it 176997e81309SPrakash Surya * directly to the zio layer. This is likely due to 177097e81309SPrakash Surya * the target latency being set to a value less than 177197e81309SPrakash Surya * the underlying hardware can satisfy (e.g. delay 177297e81309SPrakash Surya * set to 1ms, but the disks take 10ms to complete an 177397e81309SPrakash Surya * IO request). 177497e81309SPrakash Surya */ 177597e81309SPrakash Surya 177697e81309SPrakash Surya DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, 177797e81309SPrakash Surya hrtime_t, now); 177897e81309SPrakash Surya 177997e81309SPrakash Surya zio_interrupt(zio); 178097e81309SPrakash Surya } else { 178197e81309SPrakash Surya hrtime_t diff = zio->io_target_timestamp - now; 178297e81309SPrakash Surya 178397e81309SPrakash Surya DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, 178497e81309SPrakash Surya hrtime_t, now, hrtime_t, diff); 178597e81309SPrakash Surya 178697e81309SPrakash Surya (void) timeout_generic(CALLOUT_NORMAL, 178797e81309SPrakash Surya (void (*)(void *))zio_interrupt, zio, diff, 1, 0); 178897e81309SPrakash Surya } 178997e81309SPrakash Surya 179097e81309SPrakash Surya return; 179197e81309SPrakash Surya } 179297e81309SPrakash Surya #endif 179397e81309SPrakash Surya 179497e81309SPrakash Surya DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); 179597e81309SPrakash Surya zio_interrupt(zio); 179697e81309SPrakash Surya } 179797e81309SPrakash Surya 1798e14bb325SJeff Bonwick /* 1799e14bb325SJeff Bonwick * Execute the I/O pipeline until one of the following occurs: 1800f7170741SWill Andrews * 1801f7170741SWill Andrews * (1) the I/O completes 1802f7170741SWill Andrews * (2) the pipeline stalls waiting for dependent child I/Os 1803f7170741SWill Andrews * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1804f7170741SWill Andrews * (4) the I/O is delegated by vdev-level caching or aggregation 1805f7170741SWill Andrews * (5) the I/O is deferred due to vdev-level queueing 1806f7170741SWill Andrews * (6) the I/O is handed off to another thread. 1807f7170741SWill Andrews * 1808f7170741SWill Andrews * In all cases, the pipeline stops whenever there's no CPU work; it never 1809f7170741SWill Andrews * burns a thread in cv_wait(). 1810e14bb325SJeff Bonwick * 1811e14bb325SJeff Bonwick * There's no locking on io_stage because there's no legitimate way 1812e14bb325SJeff Bonwick * for multiple threads to be attempting to process the same I/O. 1813e14bb325SJeff Bonwick */ 1814b24ab676SJeff Bonwick static zio_pipe_stage_t *zio_pipeline[]; 18150a4e9518Sgw 1816e14bb325SJeff Bonwick void 1817e14bb325SJeff Bonwick zio_execute(zio_t *zio) 1818e14bb325SJeff Bonwick { 1819e14bb325SJeff Bonwick zio->io_executor = curthread; 18200a4e9518Sgw 18210f7643c7SGeorge Wilson ASSERT3U(zio->io_queued_timestamp, >, 0); 18220f7643c7SGeorge Wilson 1823e14bb325SJeff Bonwick while (zio->io_stage < ZIO_STAGE_DONE) { 1824b24ab676SJeff Bonwick enum zio_stage pipeline = zio->io_pipeline; 1825b24ab676SJeff Bonwick enum zio_stage stage = zio->io_stage; 1826e14bb325SJeff Bonwick int rv; 18270a4e9518Sgw 1828e14bb325SJeff Bonwick ASSERT(!MUTEX_HELD(&zio->io_lock)); 1829b24ab676SJeff Bonwick ASSERT(ISP2(stage)); 1830b24ab676SJeff Bonwick ASSERT(zio->io_stall == NULL); 18310a4e9518Sgw 1832b24ab676SJeff Bonwick do { 1833b24ab676SJeff Bonwick stage <<= 1; 1834b24ab676SJeff Bonwick } while ((stage & pipeline) == 0); 1835e14bb325SJeff Bonwick 1836e14bb325SJeff Bonwick ASSERT(stage <= ZIO_STAGE_DONE); 18370a4e9518Sgw 18380a4e9518Sgw /* 1839e14bb325SJeff Bonwick * If we are in interrupt context and this pipeline stage 1840e14bb325SJeff Bonwick * will grab a config lock that is held across I/O, 1841b24ab676SJeff Bonwick * or may wait for an I/O that needs an interrupt thread 1842b24ab676SJeff Bonwick * to complete, issue async to avoid deadlock. 184335a5a358SJonathan Adams * 184435a5a358SJonathan Adams * For VDEV_IO_START, we cut in line so that the io will 184535a5a358SJonathan Adams * be sent to disk promptly. 18460a4e9518Sgw */ 1847b24ab676SJeff Bonwick if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1848e14bb325SJeff Bonwick zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 184935a5a358SJonathan Adams boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 185035a5a358SJonathan Adams zio_requeue_io_start_cut_in_line : B_FALSE; 185135a5a358SJonathan Adams zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1852e14bb325SJeff Bonwick return; 18530a4e9518Sgw } 18540a4e9518Sgw 1855e14bb325SJeff Bonwick zio->io_stage = stage; 18560f7643c7SGeorge Wilson zio->io_pipeline_trace |= zio->io_stage; 1857bf16b11eSMatthew Ahrens rv = zio_pipeline[highbit64(stage) - 1](zio); 18580a4e9518Sgw 1859e14bb325SJeff Bonwick if (rv == ZIO_PIPELINE_STOP) 1860e14bb325SJeff Bonwick return; 18610a4e9518Sgw 1862e14bb325SJeff Bonwick ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1863e14bb325SJeff Bonwick } 18640a4e9518Sgw } 18650a4e9518Sgw 1866e14bb325SJeff Bonwick /* 1867e14bb325SJeff Bonwick * ========================================================================== 1868e14bb325SJeff Bonwick * Initiate I/O, either sync or async 1869e14bb325SJeff Bonwick * ========================================================================== 1870e14bb325SJeff Bonwick */ 1871e14bb325SJeff Bonwick int 1872e14bb325SJeff Bonwick zio_wait(zio_t *zio) 18730a4e9518Sgw { 1874e14bb325SJeff Bonwick int error; 18750a4e9518Sgw 18761271e4b1SPrakash Surya ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN); 18771271e4b1SPrakash Surya ASSERT3P(zio->io_executor, ==, NULL); 18780a4e9518Sgw 1879e14bb325SJeff Bonwick zio->io_waiter = curthread; 18800f7643c7SGeorge Wilson ASSERT0(zio->io_queued_timestamp); 18810f7643c7SGeorge Wilson zio->io_queued_timestamp = gethrtime(); 1882e05725b1Sbonwick 1883e14bb325SJeff Bonwick zio_execute(zio); 18840a4e9518Sgw 1885e14bb325SJeff Bonwick mutex_enter(&zio->io_lock); 1886e14bb325SJeff Bonwick while (zio->io_executor != NULL) 1887e14bb325SJeff Bonwick cv_wait(&zio->io_cv, &zio->io_lock); 1888e14bb325SJeff Bonwick mutex_exit(&zio->io_lock); 188932b87932Sek 1890e14bb325SJeff Bonwick error = zio->io_error; 1891e14bb325SJeff Bonwick zio_destroy(zio); 189232b87932Sek 1893e14bb325SJeff Bonwick return (error); 189432b87932Sek } 189532b87932Sek 1896e14bb325SJeff Bonwick void 1897e14bb325SJeff Bonwick zio_nowait(zio_t *zio) 18980a4e9518Sgw { 18991271e4b1SPrakash Surya ASSERT3P(zio->io_executor, ==, NULL); 1900fa9e4066Sahrens 1901a3f829aeSBill Moore if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1902a3f829aeSBill Moore zio_unique_parent(zio) == NULL) { 1903ea8dc4b6Seschrock /* 1904e14bb325SJeff Bonwick * This is a logical async I/O with no parent to wait for it. 190554d692b7SGeorge Wilson * We add it to the spa_async_root_zio "Godfather" I/O which 190654d692b7SGeorge Wilson * will ensure they complete prior to unloading the pool. 1907ea8dc4b6Seschrock */ 1908e14bb325SJeff Bonwick spa_t *spa = zio->io_spa; 190954d692b7SGeorge Wilson 19106f834bc1SMatthew Ahrens zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); 1911e14bb325SJeff Bonwick } 1912ea8dc4b6Seschrock 19130f7643c7SGeorge Wilson ASSERT0(zio->io_queued_timestamp); 19140f7643c7SGeorge Wilson zio->io_queued_timestamp = gethrtime(); 1915e14bb325SJeff Bonwick zio_execute(zio); 1916e14bb325SJeff Bonwick } 1917ea8dc4b6Seschrock 1918e14bb325SJeff Bonwick /* 1919e14bb325SJeff Bonwick * ========================================================================== 19201271e4b1SPrakash Surya * Reexecute, cancel, or suspend/resume failed I/O 1921e14bb325SJeff Bonwick * ========================================================================== 1922e14bb325SJeff Bonwick */ 1923fa9e4066Sahrens 1924e14bb325SJeff Bonwick static void 1925e14bb325SJeff Bonwick zio_reexecute(zio_t *pio) 1926e14bb325SJeff Bonwick { 1927a3f829aeSBill Moore zio_t *cio, *cio_next; 1928a3f829aeSBill Moore 1929a3f829aeSBill Moore ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1930a3f829aeSBill Moore ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1931f5383399SBill Moore ASSERT(pio->io_gang_leader == NULL); 1932f5383399SBill Moore ASSERT(pio->io_gang_tree == NULL); 1933e05725b1Sbonwick 1934e14bb325SJeff Bonwick pio->io_flags = pio->io_orig_flags; 1935e14bb325SJeff Bonwick pio->io_stage = pio->io_orig_stage; 1936e14bb325SJeff Bonwick pio->io_pipeline = pio->io_orig_pipeline; 1937e14bb325SJeff Bonwick pio->io_reexecute = 0; 193880901aeaSGeorge Wilson pio->io_flags |= ZIO_FLAG_REEXECUTED; 19390f7643c7SGeorge Wilson pio->io_pipeline_trace = 0; 1940e14bb325SJeff Bonwick pio->io_error = 0; 1941a3f829aeSBill Moore for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1942a3f829aeSBill Moore pio->io_state[w] = 0; 1943e14bb325SJeff Bonwick for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1944e14bb325SJeff Bonwick pio->io_child_error[c] = 0; 19450a4e9518Sgw 1946b24ab676SJeff Bonwick if (IO_IS_ALLOCATING(pio)) 1947b24ab676SJeff Bonwick BP_ZERO(pio->io_bp); 1948d58459f4Sek 1949e14bb325SJeff Bonwick /* 1950e14bb325SJeff Bonwick * As we reexecute pio's children, new children could be created. 1951a3f829aeSBill Moore * New children go to the head of pio's io_child_list, however, 1952e14bb325SJeff Bonwick * so we will (correctly) not reexecute them. The key is that 1953a3f829aeSBill Moore * the remainder of pio's io_child_list, from 'cio_next' onward, 1954a3f829aeSBill Moore * cannot be affected by any side effects of reexecuting 'cio'. 1955e14bb325SJeff Bonwick */ 19560f7643c7SGeorge Wilson zio_link_t *zl = NULL; 1957a3874b8bSToomas Soome mutex_enter(&pio->io_lock); 19580f7643c7SGeorge Wilson for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 19590f7643c7SGeorge Wilson cio_next = zio_walk_children(pio, &zl); 1960a3f829aeSBill Moore for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1961a3f829aeSBill Moore pio->io_children[cio->io_child_type][w]++; 1962e14bb325SJeff Bonwick mutex_exit(&pio->io_lock); 1963a3f829aeSBill Moore zio_reexecute(cio); 1964a3874b8bSToomas Soome mutex_enter(&pio->io_lock); 1965fa9e4066Sahrens } 1966a3874b8bSToomas Soome mutex_exit(&pio->io_lock); 1967e05725b1Sbonwick 1968e14bb325SJeff Bonwick /* 1969e14bb325SJeff Bonwick * Now that all children have been reexecuted, execute the parent. 197054d692b7SGeorge Wilson * We don't reexecute "The Godfather" I/O here as it's the 197148bbca81SDaniel Hoffman * responsibility of the caller to wait on it. 1972e14bb325SJeff Bonwick */ 19730f7643c7SGeorge Wilson if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { 19740f7643c7SGeorge Wilson pio->io_queued_timestamp = gethrtime(); 197554d692b7SGeorge Wilson zio_execute(pio); 19760f7643c7SGeorge Wilson } 19770a4e9518Sgw } 19780a4e9518Sgw 1979e14bb325SJeff Bonwick void 1980e0f1c0afSOlaf Faaland zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) 19810a4e9518Sgw { 1982e14bb325SJeff Bonwick if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1983e14bb325SJeff Bonwick fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1984e14bb325SJeff Bonwick "failure and the failure mode property for this pool " 1985e14bb325SJeff Bonwick "is set to panic.", spa_name(spa)); 19860a4e9518Sgw 1987eb633035STom Caputi zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, 1988eb633035STom Caputi NULL, NULL, 0, 0); 19890a4e9518Sgw 1990e14bb325SJeff Bonwick mutex_enter(&spa->spa_suspend_lock); 1991fa9e4066Sahrens 1992e14bb325SJeff Bonwick if (spa->spa_suspend_zio_root == NULL) 199354d692b7SGeorge Wilson spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 199454d692b7SGeorge Wilson ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 199554d692b7SGeorge Wilson ZIO_FLAG_GODFATHER); 1996fa9e4066Sahrens 1997e0f1c0afSOlaf Faaland spa->spa_suspended = reason; 1998fa9e4066Sahrens 1999e14bb325SJeff Bonwick if (zio != NULL) { 200054d692b7SGeorge Wilson ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 2001e14bb325SJeff Bonwick ASSERT(zio != spa->spa_suspend_zio_root); 2002e14bb325SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2003a3f829aeSBill Moore ASSERT(zio_unique_parent(zio) == NULL); 2004e14bb325SJeff Bonwick ASSERT(zio->io_stage == ZIO_STAGE_DONE); 2005e14bb325SJeff Bonwick zio_add_child(spa->spa_suspend_zio_root, zio); 2006e14bb325SJeff Bonwick } 2007fa9e4066Sahrens 2008e14bb325SJeff Bonwick mutex_exit(&spa->spa_suspend_lock); 2009e14bb325SJeff Bonwick } 2010fa9e4066Sahrens 201154d692b7SGeorge Wilson int 2012e14bb325SJeff Bonwick zio_resume(spa_t *spa) 2013e14bb325SJeff Bonwick { 201454d692b7SGeorge Wilson zio_t *pio; 2015fa9e4066Sahrens 2016b3995adbSahrens /* 2017e14bb325SJeff Bonwick * Reexecute all previously suspended i/o. 2018b3995adbSahrens */ 2019e14bb325SJeff Bonwick mutex_enter(&spa->spa_suspend_lock); 2020e0f1c0afSOlaf Faaland spa->spa_suspended = ZIO_SUSPEND_NONE; 2021e14bb325SJeff Bonwick cv_broadcast(&spa->spa_suspend_cv); 2022e14bb325SJeff Bonwick pio = spa->spa_suspend_zio_root; 2023e14bb325SJeff Bonwick spa->spa_suspend_zio_root = NULL; 2024e14bb325SJeff Bonwick mutex_exit(&spa->spa_suspend_lock); 2025e14bb325SJeff Bonwick 2026e14bb325SJeff Bonwick if (pio == NULL) 202754d692b7SGeorge Wilson return (0); 2028e14bb325SJeff Bonwick 202954d692b7SGeorge Wilson zio_reexecute(pio); 203054d692b7SGeorge Wilson return (zio_wait(pio)); 2031e14bb325SJeff Bonwick } 2032e14bb325SJeff Bonwick 2033e14bb325SJeff Bonwick void 2034e14bb325SJeff Bonwick zio_resume_wait(spa_t *spa) 2035e14bb325SJeff Bonwick { 2036e14bb325SJeff Bonwick mutex_enter(&spa->spa_suspend_lock); 2037e14bb325SJeff Bonwick while (spa_suspended(spa)) 2038e14bb325SJeff Bonwick cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 2039e14bb325SJeff Bonwick mutex_exit(&spa->spa_suspend_lock); 2040fa9e4066Sahrens } 2041fa9e4066Sahrens 2042fa9e4066Sahrens /* 2043fa9e4066Sahrens * ========================================================================== 2044e14bb325SJeff Bonwick * Gang blocks. 2045e14bb325SJeff Bonwick * 2046e14bb325SJeff Bonwick * A gang block is a collection of small blocks that looks to the DMU 2047e14bb325SJeff Bonwick * like one large block. When zio_dva_allocate() cannot find a block 2048e14bb325SJeff Bonwick * of the requested size, due to either severe fragmentation or the pool 2049e14bb325SJeff Bonwick * being nearly full, it calls zio_write_gang_block() to construct the 2050e14bb325SJeff Bonwick * block from smaller fragments. 2051e14bb325SJeff Bonwick * 2052e14bb325SJeff Bonwick * A gang block consists of a gang header (zio_gbh_phys_t) and up to 2053e14bb325SJeff Bonwick * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 2054e14bb325SJeff Bonwick * an indirect block: it's an array of block pointers. It consumes 2055e14bb325SJeff Bonwick * only one sector and hence is allocatable regardless of fragmentation. 2056e14bb325SJeff Bonwick * The gang header's bps point to its gang members, which hold the data. 2057e14bb325SJeff Bonwick * 2058e14bb325SJeff Bonwick * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 2059e14bb325SJeff Bonwick * as the verifier to ensure uniqueness of the SHA256 checksum. 2060e14bb325SJeff Bonwick * Critically, the gang block bp's blk_cksum is the checksum of the data, 2061e14bb325SJeff Bonwick * not the gang header. This ensures that data block signatures (needed for 2062e14bb325SJeff Bonwick * deduplication) are independent of how the block is physically stored. 2063e14bb325SJeff Bonwick * 2064e14bb325SJeff Bonwick * Gang blocks can be nested: a gang member may itself be a gang block. 2065e14bb325SJeff Bonwick * Thus every gang block is a tree in which root and all interior nodes are 2066e14bb325SJeff Bonwick * gang headers, and the leaves are normal blocks that contain user data. 2067e14bb325SJeff Bonwick * The root of the gang tree is called the gang leader. 2068e14bb325SJeff Bonwick * 2069e14bb325SJeff Bonwick * To perform any operation (read, rewrite, free, claim) on a gang block, 2070e14bb325SJeff Bonwick * zio_gang_assemble() first assembles the gang tree (minus data leaves) 2071e14bb325SJeff Bonwick * in the io_gang_tree field of the original logical i/o by recursively 2072e14bb325SJeff Bonwick * reading the gang leader and all gang headers below it. This yields 2073e14bb325SJeff Bonwick * an in-core tree containing the contents of every gang header and the 2074e14bb325SJeff Bonwick * bps for every constituent of the gang block. 2075e14bb325SJeff Bonwick * 2076e14bb325SJeff Bonwick * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 2077e14bb325SJeff Bonwick * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 2078e14bb325SJeff Bonwick * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 2079e14bb325SJeff Bonwick * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 2080e14bb325SJeff Bonwick * zio_read_gang() is a wrapper around zio_read() that omits reading gang 2081e14bb325SJeff Bonwick * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 2082e14bb325SJeff Bonwick * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 2083e14bb325SJeff Bonwick * of the gang header plus zio_checksum_compute() of the data to update the 2084e14bb325SJeff Bonwick * gang header's blk_cksum as described above. 2085e14bb325SJeff Bonwick * 2086e14bb325SJeff Bonwick * The two-phase assemble/issue model solves the problem of partial failure -- 2087e14bb325SJeff Bonwick * what if you'd freed part of a gang block but then couldn't read the 2088e14bb325SJeff Bonwick * gang header for another part? Assembling the entire gang tree first 2089e14bb325SJeff Bonwick * ensures that all the necessary gang header I/O has succeeded before 2090e14bb325SJeff Bonwick * starting the actual work of free, claim, or write. Once the gang tree 2091e14bb325SJeff Bonwick * is assembled, free and claim are in-memory operations that cannot fail. 2092e14bb325SJeff Bonwick * 2093e14bb325SJeff Bonwick * In the event that a gang write fails, zio_dva_unallocate() walks the 2094e14bb325SJeff Bonwick * gang tree to immediately free (i.e. insert back into the space map) 2095e14bb325SJeff Bonwick * everything we've allocated. This ensures that we don't get ENOSPC 2096e14bb325SJeff Bonwick * errors during repeated suspend/resume cycles due to a flaky device. 2097e14bb325SJeff Bonwick * 2098e14bb325SJeff Bonwick * Gang rewrites only happen during sync-to-convergence. If we can't assemble 2099e14bb325SJeff Bonwick * the gang tree, we won't modify the block, so we can safely defer the free 2100e14bb325SJeff Bonwick * (knowing that the block is still intact). If we *can* assemble the gang 2101e14bb325SJeff Bonwick * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 2102e14bb325SJeff Bonwick * each constituent bp and we can allocate a new block on the next sync pass. 2103e14bb325SJeff Bonwick * 2104e14bb325SJeff Bonwick * In all cases, the gang tree allows complete recovery from partial failure. 2105fa9e4066Sahrens * ========================================================================== 2106fa9e4066Sahrens */ 2107e14bb325SJeff Bonwick 2108770499e1SDan Kimmel static void 2109770499e1SDan Kimmel zio_gang_issue_func_done(zio_t *zio) 2110770499e1SDan Kimmel { 2111770499e1SDan Kimmel abd_put(zio->io_abd); 2112770499e1SDan Kimmel } 2113770499e1SDan Kimmel 2114e14bb325SJeff Bonwick static zio_t * 2115770499e1SDan Kimmel zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, 2116770499e1SDan Kimmel uint64_t offset) 2117fa9e4066Sahrens { 2118e14bb325SJeff Bonwick if (gn != NULL) 2119e14bb325SJeff Bonwick return (pio); 2120fa9e4066Sahrens 2121770499e1SDan Kimmel return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), 2122770499e1SDan Kimmel BP_GET_PSIZE(bp), zio_gang_issue_func_done, 2123770499e1SDan Kimmel NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2124e14bb325SJeff Bonwick &pio->io_bookmark)); 2125e14bb325SJeff Bonwick } 2126e14bb325SJeff Bonwick 2127770499e1SDan Kimmel static zio_t * 2128770499e1SDan Kimmel zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, 2129770499e1SDan Kimmel uint64_t offset) 2130e14bb325SJeff Bonwick { 2131e14bb325SJeff Bonwick zio_t *zio; 2132e14bb325SJeff Bonwick 2133e14bb325SJeff Bonwick if (gn != NULL) { 2134770499e1SDan Kimmel abd_t *gbh_abd = 2135770499e1SDan Kimmel abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2136e14bb325SJeff Bonwick zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 2137770499e1SDan Kimmel gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, 2138770499e1SDan Kimmel pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 2139770499e1SDan Kimmel &pio->io_bookmark); 2140fa9e4066Sahrens /* 2141e14bb325SJeff Bonwick * As we rewrite each gang header, the pipeline will compute 2142e14bb325SJeff Bonwick * a new gang block header checksum for it; but no one will 2143e14bb325SJeff Bonwick * compute a new data checksum, so we do that here. The one 2144e14bb325SJeff Bonwick * exception is the gang leader: the pipeline already computed 2145e14bb325SJeff Bonwick * its data checksum because that stage precedes gang assembly. 2146e14bb325SJeff Bonwick * (Presently, nothing actually uses interior data checksums; 2147e14bb325SJeff Bonwick * this is just good hygiene.) 2148fa9e4066Sahrens */ 2149f5383399SBill Moore if (gn != pio->io_gang_leader->io_gang_tree) { 2150770499e1SDan Kimmel abd_t *buf = abd_get_offset(data, offset); 2151770499e1SDan Kimmel 2152e14bb325SJeff Bonwick zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 2153770499e1SDan Kimmel buf, BP_GET_PSIZE(bp)); 2154770499e1SDan Kimmel 2155770499e1SDan Kimmel abd_put(buf); 2156e14bb325SJeff Bonwick } 2157b24ab676SJeff Bonwick /* 2158b24ab676SJeff Bonwick * If we are here to damage data for testing purposes, 2159b24ab676SJeff Bonwick * leave the GBH alone so that we can detect the damage. 2160b24ab676SJeff Bonwick */ 2161b24ab676SJeff Bonwick if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 2162b24ab676SJeff Bonwick zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2163fa9e4066Sahrens } else { 2164e14bb325SJeff Bonwick zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 2165770499e1SDan Kimmel abd_get_offset(data, offset), BP_GET_PSIZE(bp), 2166770499e1SDan Kimmel zio_gang_issue_func_done, NULL, pio->io_priority, 2167e14bb325SJeff Bonwick ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2168fa9e4066Sahrens } 2169fa9e4066Sahrens 2170e14bb325SJeff Bonwick return (zio); 2171e14bb325SJeff Bonwick } 2172fa9e4066Sahrens 2173e14bb325SJeff Bonwick /* ARGSUSED */ 2174770499e1SDan Kimmel static zio_t * 2175770499e1SDan Kimmel zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, 2176770499e1SDan Kimmel uint64_t offset) 2177e14bb325SJeff Bonwick { 2178b24ab676SJeff Bonwick return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 2179b24ab676SJeff Bonwick ZIO_GANG_CHILD_FLAGS(pio))); 2180fa9e4066Sahrens } 2181fa9e4066Sahrens 2182e14bb325SJeff Bonwick /* ARGSUSED */ 2183770499e1SDan Kimmel static zio_t * 2184770499e1SDan Kimmel zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, 2185770499e1SDan Kimmel uint64_t offset) 2186fa9e4066Sahrens { 2187e14bb325SJeff Bonwick return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 2188e14bb325SJeff Bonwick NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 2189e14bb325SJeff Bonwick } 2190fa9e4066Sahrens 2191e14bb325SJeff Bonwick static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 2192e14bb325SJeff Bonwick NULL, 2193e14bb325SJeff Bonwick zio_read_gang, 2194e14bb325SJeff Bonwick zio_rewrite_gang, 2195e14bb325SJeff Bonwick zio_free_gang, 2196e14bb325SJeff Bonwick zio_claim_gang, 2197e14bb325SJeff Bonwick NULL 2198e14bb325SJeff Bonwick }; 2199fa9e4066Sahrens 2200e14bb325SJeff Bonwick static void zio_gang_tree_assemble_done(zio_t *zio); 2201fa9e4066Sahrens 2202e14bb325SJeff Bonwick static zio_gang_node_t * 2203e14bb325SJeff Bonwick zio_gang_node_alloc(zio_gang_node_t **gnpp) 2204e14bb325SJeff Bonwick { 2205e14bb325SJeff Bonwick zio_gang_node_t *gn; 2206fa9e4066Sahrens 2207e14bb325SJeff Bonwick ASSERT(*gnpp == NULL); 2208fa9e4066Sahrens 2209e14bb325SJeff Bonwick gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 2210e14bb325SJeff Bonwick gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 2211e14bb325SJeff Bonwick *gnpp = gn; 2212e14bb325SJeff Bonwick 2213e14bb325SJeff Bonwick return (gn); 2214fa9e4066Sahrens } 2215fa9e4066Sahrens 2216fa9e4066Sahrens static void 2217e14bb325SJeff Bonwick zio_gang_node_free(zio_gang_node_t **gnpp) 2218fa9e4066Sahrens { 2219e14bb325SJeff Bonwick zio_gang_node_t *gn = *gnpp; 2220fa9e4066Sahrens 2221e14bb325SJeff Bonwick for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2222e14bb325SJeff Bonwick ASSERT(gn->gn_child[g] == NULL); 2223e14bb325SJeff Bonwick 2224e14bb325SJeff Bonwick zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2225e14bb325SJeff Bonwick kmem_free(gn, sizeof (*gn)); 2226e14bb325SJeff Bonwick *gnpp = NULL; 2227fa9e4066Sahrens } 2228fa9e4066Sahrens 2229e14bb325SJeff Bonwick static void 2230e14bb325SJeff Bonwick zio_gang_tree_free(zio_gang_node_t **gnpp) 2231fa9e4066Sahrens { 2232e14bb325SJeff Bonwick zio_gang_node_t *gn = *gnpp; 2233fa9e4066Sahrens 2234e14bb325SJeff Bonwick if (gn == NULL) 2235e14bb325SJeff Bonwick return; 2236fa9e4066Sahrens 2237e14bb325SJeff Bonwick for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 2238e14bb325SJeff Bonwick zio_gang_tree_free(&gn->gn_child[g]); 2239fa9e4066Sahrens 2240e14bb325SJeff Bonwick zio_gang_node_free(gnpp); 2241fa9e4066Sahrens } 2242fa9e4066Sahrens 2243e14bb325SJeff Bonwick static void 2244f5383399SBill Moore zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 2245fa9e4066Sahrens { 2246e14bb325SJeff Bonwick zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 2247770499e1SDan Kimmel abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); 2248e14bb325SJeff Bonwick 2249f5383399SBill Moore ASSERT(gio->io_gang_leader == gio); 2250e14bb325SJeff Bonwick ASSERT(BP_IS_GANG(bp)); 2251fa9e4066Sahrens 2252770499e1SDan Kimmel zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, 2253770499e1SDan Kimmel zio_gang_tree_assemble_done, gn, gio->io_priority, 2254770499e1SDan Kimmel ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 2255e14bb325SJeff Bonwick } 2256fa9e4066Sahrens 2257e14bb325SJeff Bonwick static void 2258e14bb325SJeff Bonwick zio_gang_tree_assemble_done(zio_t *zio) 2259e14bb325SJeff Bonwick { 2260f5383399SBill Moore zio_t *gio = zio->io_gang_leader; 2261e14bb325SJeff Bonwick zio_gang_node_t *gn = zio->io_private; 2262e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 2263fa9e4066Sahrens 2264f5383399SBill Moore ASSERT(gio == zio_unique_parent(zio)); 2265b24ab676SJeff Bonwick ASSERT(zio->io_child_count == 0); 2266fa9e4066Sahrens 2267e14bb325SJeff Bonwick if (zio->io_error) 2268e14bb325SJeff Bonwick return; 2269fa9e4066Sahrens 2270770499e1SDan Kimmel /* this ABD was created from a linear buf in zio_gang_tree_assemble */ 2271e14bb325SJeff Bonwick if (BP_SHOULD_BYTESWAP(bp)) 2272770499e1SDan Kimmel byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); 2273fa9e4066Sahrens 2274770499e1SDan Kimmel ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); 2275e14bb325SJeff Bonwick ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 22766e1f5caaSNeil Perrin ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2277e05725b1Sbonwick 2278770499e1SDan Kimmel abd_put(zio->io_abd); 2279770499e1SDan Kimmel 2280e14bb325SJeff Bonwick for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2281e14bb325SJeff Bonwick blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 2282e14bb325SJeff Bonwick if (!BP_IS_GANG(gbp)) 2283e14bb325SJeff Bonwick continue; 2284f5383399SBill Moore zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 2285e14bb325SJeff Bonwick } 2286fa9e4066Sahrens } 2287fa9e4066Sahrens 2288e14bb325SJeff Bonwick static void 2289770499e1SDan Kimmel zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, 2290770499e1SDan Kimmel uint64_t offset) 2291fa9e4066Sahrens { 2292f5383399SBill Moore zio_t *gio = pio->io_gang_leader; 2293e14bb325SJeff Bonwick zio_t *zio; 2294fa9e4066Sahrens 2295e14bb325SJeff Bonwick ASSERT(BP_IS_GANG(bp) == !!gn); 2296f5383399SBill Moore ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 2297f5383399SBill Moore ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 2298fa9e4066Sahrens 2299e14bb325SJeff Bonwick /* 2300e14bb325SJeff Bonwick * If you're a gang header, your data is in gn->gn_gbh. 2301e14bb325SJeff Bonwick * If you're a gang member, your data is in 'data' and gn == NULL. 2302e14bb325SJeff Bonwick */ 2303770499e1SDan Kimmel zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); 2304fa9e4066Sahrens 2305e14bb325SJeff Bonwick if (gn != NULL) { 23066e1f5caaSNeil Perrin ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 2307fa9e4066Sahrens 2308e14bb325SJeff Bonwick for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2309e14bb325SJeff Bonwick blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 2310e14bb325SJeff Bonwick if (BP_IS_HOLE(gbp)) 2311e14bb325SJeff Bonwick continue; 2312770499e1SDan Kimmel zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, 2313770499e1SDan Kimmel offset); 2314770499e1SDan Kimmel offset += BP_GET_PSIZE(gbp); 2315e14bb325SJeff Bonwick } 2316fa9e4066Sahrens } 2317fa9e4066Sahrens 2318f5383399SBill Moore if (gn == gio->io_gang_tree) 2319770499e1SDan Kimmel ASSERT3U(gio->io_size, ==, offset); 2320e05725b1Sbonwick 2321e14bb325SJeff Bonwick if (zio != pio) 2322e14bb325SJeff Bonwick zio_nowait(zio); 2323fa9e4066Sahrens } 2324fa9e4066Sahrens 2325e05725b1Sbonwick static int 2326e14bb325SJeff Bonwick zio_gang_assemble(zio_t *zio) 2327fa9e4066Sahrens { 2328e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 2329fa9e4066Sahrens 2330f5383399SBill Moore ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 2331f5383399SBill Moore ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2332f5383399SBill Moore 2333f5383399SBill Moore zio->io_gang_leader = zio; 2334fa9e4066Sahrens 2335e14bb325SJeff Bonwick zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 2336e05725b1Sbonwick 2337e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 2338fa9e4066Sahrens } 2339fa9e4066Sahrens 2340e05725b1Sbonwick static int 2341e14bb325SJeff Bonwick zio_gang_issue(zio_t *zio) 2342fa9e4066Sahrens { 2343e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 2344fa9e4066Sahrens 2345d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { 2346e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 2347d6e1c446SGeorge Wilson } 2348fa9e4066Sahrens 2349f5383399SBill Moore ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 2350f5383399SBill Moore ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2351fa9e4066Sahrens 2352e14bb325SJeff Bonwick if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 2353770499e1SDan Kimmel zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 2354770499e1SDan Kimmel 0); 2355e14bb325SJeff Bonwick else 2356f5383399SBill Moore zio_gang_tree_free(&zio->io_gang_tree); 2357fa9e4066Sahrens 2358e14bb325SJeff Bonwick zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2359e05725b1Sbonwick 2360e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 2361fa9e4066Sahrens } 2362fa9e4066Sahrens 2363fa9e4066Sahrens static void 2364e14bb325SJeff Bonwick zio_write_gang_member_ready(zio_t *zio) 2365fa9e4066Sahrens { 2366a3f829aeSBill Moore zio_t *pio = zio_unique_parent(zio); 2367f5383399SBill Moore zio_t *gio = zio->io_gang_leader; 236844cd46caSbillm dva_t *cdva = zio->io_bp->blk_dva; 236944cd46caSbillm dva_t *pdva = pio->io_bp->blk_dva; 2370fa9e4066Sahrens uint64_t asize; 2371fa9e4066Sahrens 2372e14bb325SJeff Bonwick if (BP_IS_HOLE(zio->io_bp)) 2373e14bb325SJeff Bonwick return; 2374e14bb325SJeff Bonwick 2375e14bb325SJeff Bonwick ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 2376e14bb325SJeff Bonwick 2377e14bb325SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 2378b24ab676SJeff Bonwick ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 2379b24ab676SJeff Bonwick ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 2380b24ab676SJeff Bonwick ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 238144cd46caSbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 2382fa9e4066Sahrens 2383fa9e4066Sahrens mutex_enter(&pio->io_lock); 2384e14bb325SJeff Bonwick for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 238544cd46caSbillm ASSERT(DVA_GET_GANG(&pdva[d])); 238644cd46caSbillm asize = DVA_GET_ASIZE(&pdva[d]); 238744cd46caSbillm asize += DVA_GET_ASIZE(&cdva[d]); 238844cd46caSbillm DVA_SET_ASIZE(&pdva[d], asize); 238944cd46caSbillm } 2390fa9e4066Sahrens mutex_exit(&pio->io_lock); 2391fa9e4066Sahrens } 2392fa9e4066Sahrens 2393770499e1SDan Kimmel static void 2394770499e1SDan Kimmel zio_write_gang_done(zio_t *zio) 2395770499e1SDan Kimmel { 23967341a7deSBrad Lewis /* 23977341a7deSBrad Lewis * The io_abd field will be NULL for a zio with no data. The io_flags 23987341a7deSBrad Lewis * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't 23997341a7deSBrad Lewis * check for it here as it is cleared in zio_ready. 24007341a7deSBrad Lewis */ 24017341a7deSBrad Lewis if (zio->io_abd != NULL) 24027341a7deSBrad Lewis abd_put(zio->io_abd); 2403770499e1SDan Kimmel } 2404770499e1SDan Kimmel 24050a4e9518Sgw static int 2406e14bb325SJeff Bonwick zio_write_gang_block(zio_t *pio) 2407fa9e4066Sahrens { 2408e14bb325SJeff Bonwick spa_t *spa = pio->io_spa; 24090f7643c7SGeorge Wilson metaslab_class_t *mc = spa_normal_class(spa); 2410e14bb325SJeff Bonwick blkptr_t *bp = pio->io_bp; 2411f5383399SBill Moore zio_t *gio = pio->io_gang_leader; 2412e14bb325SJeff Bonwick zio_t *zio; 2413e14bb325SJeff Bonwick zio_gang_node_t *gn, **gnpp; 2414fa9e4066Sahrens zio_gbh_phys_t *gbh; 2415770499e1SDan Kimmel abd_t *gbh_abd; 2416e14bb325SJeff Bonwick uint64_t txg = pio->io_txg; 2417e14bb325SJeff Bonwick uint64_t resid = pio->io_size; 2418e14bb325SJeff Bonwick uint64_t lsize; 2419b24ab676SJeff Bonwick int copies = gio->io_prop.zp_copies; 2420b24ab676SJeff Bonwick int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 2421e14bb325SJeff Bonwick zio_prop_t zp; 2422fa9e4066Sahrens int error; 24237341a7deSBrad Lewis boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); 2424fa9e4066Sahrens 2425eb633035STom Caputi /* 2426eb633035STom Caputi * encrypted blocks need DVA[2] free so encrypted gang headers can't 2427eb633035STom Caputi * have a third copy. 2428eb633035STom Caputi */ 2429eb633035STom Caputi if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) 2430eb633035STom Caputi gbh_copies = SPA_DVAS_PER_BP - 1; 2431eb633035STom Caputi 24320f7643c7SGeorge Wilson int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; 24330f7643c7SGeorge Wilson if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 24340f7643c7SGeorge Wilson ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 24357341a7deSBrad Lewis ASSERT(has_data); 24360f7643c7SGeorge Wilson 24370f7643c7SGeorge Wilson flags |= METASLAB_ASYNC_ALLOC; 2438e914ace2STim Schumacher VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], 2439f78cdc34SPaul Dagnelie pio)); 24400f7643c7SGeorge Wilson 24410f7643c7SGeorge Wilson /* 24420f7643c7SGeorge Wilson * The logical zio has already placed a reservation for 24430f7643c7SGeorge Wilson * 'copies' allocation slots but gang blocks may require 24440f7643c7SGeorge Wilson * additional copies. These additional copies 24450f7643c7SGeorge Wilson * (i.e. gbh_copies - copies) are guaranteed to succeed 24460f7643c7SGeorge Wilson * since metaslab_class_throttle_reserve() always allows 24470f7643c7SGeorge Wilson * additional reservations for gang blocks. 24480f7643c7SGeorge Wilson */ 24490f7643c7SGeorge Wilson VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, 2450f78cdc34SPaul Dagnelie pio->io_allocator, pio, flags)); 24510f7643c7SGeorge Wilson } 24520f7643c7SGeorge Wilson 24530f7643c7SGeorge Wilson error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, 24548363e80aSGeorge Wilson bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, 2455f78cdc34SPaul Dagnelie &pio->io_alloc_list, pio, pio->io_allocator); 2456e05725b1Sbonwick if (error) { 24570f7643c7SGeorge Wilson if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 24580f7643c7SGeorge Wilson ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 24597341a7deSBrad Lewis ASSERT(has_data); 24600f7643c7SGeorge Wilson 24610f7643c7SGeorge Wilson /* 24620f7643c7SGeorge Wilson * If we failed to allocate the gang block header then 24630f7643c7SGeorge Wilson * we remove any additional allocation reservations that 24640f7643c7SGeorge Wilson * we placed here. The original reservation will 24650f7643c7SGeorge Wilson * be removed when the logical I/O goes to the ready 24660f7643c7SGeorge Wilson * stage. 24670f7643c7SGeorge Wilson */ 24680f7643c7SGeorge Wilson metaslab_class_throttle_unreserve(mc, 2469f78cdc34SPaul Dagnelie gbh_copies - copies, pio->io_allocator, pio); 24700f7643c7SGeorge Wilson } 2471e14bb325SJeff Bonwick pio->io_error = error; 2472e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 2473e05725b1Sbonwick } 2474fa9e4066Sahrens 2475f5383399SBill Moore if (pio == gio) { 2476f5383399SBill Moore gnpp = &gio->io_gang_tree; 2477e14bb325SJeff Bonwick } else { 2478e14bb325SJeff Bonwick gnpp = pio->io_private; 2479e14bb325SJeff Bonwick ASSERT(pio->io_ready == zio_write_gang_member_ready); 2480fa9e4066Sahrens } 2481fa9e4066Sahrens 2482e14bb325SJeff Bonwick gn = zio_gang_node_alloc(gnpp); 2483e14bb325SJeff Bonwick gbh = gn->gn_gbh; 2484e14bb325SJeff Bonwick bzero(gbh, SPA_GANGBLOCKSIZE); 2485770499e1SDan Kimmel gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); 2486fa9e4066Sahrens 2487e14bb325SJeff Bonwick /* 2488e14bb325SJeff Bonwick * Create the gang header. 2489e14bb325SJeff Bonwick */ 2490770499e1SDan Kimmel zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, 2491770499e1SDan Kimmel zio_write_gang_done, NULL, pio->io_priority, 2492770499e1SDan Kimmel ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 2493fa9e4066Sahrens 2494e14bb325SJeff Bonwick /* 2495e14bb325SJeff Bonwick * Create and nowait the gang children. 2496e14bb325SJeff Bonwick */ 2497e14bb325SJeff Bonwick for (int g = 0; resid != 0; resid -= lsize, g++) { 2498e14bb325SJeff Bonwick lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 2499e14bb325SJeff Bonwick SPA_MINBLOCKSIZE); 2500e14bb325SJeff Bonwick ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 2501e14bb325SJeff Bonwick 2502f5383399SBill Moore zp.zp_checksum = gio->io_prop.zp_checksum; 2503e14bb325SJeff Bonwick zp.zp_compress = ZIO_COMPRESS_OFF; 2504e14bb325SJeff Bonwick zp.zp_type = DMU_OT_NONE; 2505e14bb325SJeff Bonwick zp.zp_level = 0; 2506b24ab676SJeff Bonwick zp.zp_copies = gio->io_prop.zp_copies; 250780901aeaSGeorge Wilson zp.zp_dedup = B_FALSE; 250880901aeaSGeorge Wilson zp.zp_dedup_verify = B_FALSE; 250980901aeaSGeorge Wilson zp.zp_nopwrite = B_FALSE; 2510eb633035STom Caputi zp.zp_encrypt = gio->io_prop.zp_encrypt; 2511eb633035STom Caputi zp.zp_byteorder = gio->io_prop.zp_byteorder; 2512eb633035STom Caputi bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); 2513eb633035STom Caputi bzero(zp.zp_iv, ZIO_DATA_IV_LEN); 2514eb633035STom Caputi bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); 2515e14bb325SJeff Bonwick 25160f7643c7SGeorge Wilson zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 25177341a7deSBrad Lewis has_data ? abd_get_offset(pio->io_abd, pio->io_size - 25187341a7deSBrad Lewis resid) : NULL, lsize, lsize, &zp, 25197341a7deSBrad Lewis zio_write_gang_member_ready, NULL, NULL, 2520770499e1SDan Kimmel zio_write_gang_done, &gn->gn_child[g], pio->io_priority, 25210f7643c7SGeorge Wilson ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 25220f7643c7SGeorge Wilson 25230f7643c7SGeorge Wilson if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 25240f7643c7SGeorge Wilson ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 25257341a7deSBrad Lewis ASSERT(has_data); 25260f7643c7SGeorge Wilson 25270f7643c7SGeorge Wilson /* 25280f7643c7SGeorge Wilson * Gang children won't throttle but we should 25290f7643c7SGeorge Wilson * account for their work, so reserve an allocation 25300f7643c7SGeorge Wilson * slot for them here. 25310f7643c7SGeorge Wilson */ 25320f7643c7SGeorge Wilson VERIFY(metaslab_class_throttle_reserve(mc, 2533f78cdc34SPaul Dagnelie zp.zp_copies, cio->io_allocator, cio, flags)); 25340f7643c7SGeorge Wilson } 25350f7643c7SGeorge Wilson zio_nowait(cio); 2536e14bb325SJeff Bonwick } 2537e05725b1Sbonwick 253844cd46caSbillm /* 2539e14bb325SJeff Bonwick * Set pio's pipeline to just wait for zio to finish. 254044cd46caSbillm */ 2541e14bb325SJeff Bonwick pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2542e14bb325SJeff Bonwick 2543e14bb325SJeff Bonwick zio_nowait(zio); 2544e14bb325SJeff Bonwick 2545e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2546fa9e4066Sahrens } 2547fa9e4066Sahrens 254880901aeaSGeorge Wilson /* 254945818ee1SMatthew Ahrens * The zio_nop_write stage in the pipeline determines if allocating a 255045818ee1SMatthew Ahrens * new bp is necessary. The nopwrite feature can handle writes in 255145818ee1SMatthew Ahrens * either syncing or open context (i.e. zil writes) and as a result is 255245818ee1SMatthew Ahrens * mutually exclusive with dedup. 255345818ee1SMatthew Ahrens * 255445818ee1SMatthew Ahrens * By leveraging a cryptographically secure checksum, such as SHA256, we 255545818ee1SMatthew Ahrens * can compare the checksums of the new data and the old to determine if 255645818ee1SMatthew Ahrens * allocating a new block is required. Note that our requirements for 255745818ee1SMatthew Ahrens * cryptographic strength are fairly weak: there can't be any accidental 255845818ee1SMatthew Ahrens * hash collisions, but we don't need to be secure against intentional 255945818ee1SMatthew Ahrens * (malicious) collisions. To trigger a nopwrite, you have to be able 256045818ee1SMatthew Ahrens * to write the file to begin with, and triggering an incorrect (hash 256145818ee1SMatthew Ahrens * collision) nopwrite is no worse than simply writing to the file. 256245818ee1SMatthew Ahrens * That said, there are no known attacks against the checksum algorithms 256345818ee1SMatthew Ahrens * used for nopwrite, assuming that the salt and the checksums 256445818ee1SMatthew Ahrens * themselves remain secret. 256580901aeaSGeorge Wilson */ 256680901aeaSGeorge Wilson static int 256780901aeaSGeorge Wilson zio_nop_write(zio_t *zio) 256880901aeaSGeorge Wilson { 256980901aeaSGeorge Wilson blkptr_t *bp = zio->io_bp; 257080901aeaSGeorge Wilson blkptr_t *bp_orig = &zio->io_bp_orig; 257180901aeaSGeorge Wilson zio_prop_t *zp = &zio->io_prop; 257280901aeaSGeorge Wilson 257380901aeaSGeorge Wilson ASSERT(BP_GET_LEVEL(bp) == 0); 257480901aeaSGeorge Wilson ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 257580901aeaSGeorge Wilson ASSERT(zp->zp_nopwrite); 257680901aeaSGeorge Wilson ASSERT(!zp->zp_dedup); 257780901aeaSGeorge Wilson ASSERT(zio->io_bp_override == NULL); 257880901aeaSGeorge Wilson ASSERT(IO_IS_ALLOCATING(zio)); 257980901aeaSGeorge Wilson 258080901aeaSGeorge Wilson /* 258180901aeaSGeorge Wilson * Check to see if the original bp and the new bp have matching 258280901aeaSGeorge Wilson * characteristics (i.e. same checksum, compression algorithms, etc). 258380901aeaSGeorge Wilson * If they don't then just continue with the pipeline which will 258480901aeaSGeorge Wilson * allocate a new bp. 258580901aeaSGeorge Wilson */ 258680901aeaSGeorge Wilson if (BP_IS_HOLE(bp_orig) || 258745818ee1SMatthew Ahrens !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & 258845818ee1SMatthew Ahrens ZCHECKSUM_FLAG_NOPWRITE) || 2589eb633035STom Caputi BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || 259080901aeaSGeorge Wilson BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 259180901aeaSGeorge Wilson BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 259280901aeaSGeorge Wilson BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 259380901aeaSGeorge Wilson zp->zp_copies != BP_GET_NDVAS(bp_orig)) 259480901aeaSGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 259580901aeaSGeorge Wilson 259680901aeaSGeorge Wilson /* 259780901aeaSGeorge Wilson * If the checksums match then reset the pipeline so that we 259880901aeaSGeorge Wilson * avoid allocating a new bp and issuing any I/O. 259980901aeaSGeorge Wilson */ 260080901aeaSGeorge Wilson if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 260145818ee1SMatthew Ahrens ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & 260245818ee1SMatthew Ahrens ZCHECKSUM_FLAG_NOPWRITE); 260380901aeaSGeorge Wilson ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 260480901aeaSGeorge Wilson ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 260580901aeaSGeorge Wilson ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 260680901aeaSGeorge Wilson ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 260780901aeaSGeorge Wilson sizeof (uint64_t)) == 0); 260880901aeaSGeorge Wilson 260980901aeaSGeorge Wilson *bp = *bp_orig; 261080901aeaSGeorge Wilson zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 261180901aeaSGeorge Wilson zio->io_flags |= ZIO_FLAG_NOPWRITE; 261280901aeaSGeorge Wilson } 261380901aeaSGeorge Wilson 261480901aeaSGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 261580901aeaSGeorge Wilson } 261680901aeaSGeorge Wilson 2617fa9e4066Sahrens /* 2618fa9e4066Sahrens * ========================================================================== 2619b24ab676SJeff Bonwick * Dedup 2620fa9e4066Sahrens * ========================================================================== 2621fa9e4066Sahrens */ 2622b24ab676SJeff Bonwick static void 2623b24ab676SJeff Bonwick zio_ddt_child_read_done(zio_t *zio) 2624b24ab676SJeff Bonwick { 2625b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 2626b24ab676SJeff Bonwick ddt_entry_t *dde = zio->io_private; 2627b24ab676SJeff Bonwick ddt_phys_t *ddp; 2628b24ab676SJeff Bonwick zio_t *pio = zio_unique_parent(zio); 2629b24ab676SJeff Bonwick 2630b24ab676SJeff Bonwick mutex_enter(&pio->io_lock); 2631b24ab676SJeff Bonwick ddp = ddt_phys_select(dde, bp); 2632b24ab676SJeff Bonwick if (zio->io_error == 0) 2633b24ab676SJeff Bonwick ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 2634770499e1SDan Kimmel 2635770499e1SDan Kimmel if (zio->io_error == 0 && dde->dde_repair_abd == NULL) 2636770499e1SDan Kimmel dde->dde_repair_abd = zio->io_abd; 2637b24ab676SJeff Bonwick else 2638770499e1SDan Kimmel abd_free(zio->io_abd); 2639b24ab676SJeff Bonwick mutex_exit(&pio->io_lock); 2640b24ab676SJeff Bonwick } 2641b24ab676SJeff Bonwick 2642b24ab676SJeff Bonwick static int 2643b24ab676SJeff Bonwick zio_ddt_read_start(zio_t *zio) 2644b24ab676SJeff Bonwick { 2645b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 2646b24ab676SJeff Bonwick 2647b24ab676SJeff Bonwick ASSERT(BP_GET_DEDUP(bp)); 2648b24ab676SJeff Bonwick ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2649b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2650b24ab676SJeff Bonwick 2651b24ab676SJeff Bonwick if (zio->io_child_error[ZIO_CHILD_DDT]) { 2652b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(zio->io_spa, bp); 2653b24ab676SJeff Bonwick ddt_entry_t *dde = ddt_repair_start(ddt, bp); 2654b24ab676SJeff Bonwick ddt_phys_t *ddp = dde->dde_phys; 2655b24ab676SJeff Bonwick ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 2656b24ab676SJeff Bonwick blkptr_t blk; 2657b24ab676SJeff Bonwick 2658b24ab676SJeff Bonwick ASSERT(zio->io_vsd == NULL); 2659b24ab676SJeff Bonwick zio->io_vsd = dde; 2660b24ab676SJeff Bonwick 2661b24ab676SJeff Bonwick if (ddp_self == NULL) 2662b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2663b24ab676SJeff Bonwick 2664b24ab676SJeff Bonwick for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2665b24ab676SJeff Bonwick if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 2666b24ab676SJeff Bonwick continue; 2667bbfd46c4SJeff Bonwick ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 2668bbfd46c4SJeff Bonwick &blk); 2669b24ab676SJeff Bonwick zio_nowait(zio_read(zio, zio->io_spa, &blk, 2670770499e1SDan Kimmel abd_alloc_for_io(zio->io_size, B_TRUE), 2671770499e1SDan Kimmel zio->io_size, zio_ddt_child_read_done, dde, 2672770499e1SDan Kimmel zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | 2673770499e1SDan Kimmel ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); 2674b24ab676SJeff Bonwick } 2675b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2676b24ab676SJeff Bonwick } 2677b24ab676SJeff Bonwick 2678b24ab676SJeff Bonwick zio_nowait(zio_read(zio, zio->io_spa, bp, 2679770499e1SDan Kimmel zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, 2680b24ab676SJeff Bonwick ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 2681b24ab676SJeff Bonwick 2682b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2683b24ab676SJeff Bonwick } 2684e14bb325SJeff Bonwick 2685b24ab676SJeff Bonwick static int 2686b24ab676SJeff Bonwick zio_ddt_read_done(zio_t *zio) 2687b24ab676SJeff Bonwick { 2688b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 2689b24ab676SJeff Bonwick 2690d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { 2691b24ab676SJeff Bonwick return (ZIO_PIPELINE_STOP); 2692d6e1c446SGeorge Wilson } 2693b24ab676SJeff Bonwick 2694b24ab676SJeff Bonwick ASSERT(BP_GET_DEDUP(bp)); 2695b24ab676SJeff Bonwick ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 2696b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2697b24ab676SJeff Bonwick 2698b24ab676SJeff Bonwick if (zio->io_child_error[ZIO_CHILD_DDT]) { 2699b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(zio->io_spa, bp); 2700b24ab676SJeff Bonwick ddt_entry_t *dde = zio->io_vsd; 2701b24ab676SJeff Bonwick if (ddt == NULL) { 2702b16da2e2SGeorge Wilson ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 2703b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2704b24ab676SJeff Bonwick } 2705b24ab676SJeff Bonwick if (dde == NULL) { 2706b24ab676SJeff Bonwick zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 270735a5a358SJonathan Adams zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 2708b24ab676SJeff Bonwick return (ZIO_PIPELINE_STOP); 2709b24ab676SJeff Bonwick } 2710770499e1SDan Kimmel if (dde->dde_repair_abd != NULL) { 2711770499e1SDan Kimmel abd_copy(zio->io_abd, dde->dde_repair_abd, 2712770499e1SDan Kimmel zio->io_size); 2713b24ab676SJeff Bonwick zio->io_child_error[ZIO_CHILD_DDT] = 0; 2714b24ab676SJeff Bonwick } 2715b24ab676SJeff Bonwick ddt_repair_done(ddt, dde); 2716b24ab676SJeff Bonwick zio->io_vsd = NULL; 2717b24ab676SJeff Bonwick } 2718b24ab676SJeff Bonwick 2719b24ab676SJeff Bonwick ASSERT(zio->io_vsd == NULL); 2720b24ab676SJeff Bonwick 2721b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2722b24ab676SJeff Bonwick } 2723b24ab676SJeff Bonwick 2724b24ab676SJeff Bonwick static boolean_t 2725b24ab676SJeff Bonwick zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2726b24ab676SJeff Bonwick { 2727b24ab676SJeff Bonwick spa_t *spa = zio->io_spa; 2728eb633035STom Caputi boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); 27295602294fSDan Kimmel 27305602294fSDan Kimmel /* We should never get a raw, override zio */ 27315602294fSDan Kimmel ASSERT(!(zio->io_bp_override && do_raw)); 2732b24ab676SJeff Bonwick 2733b24ab676SJeff Bonwick /* 2734b24ab676SJeff Bonwick * Note: we compare the original data, not the transformed data, 2735b24ab676SJeff Bonwick * because when zio->io_bp is an override bp, we will not have 2736b24ab676SJeff Bonwick * pushed the I/O transforms. That's an important optimization 2737b24ab676SJeff Bonwick * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2738eb633035STom Caputi * However, we should never get a raw, override zio so in these 2739eb633035STom Caputi * cases we can compare the io_data directly. This is useful because 2740eb633035STom Caputi * it allows us to do dedup verification even if we don't have access 2741eb633035STom Caputi * to the original data (for instance, if the encryption keys aren't 2742eb633035STom Caputi * loaded). 2743b24ab676SJeff Bonwick */ 2744eb633035STom Caputi 2745b24ab676SJeff Bonwick for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2746b24ab676SJeff Bonwick zio_t *lio = dde->dde_lead_zio[p]; 2747b24ab676SJeff Bonwick 2748eb633035STom Caputi if (lio != NULL && do_raw) { 2749eb633035STom Caputi return (lio->io_size != zio->io_size || 2750eb633035STom Caputi abd_cmp(zio->io_abd, lio->io_abd, 2751eb633035STom Caputi zio->io_size) != 0); 2752eb633035STom Caputi } else if (lio != NULL) { 2753b24ab676SJeff Bonwick return (lio->io_orig_size != zio->io_orig_size || 2754770499e1SDan Kimmel abd_cmp(zio->io_orig_abd, lio->io_orig_abd, 2755b24ab676SJeff Bonwick zio->io_orig_size) != 0); 2756b24ab676SJeff Bonwick } 2757b24ab676SJeff Bonwick } 2758b24ab676SJeff Bonwick 2759b24ab676SJeff Bonwick for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2760b24ab676SJeff Bonwick ddt_phys_t *ddp = &dde->dde_phys[p]; 2761b24ab676SJeff Bonwick 2762eb633035STom Caputi if (ddp->ddp_phys_birth != 0 && do_raw) { 2763eb633035STom Caputi blkptr_t blk = *zio->io_bp; 2764eb633035STom Caputi uint64_t psize; 2765eb633035STom Caputi abd_t *tmpabd; 2766eb633035STom Caputi int error; 2767eb633035STom Caputi 2768eb633035STom Caputi ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2769eb633035STom Caputi psize = BP_GET_PSIZE(&blk); 2770eb633035STom Caputi 2771eb633035STom Caputi if (psize != zio->io_size) 2772eb633035STom Caputi return (B_TRUE); 2773eb633035STom Caputi 2774eb633035STom Caputi ddt_exit(ddt); 2775eb633035STom Caputi 2776eb633035STom Caputi tmpabd = abd_alloc_for_io(psize, B_TRUE); 2777eb633035STom Caputi 2778eb633035STom Caputi error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, 2779eb633035STom Caputi psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, 2780eb633035STom Caputi ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2781eb633035STom Caputi ZIO_FLAG_RAW, &zio->io_bookmark)); 2782eb633035STom Caputi 2783eb633035STom Caputi if (error == 0) { 2784eb633035STom Caputi if (abd_cmp(tmpabd, zio->io_abd, psize) != 0) 2785eb633035STom Caputi error = SET_ERROR(ENOENT); 2786eb633035STom Caputi } 2787eb633035STom Caputi 2788eb633035STom Caputi abd_free(tmpabd); 2789eb633035STom Caputi ddt_enter(ddt); 2790eb633035STom Caputi return (error != 0); 2791eb633035STom Caputi } else if (ddp->ddp_phys_birth != 0) { 2792b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 27937adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 27945602294fSDan Kimmel int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; 2795b24ab676SJeff Bonwick blkptr_t blk = *zio->io_bp; 2796b24ab676SJeff Bonwick int error; 2797b24ab676SJeff Bonwick 2798b24ab676SJeff Bonwick ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2799b24ab676SJeff Bonwick 2800eb633035STom Caputi if (BP_GET_LSIZE(&blk) != zio->io_orig_size) 2801eb633035STom Caputi return (B_TRUE); 2802eb633035STom Caputi 2803b24ab676SJeff Bonwick ddt_exit(ddt); 2804b24ab676SJeff Bonwick 28055602294fSDan Kimmel /* 28065602294fSDan Kimmel * Intuitively, it would make more sense to compare 2807770499e1SDan Kimmel * io_abd than io_orig_abd in the raw case since you 28085602294fSDan Kimmel * don't want to look at any transformations that have 28095602294fSDan Kimmel * happened to the data. However, for raw I/Os the 2810770499e1SDan Kimmel * data will actually be the same in io_abd and 2811770499e1SDan Kimmel * io_orig_abd, so all we have to do is issue this as 28125602294fSDan Kimmel * a raw ARC read. 28135602294fSDan Kimmel */ 28145602294fSDan Kimmel if (do_raw) { 28155602294fSDan Kimmel zio_flags |= ZIO_FLAG_RAW; 28165602294fSDan Kimmel ASSERT3U(zio->io_size, ==, zio->io_orig_size); 2817770499e1SDan Kimmel ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, 28185602294fSDan Kimmel zio->io_size)); 28195602294fSDan Kimmel ASSERT3P(zio->io_transform_stack, ==, NULL); 28205602294fSDan Kimmel } 28215602294fSDan Kimmel 28221b912ec7SGeorge Wilson error = arc_read(NULL, spa, &blk, 2823b24ab676SJeff Bonwick arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 28245602294fSDan Kimmel zio_flags, &aflags, &zio->io_bookmark); 2825b24ab676SJeff Bonwick 2826b24ab676SJeff Bonwick if (error == 0) { 2827eb633035STom Caputi if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, 2828b24ab676SJeff Bonwick zio->io_orig_size) != 0) 2829eb633035STom Caputi error = SET_ERROR(ENOENT); 2830dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 2831b24ab676SJeff Bonwick } 2832b24ab676SJeff Bonwick 2833b24ab676SJeff Bonwick ddt_enter(ddt); 2834b24ab676SJeff Bonwick return (error != 0); 2835b24ab676SJeff Bonwick } 2836b24ab676SJeff Bonwick } 2837b24ab676SJeff Bonwick 2838b24ab676SJeff Bonwick return (B_FALSE); 2839b24ab676SJeff Bonwick } 2840b24ab676SJeff Bonwick 2841b24ab676SJeff Bonwick static void 2842b24ab676SJeff Bonwick zio_ddt_child_write_ready(zio_t *zio) 2843b24ab676SJeff Bonwick { 2844b24ab676SJeff Bonwick int p = zio->io_prop.zp_copies; 2845b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2846b24ab676SJeff Bonwick ddt_entry_t *dde = zio->io_private; 2847b24ab676SJeff Bonwick ddt_phys_t *ddp = &dde->dde_phys[p]; 2848b24ab676SJeff Bonwick zio_t *pio; 2849b24ab676SJeff Bonwick 2850b24ab676SJeff Bonwick if (zio->io_error) 2851b24ab676SJeff Bonwick return; 2852b24ab676SJeff Bonwick 2853b24ab676SJeff Bonwick ddt_enter(ddt); 2854b24ab676SJeff Bonwick 2855b24ab676SJeff Bonwick ASSERT(dde->dde_lead_zio[p] == zio); 2856b24ab676SJeff Bonwick 2857b24ab676SJeff Bonwick ddt_phys_fill(ddp, zio->io_bp); 2858b24ab676SJeff Bonwick 28590f7643c7SGeorge Wilson zio_link_t *zl = NULL; 28600f7643c7SGeorge Wilson while ((pio = zio_walk_parents(zio, &zl)) != NULL) 2861b24ab676SJeff Bonwick ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2862b24ab676SJeff Bonwick 2863b24ab676SJeff Bonwick ddt_exit(ddt); 2864b24ab676SJeff Bonwick } 2865b24ab676SJeff Bonwick 2866b24ab676SJeff Bonwick static void 2867b24ab676SJeff Bonwick zio_ddt_child_write_done(zio_t *zio) 2868b24ab676SJeff Bonwick { 2869b24ab676SJeff Bonwick int p = zio->io_prop.zp_copies; 2870b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2871b24ab676SJeff Bonwick ddt_entry_t *dde = zio->io_private; 2872b24ab676SJeff Bonwick ddt_phys_t *ddp = &dde->dde_phys[p]; 2873b24ab676SJeff Bonwick 2874b24ab676SJeff Bonwick ddt_enter(ddt); 2875b24ab676SJeff Bonwick 2876b24ab676SJeff Bonwick ASSERT(ddp->ddp_refcnt == 0); 2877b24ab676SJeff Bonwick ASSERT(dde->dde_lead_zio[p] == zio); 2878b24ab676SJeff Bonwick dde->dde_lead_zio[p] = NULL; 2879b24ab676SJeff Bonwick 2880b24ab676SJeff Bonwick if (zio->io_error == 0) { 28810f7643c7SGeorge Wilson zio_link_t *zl = NULL; 28820f7643c7SGeorge Wilson while (zio_walk_parents(zio, &zl) != NULL) 2883b24ab676SJeff Bonwick ddt_phys_addref(ddp); 2884b24ab676SJeff Bonwick } else { 2885b24ab676SJeff Bonwick ddt_phys_clear(ddp); 2886b24ab676SJeff Bonwick } 2887b24ab676SJeff Bonwick 2888b24ab676SJeff Bonwick ddt_exit(ddt); 2889b24ab676SJeff Bonwick } 2890b24ab676SJeff Bonwick 2891b24ab676SJeff Bonwick static void 2892b24ab676SJeff Bonwick zio_ddt_ditto_write_done(zio_t *zio) 2893b24ab676SJeff Bonwick { 2894b24ab676SJeff Bonwick int p = DDT_PHYS_DITTO; 2895b24ab676SJeff Bonwick zio_prop_t *zp = &zio->io_prop; 2896b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 2897b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(zio->io_spa, bp); 2898b24ab676SJeff Bonwick ddt_entry_t *dde = zio->io_private; 2899b24ab676SJeff Bonwick ddt_phys_t *ddp = &dde->dde_phys[p]; 2900b24ab676SJeff Bonwick ddt_key_t *ddk = &dde->dde_key; 2901b24ab676SJeff Bonwick 2902b24ab676SJeff Bonwick ddt_enter(ddt); 2903b24ab676SJeff Bonwick 2904b24ab676SJeff Bonwick ASSERT(ddp->ddp_refcnt == 0); 2905b24ab676SJeff Bonwick ASSERT(dde->dde_lead_zio[p] == zio); 2906b24ab676SJeff Bonwick dde->dde_lead_zio[p] = NULL; 2907b24ab676SJeff Bonwick 2908b24ab676SJeff Bonwick if (zio->io_error == 0) { 2909b24ab676SJeff Bonwick ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2910b24ab676SJeff Bonwick ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2911b24ab676SJeff Bonwick ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2912b24ab676SJeff Bonwick if (ddp->ddp_phys_birth != 0) 2913b24ab676SJeff Bonwick ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2914b24ab676SJeff Bonwick ddt_phys_fill(ddp, bp); 2915b24ab676SJeff Bonwick } 2916b24ab676SJeff Bonwick 2917b24ab676SJeff Bonwick ddt_exit(ddt); 2918b24ab676SJeff Bonwick } 2919b24ab676SJeff Bonwick 2920b24ab676SJeff Bonwick static int 2921b24ab676SJeff Bonwick zio_ddt_write(zio_t *zio) 2922b24ab676SJeff Bonwick { 2923b24ab676SJeff Bonwick spa_t *spa = zio->io_spa; 2924b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 2925b24ab676SJeff Bonwick uint64_t txg = zio->io_txg; 2926b24ab676SJeff Bonwick zio_prop_t *zp = &zio->io_prop; 2927b24ab676SJeff Bonwick int p = zp->zp_copies; 2928b24ab676SJeff Bonwick int ditto_copies; 2929b24ab676SJeff Bonwick zio_t *cio = NULL; 2930b24ab676SJeff Bonwick zio_t *dio = NULL; 2931b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(spa, bp); 2932b24ab676SJeff Bonwick ddt_entry_t *dde; 2933b24ab676SJeff Bonwick ddt_phys_t *ddp; 2934b24ab676SJeff Bonwick 2935b24ab676SJeff Bonwick ASSERT(BP_GET_DEDUP(bp)); 2936b24ab676SJeff Bonwick ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2937b24ab676SJeff Bonwick ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 29385602294fSDan Kimmel ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); 2939b24ab676SJeff Bonwick 2940b24ab676SJeff Bonwick ddt_enter(ddt); 2941b24ab676SJeff Bonwick dde = ddt_lookup(ddt, bp, B_TRUE); 2942b24ab676SJeff Bonwick ddp = &dde->dde_phys[p]; 2943b24ab676SJeff Bonwick 2944b24ab676SJeff Bonwick if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2945b24ab676SJeff Bonwick /* 2946b24ab676SJeff Bonwick * If we're using a weak checksum, upgrade to a strong checksum 2947b24ab676SJeff Bonwick * and try again. If we're already using a strong checksum, 2948b24ab676SJeff Bonwick * we can't resolve it, so just convert to an ordinary write. 2949b24ab676SJeff Bonwick * (And automatically e-mail a paper to Nature?) 2950b24ab676SJeff Bonwick */ 295145818ee1SMatthew Ahrens if (!(zio_checksum_table[zp->zp_checksum].ci_flags & 295245818ee1SMatthew Ahrens ZCHECKSUM_FLAG_DEDUP)) { 2953b24ab676SJeff Bonwick zp->zp_checksum = spa_dedup_checksum(spa); 2954b24ab676SJeff Bonwick zio_pop_transforms(zio); 2955b24ab676SJeff Bonwick zio->io_stage = ZIO_STAGE_OPEN; 2956b24ab676SJeff Bonwick BP_ZERO(bp); 2957b24ab676SJeff Bonwick } else { 295880901aeaSGeorge Wilson zp->zp_dedup = B_FALSE; 29595602294fSDan Kimmel BP_SET_DEDUP(bp, B_FALSE); 2960b24ab676SJeff Bonwick } 29615602294fSDan Kimmel ASSERT(!BP_GET_DEDUP(bp)); 2962b24ab676SJeff Bonwick zio->io_pipeline = ZIO_WRITE_PIPELINE; 2963b24ab676SJeff Bonwick ddt_exit(ddt); 2964b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2965b24ab676SJeff Bonwick } 2966b24ab676SJeff Bonwick 2967b24ab676SJeff Bonwick ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2968b24ab676SJeff Bonwick ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2969b24ab676SJeff Bonwick 2970b24ab676SJeff Bonwick if (ditto_copies > ddt_ditto_copies_present(dde) && 2971b24ab676SJeff Bonwick dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2972b24ab676SJeff Bonwick zio_prop_t czp = *zp; 2973b24ab676SJeff Bonwick 2974b24ab676SJeff Bonwick czp.zp_copies = ditto_copies; 2975b24ab676SJeff Bonwick 2976b24ab676SJeff Bonwick /* 2977b24ab676SJeff Bonwick * If we arrived here with an override bp, we won't have run 2978b24ab676SJeff Bonwick * the transform stack, so we won't have the data we need to 2979b24ab676SJeff Bonwick * generate a child i/o. So, toss the override bp and restart. 2980b24ab676SJeff Bonwick * This is safe, because using the override bp is just an 2981b24ab676SJeff Bonwick * optimization; and it's rare, so the cost doesn't matter. 2982b24ab676SJeff Bonwick */ 2983b24ab676SJeff Bonwick if (zio->io_bp_override) { 2984b24ab676SJeff Bonwick zio_pop_transforms(zio); 2985b24ab676SJeff Bonwick zio->io_stage = ZIO_STAGE_OPEN; 2986b24ab676SJeff Bonwick zio->io_pipeline = ZIO_WRITE_PIPELINE; 2987b24ab676SJeff Bonwick zio->io_bp_override = NULL; 2988b24ab676SJeff Bonwick BP_ZERO(bp); 2989b24ab676SJeff Bonwick ddt_exit(ddt); 2990b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 2991b24ab676SJeff Bonwick } 2992b24ab676SJeff Bonwick 2993770499e1SDan Kimmel dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, 29945602294fSDan Kimmel zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, 29958df0bcf0SPaul Dagnelie NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, 2996b24ab676SJeff Bonwick ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2997b24ab676SJeff Bonwick 2998770499e1SDan Kimmel zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); 2999b24ab676SJeff Bonwick dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 3000b24ab676SJeff Bonwick } 3001b24ab676SJeff Bonwick 3002b24ab676SJeff Bonwick if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 3003b24ab676SJeff Bonwick if (ddp->ddp_phys_birth != 0) 3004b24ab676SJeff Bonwick ddt_bp_fill(ddp, bp, txg); 3005b24ab676SJeff Bonwick if (dde->dde_lead_zio[p] != NULL) 3006b24ab676SJeff Bonwick zio_add_child(zio, dde->dde_lead_zio[p]); 3007b24ab676SJeff Bonwick else 3008b24ab676SJeff Bonwick ddt_phys_addref(ddp); 3009b24ab676SJeff Bonwick } else if (zio->io_bp_override) { 3010b24ab676SJeff Bonwick ASSERT(bp->blk_birth == txg); 3011b24ab676SJeff Bonwick ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 3012b24ab676SJeff Bonwick ddt_phys_fill(ddp, bp); 3013b24ab676SJeff Bonwick ddt_phys_addref(ddp); 3014b24ab676SJeff Bonwick } else { 3015770499e1SDan Kimmel cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, 30165602294fSDan Kimmel zio->io_orig_size, zio->io_orig_size, zp, 30178df0bcf0SPaul Dagnelie zio_ddt_child_write_ready, NULL, NULL, 3018b24ab676SJeff Bonwick zio_ddt_child_write_done, dde, zio->io_priority, 3019b24ab676SJeff Bonwick ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 3020b24ab676SJeff Bonwick 3021770499e1SDan Kimmel zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); 3022b24ab676SJeff Bonwick dde->dde_lead_zio[p] = cio; 3023b24ab676SJeff Bonwick } 3024b24ab676SJeff Bonwick 3025b24ab676SJeff Bonwick ddt_exit(ddt); 3026b24ab676SJeff Bonwick 3027b24ab676SJeff Bonwick if (cio) 3028b24ab676SJeff Bonwick zio_nowait(cio); 3029b24ab676SJeff Bonwick if (dio) 3030b24ab676SJeff Bonwick zio_nowait(dio); 3031b24ab676SJeff Bonwick 3032b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 3033b24ab676SJeff Bonwick } 3034b24ab676SJeff Bonwick 30353f9d6ad7SLin Ling ddt_entry_t *freedde; /* for debugging */ 30363f9d6ad7SLin Ling 3037b24ab676SJeff Bonwick static int 3038b24ab676SJeff Bonwick zio_ddt_free(zio_t *zio) 3039b24ab676SJeff Bonwick { 3040b24ab676SJeff Bonwick spa_t *spa = zio->io_spa; 3041b24ab676SJeff Bonwick blkptr_t *bp = zio->io_bp; 3042b24ab676SJeff Bonwick ddt_t *ddt = ddt_select(spa, bp); 3043b24ab676SJeff Bonwick ddt_entry_t *dde; 3044b24ab676SJeff Bonwick ddt_phys_t *ddp; 3045b24ab676SJeff Bonwick 3046b24ab676SJeff Bonwick ASSERT(BP_GET_DEDUP(bp)); 3047b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3048b24ab676SJeff Bonwick 3049b24ab676SJeff Bonwick ddt_enter(ddt); 30503f9d6ad7SLin Ling freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 3051b24ab676SJeff Bonwick ddp = ddt_phys_select(dde, bp); 3052b24ab676SJeff Bonwick ddt_phys_decref(ddp); 3053b24ab676SJeff Bonwick ddt_exit(ddt); 3054b24ab676SJeff Bonwick 3055b24ab676SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 3056b24ab676SJeff Bonwick } 3057b24ab676SJeff Bonwick 3058b24ab676SJeff Bonwick /* 3059b24ab676SJeff Bonwick * ========================================================================== 3060b24ab676SJeff Bonwick * Allocate and free blocks 3061b24ab676SJeff Bonwick * ========================================================================== 3062b24ab676SJeff Bonwick */ 30630f7643c7SGeorge Wilson 30640f7643c7SGeorge Wilson static zio_t * 3065f78cdc34SPaul Dagnelie zio_io_to_allocate(spa_t *spa, int allocator) 30660f7643c7SGeorge Wilson { 30670f7643c7SGeorge Wilson zio_t *zio; 30680f7643c7SGeorge Wilson 3069f78cdc34SPaul Dagnelie ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); 30700f7643c7SGeorge Wilson 3071f78cdc34SPaul Dagnelie zio = avl_first(&spa->spa_alloc_trees[allocator]); 30720f7643c7SGeorge Wilson if (zio == NULL) 30730f7643c7SGeorge Wilson return (NULL); 30740f7643c7SGeorge Wilson 30750f7643c7SGeorge Wilson ASSERT(IO_IS_ALLOCATING(zio)); 30760f7643c7SGeorge Wilson 30770f7643c7SGeorge Wilson /* 30780f7643c7SGeorge Wilson * Try to place a reservation for this zio. If we're unable to 30790f7643c7SGeorge Wilson * reserve then we throttle. 30800f7643c7SGeorge Wilson */ 3081f78cdc34SPaul Dagnelie ASSERT3U(zio->io_allocator, ==, allocator); 3082663207adSDon Brady if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, 3083f78cdc34SPaul Dagnelie zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { 30840f7643c7SGeorge Wilson return (NULL); 30850f7643c7SGeorge Wilson } 30860f7643c7SGeorge Wilson 3087f78cdc34SPaul Dagnelie avl_remove(&spa->spa_alloc_trees[allocator], zio); 30880f7643c7SGeorge Wilson ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); 30890f7643c7SGeorge Wilson 30900f7643c7SGeorge Wilson return (zio); 30910f7643c7SGeorge Wilson } 30920f7643c7SGeorge Wilson 30930f7643c7SGeorge Wilson static int 30940f7643c7SGeorge Wilson zio_dva_throttle(zio_t *zio) 30950f7643c7SGeorge Wilson { 30960f7643c7SGeorge Wilson spa_t *spa = zio->io_spa; 30970f7643c7SGeorge Wilson zio_t *nio; 3098663207adSDon Brady metaslab_class_t *mc; 3099663207adSDon Brady 3100663207adSDon Brady /* locate an appropriate allocation class */ 3101663207adSDon Brady mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, 3102663207adSDon Brady zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); 31030f7643c7SGeorge Wilson 31040f7643c7SGeorge Wilson if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || 3105663207adSDon Brady !mc->mc_alloc_throttle_enabled || 31060f7643c7SGeorge Wilson zio->io_child_type == ZIO_CHILD_GANG || 31070f7643c7SGeorge Wilson zio->io_flags & ZIO_FLAG_NODATA) { 31080f7643c7SGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 31090f7643c7SGeorge Wilson } 31100f7643c7SGeorge Wilson 31110f7643c7SGeorge Wilson ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 31120f7643c7SGeorge Wilson 31130f7643c7SGeorge Wilson ASSERT3U(zio->io_queued_timestamp, >, 0); 31140f7643c7SGeorge Wilson ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); 31150f7643c7SGeorge Wilson 3116f78cdc34SPaul Dagnelie zbookmark_phys_t *bm = &zio->io_bookmark; 3117f78cdc34SPaul Dagnelie /* 3118f78cdc34SPaul Dagnelie * We want to try to use as many allocators as possible to help improve 3119f78cdc34SPaul Dagnelie * performance, but we also want logically adjacent IOs to be physically 3120f78cdc34SPaul Dagnelie * adjacent to improve sequential read performance. We chunk each object 3121f78cdc34SPaul Dagnelie * into 2^20 block regions, and then hash based on the objset, object, 3122f78cdc34SPaul Dagnelie * level, and region to accomplish both of these goals. 3123f78cdc34SPaul Dagnelie */ 3124f78cdc34SPaul Dagnelie zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, 3125f78cdc34SPaul Dagnelie bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; 3126f78cdc34SPaul Dagnelie mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); 31270f7643c7SGeorge Wilson ASSERT(zio->io_type == ZIO_TYPE_WRITE); 3128663207adSDon Brady zio->io_metaslab_class = mc; 3129f78cdc34SPaul Dagnelie avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); 3130663207adSDon Brady nio = zio_io_to_allocate(spa, zio->io_allocator); 3131f78cdc34SPaul Dagnelie mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); 31320f7643c7SGeorge Wilson 31330f7643c7SGeorge Wilson if (nio == zio) 31340f7643c7SGeorge Wilson return (ZIO_PIPELINE_CONTINUE); 31350f7643c7SGeorge Wilson 31360f7643c7SGeorge Wilson if (nio != NULL) { 31370f7643c7SGeorge Wilson ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); 31380f7643c7SGeorge Wilson /* 31390f7643c7SGeorge Wilson * We are passing control to a new zio so make sure that 31400f7643c7SGeorge Wilson * it is processed by a different thread. We do this to 31410f7643c7SGeorge Wilson * avoid stack overflows that can occur when parents are 31420f7643c7SGeorge Wilson * throttled and children are making progress. We allow 31430f7643c7SGeorge Wilson * it to go to the head of the taskq since it's already 31440f7643c7SGeorge Wilson * been waiting. 31450f7643c7SGeorge Wilson */ 31460f7643c7SGeorge Wilson zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); 31470f7643c7SGeorge Wilson } 31480f7643c7SGeorge Wilson return (ZIO_PIPELINE_STOP); 31490f7643c7SGeorge Wilson } 31500f7643c7SGeorge Wilson 3151663207adSDon Brady static void 3152f78cdc34SPaul Dagnelie zio_allocate_dispatch(spa_t *spa, int allocator) 31530f7643c7SGeorge Wilson { 31540f7643c7SGeorge Wilson zio_t *zio; 31550f7643c7SGeorge Wilson 3156f78cdc34SPaul Dagnelie mutex_enter(&spa->spa_alloc_locks[allocator]); 3157f78cdc34SPaul Dagnelie zio = zio_io_to_allocate(spa, allocator); 3158f78cdc34SPaul Dagnelie mutex_exit(&spa->spa_alloc_locks[allocator]); 31590f7643c7SGeorge Wilson if (zio == NULL) 31600f7643c7SGeorge Wilson return; 31610f7643c7SGeorge Wilson 31620f7643c7SGeorge Wilson ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); 31630f7643c7SGeorge Wilson ASSERT0(zio->io_error); 31640f7643c7SGeorge Wilson zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); 31650f7643c7SGeorge Wilson } 31660f7643c7SGeorge Wilson 3167e05725b1Sbonwick static int 3168fa9e4066Sahrens zio_dva_allocate(zio_t *zio) 3169fa9e4066Sahrens { 31708654d025Sperrin spa_t *spa = zio->io_spa; 3171663207adSDon Brady metaslab_class_t *mc; 3172fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 3173fa9e4066Sahrens int error; 317409c9d376SGeorge Wilson int flags = 0; 3175fa9e4066Sahrens 3176f5383399SBill Moore if (zio->io_gang_leader == NULL) { 3177f5383399SBill Moore ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 3178f5383399SBill Moore zio->io_gang_leader = zio; 3179f5383399SBill Moore } 3180f5383399SBill Moore 3181fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 3182fb09f5aaSMadhav Suresh ASSERT0(BP_GET_NDVAS(bp)); 3183b24ab676SJeff Bonwick ASSERT3U(zio->io_prop.zp_copies, >, 0); 3184b24ab676SJeff Bonwick ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 3185fa9e4066Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 3186fa9e4066Sahrens 3187663207adSDon Brady if (zio->io_flags & ZIO_FLAG_NODATA) 31880f7643c7SGeorge Wilson flags |= METASLAB_DONT_THROTTLE; 3189663207adSDon Brady if (zio->io_flags & ZIO_FLAG_GANG_CHILD) 31900f7643c7SGeorge Wilson flags |= METASLAB_GANG_CHILD; 3191663207adSDon Brady if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) 31920f7643c7SGeorge Wilson flags |= METASLAB_ASYNC_ALLOC; 3193663207adSDon Brady 3194663207adSDon Brady /* 3195663207adSDon Brady * if not already chosen, locate an appropriate allocation class 3196663207adSDon Brady */ 3197663207adSDon Brady mc = zio->io_metaslab_class; 3198663207adSDon Brady if (mc == NULL) { 3199663207adSDon Brady mc = spa_preferred_class(spa, zio->io_size, 3200663207adSDon Brady zio->io_prop.zp_type, zio->io_prop.zp_level, 3201663207adSDon Brady zio->io_prop.zp_zpl_smallblk); 3202663207adSDon Brady zio->io_metaslab_class = mc; 32030f7643c7SGeorge Wilson } 32040f7643c7SGeorge Wilson 3205e14bb325SJeff Bonwick error = metaslab_alloc(spa, mc, zio->io_size, bp, 32068363e80aSGeorge Wilson zio->io_prop.zp_copies, zio->io_txg, NULL, flags, 3207f78cdc34SPaul Dagnelie &zio->io_alloc_list, zio, zio->io_allocator); 3208fa9e4066Sahrens 3209663207adSDon Brady /* 3210663207adSDon Brady * Fallback to normal class when an alloc class is full 3211663207adSDon Brady */ 3212663207adSDon Brady if (error == ENOSPC && mc != spa_normal_class(spa)) { 3213663207adSDon Brady /* 3214663207adSDon Brady * If throttling, transfer reservation over to normal class. 3215663207adSDon Brady * The io_allocator slot can remain the same even though we 3216663207adSDon Brady * are switching classes. 3217663207adSDon Brady */ 3218663207adSDon Brady if (mc->mc_alloc_throttle_enabled && 3219663207adSDon Brady (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { 3220663207adSDon Brady metaslab_class_throttle_unreserve(mc, 3221663207adSDon Brady zio->io_prop.zp_copies, zio->io_allocator, zio); 3222663207adSDon Brady zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; 3223663207adSDon Brady 3224663207adSDon Brady mc = spa_normal_class(spa); 3225663207adSDon Brady VERIFY(metaslab_class_throttle_reserve(mc, 3226663207adSDon Brady zio->io_prop.zp_copies, zio->io_allocator, zio, 3227663207adSDon Brady flags | METASLAB_MUST_RESERVE)); 3228663207adSDon Brady } else { 3229663207adSDon Brady mc = spa_normal_class(spa); 3230663207adSDon Brady } 3231663207adSDon Brady zio->io_metaslab_class = mc; 3232663207adSDon Brady 3233663207adSDon Brady error = metaslab_alloc(spa, mc, zio->io_size, bp, 3234663207adSDon Brady zio->io_prop.zp_copies, zio->io_txg, NULL, flags, 3235663207adSDon Brady &zio->io_alloc_list, zio, zio->io_allocator); 3236663207adSDon Brady } 3237663207adSDon Brady 32380f7643c7SGeorge Wilson if (error != 0) { 323921f7c81cSMatthew Ahrens zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " 324009c9d376SGeorge Wilson "size %llu, error %d", spa_name(spa), zio, zio->io_size, 324109c9d376SGeorge Wilson error); 3242e14bb325SJeff Bonwick if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 3243e14bb325SJeff Bonwick return (zio_write_gang_block(zio)); 3244fa9e4066Sahrens zio->io_error = error; 3245fa9e4066Sahrens } 3246e05725b1Sbonwick 3247e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 3248fa9e4066Sahrens } 3249fa9e4066Sahrens 3250e05725b1Sbonwick static int 3251fa9e4066Sahrens zio_dva_free(zio_t *zio) 3252fa9e4066Sahrens { 3253e14bb325SJeff Bonwick metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 3254fa9e4066Sahrens 3255e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 3256fa9e4066Sahrens } 3257fa9e4066Sahrens 3258e05725b1Sbonwick static int 3259fa9e4066Sahrens zio_dva_claim(zio_t *zio) 3260fa9e4066Sahrens { 3261e14bb325SJeff Bonwick int error; 3262e14bb325SJeff Bonwick 3263e14bb325SJeff Bonwick error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 3264e14bb325SJeff Bonwick if (error) 3265e14bb325SJeff Bonwick zio->io_error = error; 3266fa9e4066Sahrens 3267e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 3268fa9e4066Sahrens } 3269fa9e4066Sahrens 3270e14bb325SJeff Bonwick /* 3271e14bb325SJeff Bonwick * Undo an allocation. This is used by zio_done() when an I/O fails 3272e14bb325SJeff Bonwick * and we want to give back the block we just allocated. 3273e14bb325SJeff Bonwick * This handles both normal blocks and gang blocks. 3274e14bb325SJeff Bonwick */ 3275e14bb325SJeff Bonwick static void 3276e14bb325SJeff Bonwick zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 3277e14bb325SJeff Bonwick { 3278e14bb325SJeff Bonwick ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 3279b24ab676SJeff Bonwick ASSERT(zio->io_bp_override == NULL); 3280e14bb325SJeff Bonwick 3281e14bb325SJeff Bonwick if (!BP_IS_HOLE(bp)) 3282b24ab676SJeff Bonwick metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 3283e14bb325SJeff Bonwick 3284e14bb325SJeff Bonwick if (gn != NULL) { 3285e14bb325SJeff Bonwick for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 3286e14bb325SJeff Bonwick zio_dva_unallocate(zio, gn->gn_child[g], 3287e14bb325SJeff Bonwick &gn->gn_gbh->zg_blkptr[g]); 3288e14bb325SJeff Bonwick } 3289e14bb325SJeff Bonwick } 3290e14bb325SJeff Bonwick } 3291e14bb325SJeff Bonwick 3292e14bb325SJeff Bonwick /* 3293e14bb325SJeff Bonwick * Try to allocate an intent log block. Return 0 on success, errno on failure. 3294e14bb325SJeff Bonwick */ 3295e14bb325SJeff Bonwick int 3296eb633035STom Caputi zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, 3297f78cdc34SPaul Dagnelie blkptr_t *old_bp, uint64_t size, boolean_t *slog) 3298e14bb325SJeff Bonwick { 3299e09fa4daSNeil Perrin int error = 1; 33008363e80aSGeorge Wilson zio_alloc_list_t io_alloc_list; 3301e14bb325SJeff Bonwick 3302b24ab676SJeff Bonwick ASSERT(txg > spa_syncing_txg(spa)); 3303b24ab676SJeff Bonwick 33048363e80aSGeorge Wilson metaslab_trace_init(&io_alloc_list); 3305663207adSDon Brady 3306663207adSDon Brady /* 3307663207adSDon Brady * Block pointer fields are useful to metaslabs for stats and debugging. 3308663207adSDon Brady * Fill in the obvious ones before calling into metaslab_alloc(). 3309663207adSDon Brady */ 3310663207adSDon Brady BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 3311663207adSDon Brady BP_SET_PSIZE(new_bp, size); 3312663207adSDon Brady BP_SET_LEVEL(new_bp, 0); 3313663207adSDon Brady 3314f78cdc34SPaul Dagnelie /* 3315f78cdc34SPaul Dagnelie * When allocating a zil block, we don't have information about 3316f78cdc34SPaul Dagnelie * the final destination of the block except the objset it's part 3317f78cdc34SPaul Dagnelie * of, so we just hash the objset ID to pick the allocator to get 3318f78cdc34SPaul Dagnelie * some parallelism. 3319f78cdc34SPaul Dagnelie */ 3320c5ee4681SAlexander Motin error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, 3321f78cdc34SPaul Dagnelie txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL, 3322eb633035STom Caputi cityhash4(0, 0, 0, 3323eb633035STom Caputi os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); 3324c5ee4681SAlexander Motin if (error == 0) { 3325c5ee4681SAlexander Motin *slog = TRUE; 3326c5ee4681SAlexander Motin } else { 3327b24ab676SJeff Bonwick error = metaslab_alloc(spa, spa_normal_class(spa), size, 33288363e80aSGeorge Wilson new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, 3329eb633035STom Caputi &io_alloc_list, NULL, cityhash4(0, 0, 0, 3330eb633035STom Caputi os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); 3331c5ee4681SAlexander Motin if (error == 0) 3332c5ee4681SAlexander Motin *slog = FALSE; 3333840345f6SGeorge Wilson } 33348363e80aSGeorge Wilson metaslab_trace_fini(&io_alloc_list); 3335e14bb325SJeff Bonwick 3336e14bb325SJeff Bonwick if (error == 0) { 3337e14bb325SJeff Bonwick BP_SET_LSIZE(new_bp, size); 3338e14bb325SJeff Bonwick BP_SET_PSIZE(new_bp, size); 3339e14bb325SJeff Bonwick BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 33406e1f5caaSNeil Perrin BP_SET_CHECKSUM(new_bp, 33416e1f5caaSNeil Perrin spa_version(spa) >= SPA_VERSION_SLIM_ZIL 33426e1f5caaSNeil Perrin ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 3343e14bb325SJeff Bonwick BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 3344e14bb325SJeff Bonwick BP_SET_LEVEL(new_bp, 0); 3345b24ab676SJeff Bonwick BP_SET_DEDUP(new_bp, 0); 3346e14bb325SJeff Bonwick BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 3347eb633035STom Caputi 3348eb633035STom Caputi /* 3349eb633035STom Caputi * encrypted blocks will require an IV and salt. We generate 3350eb633035STom Caputi * these now since we will not be rewriting the bp at 3351eb633035STom Caputi * rewrite time. 3352eb633035STom Caputi */ 3353eb633035STom Caputi if (os->os_encrypted) { 3354eb633035STom Caputi uint8_t iv[ZIO_DATA_IV_LEN]; 3355eb633035STom Caputi uint8_t salt[ZIO_DATA_SALT_LEN]; 3356eb633035STom Caputi 3357eb633035STom Caputi BP_SET_CRYPT(new_bp, B_TRUE); 3358eb633035STom Caputi VERIFY0(spa_crypt_get_salt(spa, 3359eb633035STom Caputi dmu_objset_id(os), salt)); 3360eb633035STom Caputi VERIFY0(zio_crypt_generate_iv(iv)); 3361eb633035STom Caputi 3362eb633035STom Caputi zio_crypt_encode_params_bp(new_bp, salt, iv); 3363eb633035STom Caputi } 33641271e4b1SPrakash Surya } else { 33651271e4b1SPrakash Surya zfs_dbgmsg("%s: zil block allocation failure: " 33661271e4b1SPrakash Surya "size %llu, error %d", spa_name(spa), size, error); 3367e14bb325SJeff Bonwick } 3368e14bb325SJeff Bonwick 3369e14bb325SJeff Bonwick return (error); 3370e14bb325SJeff Bonwick } 3371e14bb325SJeff Bonwick 3372fa9e4066Sahrens /* 3373fa9e4066Sahrens * ========================================================================== 3374fa9e4066Sahrens * Read and write to physical devices 3375fa9e4066Sahrens * ========================================================================== 3376fa9e4066Sahrens */ 3377738f37bcSGeorge Wilson 3378738f37bcSGeorge Wilson /* 3379738f37bcSGeorge Wilson * Issue an I/O to the underlying vdev. Typically the issue pipeline 3380738f37bcSGeorge Wilson * stops after this stage and will resume upon I/O completion. 3381738f37bcSGeorge Wilson * However, there are instances where the vdev layer may need to 3382738f37bcSGeorge Wilson * continue the pipeline when an I/O was not issued. Since the I/O 3383738f37bcSGeorge Wilson * that was sent to the vdev layer might be different than the one 3384738f37bcSGeorge Wilson * currently active in the pipeline (see vdev_queue_io()), we explicitly 3385738f37bcSGeorge Wilson * force the underlying vdev layers to call either zio_execute() or 3386738f37bcSGeorge Wilson * zio_interrupt() to ensure that the pipeline continues with the correct I/O. 3387738f37bcSGeorge Wilson */ 3388e05725b1Sbonwick static int 338944cd46caSbillm zio_vdev_io_start(zio_t *zio) 3390fa9e4066Sahrens { 3391fa9e4066Sahrens vdev_t *vd = zio->io_vd; 339244cd46caSbillm uint64_t align; 33930a4e9518Sgw spa_t *spa = zio->io_spa; 33940a4e9518Sgw 3395*dd50e0ccSTony Hutter zio->io_delay = 0; 3396*dd50e0ccSTony Hutter 3397e14bb325SJeff Bonwick ASSERT(zio->io_error == 0); 3398e14bb325SJeff Bonwick ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 3399fa9e4066Sahrens 3400e14bb325SJeff Bonwick if (vd == NULL) { 3401e14bb325SJeff Bonwick if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3402e14bb325SJeff Bonwick spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 3403fa9e4066Sahrens 3404e14bb325SJeff Bonwick /* 3405e14bb325SJeff Bonwick * The mirror_ops handle multiple DVAs in a single BP. 3406e14bb325SJeff Bonwick */ 3407738f37bcSGeorge Wilson vdev_mirror_ops.vdev_op_io_start(zio); 3408738f37bcSGeorge Wilson return (ZIO_PIPELINE_STOP); 3409fa9e4066Sahrens } 3410fa9e4066Sahrens 34110f7643c7SGeorge Wilson ASSERT3P(zio->io_logical, !=, zio); 34126f793812SPavel Zakharov if (zio->io_type == ZIO_TYPE_WRITE) { 34136f793812SPavel Zakharov ASSERT(spa->spa_trust_config); 34146f793812SPavel Zakharov 3415a3874b8bSToomas Soome /* 3416a3874b8bSToomas Soome * Note: the code can handle other kinds of writes, 3417a3874b8bSToomas Soome * but we don't expect them. 3418a3874b8bSToomas Soome */ 34196f793812SPavel Zakharov if (zio->io_vd->vdev_removing) { 34206f793812SPavel Zakharov ASSERT(zio->io_flags & 34216f793812SPavel Zakharov (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | 34223a4b1be9SMatthew Ahrens ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); 34236f793812SPavel Zakharov } 34245cabbc6bSPrashanth Sreenivasa } 34250f7643c7SGeorge Wilson 3426e14bb325SJeff Bonwick align = 1ULL << vd->vdev_top->vdev_ashift; 3427e14bb325SJeff Bonwick 34282a104a52SAlex Reece if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 34292a104a52SAlex Reece P2PHASE(zio->io_size, align) != 0) { 34302a104a52SAlex Reece /* Transform logical writes to be a full physical block size. */ 3431ecc2d604Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 3432770499e1SDan Kimmel abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); 3433e14bb325SJeff Bonwick ASSERT(vd == vd->vdev_top); 3434ecc2d604Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 3435770499e1SDan Kimmel abd_copy(abuf, zio->io_abd, zio->io_size); 3436770499e1SDan Kimmel abd_zero_off(abuf, zio->io_size, asize - zio->io_size); 3437ecc2d604Sbonwick } 3438e14bb325SJeff Bonwick zio_push_transform(zio, abuf, asize, asize, zio_subblock); 3439ecc2d604Sbonwick } 3440ecc2d604Sbonwick 34412a104a52SAlex Reece /* 34422a104a52SAlex Reece * If this is not a physical io, make sure that it is properly aligned 34432a104a52SAlex Reece * before proceeding. 34442a104a52SAlex Reece */ 34452a104a52SAlex Reece if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { 34462a104a52SAlex Reece ASSERT0(P2PHASE(zio->io_offset, align)); 34472a104a52SAlex Reece ASSERT0(P2PHASE(zio->io_size, align)); 34482a104a52SAlex Reece } else { 34492a104a52SAlex Reece /* 34502a104a52SAlex Reece * For physical writes, we allow 512b aligned writes and assume 34512a104a52SAlex Reece * the device will perform a read-modify-write as necessary. 34522a104a52SAlex Reece */ 34532a104a52SAlex Reece ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); 34542a104a52SAlex Reece ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); 34552a104a52SAlex Reece } 34562a104a52SAlex Reece 3457f9af39baSGeorge Wilson VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 34588ad4d6ddSJeff Bonwick 34598ad4d6ddSJeff Bonwick /* 34608ad4d6ddSJeff Bonwick * If this is a repair I/O, and there's no self-healing involved -- 34618ad4d6ddSJeff Bonwick * that is, we're just resilvering what we expect to resilver -- 34628ad4d6ddSJeff Bonwick * then don't do the I/O unless zio's txg is actually in vd's DTL. 34633a4b1be9SMatthew Ahrens * This prevents spurious resilvering. 34643a4b1be9SMatthew Ahrens * 34653a4b1be9SMatthew Ahrens * There are a few ways that we can end up creating these spurious 34663a4b1be9SMatthew Ahrens * resilver i/os: 34673a4b1be9SMatthew Ahrens * 34683a4b1be9SMatthew Ahrens * 1. A resilver i/o will be issued if any DVA in the BP has a 34693a4b1be9SMatthew Ahrens * dirty DTL. The mirror code will issue resilver writes to 34703a4b1be9SMatthew Ahrens * each DVA, including the one(s) that are not on vdevs with dirty 34713a4b1be9SMatthew Ahrens * DTLs. 34723a4b1be9SMatthew Ahrens * 34733a4b1be9SMatthew Ahrens * 2. With nested replication, which happens when we have a 34743a4b1be9SMatthew Ahrens * "replacing" or "spare" vdev that's a child of a mirror or raidz. 34753a4b1be9SMatthew Ahrens * For example, given mirror(replacing(A+B), C), it's likely that 34763a4b1be9SMatthew Ahrens * only A is out of date (it's the new device). In this case, we'll 34773a4b1be9SMatthew Ahrens * read from C, then use the data to resilver A+B -- but we don't 34783a4b1be9SMatthew Ahrens * actually want to resilver B, just A. The top-level mirror has no 34793a4b1be9SMatthew Ahrens * way to know this, so instead we just discard unnecessary repairs 34803a4b1be9SMatthew Ahrens * as we work our way down the vdev tree. 34813a4b1be9SMatthew Ahrens * 34823a4b1be9SMatthew Ahrens * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. 34833a4b1be9SMatthew Ahrens * The same logic applies to any form of nested replication: ditto 34843a4b1be9SMatthew Ahrens * + mirror, RAID-Z + replacing, etc. 34853a4b1be9SMatthew Ahrens * 34863a4b1be9SMatthew Ahrens * However, indirect vdevs point off to other vdevs which may have 34873a4b1be9SMatthew Ahrens * DTL's, so we never bypass them. The child i/os on concrete vdevs 34883a4b1be9SMatthew Ahrens * will be properly bypassed instead. 34898ad4d6ddSJeff Bonwick */ 34908ad4d6ddSJeff Bonwick if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 34918ad4d6ddSJeff Bonwick !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 34928ad4d6ddSJeff Bonwick zio->io_txg != 0 && /* not a delegated i/o */ 34933a4b1be9SMatthew Ahrens vd->vdev_ops != &vdev_indirect_ops && 34948ad4d6ddSJeff Bonwick !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 34958ad4d6ddSJeff Bonwick ASSERT(zio->io_type == ZIO_TYPE_WRITE); 34968ad4d6ddSJeff Bonwick zio_vdev_io_bypass(zio); 34978ad4d6ddSJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 34988ad4d6ddSJeff Bonwick } 3499fa9e4066Sahrens 3500084fd14fSBrian Behlendorf if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || 3501084fd14fSBrian Behlendorf zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { 3502e14bb325SJeff Bonwick 350343466aaeSMax Grossman if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 3504a3f829aeSBill Moore return (ZIO_PIPELINE_CONTINUE); 3505e14bb325SJeff Bonwick 3506e14bb325SJeff Bonwick if ((zio = vdev_queue_io(zio)) == NULL) 3507e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 3508e14bb325SJeff Bonwick 3509e14bb325SJeff Bonwick if (!vdev_accessible(vd, zio)) { 3510be6fd75aSMatthew Ahrens zio->io_error = SET_ERROR(ENXIO); 3511e14bb325SJeff Bonwick zio_interrupt(zio); 3512e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 3513e14bb325SJeff Bonwick } 3514*dd50e0ccSTony Hutter zio->io_delay = gethrtime(); 3515e14bb325SJeff Bonwick } 3516e14bb325SJeff Bonwick 3517738f37bcSGeorge Wilson vd->vdev_ops->vdev_op_io_start(zio); 3518738f37bcSGeorge Wilson return (ZIO_PIPELINE_STOP); 3519fa9e4066Sahrens } 3520fa9e4066Sahrens 3521e05725b1Sbonwick static int 3522fa9e4066Sahrens zio_vdev_io_done(zio_t *zio) 3523fa9e4066Sahrens { 3524e14bb325SJeff Bonwick vdev_t *vd = zio->io_vd; 3525e14bb325SJeff Bonwick vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 3526e14bb325SJeff Bonwick boolean_t unexpected_error = B_FALSE; 3527e05725b1Sbonwick 3528d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3529e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 3530d6e1c446SGeorge Wilson } 3531fa9e4066Sahrens 3532084fd14fSBrian Behlendorf ASSERT(zio->io_type == ZIO_TYPE_READ || 3533084fd14fSBrian Behlendorf zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); 3534e14bb325SJeff Bonwick 3535*dd50e0ccSTony Hutter if (zio->io_delay) 3536*dd50e0ccSTony Hutter zio->io_delay = gethrtime() - zio->io_delay; 3537*dd50e0ccSTony Hutter 3538e14bb325SJeff Bonwick if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 3539e14bb325SJeff Bonwick 3540e14bb325SJeff Bonwick vdev_queue_io_done(zio); 3541fa9e4066Sahrens 3542e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_WRITE) 3543e14bb325SJeff Bonwick vdev_cache_write(zio); 3544e14bb325SJeff Bonwick 3545e14bb325SJeff Bonwick if (zio_injection_enabled && zio->io_error == 0) 35468956713aSEric Schrock zio->io_error = zio_handle_device_injection(vd, 35478956713aSEric Schrock zio, EIO); 3548e14bb325SJeff Bonwick 3549e14bb325SJeff Bonwick if (zio_injection_enabled && zio->io_error == 0) 3550e14bb325SJeff Bonwick zio->io_error = zio_handle_label_injection(zio, EIO); 3551e14bb325SJeff Bonwick 3552084fd14fSBrian Behlendorf if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { 3553e14bb325SJeff Bonwick if (!vdev_accessible(vd, zio)) { 3554be6fd75aSMatthew Ahrens zio->io_error = SET_ERROR(ENXIO); 3555e14bb325SJeff Bonwick } else { 3556e14bb325SJeff Bonwick unexpected_error = B_TRUE; 3557e14bb325SJeff Bonwick } 3558e14bb325SJeff Bonwick } 355951ece835Seschrock } 3560fa9e4066Sahrens 3561e14bb325SJeff Bonwick ops->vdev_op_io_done(zio); 3562e14bb325SJeff Bonwick 3563e14bb325SJeff Bonwick if (unexpected_error) 3564a3f829aeSBill Moore VERIFY(vdev_probe(vd, zio) == NULL); 3565e14bb325SJeff Bonwick 3566e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 3567fa9e4066Sahrens } 3568fa9e4066Sahrens 3569a3874b8bSToomas Soome /* 3570a3874b8bSToomas Soome * This function is used to change the priority of an existing zio that is 3571a3874b8bSToomas Soome * currently in-flight. This is used by the arc to upgrade priority in the 3572a3874b8bSToomas Soome * event that a demand read is made for a block that is currently queued 3573a3874b8bSToomas Soome * as a scrub or async read IO. Otherwise, the high priority read request 3574a3874b8bSToomas Soome * would end up having to wait for the lower priority IO. 3575a3874b8bSToomas Soome */ 3576a3874b8bSToomas Soome void 3577a3874b8bSToomas Soome zio_change_priority(zio_t *pio, zio_priority_t priority) 3578a3874b8bSToomas Soome { 3579a3874b8bSToomas Soome zio_t *cio, *cio_next; 3580a3874b8bSToomas Soome zio_link_t *zl = NULL; 3581a3874b8bSToomas Soome 3582a3874b8bSToomas Soome ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); 3583a3874b8bSToomas Soome 3584a3874b8bSToomas Soome if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { 3585a3874b8bSToomas Soome vdev_queue_change_io_priority(pio, priority); 3586a3874b8bSToomas Soome } else { 3587a3874b8bSToomas Soome pio->io_priority = priority; 3588a3874b8bSToomas Soome } 3589a3874b8bSToomas Soome 3590a3874b8bSToomas Soome mutex_enter(&pio->io_lock); 3591a3874b8bSToomas Soome for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { 3592a3874b8bSToomas Soome cio_next = zio_walk_children(pio, &zl); 3593a3874b8bSToomas Soome zio_change_priority(cio, priority); 3594a3874b8bSToomas Soome } 3595a3874b8bSToomas Soome mutex_exit(&pio->io_lock); 3596a3874b8bSToomas Soome } 3597a3874b8bSToomas Soome 359822fe2c88SJonathan Adams /* 359922fe2c88SJonathan Adams * For non-raidz ZIOs, we can just copy aside the bad data read from the 360022fe2c88SJonathan Adams * disk, and use that to finish the checksum ereport later. 360122fe2c88SJonathan Adams */ 360222fe2c88SJonathan Adams static void 360322fe2c88SJonathan Adams zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 3604eb633035STom Caputi const abd_t *good_buf) 360522fe2c88SJonathan Adams { 360622fe2c88SJonathan Adams /* no processing needed */ 360722fe2c88SJonathan Adams zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 360822fe2c88SJonathan Adams } 360922fe2c88SJonathan Adams 361022fe2c88SJonathan Adams /*ARGSUSED*/ 361122fe2c88SJonathan Adams void 361222fe2c88SJonathan Adams zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 361322fe2c88SJonathan Adams { 3614eb633035STom Caputi void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); 361522fe2c88SJonathan Adams 3616eb633035STom Caputi abd_copy(abd, zio->io_abd, zio->io_size); 361722fe2c88SJonathan Adams 361822fe2c88SJonathan Adams zcr->zcr_cbinfo = zio->io_size; 3619eb633035STom Caputi zcr->zcr_cbdata = abd; 362022fe2c88SJonathan Adams zcr->zcr_finish = zio_vsd_default_cksum_finish; 3621eb633035STom Caputi zcr->zcr_free = zio_abd_free; 362222fe2c88SJonathan Adams } 362322fe2c88SJonathan Adams 3624e05725b1Sbonwick static int 3625fa9e4066Sahrens zio_vdev_io_assess(zio_t *zio) 3626fa9e4066Sahrens { 3627fa9e4066Sahrens vdev_t *vd = zio->io_vd; 3628e14bb325SJeff Bonwick 3629d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { 3630e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 3631d6e1c446SGeorge Wilson } 3632e14bb325SJeff Bonwick 3633e14bb325SJeff Bonwick if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 3634e14bb325SJeff Bonwick spa_config_exit(zio->io_spa, SCL_ZIO, zio); 3635e14bb325SJeff Bonwick 3636e14bb325SJeff Bonwick if (zio->io_vsd != NULL) { 363722fe2c88SJonathan Adams zio->io_vsd_ops->vsd_free(zio); 3638e14bb325SJeff Bonwick zio->io_vsd = NULL; 3639ecc2d604Sbonwick } 3640ecc2d604Sbonwick 3641e14bb325SJeff Bonwick if (zio_injection_enabled && zio->io_error == 0) 3642ea8dc4b6Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 3643ea8dc4b6Seschrock 3644fa9e4066Sahrens /* 3645fa9e4066Sahrens * If the I/O failed, determine whether we should attempt to retry it. 364635a5a358SJonathan Adams * 364735a5a358SJonathan Adams * On retry, we cut in line in the issue queue, since we don't want 364835a5a358SJonathan Adams * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 3649fa9e4066Sahrens */ 3650e14bb325SJeff Bonwick if (zio->io_error && vd == NULL && 3651e14bb325SJeff Bonwick !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 3652e14bb325SJeff Bonwick ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 3653e14bb325SJeff Bonwick ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 3654fa9e4066Sahrens zio->io_error = 0; 3655e14bb325SJeff Bonwick zio->io_flags |= ZIO_FLAG_IO_RETRY | 3656e14bb325SJeff Bonwick ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 3657b24ab676SJeff Bonwick zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 365835a5a358SJonathan Adams zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 365935a5a358SJonathan Adams zio_requeue_io_start_cut_in_line); 3660e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 3661ea8dc4b6Seschrock } 3662fa9e4066Sahrens 3663e14bb325SJeff Bonwick /* 3664e14bb325SJeff Bonwick * If we got an error on a leaf device, convert it to ENXIO 3665e14bb325SJeff Bonwick * if the device is not accessible at all. 3666e14bb325SJeff Bonwick */ 3667e14bb325SJeff Bonwick if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 3668e14bb325SJeff Bonwick !vdev_accessible(vd, zio)) 3669be6fd75aSMatthew Ahrens zio->io_error = SET_ERROR(ENXIO); 3670e14bb325SJeff Bonwick 3671e14bb325SJeff Bonwick /* 3672e14bb325SJeff Bonwick * If we can't write to an interior vdev (mirror or RAID-Z), 3673e14bb325SJeff Bonwick * set vdev_cant_write so that we stop trying to allocate from it. 3674e14bb325SJeff Bonwick */ 3675e14bb325SJeff Bonwick if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 36763b2aab18SMatthew Ahrens vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 3677e14bb325SJeff Bonwick vd->vdev_cant_write = B_TRUE; 36783b2aab18SMatthew Ahrens } 3679e14bb325SJeff Bonwick 3680295438baSHans Rosenfeld /* 3681295438baSHans Rosenfeld * If a cache flush returns ENOTSUP or ENOTTY, we know that no future 3682084fd14fSBrian Behlendorf * attempts will ever succeed. In this case we set a persistent 3683084fd14fSBrian Behlendorf * boolean flag so that we don't bother with it in the future. 3684295438baSHans Rosenfeld */ 3685295438baSHans Rosenfeld if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && 3686295438baSHans Rosenfeld zio->io_type == ZIO_TYPE_IOCTL && 3687295438baSHans Rosenfeld zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) 3688295438baSHans Rosenfeld vd->vdev_nowritecache = B_TRUE; 3689295438baSHans Rosenfeld 3690e14bb325SJeff Bonwick if (zio->io_error) 3691e14bb325SJeff Bonwick zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 3692e14bb325SJeff Bonwick 369369962b56SMatthew Ahrens if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 369469962b56SMatthew Ahrens zio->io_physdone != NULL) { 369569962b56SMatthew Ahrens ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 369669962b56SMatthew Ahrens ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 369769962b56SMatthew Ahrens zio->io_physdone(zio->io_logical); 369869962b56SMatthew Ahrens } 369969962b56SMatthew Ahrens 3700e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 3701fa9e4066Sahrens } 3702fa9e4066Sahrens 3703fa9e4066Sahrens void 3704fa9e4066Sahrens zio_vdev_io_reissue(zio_t *zio) 3705fa9e4066Sahrens { 3706fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3707fa9e4066Sahrens ASSERT(zio->io_error == 0); 3708fa9e4066Sahrens 3709b24ab676SJeff Bonwick zio->io_stage >>= 1; 3710fa9e4066Sahrens } 3711fa9e4066Sahrens 3712fa9e4066Sahrens void 3713fa9e4066Sahrens zio_vdev_io_redone(zio_t *zio) 3714fa9e4066Sahrens { 3715fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 3716fa9e4066Sahrens 3717b24ab676SJeff Bonwick zio->io_stage >>= 1; 3718fa9e4066Sahrens } 3719fa9e4066Sahrens 3720fa9e4066Sahrens void 3721fa9e4066Sahrens zio_vdev_io_bypass(zio_t *zio) 3722fa9e4066Sahrens { 3723fa9e4066Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 3724fa9e4066Sahrens ASSERT(zio->io_error == 0); 3725fa9e4066Sahrens 3726fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 3727b24ab676SJeff Bonwick zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 3728fa9e4066Sahrens } 3729fa9e4066Sahrens 3730eb633035STom Caputi /* 3731eb633035STom Caputi * ========================================================================== 3732eb633035STom Caputi * Encrypt and store encryption parameters 3733eb633035STom Caputi * ========================================================================== 3734eb633035STom Caputi */ 3735eb633035STom Caputi 3736eb633035STom Caputi 3737eb633035STom Caputi /* 3738eb633035STom Caputi * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for 3739eb633035STom Caputi * managing the storage of encryption parameters and passing them to the 3740eb633035STom Caputi * lower-level encryption functions. 3741eb633035STom Caputi */ 3742eb633035STom Caputi static int 3743eb633035STom Caputi zio_encrypt(zio_t *zio) 3744eb633035STom Caputi { 3745eb633035STom Caputi zio_prop_t *zp = &zio->io_prop; 3746eb633035STom Caputi spa_t *spa = zio->io_spa; 3747eb633035STom Caputi blkptr_t *bp = zio->io_bp; 3748eb633035STom Caputi uint64_t psize = BP_GET_PSIZE(bp); 3749eb633035STom Caputi uint64_t dsobj = zio->io_bookmark.zb_objset; 3750eb633035STom Caputi dmu_object_type_t ot = BP_GET_TYPE(bp); 3751eb633035STom Caputi void *enc_buf = NULL; 3752eb633035STom Caputi abd_t *eabd = NULL; 3753eb633035STom Caputi uint8_t salt[ZIO_DATA_SALT_LEN]; 3754eb633035STom Caputi uint8_t iv[ZIO_DATA_IV_LEN]; 3755eb633035STom Caputi uint8_t mac[ZIO_DATA_MAC_LEN]; 3756eb633035STom Caputi boolean_t no_crypt = B_FALSE; 3757eb633035STom Caputi 3758eb633035STom Caputi /* the root zio already encrypted the data */ 3759eb633035STom Caputi if (zio->io_child_type == ZIO_CHILD_GANG) 3760eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3761eb633035STom Caputi 3762eb633035STom Caputi /* only ZIL blocks are re-encrypted on rewrite */ 3763eb633035STom Caputi if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) 3764eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3765eb633035STom Caputi 3766eb633035STom Caputi if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { 3767eb633035STom Caputi BP_SET_CRYPT(bp, B_FALSE); 3768eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3769eb633035STom Caputi } 3770eb633035STom Caputi 3771eb633035STom Caputi /* if we are doing raw encryption set the provided encryption params */ 3772eb633035STom Caputi if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { 3773eb633035STom Caputi ASSERT0(BP_GET_LEVEL(bp)); 3774eb633035STom Caputi BP_SET_CRYPT(bp, B_TRUE); 3775eb633035STom Caputi BP_SET_BYTEORDER(bp, zp->zp_byteorder); 3776eb633035STom Caputi if (ot != DMU_OT_OBJSET) 3777eb633035STom Caputi zio_crypt_encode_mac_bp(bp, zp->zp_mac); 3778eb633035STom Caputi 3779eb633035STom Caputi /* dnode blocks must be written out in the provided byteorder */ 3780eb633035STom Caputi if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && 3781eb633035STom Caputi ot == DMU_OT_DNODE) { 3782eb633035STom Caputi void *bswap_buf = zio_buf_alloc(psize); 3783eb633035STom Caputi abd_t *babd = abd_get_from_buf(bswap_buf, psize); 3784eb633035STom Caputi 3785eb633035STom Caputi ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); 3786eb633035STom Caputi abd_copy_to_buf(bswap_buf, zio->io_abd, psize); 3787eb633035STom Caputi dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, 3788eb633035STom Caputi psize); 3789eb633035STom Caputi 3790eb633035STom Caputi abd_take_ownership_of_buf(babd, B_TRUE); 3791eb633035STom Caputi zio_push_transform(zio, babd, psize, psize, NULL); 3792eb633035STom Caputi } 3793eb633035STom Caputi 3794eb633035STom Caputi if (DMU_OT_IS_ENCRYPTED(ot)) 3795eb633035STom Caputi zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); 3796eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3797eb633035STom Caputi } 3798eb633035STom Caputi 3799eb633035STom Caputi /* indirect blocks only maintain a cksum of the lower level MACs */ 3800eb633035STom Caputi if (BP_GET_LEVEL(bp) > 0) { 3801eb633035STom Caputi BP_SET_CRYPT(bp, B_TRUE); 3802eb633035STom Caputi VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, 3803eb633035STom Caputi zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), 3804eb633035STom Caputi mac)); 3805eb633035STom Caputi zio_crypt_encode_mac_bp(bp, mac); 3806eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3807eb633035STom Caputi } 3808eb633035STom Caputi 3809eb633035STom Caputi /* 3810eb633035STom Caputi * Objset blocks are a special case since they have 2 256-bit MACs 3811eb633035STom Caputi * embedded within them. 3812eb633035STom Caputi */ 3813eb633035STom Caputi if (ot == DMU_OT_OBJSET) { 3814eb633035STom Caputi ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); 3815eb633035STom Caputi ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); 3816eb633035STom Caputi BP_SET_CRYPT(bp, B_TRUE); 3817eb633035STom Caputi VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, 3818eb633035STom Caputi zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); 3819eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3820eb633035STom Caputi } 3821eb633035STom Caputi 3822eb633035STom Caputi /* unencrypted object types are only authenticated with a MAC */ 3823eb633035STom Caputi if (!DMU_OT_IS_ENCRYPTED(ot)) { 3824eb633035STom Caputi BP_SET_CRYPT(bp, B_TRUE); 3825eb633035STom Caputi VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, 3826eb633035STom Caputi zio->io_abd, psize, mac)); 3827eb633035STom Caputi zio_crypt_encode_mac_bp(bp, mac); 3828eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3829eb633035STom Caputi } 3830eb633035STom Caputi 3831eb633035STom Caputi /* 3832eb633035STom Caputi * Later passes of sync-to-convergence may decide to rewrite data 3833eb633035STom Caputi * in place to avoid more disk reallocations. This presents a problem 3834eb633035STom Caputi * for encryption because this consitutes rewriting the new data with 3835eb633035STom Caputi * the same encryption key and IV. However, this only applies to blocks 3836eb633035STom Caputi * in the MOS (particularly the spacemaps) and we do not encrypt the 3837eb633035STom Caputi * MOS. We assert that the zio is allocating or an intent log write 3838eb633035STom Caputi * to enforce this. 3839eb633035STom Caputi */ 3840eb633035STom Caputi ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); 3841eb633035STom Caputi ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); 3842eb633035STom Caputi ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); 3843eb633035STom Caputi ASSERT3U(psize, !=, 0); 3844eb633035STom Caputi 3845eb633035STom Caputi enc_buf = zio_buf_alloc(psize); 3846eb633035STom Caputi eabd = abd_get_from_buf(enc_buf, psize); 3847eb633035STom Caputi abd_take_ownership_of_buf(eabd, B_TRUE); 3848eb633035STom Caputi 3849eb633035STom Caputi /* 3850eb633035STom Caputi * For an explanation of what encryption parameters are stored 3851eb633035STom Caputi * where, see the block comment in zio_crypt.c. 3852eb633035STom Caputi */ 3853eb633035STom Caputi if (ot == DMU_OT_INTENT_LOG) { 3854eb633035STom Caputi zio_crypt_decode_params_bp(bp, salt, iv); 3855eb633035STom Caputi } else { 3856eb633035STom Caputi BP_SET_CRYPT(bp, B_TRUE); 3857eb633035STom Caputi } 3858eb633035STom Caputi 3859eb633035STom Caputi /* Perform the encryption. This should not fail */ 3860eb633035STom Caputi VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, 3861eb633035STom Caputi BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), 3862eb633035STom Caputi salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); 3863eb633035STom Caputi 3864eb633035STom Caputi /* encode encryption metadata into the bp */ 3865eb633035STom Caputi if (ot == DMU_OT_INTENT_LOG) { 3866eb633035STom Caputi /* 3867eb633035STom Caputi * ZIL blocks store the MAC in the embedded checksum, so the 3868eb633035STom Caputi * transform must always be applied. 3869eb633035STom Caputi */ 3870eb633035STom Caputi zio_crypt_encode_mac_zil(enc_buf, mac); 3871eb633035STom Caputi zio_push_transform(zio, eabd, psize, psize, NULL); 3872eb633035STom Caputi } else { 3873eb633035STom Caputi BP_SET_CRYPT(bp, B_TRUE); 3874eb633035STom Caputi zio_crypt_encode_params_bp(bp, salt, iv); 3875eb633035STom Caputi zio_crypt_encode_mac_bp(bp, mac); 3876eb633035STom Caputi 3877eb633035STom Caputi if (no_crypt) { 3878eb633035STom Caputi ASSERT3U(ot, ==, DMU_OT_DNODE); 3879eb633035STom Caputi abd_free(eabd); 3880eb633035STom Caputi } else { 3881eb633035STom Caputi zio_push_transform(zio, eabd, psize, psize, NULL); 3882eb633035STom Caputi } 3883eb633035STom Caputi } 3884eb633035STom Caputi 3885eb633035STom Caputi return (ZIO_PIPELINE_CONTINUE); 3886eb633035STom Caputi } 3887eb633035STom Caputi 3888fa9e4066Sahrens /* 3889fa9e4066Sahrens * ========================================================================== 3890fa9e4066Sahrens * Generate and verify checksums 3891fa9e4066Sahrens * ========================================================================== 3892fa9e4066Sahrens */ 3893e05725b1Sbonwick static int 3894fa9e4066Sahrens zio_checksum_generate(zio_t *zio) 3895fa9e4066Sahrens { 3896fa9e4066Sahrens blkptr_t *bp = zio->io_bp; 3897e14bb325SJeff Bonwick enum zio_checksum checksum; 3898fa9e4066Sahrens 3899e14bb325SJeff Bonwick if (bp == NULL) { 3900e14bb325SJeff Bonwick /* 3901e14bb325SJeff Bonwick * This is zio_write_phys(). 3902e14bb325SJeff Bonwick * We're either generating a label checksum, or none at all. 3903e14bb325SJeff Bonwick */ 3904e14bb325SJeff Bonwick checksum = zio->io_prop.zp_checksum; 3905e14bb325SJeff Bonwick 3906e14bb325SJeff Bonwick if (checksum == ZIO_CHECKSUM_OFF) 3907e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 3908fa9e4066Sahrens 3909e14bb325SJeff Bonwick ASSERT(checksum == ZIO_CHECKSUM_LABEL); 3910e14bb325SJeff Bonwick } else { 3911e14bb325SJeff Bonwick if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 3912e14bb325SJeff Bonwick ASSERT(!IO_IS_ALLOCATING(zio)); 3913e14bb325SJeff Bonwick checksum = ZIO_CHECKSUM_GANG_HEADER; 3914e14bb325SJeff Bonwick } else { 3915e14bb325SJeff Bonwick checksum = BP_GET_CHECKSUM(bp); 3916e14bb325SJeff Bonwick } 3917e14bb325SJeff Bonwick } 3918fa9e4066Sahrens 3919770499e1SDan Kimmel zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); 3920fa9e4066Sahrens 3921e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 3922fa9e4066Sahrens } 3923fa9e4066Sahrens 3924e05725b1Sbonwick static int 3925e14bb325SJeff Bonwick zio_checksum_verify(zio_t *zio) 3926fa9e4066Sahrens { 392722fe2c88SJonathan Adams zio_bad_cksum_t info; 3928e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 3929e14bb325SJeff Bonwick int error; 3930fa9e4066Sahrens 3931b24ab676SJeff Bonwick ASSERT(zio->io_vd != NULL); 3932b24ab676SJeff Bonwick 3933e14bb325SJeff Bonwick if (bp == NULL) { 3934e14bb325SJeff Bonwick /* 3935e14bb325SJeff Bonwick * This is zio_read_phys(). 3936e14bb325SJeff Bonwick * We're either verifying a label checksum, or nothing at all. 3937e14bb325SJeff Bonwick */ 3938e14bb325SJeff Bonwick if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 3939e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 3940fa9e4066Sahrens 3941e14bb325SJeff Bonwick ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 3942e14bb325SJeff Bonwick } 3943fa9e4066Sahrens 394422fe2c88SJonathan Adams if ((error = zio_checksum_error(zio, &info)) != 0) { 3945e14bb325SJeff Bonwick zio->io_error = error; 3946373dc1cfSMatthew Ahrens if (error == ECKSUM && 3947373dc1cfSMatthew Ahrens !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 394822fe2c88SJonathan Adams zfs_ereport_start_checksum(zio->io_spa, 3949eb633035STom Caputi zio->io_vd, &zio->io_bookmark, zio, 3950eb633035STom Caputi zio->io_offset, zio->io_size, NULL, &info); 3951e14bb325SJeff Bonwick } 3952fa9e4066Sahrens } 3953fa9e4066Sahrens 3954e05725b1Sbonwick return (ZIO_PIPELINE_CONTINUE); 3955fa9e4066Sahrens } 3956fa9e4066Sahrens 3957fa9e4066Sahrens /* 3958fa9e4066Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 3959fa9e4066Sahrens */ 3960fa9e4066Sahrens void 3961fa9e4066Sahrens zio_checksum_verified(zio_t *zio) 3962fa9e4066Sahrens { 3963b24ab676SJeff Bonwick zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 3964fa9e4066Sahrens } 3965fa9e4066Sahrens 3966fa9e4066Sahrens /* 3967e14bb325SJeff Bonwick * ========================================================================== 3968e14bb325SJeff Bonwick * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 39695d7b4d43SMatthew Ahrens * An error of 0 indicates success. ENXIO indicates whole-device failure, 3970e14bb325SJeff Bonwick * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 3971e14bb325SJeff Bonwick * indicate errors that are specific to one I/O, and most likely permanent. 3972e14bb325SJeff Bonwick * Any other error is presumed to be worse because we weren't expecting it. 3973e14bb325SJeff Bonwick * ========================================================================== 3974fa9e4066Sahrens */ 3975e14bb325SJeff Bonwick int 3976e14bb325SJeff Bonwick zio_worst_error(int e1, int e2) 3977fa9e4066Sahrens { 3978e14bb325SJeff Bonwick static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 3979e14bb325SJeff Bonwick int r1, r2; 3980e14bb325SJeff Bonwick 3981e14bb325SJeff Bonwick for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 3982e14bb325SJeff Bonwick if (e1 == zio_error_rank[r1]) 3983e14bb325SJeff Bonwick break; 3984e14bb325SJeff Bonwick 3985e14bb325SJeff Bonwick for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 3986e14bb325SJeff Bonwick if (e2 == zio_error_rank[r2]) 3987e14bb325SJeff Bonwick break; 398844cd46caSbillm 3989e14bb325SJeff Bonwick return (r1 > r2 ? e1 : e2); 3990fa9e4066Sahrens } 3991fa9e4066Sahrens 3992fa9e4066Sahrens /* 3993fa9e4066Sahrens * ========================================================================== 3994e14bb325SJeff Bonwick * I/O completion 3995fa9e4066Sahrens * ========================================================================== 3996fa9e4066Sahrens */ 3997e14bb325SJeff Bonwick static int 3998e14bb325SJeff Bonwick zio_ready(zio_t *zio) 3999fa9e4066Sahrens { 4000e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 4001a3f829aeSBill Moore zio_t *pio, *pio_next; 40020f7643c7SGeorge Wilson zio_link_t *zl = NULL; 4003fa9e4066Sahrens 4004d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, 4005d6e1c446SGeorge Wilson ZIO_WAIT_READY)) { 4006f5383399SBill Moore return (ZIO_PIPELINE_STOP); 4007d6e1c446SGeorge Wilson } 4008fa9e4066Sahrens 4009f5383399SBill Moore if (zio->io_ready) { 4010e14bb325SJeff Bonwick ASSERT(IO_IS_ALLOCATING(zio)); 401180901aeaSGeorge Wilson ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 401280901aeaSGeorge Wilson (zio->io_flags & ZIO_FLAG_NOPWRITE)); 4013e14bb325SJeff Bonwick ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 4014fa9e4066Sahrens 4015e14bb325SJeff Bonwick zio->io_ready(zio); 4016e14bb325SJeff Bonwick } 4017fa9e4066Sahrens 4018e14bb325SJeff Bonwick if (bp != NULL && bp != &zio->io_bp_copy) 4019e14bb325SJeff Bonwick zio->io_bp_copy = *bp; 4020fa9e4066Sahrens 40210f7643c7SGeorge Wilson if (zio->io_error != 0) { 4022e14bb325SJeff Bonwick zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 4023fa9e4066Sahrens 40240f7643c7SGeorge Wilson if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 40250f7643c7SGeorge Wilson ASSERT(IO_IS_ALLOCATING(zio)); 40260f7643c7SGeorge Wilson ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 4027663207adSDon Brady ASSERT(zio->io_metaslab_class != NULL); 4028663207adSDon Brady 40290f7643c7SGeorge Wilson /* 40300f7643c7SGeorge Wilson * We were unable to allocate anything, unreserve and 40310f7643c7SGeorge Wilson * issue the next I/O to allocate. 40320f7643c7SGeorge Wilson */ 40330f7643c7SGeorge Wilson metaslab_class_throttle_unreserve( 4034663207adSDon Brady zio->io_metaslab_class, zio->io_prop.zp_copies, 4035663207adSDon Brady zio->io_allocator, zio); 4036f78cdc34SPaul Dagnelie zio_allocate_dispatch(zio->io_spa, zio->io_allocator); 40370f7643c7SGeorge Wilson } 40380f7643c7SGeorge Wilson } 40390f7643c7SGeorge Wilson 4040a3f829aeSBill Moore mutex_enter(&zio->io_lock); 4041a3f829aeSBill Moore zio->io_state[ZIO_WAIT_READY] = 1; 40420f7643c7SGeorge Wilson pio = zio_walk_parents(zio, &zl); 4043a3f829aeSBill Moore mutex_exit(&zio->io_lock); 4044a3f829aeSBill Moore 4045a3f829aeSBill Moore /* 4046a3f829aeSBill Moore * As we notify zio's parents, new parents could be added. 4047a3f829aeSBill Moore * New parents go to the head of zio's io_parent_list, however, 4048a3f829aeSBill Moore * so we will (correctly) not notify them. The remainder of zio's 4049a3f829aeSBill Moore * io_parent_list, from 'pio_next' onward, cannot change because 4050a3f829aeSBill Moore * all parents must wait for us to be done before they can be done. 4051a3f829aeSBill Moore */ 4052a3f829aeSBill Moore for (; pio != NULL; pio = pio_next) { 40530f7643c7SGeorge Wilson pio_next = zio_walk_parents(zio, &zl); 4054e14bb325SJeff Bonwick zio_notify_parent(pio, zio, ZIO_WAIT_READY); 4055a3f829aeSBill Moore } 4056fa9e4066Sahrens 4057b24ab676SJeff Bonwick if (zio->io_flags & ZIO_FLAG_NODATA) { 4058b24ab676SJeff Bonwick if (BP_IS_GANG(bp)) { 4059b24ab676SJeff Bonwick zio->io_flags &= ~ZIO_FLAG_NODATA; 4060b24ab676SJeff Bonwick } else { 4061770499e1SDan Kimmel ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); 4062b24ab676SJeff Bonwick zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 4063b24ab676SJeff Bonwick } 4064b24ab676SJeff Bonwick } 4065b24ab676SJeff Bonwick 4066a33cae98STim Haley if (zio_injection_enabled && 4067a33cae98STim Haley zio->io_spa->spa_syncing_txg == zio->io_txg) 4068a33cae98STim Haley zio_handle_ignored_writes(zio); 4069a33cae98STim Haley 4070e14bb325SJeff Bonwick return (ZIO_PIPELINE_CONTINUE); 4071fa9e4066Sahrens } 4072fa9e4066Sahrens 40730f7643c7SGeorge Wilson /* 40740f7643c7SGeorge Wilson * Update the allocation throttle accounting. 40750f7643c7SGeorge Wilson */ 40760f7643c7SGeorge Wilson static void 40770f7643c7SGeorge Wilson zio_dva_throttle_done(zio_t *zio) 40780f7643c7SGeorge Wilson { 40790f7643c7SGeorge Wilson zio_t *lio = zio->io_logical; 40800f7643c7SGeorge Wilson zio_t *pio = zio_unique_parent(zio); 40810f7643c7SGeorge Wilson vdev_t *vd = zio->io_vd; 40820f7643c7SGeorge Wilson int flags = METASLAB_ASYNC_ALLOC; 40830f7643c7SGeorge Wilson 40840f7643c7SGeorge Wilson ASSERT3P(zio->io_bp, !=, NULL); 40850f7643c7SGeorge Wilson ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 40860f7643c7SGeorge Wilson ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); 40870f7643c7SGeorge Wilson ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); 40880f7643c7SGeorge Wilson ASSERT(vd != NULL); 40890f7643c7SGeorge Wilson ASSERT3P(vd, ==, vd->vdev_top); 40900f7643c7SGeorge Wilson ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); 40910f7643c7SGeorge Wilson ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); 40920f7643c7SGeorge Wilson ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); 40930f7643c7SGeorge Wilson ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); 40940f7643c7SGeorge Wilson 40950f7643c7SGeorge Wilson /* 40960f7643c7SGeorge Wilson * Parents of gang children can have two flavors -- ones that 40970f7643c7SGeorge Wilson * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) 40980f7643c7SGeorge Wilson * and ones that allocated the constituent blocks. The allocation 40990f7643c7SGeorge Wilson * throttle needs to know the allocating parent zio so we must find 41000f7643c7SGeorge Wilson * it here. 41010f7643c7SGeorge Wilson */ 41020f7643c7SGeorge Wilson if (pio->io_child_type == ZIO_CHILD_GANG) { 41030f7643c7SGeorge Wilson /* 41040f7643c7SGeorge Wilson * If our parent is a rewrite gang child then our grandparent 41050f7643c7SGeorge Wilson * would have been the one that performed the allocation. 41060f7643c7SGeorge Wilson */ 41070f7643c7SGeorge Wilson if (pio->io_flags & ZIO_FLAG_IO_REWRITE) 41080f7643c7SGeorge Wilson pio = zio_unique_parent(pio); 41090f7643c7SGeorge Wilson flags |= METASLAB_GANG_CHILD; 41100f7643c7SGeorge Wilson } 41110f7643c7SGeorge Wilson 41120f7643c7SGeorge Wilson ASSERT(IO_IS_ALLOCATING(pio)); 41130f7643c7SGeorge Wilson ASSERT3P(zio, !=, zio->io_logical); 41140f7643c7SGeorge Wilson ASSERT(zio->io_logical != NULL); 41150f7643c7SGeorge Wilson ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); 41160f7643c7SGeorge Wilson ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); 4117663207adSDon Brady ASSERT(zio->io_metaslab_class != NULL); 41180f7643c7SGeorge Wilson 41190f7643c7SGeorge Wilson mutex_enter(&pio->io_lock); 4120f78cdc34SPaul Dagnelie metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, 4121f78cdc34SPaul Dagnelie pio->io_allocator, B_TRUE); 41220f7643c7SGeorge Wilson mutex_exit(&pio->io_lock); 41230f7643c7SGeorge Wilson 4124663207adSDon Brady metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, 4125663207adSDon Brady pio->io_allocator, pio); 41260f7643c7SGeorge Wilson 41270f7643c7SGeorge Wilson /* 41280f7643c7SGeorge Wilson * Call into the pipeline to see if there is more work that 41290f7643c7SGeorge Wilson * needs to be done. If there is work to be done it will be 41300f7643c7SGeorge Wilson * dispatched to another taskq thread. 41310f7643c7SGeorge Wilson */ 4132f78cdc34SPaul Dagnelie zio_allocate_dispatch(zio->io_spa, pio->io_allocator); 41330f7643c7SGeorge Wilson } 41340f7643c7SGeorge Wilson 4135e14bb325SJeff Bonwick static int 4136e14bb325SJeff Bonwick zio_done(zio_t *zio) 4137d63d470bSgw { 4138e14bb325SJeff Bonwick spa_t *spa = zio->io_spa; 4139e14bb325SJeff Bonwick zio_t *lio = zio->io_logical; 4140e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 4141e14bb325SJeff Bonwick vdev_t *vd = zio->io_vd; 4142e14bb325SJeff Bonwick uint64_t psize = zio->io_size; 4143a3f829aeSBill Moore zio_t *pio, *pio_next; 41440f7643c7SGeorge Wilson zio_link_t *zl = NULL; 4145d63d470bSgw 4146e14bb325SJeff Bonwick /* 4147f5383399SBill Moore * If our children haven't all completed, 4148e14bb325SJeff Bonwick * wait for them and then repeat this pipeline stage. 4149e14bb325SJeff Bonwick */ 4150d6e1c446SGeorge Wilson if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { 4151e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 4152d6e1c446SGeorge Wilson } 4153d63d470bSgw 41540f7643c7SGeorge Wilson /* 41550f7643c7SGeorge Wilson * If the allocation throttle is enabled, then update the accounting. 41560f7643c7SGeorge Wilson * We only track child I/Os that are part of an allocating async 41570f7643c7SGeorge Wilson * write. We must do this since the allocation is performed 41580f7643c7SGeorge Wilson * by the logical I/O but the actual write is done by child I/Os. 41590f7643c7SGeorge Wilson */ 41600f7643c7SGeorge Wilson if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && 41610f7643c7SGeorge Wilson zio->io_child_type == ZIO_CHILD_VDEV) { 4162663207adSDon Brady ASSERT(zio->io_metaslab_class != NULL); 4163663207adSDon Brady ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); 41640f7643c7SGeorge Wilson zio_dva_throttle_done(zio); 41650f7643c7SGeorge Wilson } 41660f7643c7SGeorge Wilson 41670f7643c7SGeorge Wilson /* 41680f7643c7SGeorge Wilson * If the allocation throttle is enabled, verify that 41690f7643c7SGeorge Wilson * we have decremented the refcounts for every I/O that was throttled. 41700f7643c7SGeorge Wilson */ 41710f7643c7SGeorge Wilson if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { 41720f7643c7SGeorge Wilson ASSERT(zio->io_type == ZIO_TYPE_WRITE); 41730f7643c7SGeorge Wilson ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); 41740f7643c7SGeorge Wilson ASSERT(bp != NULL); 4175663207adSDon Brady 4176f78cdc34SPaul Dagnelie metaslab_group_alloc_verify(spa, zio->io_bp, zio, 4177f78cdc34SPaul Dagnelie zio->io_allocator); 4178e914ace2STim Schumacher VERIFY(zfs_refcount_not_held( 4179663207adSDon Brady &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], 4180663207adSDon Brady zio)); 41810f7643c7SGeorge Wilson } 41820f7643c7SGeorge Wilson 4183e14bb325SJeff Bonwick for (int c = 0; c < ZIO_CHILD_TYPES; c++) 4184e14bb325SJeff Bonwick for (int w = 0; w < ZIO_WAIT_TYPES; w++) 4185e14bb325SJeff Bonwick ASSERT(zio->io_children[c][w] == 0); 4186e14bb325SJeff Bonwick 41875d7b4d43SMatthew Ahrens if (bp != NULL && !BP_IS_EMBEDDED(bp)) { 4188e14bb325SJeff Bonwick ASSERT(bp->blk_pad[0] == 0); 4189e14bb325SJeff Bonwick ASSERT(bp->blk_pad[1] == 0); 4190e14bb325SJeff Bonwick ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 4191a3f829aeSBill Moore (bp == zio_unique_parent(zio)->io_bp)); 4192e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 4193b24ab676SJeff Bonwick zio->io_bp_override == NULL && 4194e14bb325SJeff Bonwick !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 4195b24ab676SJeff Bonwick ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 4196e14bb325SJeff Bonwick ASSERT(BP_COUNT_GANG(bp) == 0 || 4197e14bb325SJeff Bonwick (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 4198e14bb325SJeff Bonwick } 419980901aeaSGeorge Wilson if (zio->io_flags & ZIO_FLAG_NOPWRITE) 420080901aeaSGeorge Wilson VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 4201e14bb325SJeff Bonwick } 4202fa9e4066Sahrens 4203e14bb325SJeff Bonwick /* 4204b24ab676SJeff Bonwick * If there were child vdev/gang/ddt errors, they apply to us now. 4205e14bb325SJeff Bonwick */ 4206e14bb325SJeff Bonwick zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 4207e14bb325SJeff Bonwick zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 4208b24ab676SJeff Bonwick zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 4209b24ab676SJeff Bonwick 4210b24ab676SJeff Bonwick /* 4211b24ab676SJeff Bonwick * If the I/O on the transformed data was successful, generate any 4212b24ab676SJeff Bonwick * checksum reports now while we still have the transformed data. 4213b24ab676SJeff Bonwick */ 4214b24ab676SJeff Bonwick if (zio->io_error == 0) { 4215b24ab676SJeff Bonwick while (zio->io_cksum_report != NULL) { 4216b24ab676SJeff Bonwick zio_cksum_report_t *zcr = zio->io_cksum_report; 4217b24ab676SJeff Bonwick uint64_t align = zcr->zcr_align; 4218b24ab676SJeff Bonwick uint64_t asize = P2ROUNDUP(psize, align); 4219770499e1SDan Kimmel abd_t *adata = zio->io_abd; 4220b24ab676SJeff Bonwick 4221b24ab676SJeff Bonwick if (asize != psize) { 4222eb633035STom Caputi adata = abd_alloc(asize, B_TRUE); 4223770499e1SDan Kimmel abd_copy(adata, zio->io_abd, psize); 4224770499e1SDan Kimmel abd_zero_off(adata, psize, asize - psize); 4225b24ab676SJeff Bonwick } 4226b24ab676SJeff Bonwick 4227b24ab676SJeff Bonwick zio->io_cksum_report = zcr->zcr_next; 4228b24ab676SJeff Bonwick zcr->zcr_next = NULL; 4229eb633035STom Caputi zcr->zcr_finish(zcr, adata); 4230b24ab676SJeff Bonwick zfs_ereport_free_checksum(zcr); 4231b24ab676SJeff Bonwick 4232b24ab676SJeff Bonwick if (asize != psize) 4233770499e1SDan Kimmel abd_free(adata); 4234b24ab676SJeff Bonwick } 4235b24ab676SJeff Bonwick } 4236e14bb325SJeff Bonwick 4237e14bb325SJeff Bonwick zio_pop_transforms(zio); /* note: may set zio->io_error */ 4238e14bb325SJeff Bonwick 4239e14bb325SJeff Bonwick vdev_stat_update(zio, psize); 4240e14bb325SJeff Bonwick 4241*dd50e0ccSTony Hutter if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { 4242*dd50e0ccSTony Hutter if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { 4243*dd50e0ccSTony Hutter /* 4244*dd50e0ccSTony Hutter * We want to only increment our slow IO counters if 4245*dd50e0ccSTony Hutter * the IO is valid (i.e. not if the drive is removed). 4246*dd50e0ccSTony Hutter * 4247*dd50e0ccSTony Hutter * zfs_ereport_post() will also do these checks, but 4248*dd50e0ccSTony Hutter * it can also have other failures, so we need to 4249*dd50e0ccSTony Hutter * increment the slow_io counters independent of it. 4250*dd50e0ccSTony Hutter */ 4251*dd50e0ccSTony Hutter if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, 4252*dd50e0ccSTony Hutter zio->io_spa, zio->io_vd, zio)) { 4253*dd50e0ccSTony Hutter mutex_enter(&zio->io_vd->vdev_stat_lock); 4254*dd50e0ccSTony Hutter zio->io_vd->vdev_stat.vs_slow_ios++; 4255*dd50e0ccSTony Hutter mutex_exit(&zio->io_vd->vdev_stat_lock); 4256*dd50e0ccSTony Hutter 4257*dd50e0ccSTony Hutter zfs_ereport_post(FM_EREPORT_ZFS_DELAY, 4258*dd50e0ccSTony Hutter zio->io_spa, zio->io_vd, &zio->io_bookmark, 4259*dd50e0ccSTony Hutter zio, 0, 0); 4260*dd50e0ccSTony Hutter } 4261*dd50e0ccSTony Hutter } 4262*dd50e0ccSTony Hutter } 4263*dd50e0ccSTony Hutter 4264e14bb325SJeff Bonwick if (zio->io_error) { 4265e14bb325SJeff Bonwick /* 4266e14bb325SJeff Bonwick * If this I/O is attached to a particular vdev, 4267e14bb325SJeff Bonwick * generate an error message describing the I/O failure 4268e14bb325SJeff Bonwick * at the block level. We ignore these errors if the 4269e14bb325SJeff Bonwick * device is currently unavailable. 4270e14bb325SJeff Bonwick */ 4271e14bb325SJeff Bonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 4272eb633035STom Caputi zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, 4273eb633035STom Caputi &zio->io_bookmark, zio, 0, 0); 4274e14bb325SJeff Bonwick 42758f18d1faSGeorge Wilson if ((zio->io_error == EIO || !(zio->io_flags & 42768f18d1faSGeorge Wilson (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 42778f18d1faSGeorge Wilson zio == lio) { 4278e14bb325SJeff Bonwick /* 4279e14bb325SJeff Bonwick * For logical I/O requests, tell the SPA to log the 4280e14bb325SJeff Bonwick * error and generate a logical data ereport. 4281e14bb325SJeff Bonwick */ 4282eb633035STom Caputi spa_log_error(spa, &zio->io_bookmark); 4283eb633035STom Caputi zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, 4284eb633035STom Caputi &zio->io_bookmark, zio, 0, 0); 4285e14bb325SJeff Bonwick } 4286e14bb325SJeff Bonwick } 4287fa9e4066Sahrens 4288e14bb325SJeff Bonwick if (zio->io_error && zio == lio) { 4289e14bb325SJeff Bonwick /* 4290e14bb325SJeff Bonwick * Determine whether zio should be reexecuted. This will 4291e14bb325SJeff Bonwick * propagate all the way to the root via zio_notify_parent(). 4292e14bb325SJeff Bonwick */ 4293e14bb325SJeff Bonwick ASSERT(vd == NULL && bp != NULL); 4294b24ab676SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 4295e14bb325SJeff Bonwick 4296b24ab676SJeff Bonwick if (IO_IS_ALLOCATING(zio) && 4297b24ab676SJeff Bonwick !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 4298e14bb325SJeff Bonwick if (zio->io_error != ENOSPC) 4299e14bb325SJeff Bonwick zio->io_reexecute |= ZIO_REEXECUTE_NOW; 4300e14bb325SJeff Bonwick else 4301e14bb325SJeff Bonwick zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 4302b24ab676SJeff Bonwick } 4303e14bb325SJeff Bonwick 4304e14bb325SJeff Bonwick if ((zio->io_type == ZIO_TYPE_READ || 4305e14bb325SJeff Bonwick zio->io_type == ZIO_TYPE_FREE) && 430644ecc532SGeorge Wilson !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 4307e14bb325SJeff Bonwick zio->io_error == ENXIO && 4308b16da2e2SGeorge Wilson spa_load_state(spa) == SPA_LOAD_NONE && 4309e14bb325SJeff Bonwick spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 4310e14bb325SJeff Bonwick zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 4311e14bb325SJeff Bonwick 4312e14bb325SJeff Bonwick if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 4313e14bb325SJeff Bonwick zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 431422fe2c88SJonathan Adams 431522fe2c88SJonathan Adams /* 431622fe2c88SJonathan Adams * Here is a possibly good place to attempt to do 431722fe2c88SJonathan Adams * either combinatorial reconstruction or error correction 431822fe2c88SJonathan Adams * based on checksums. It also might be a good place 431922fe2c88SJonathan Adams * to send out preliminary ereports before we suspend 432022fe2c88SJonathan Adams * processing. 432122fe2c88SJonathan Adams */ 4322d63d470bSgw } 4323d63d470bSgw 432467bd71c6Sperrin /* 4325e14bb325SJeff Bonwick * If there were logical child errors, they apply to us now. 4326e14bb325SJeff Bonwick * We defer this until now to avoid conflating logical child 4327e14bb325SJeff Bonwick * errors with errors that happened to the zio itself when 4328e14bb325SJeff Bonwick * updating vdev stats and reporting FMA events above. 432967bd71c6Sperrin */ 4330e14bb325SJeff Bonwick zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 43318654d025Sperrin 4332b24ab676SJeff Bonwick if ((zio->io_error || zio->io_reexecute) && 4333b24ab676SJeff Bonwick IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 433480901aeaSGeorge Wilson !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 4335f5383399SBill Moore zio_dva_unallocate(zio, zio->io_gang_tree, bp); 4336f5383399SBill Moore 4337f5383399SBill Moore zio_gang_tree_free(&zio->io_gang_tree); 4338f5383399SBill Moore 433933a372edSGeorge Wilson /* 434033a372edSGeorge Wilson * Godfather I/Os should never suspend. 434133a372edSGeorge Wilson */ 434233a372edSGeorge Wilson if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 434333a372edSGeorge Wilson (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 434433a372edSGeorge Wilson zio->io_reexecute = 0; 434533a372edSGeorge Wilson 434633a372edSGeorge Wilson if (zio->io_reexecute) { 4347e14bb325SJeff Bonwick /* 4348e14bb325SJeff Bonwick * This is a logical I/O that wants to reexecute. 4349e14bb325SJeff Bonwick * 4350e14bb325SJeff Bonwick * Reexecute is top-down. When an i/o fails, if it's not 4351e14bb325SJeff Bonwick * the root, it simply notifies its parent and sticks around. 4352e14bb325SJeff Bonwick * The parent, seeing that it still has children in zio_done(), 4353e14bb325SJeff Bonwick * does the same. This percolates all the way up to the root. 4354e14bb325SJeff Bonwick * The root i/o will reexecute or suspend the entire tree. 4355e14bb325SJeff Bonwick * 4356e14bb325SJeff Bonwick * This approach ensures that zio_reexecute() honors 4357e14bb325SJeff Bonwick * all the original i/o dependency relationships, e.g. 4358e14bb325SJeff Bonwick * parents not executing until children are ready. 4359e14bb325SJeff Bonwick */ 4360e14bb325SJeff Bonwick ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 4361fa9e4066Sahrens 4362f5383399SBill Moore zio->io_gang_leader = NULL; 4363e14bb325SJeff Bonwick 4364a3f829aeSBill Moore mutex_enter(&zio->io_lock); 4365a3f829aeSBill Moore zio->io_state[ZIO_WAIT_DONE] = 1; 4366a3f829aeSBill Moore mutex_exit(&zio->io_lock); 4367a3f829aeSBill Moore 436854d692b7SGeorge Wilson /* 436954d692b7SGeorge Wilson * "The Godfather" I/O monitors its children but is 437054d692b7SGeorge Wilson * not a true parent to them. It will track them through 437154d692b7SGeorge Wilson * the pipeline but severs its ties whenever they get into 437254d692b7SGeorge Wilson * trouble (e.g. suspended). This allows "The Godfather" 437354d692b7SGeorge Wilson * I/O to return status without blocking. 437454d692b7SGeorge Wilson */ 43750f7643c7SGeorge Wilson zl = NULL; 43760f7643c7SGeorge Wilson for (pio = zio_walk_parents(zio, &zl); pio != NULL; 43770f7643c7SGeorge Wilson pio = pio_next) { 43780f7643c7SGeorge Wilson zio_link_t *remove_zl = zl; 43790f7643c7SGeorge Wilson pio_next = zio_walk_parents(zio, &zl); 438054d692b7SGeorge Wilson 438154d692b7SGeorge Wilson if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 438254d692b7SGeorge Wilson (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 43830f7643c7SGeorge Wilson zio_remove_child(pio, zio, remove_zl); 438454d692b7SGeorge Wilson zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 438554d692b7SGeorge Wilson } 438654d692b7SGeorge Wilson } 438754d692b7SGeorge Wilson 4388a3f829aeSBill Moore if ((pio = zio_unique_parent(zio)) != NULL) { 4389e14bb325SJeff Bonwick /* 4390e14bb325SJeff Bonwick * We're not a root i/o, so there's nothing to do 4391e14bb325SJeff Bonwick * but notify our parent. Don't propagate errors 4392e14bb325SJeff Bonwick * upward since we haven't permanently failed yet. 4393e14bb325SJeff Bonwick */ 439433a372edSGeorge Wilson ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 4395e14bb325SJeff Bonwick zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 4396e14bb325SJeff Bonwick zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 4397e14bb325SJeff Bonwick } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 4398e14bb325SJeff Bonwick /* 4399e14bb325SJeff Bonwick * We'd fail again if we reexecuted now, so suspend 4400e14bb325SJeff Bonwick * until conditions improve (e.g. device comes online). 4401e14bb325SJeff Bonwick */ 4402e0f1c0afSOlaf Faaland zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); 4403e14bb325SJeff Bonwick } else { 4404e14bb325SJeff Bonwick /* 4405e14bb325SJeff Bonwick * Reexecution is potentially a huge amount of work. 4406e14bb325SJeff Bonwick * Hand it off to the otherwise-unused claim taskq. 4407e14bb325SJeff Bonwick */ 44085aeb9474SGarrett D'Amore ASSERT(zio->io_tqent.tqent_next == NULL); 4409ec94d322SAdam Leventhal spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 4410ec94d322SAdam Leventhal ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 4411ec94d322SAdam Leventhal 0, &zio->io_tqent); 4412e14bb325SJeff Bonwick } 4413e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 4414fa9e4066Sahrens } 4415fa9e4066Sahrens 4416b24ab676SJeff Bonwick ASSERT(zio->io_child_count == 0); 441733a372edSGeorge Wilson ASSERT(zio->io_reexecute == 0); 4418e14bb325SJeff Bonwick ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 4419fa9e4066Sahrens 4420b24ab676SJeff Bonwick /* 4421b24ab676SJeff Bonwick * Report any checksum errors, since the I/O is complete. 4422b24ab676SJeff Bonwick */ 442322fe2c88SJonathan Adams while (zio->io_cksum_report != NULL) { 4424b24ab676SJeff Bonwick zio_cksum_report_t *zcr = zio->io_cksum_report; 4425b24ab676SJeff Bonwick zio->io_cksum_report = zcr->zcr_next; 4426b24ab676SJeff Bonwick zcr->zcr_next = NULL; 4427b24ab676SJeff Bonwick zcr->zcr_finish(zcr, NULL); 4428b24ab676SJeff Bonwick zfs_ereport_free_checksum(zcr); 442922fe2c88SJonathan Adams } 443022fe2c88SJonathan Adams 4431a3f829aeSBill Moore /* 4432a3f829aeSBill Moore * It is the responsibility of the done callback to ensure that this 4433a3f829aeSBill Moore * particular zio is no longer discoverable for adoption, and as 4434a3f829aeSBill Moore * such, cannot acquire any new parents. 4435a3f829aeSBill Moore */ 4436e14bb325SJeff Bonwick if (zio->io_done) 4437e14bb325SJeff Bonwick zio->io_done(zio); 4438fa9e4066Sahrens 4439a3f829aeSBill Moore mutex_enter(&zio->io_lock); 4440a3f829aeSBill Moore zio->io_state[ZIO_WAIT_DONE] = 1; 4441a3f829aeSBill Moore mutex_exit(&zio->io_lock); 4442fa9e4066Sahrens 44430f7643c7SGeorge Wilson zl = NULL; 44440f7643c7SGeorge Wilson for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { 44450f7643c7SGeorge Wilson zio_link_t *remove_zl = zl; 44460f7643c7SGeorge Wilson pio_next = zio_walk_parents(zio, &zl); 44470f7643c7SGeorge Wilson zio_remove_child(pio, zio, remove_zl); 4448e14bb325SJeff Bonwick zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 4449e14bb325SJeff Bonwick } 4450fa9e4066Sahrens 4451e14bb325SJeff Bonwick if (zio->io_waiter != NULL) { 4452e14bb325SJeff Bonwick mutex_enter(&zio->io_lock); 4453e14bb325SJeff Bonwick zio->io_executor = NULL; 4454e14bb325SJeff Bonwick cv_broadcast(&zio->io_cv); 4455e14bb325SJeff Bonwick mutex_exit(&zio->io_lock); 4456e14bb325SJeff Bonwick } else { 4457e14bb325SJeff Bonwick zio_destroy(zio); 4458e14bb325SJeff Bonwick } 4459fa9e4066Sahrens 4460e14bb325SJeff Bonwick return (ZIO_PIPELINE_STOP); 4461fa9e4066Sahrens } 446246341222Sperrin 446346341222Sperrin /* 4464e14bb325SJeff Bonwick * ========================================================================== 4465e14bb325SJeff Bonwick * I/O pipeline definition 4466e14bb325SJeff Bonwick * ========================================================================== 446746341222Sperrin */ 4468b24ab676SJeff Bonwick static zio_pipe_stage_t *zio_pipeline[] = { 4469e14bb325SJeff Bonwick NULL, 4470e14bb325SJeff Bonwick zio_read_bp_init, 44710f7643c7SGeorge Wilson zio_write_bp_init, 4472b24ab676SJeff Bonwick zio_free_bp_init, 4473b24ab676SJeff Bonwick zio_issue_async, 44740f7643c7SGeorge Wilson zio_write_compress, 4475eb633035STom Caputi zio_encrypt, 4476e14bb325SJeff Bonwick zio_checksum_generate, 447780901aeaSGeorge Wilson zio_nop_write, 4478b24ab676SJeff Bonwick zio_ddt_read_start, 4479b24ab676SJeff Bonwick zio_ddt_read_done, 4480b24ab676SJeff Bonwick zio_ddt_write, 4481b24ab676SJeff Bonwick zio_ddt_free, 4482e14bb325SJeff Bonwick zio_gang_assemble, 4483e14bb325SJeff Bonwick zio_gang_issue, 44840f7643c7SGeorge Wilson zio_dva_throttle, 4485e14bb325SJeff Bonwick zio_dva_allocate, 4486e14bb325SJeff Bonwick zio_dva_free, 4487e14bb325SJeff Bonwick zio_dva_claim, 4488e14bb325SJeff Bonwick zio_ready, 4489e14bb325SJeff Bonwick zio_vdev_io_start, 4490e14bb325SJeff Bonwick zio_vdev_io_done, 4491e14bb325SJeff Bonwick zio_vdev_io_assess, 4492e14bb325SJeff Bonwick zio_checksum_verify, 4493e14bb325SJeff Bonwick zio_done 4494e14bb325SJeff Bonwick }; 4495ad135b5dSChristopher Siden 4496ad135b5dSChristopher Siden 4497ad135b5dSChristopher Siden 4498ad135b5dSChristopher Siden 4499a2cdcdd2SPaul Dagnelie /* 4500a2cdcdd2SPaul Dagnelie * Compare two zbookmark_phys_t's to see which we would reach first in a 4501a2cdcdd2SPaul Dagnelie * pre-order traversal of the object tree. 4502a2cdcdd2SPaul Dagnelie * 4503a2cdcdd2SPaul Dagnelie * This is simple in every case aside from the meta-dnode object. For all other 4504a2cdcdd2SPaul Dagnelie * objects, we traverse them in order (object 1 before object 2, and so on). 4505a2cdcdd2SPaul Dagnelie * However, all of these objects are traversed while traversing object 0, since 4506a2cdcdd2SPaul Dagnelie * the data it points to is the list of objects. Thus, we need to convert to a 4507a2cdcdd2SPaul Dagnelie * canonical representation so we can compare meta-dnode bookmarks to 4508a2cdcdd2SPaul Dagnelie * non-meta-dnode bookmarks. 4509a2cdcdd2SPaul Dagnelie * 4510a2cdcdd2SPaul Dagnelie * We do this by calculating "equivalents" for each field of the zbookmark. 4511a2cdcdd2SPaul Dagnelie * zbookmarks outside of the meta-dnode use their own object and level, and 4512a2cdcdd2SPaul Dagnelie * calculate the level 0 equivalent (the first L0 blkid that is contained in the 4513a2cdcdd2SPaul Dagnelie * blocks this bookmark refers to) by multiplying their blkid by their span 4514a2cdcdd2SPaul Dagnelie * (the number of L0 blocks contained within one block at their level). 4515a2cdcdd2SPaul Dagnelie * zbookmarks inside the meta-dnode calculate their object equivalent 4516a2cdcdd2SPaul Dagnelie * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use 4517a2cdcdd2SPaul Dagnelie * level + 1<<31 (any value larger than a level could ever be) for their level. 4518a2cdcdd2SPaul Dagnelie * This causes them to always compare before a bookmark in their object 4519a2cdcdd2SPaul Dagnelie * equivalent, compare appropriately to bookmarks in other objects, and to 4520a2cdcdd2SPaul Dagnelie * compare appropriately to other bookmarks in the meta-dnode. 4521a2cdcdd2SPaul Dagnelie */ 4522a2cdcdd2SPaul Dagnelie int 4523a2cdcdd2SPaul Dagnelie zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, 4524a2cdcdd2SPaul Dagnelie const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) 4525a2cdcdd2SPaul Dagnelie { 4526a2cdcdd2SPaul Dagnelie /* 4527a2cdcdd2SPaul Dagnelie * These variables represent the "equivalent" values for the zbookmark, 4528a2cdcdd2SPaul Dagnelie * after converting zbookmarks inside the meta dnode to their 4529a2cdcdd2SPaul Dagnelie * normal-object equivalents. 4530a2cdcdd2SPaul Dagnelie */ 4531a2cdcdd2SPaul Dagnelie uint64_t zb1obj, zb2obj; 4532a2cdcdd2SPaul Dagnelie uint64_t zb1L0, zb2L0; 4533a2cdcdd2SPaul Dagnelie uint64_t zb1level, zb2level; 4534ad135b5dSChristopher Siden 4535a2cdcdd2SPaul Dagnelie if (zb1->zb_object == zb2->zb_object && 4536a2cdcdd2SPaul Dagnelie zb1->zb_level == zb2->zb_level && 4537a2cdcdd2SPaul Dagnelie zb1->zb_blkid == zb2->zb_blkid) 4538a2cdcdd2SPaul Dagnelie return (0); 4539a2cdcdd2SPaul Dagnelie 4540a2cdcdd2SPaul Dagnelie /* 4541a2cdcdd2SPaul Dagnelie * BP_SPANB calculates the span in blocks. 4542a2cdcdd2SPaul Dagnelie */ 4543a2cdcdd2SPaul Dagnelie zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); 4544a2cdcdd2SPaul Dagnelie zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); 4545ad135b5dSChristopher Siden 4546ad135b5dSChristopher Siden if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 4547a2cdcdd2SPaul Dagnelie zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 4548a2cdcdd2SPaul Dagnelie zb1L0 = 0; 4549a2cdcdd2SPaul Dagnelie zb1level = zb1->zb_level + COMPARE_META_LEVEL; 4550a2cdcdd2SPaul Dagnelie } else { 4551a2cdcdd2SPaul Dagnelie zb1obj = zb1->zb_object; 4552a2cdcdd2SPaul Dagnelie zb1level = zb1->zb_level; 4553ad135b5dSChristopher Siden } 4554ad135b5dSChristopher Siden 4555a2cdcdd2SPaul Dagnelie if (zb2->zb_object == DMU_META_DNODE_OBJECT) { 4556a2cdcdd2SPaul Dagnelie zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); 4557a2cdcdd2SPaul Dagnelie zb2L0 = 0; 4558a2cdcdd2SPaul Dagnelie zb2level = zb2->zb_level + COMPARE_META_LEVEL; 4559a2cdcdd2SPaul Dagnelie } else { 4560a2cdcdd2SPaul Dagnelie zb2obj = zb2->zb_object; 4561a2cdcdd2SPaul Dagnelie zb2level = zb2->zb_level; 4562a2cdcdd2SPaul Dagnelie } 4563a2cdcdd2SPaul Dagnelie 4564a2cdcdd2SPaul Dagnelie /* Now that we have a canonical representation, do the comparison. */ 4565a2cdcdd2SPaul Dagnelie if (zb1obj != zb2obj) 4566a2cdcdd2SPaul Dagnelie return (zb1obj < zb2obj ? -1 : 1); 4567a2cdcdd2SPaul Dagnelie else if (zb1L0 != zb2L0) 4568a2cdcdd2SPaul Dagnelie return (zb1L0 < zb2L0 ? -1 : 1); 4569a2cdcdd2SPaul Dagnelie else if (zb1level != zb2level) 4570a2cdcdd2SPaul Dagnelie return (zb1level > zb2level ? -1 : 1); 4571a2cdcdd2SPaul Dagnelie /* 4572a2cdcdd2SPaul Dagnelie * This can (theoretically) happen if the bookmarks have the same object 4573a2cdcdd2SPaul Dagnelie * and level, but different blkids, if the block sizes are not the same. 4574a2cdcdd2SPaul Dagnelie * There is presently no way to change the indirect block sizes 4575a2cdcdd2SPaul Dagnelie */ 4576a2cdcdd2SPaul Dagnelie return (0); 4577a2cdcdd2SPaul Dagnelie } 4578a2cdcdd2SPaul Dagnelie 4579a2cdcdd2SPaul Dagnelie /* 4580a2cdcdd2SPaul Dagnelie * This function checks the following: given that last_block is the place that 4581a2cdcdd2SPaul Dagnelie * our traversal stopped last time, does that guarantee that we've visited 4582a2cdcdd2SPaul Dagnelie * every node under subtree_root? Therefore, we can't just use the raw output 4583a2cdcdd2SPaul Dagnelie * of zbookmark_compare. We have to pass in a modified version of 4584a2cdcdd2SPaul Dagnelie * subtree_root; by incrementing the block id, and then checking whether 4585a2cdcdd2SPaul Dagnelie * last_block is before or equal to that, we can tell whether or not having 4586a2cdcdd2SPaul Dagnelie * visited last_block implies that all of subtree_root's children have been 4587a2cdcdd2SPaul Dagnelie * visited. 4588a2cdcdd2SPaul Dagnelie */ 4589a2cdcdd2SPaul Dagnelie boolean_t 4590a2cdcdd2SPaul Dagnelie zbookmark_subtree_completed(const dnode_phys_t *dnp, 4591a2cdcdd2SPaul Dagnelie const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) 4592a2cdcdd2SPaul Dagnelie { 4593a2cdcdd2SPaul Dagnelie zbookmark_phys_t mod_zb = *subtree_root; 4594a2cdcdd2SPaul Dagnelie mod_zb.zb_blkid++; 4595a2cdcdd2SPaul Dagnelie ASSERT(last_block->zb_level == 0); 4596a2cdcdd2SPaul Dagnelie 4597a2cdcdd2SPaul Dagnelie /* The objset_phys_t isn't before anything. */ 4598a2cdcdd2SPaul Dagnelie if (dnp == NULL) 4599ad135b5dSChristopher Siden return (B_FALSE); 4600a2cdcdd2SPaul Dagnelie 4601a2cdcdd2SPaul Dagnelie /* 4602a2cdcdd2SPaul Dagnelie * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the 4603a2cdcdd2SPaul Dagnelie * data block size in sectors, because that variable is only used if 4604a2cdcdd2SPaul Dagnelie * the bookmark refers to a block in the meta-dnode. Since we don't 4605a2cdcdd2SPaul Dagnelie * know without examining it what object it refers to, and there's no 4606a2cdcdd2SPaul Dagnelie * harm in passing in this value in other cases, we always pass it in. 4607a2cdcdd2SPaul Dagnelie * 4608a2cdcdd2SPaul Dagnelie * We pass in 0 for the indirect block size shift because zb2 must be 4609a2cdcdd2SPaul Dagnelie * level 0. The indirect block size is only used to calculate the span 4610a2cdcdd2SPaul Dagnelie * of the bookmark, but since the bookmark must be level 0, the span is 4611a2cdcdd2SPaul Dagnelie * always 1, so the math works out. 4612a2cdcdd2SPaul Dagnelie * 4613a2cdcdd2SPaul Dagnelie * If you make changes to how the zbookmark_compare code works, be sure 4614a2cdcdd2SPaul Dagnelie * to make sure that this code still works afterwards. 4615a2cdcdd2SPaul Dagnelie */ 4616a2cdcdd2SPaul Dagnelie return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 4617a2cdcdd2SPaul Dagnelie 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, 4618a2cdcdd2SPaul Dagnelie last_block) <= 0); 4619ad135b5dSChristopher Siden } 4620