1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5fe9cf88cSperrin * Common Development and Distribution License (the "License"). 6fe9cf88cSperrin * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2255da60b9SMark J Musante * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23b7b2590dSMatthew Ahrens * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 25fa9e4066Sahrens */ 26fa9e4066Sahrens 2755da60b9SMark J Musante /* Portions Copyright 2010 Robert Milkowski */ 2855da60b9SMark J Musante 29fa9e4066Sahrens #include <sys/zfs_context.h> 30fa9e4066Sahrens #include <sys/spa.h> 31fa9e4066Sahrens #include <sys/dmu.h> 32fa9e4066Sahrens #include <sys/zap.h> 33fa9e4066Sahrens #include <sys/arc.h> 34fa9e4066Sahrens #include <sys/stat.h> 35fa9e4066Sahrens #include <sys/resource.h> 36fa9e4066Sahrens #include <sys/zil.h> 37fa9e4066Sahrens #include <sys/zil_impl.h> 38fa9e4066Sahrens #include <sys/dsl_dataset.h> 394b964adaSGeorge Wilson #include <sys/vdev_impl.h> 40d63d470bSgw #include <sys/dmu_tx.h> 413f9d6ad7SLin Ling #include <sys/dsl_pool.h> 42770499e1SDan Kimmel #include <sys/abd.h> 43fa9e4066Sahrens 44fa9e4066Sahrens /* 45*1271e4b1SPrakash Surya * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system 46*1271e4b1SPrakash Surya * calls that change the file system. Each itx has enough information to 47*1271e4b1SPrakash Surya * be able to replay them after a system crash, power loss, or 48*1271e4b1SPrakash Surya * equivalent failure mode. These are stored in memory until either: 49fa9e4066Sahrens * 50*1271e4b1SPrakash Surya * 1. they are committed to the pool by the DMU transaction group 51*1271e4b1SPrakash Surya * (txg), at which point they can be discarded; or 52*1271e4b1SPrakash Surya * 2. they are committed to the on-disk ZIL for the dataset being 53*1271e4b1SPrakash Surya * modified (e.g. due to an fsync, O_DSYNC, or other synchronous 54*1271e4b1SPrakash Surya * requirement). 55fa9e4066Sahrens * 56*1271e4b1SPrakash Surya * In the event of a crash or power loss, the itxs contained by each 57*1271e4b1SPrakash Surya * dataset's on-disk ZIL will be replayed when that dataset is first 58*1271e4b1SPrakash Surya * instantianted (e.g. if the dataset is a normal fileystem, when it is 59*1271e4b1SPrakash Surya * first mounted). 60fa9e4066Sahrens * 61*1271e4b1SPrakash Surya * As hinted at above, there is one ZIL per dataset (both the in-memory 62*1271e4b1SPrakash Surya * representation, and the on-disk representation). The on-disk format 63*1271e4b1SPrakash Surya * consists of 3 parts: 64*1271e4b1SPrakash Surya * 65*1271e4b1SPrakash Surya * - a single, per-dataset, ZIL header; which points to a chain of 66*1271e4b1SPrakash Surya * - zero or more ZIL blocks; each of which contains 67*1271e4b1SPrakash Surya * - zero or more ZIL records 68*1271e4b1SPrakash Surya * 69*1271e4b1SPrakash Surya * A ZIL record holds the information necessary to replay a single 70*1271e4b1SPrakash Surya * system call transaction. A ZIL block can hold many ZIL records, and 71*1271e4b1SPrakash Surya * the blocks are chained together, similarly to a singly linked list. 72*1271e4b1SPrakash Surya * 73*1271e4b1SPrakash Surya * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL 74*1271e4b1SPrakash Surya * block in the chain, and the ZIL header points to the first block in 75*1271e4b1SPrakash Surya * the chain. 76*1271e4b1SPrakash Surya * 77*1271e4b1SPrakash Surya * Note, there is not a fixed place in the pool to hold these ZIL 78*1271e4b1SPrakash Surya * blocks; they are dynamically allocated and freed as needed from the 79*1271e4b1SPrakash Surya * blocks available on the pool, though they can be preferentially 80*1271e4b1SPrakash Surya * allocated from a dedicated "log" vdev. 81*1271e4b1SPrakash Surya */ 82*1271e4b1SPrakash Surya 83*1271e4b1SPrakash Surya /* 84*1271e4b1SPrakash Surya * This controls the amount of time that a ZIL block (lwb) will remain 85*1271e4b1SPrakash Surya * "open" when it isn't "full", and it has a thread waiting for it to be 86*1271e4b1SPrakash Surya * committed to stable storage. Please refer to the zil_commit_waiter() 87*1271e4b1SPrakash Surya * function (and the comments within it) for more details. 88fa9e4066Sahrens */ 89*1271e4b1SPrakash Surya int zfs_commit_timeout_pct = 5; 90fa9e4066Sahrens 91fa9e4066Sahrens /* 92f7170741SWill Andrews * Disable intent logging replay. This global ZIL switch affects all pools. 93fa9e4066Sahrens */ 94f7170741SWill Andrews int zil_replay_disable = 0; 95416e0cd8Sek 96416e0cd8Sek /* 97416e0cd8Sek * Tunable parameter for debugging or performance analysis. Setting 98416e0cd8Sek * zfs_nocacheflush will cause corruption on power loss if a volatile 99416e0cd8Sek * out-of-order write cache is enabled. 100416e0cd8Sek */ 101416e0cd8Sek boolean_t zfs_nocacheflush = B_FALSE; 102fa9e4066Sahrens 103c5ee4681SAlexander Motin /* 104c5ee4681SAlexander Motin * Limit SLOG write size per commit executed with synchronous priority. 105c5ee4681SAlexander Motin * Any writes above that will be executed with lower (asynchronous) priority 106c5ee4681SAlexander Motin * to limit potential SLOG device abuse by single active ZIL writer. 107c5ee4681SAlexander Motin */ 108c5ee4681SAlexander Motin uint64_t zil_slog_bulk = 768 * 1024; 109c5ee4681SAlexander Motin 110fa9e4066Sahrens static kmem_cache_t *zil_lwb_cache; 111*1271e4b1SPrakash Surya static kmem_cache_t *zil_zcw_cache; 112fa9e4066Sahrens 11391de656bSNeil Perrin static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); 1148f18d1faSGeorge Wilson 1156e1f5caaSNeil Perrin #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 1166e1f5caaSNeil Perrin sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 1176e1f5caaSNeil Perrin 118fa9e4066Sahrens static int 119b24ab676SJeff Bonwick zil_bp_compare(const void *x1, const void *x2) 120fa9e4066Sahrens { 121b24ab676SJeff Bonwick const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 122b24ab676SJeff Bonwick const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 123fa9e4066Sahrens 124fa9e4066Sahrens if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 125fa9e4066Sahrens return (-1); 126fa9e4066Sahrens if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 127fa9e4066Sahrens return (1); 128fa9e4066Sahrens 129fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 130fa9e4066Sahrens return (-1); 131fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 132fa9e4066Sahrens return (1); 133fa9e4066Sahrens 134fa9e4066Sahrens return (0); 135fa9e4066Sahrens } 136fa9e4066Sahrens 137fa9e4066Sahrens static void 138b24ab676SJeff Bonwick zil_bp_tree_init(zilog_t *zilog) 139fa9e4066Sahrens { 140b24ab676SJeff Bonwick avl_create(&zilog->zl_bp_tree, zil_bp_compare, 141b24ab676SJeff Bonwick sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 142fa9e4066Sahrens } 143fa9e4066Sahrens 144fa9e4066Sahrens static void 145b24ab676SJeff Bonwick zil_bp_tree_fini(zilog_t *zilog) 146fa9e4066Sahrens { 147b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 148b24ab676SJeff Bonwick zil_bp_node_t *zn; 149fa9e4066Sahrens void *cookie = NULL; 150fa9e4066Sahrens 151fa9e4066Sahrens while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 152b24ab676SJeff Bonwick kmem_free(zn, sizeof (zil_bp_node_t)); 153fa9e4066Sahrens 154fa9e4066Sahrens avl_destroy(t); 155fa9e4066Sahrens } 156fa9e4066Sahrens 157b24ab676SJeff Bonwick int 158b24ab676SJeff Bonwick zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 159fa9e4066Sahrens { 160b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 1615d7b4d43SMatthew Ahrens const dva_t *dva; 162b24ab676SJeff Bonwick zil_bp_node_t *zn; 163fa9e4066Sahrens avl_index_t where; 164fa9e4066Sahrens 1655d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 1665d7b4d43SMatthew Ahrens return (0); 1675d7b4d43SMatthew Ahrens 1685d7b4d43SMatthew Ahrens dva = BP_IDENTITY(bp); 1695d7b4d43SMatthew Ahrens 170fa9e4066Sahrens if (avl_find(t, dva, &where) != NULL) 171be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST)); 172fa9e4066Sahrens 173b24ab676SJeff Bonwick zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 174fa9e4066Sahrens zn->zn_dva = *dva; 175fa9e4066Sahrens avl_insert(t, zn, where); 176fa9e4066Sahrens 177fa9e4066Sahrens return (0); 178fa9e4066Sahrens } 179fa9e4066Sahrens 180d80c45e0Sbonwick static zil_header_t * 181d80c45e0Sbonwick zil_header_in_syncing_context(zilog_t *zilog) 182d80c45e0Sbonwick { 183d80c45e0Sbonwick return ((zil_header_t *)zilog->zl_header); 184d80c45e0Sbonwick } 185d80c45e0Sbonwick 186d80c45e0Sbonwick static void 187d80c45e0Sbonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 188d80c45e0Sbonwick { 189d80c45e0Sbonwick zio_cksum_t *zc = &bp->blk_cksum; 190d80c45e0Sbonwick 191d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 192d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 193d80c45e0Sbonwick zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 194d80c45e0Sbonwick zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 195d80c45e0Sbonwick } 196d80c45e0Sbonwick 197fa9e4066Sahrens /* 198b24ab676SJeff Bonwick * Read a log block and make sure it's valid. 199fa9e4066Sahrens */ 200fa9e4066Sahrens static int 2016e1f5caaSNeil Perrin zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 2026e1f5caaSNeil Perrin char **end) 203fa9e4066Sahrens { 204b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 2057adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 206b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 2077802d7bfSMatthew Ahrens zbookmark_phys_t zb; 208fa9e4066Sahrens int error; 209fa9e4066Sahrens 210b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 211b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 212ea8dc4b6Seschrock 213b24ab676SJeff Bonwick if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 214b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE; 215fa9e4066Sahrens 216b24ab676SJeff Bonwick SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 217b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 218b24ab676SJeff Bonwick 2191b912ec7SGeorge Wilson error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 220b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 221fa9e4066Sahrens 222d80c45e0Sbonwick if (error == 0) { 223d80c45e0Sbonwick zio_cksum_t cksum = bp->blk_cksum; 224fa9e4066Sahrens 225d80c45e0Sbonwick /* 226f5e6e722SNeil Perrin * Validate the checksummed log block. 227f5e6e722SNeil Perrin * 228d80c45e0Sbonwick * Sequence numbers should be... sequential. The checksum 229d80c45e0Sbonwick * verifier for the next block should be bp's checksum plus 1. 230f5e6e722SNeil Perrin * 231f5e6e722SNeil Perrin * Also check the log chain linkage and size used. 232d80c45e0Sbonwick */ 233d80c45e0Sbonwick cksum.zc_word[ZIL_ZC_SEQ]++; 234d80c45e0Sbonwick 2356e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 2366e1f5caaSNeil Perrin zil_chain_t *zilc = abuf->b_data; 2376e1f5caaSNeil Perrin char *lr = (char *)(zilc + 1); 2386e1f5caaSNeil Perrin uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 2396e1f5caaSNeil Perrin 2406e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2416e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 242be6fd75aSMatthew Ahrens error = SET_ERROR(ECKSUM); 2436e1f5caaSNeil Perrin } else { 244b5152584SMatthew Ahrens ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); 2456e1f5caaSNeil Perrin bcopy(lr, dst, len); 2466e1f5caaSNeil Perrin *end = (char *)dst + len; 2476e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2486e1f5caaSNeil Perrin } 2496e1f5caaSNeil Perrin } else { 2506e1f5caaSNeil Perrin char *lr = abuf->b_data; 2516e1f5caaSNeil Perrin uint64_t size = BP_GET_LSIZE(bp); 2526e1f5caaSNeil Perrin zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 2536e1f5caaSNeil Perrin 2546e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2556e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 2566e1f5caaSNeil Perrin (zilc->zc_nused > (size - sizeof (*zilc)))) { 257be6fd75aSMatthew Ahrens error = SET_ERROR(ECKSUM); 2586e1f5caaSNeil Perrin } else { 259b5152584SMatthew Ahrens ASSERT3U(zilc->zc_nused, <=, 260b5152584SMatthew Ahrens SPA_OLD_MAXBLOCKSIZE); 2616e1f5caaSNeil Perrin bcopy(lr, dst, zilc->zc_nused); 2626e1f5caaSNeil Perrin *end = (char *)dst + zilc->zc_nused; 2636e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2646e1f5caaSNeil Perrin } 2656e1f5caaSNeil Perrin } 266fa9e4066Sahrens 267dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 268fa9e4066Sahrens } 269fa9e4066Sahrens 270b24ab676SJeff Bonwick return (error); 271b24ab676SJeff Bonwick } 272b24ab676SJeff Bonwick 273b24ab676SJeff Bonwick /* 274b24ab676SJeff Bonwick * Read a TX_WRITE log data block. 275b24ab676SJeff Bonwick */ 276b24ab676SJeff Bonwick static int 277b24ab676SJeff Bonwick zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 278b24ab676SJeff Bonwick { 279b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 280b24ab676SJeff Bonwick const blkptr_t *bp = &lr->lr_blkptr; 2817adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 282b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 2837802d7bfSMatthew Ahrens zbookmark_phys_t zb; 284b24ab676SJeff Bonwick int error; 285b24ab676SJeff Bonwick 286b24ab676SJeff Bonwick if (BP_IS_HOLE(bp)) { 287b24ab676SJeff Bonwick if (wbuf != NULL) 288b24ab676SJeff Bonwick bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 289b24ab676SJeff Bonwick return (0); 290b24ab676SJeff Bonwick } 291b24ab676SJeff Bonwick 292b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 293b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 294b24ab676SJeff Bonwick 295b24ab676SJeff Bonwick SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 296b24ab676SJeff Bonwick ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 297b24ab676SJeff Bonwick 2981b912ec7SGeorge Wilson error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 299b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 300b24ab676SJeff Bonwick 301b24ab676SJeff Bonwick if (error == 0) { 302b24ab676SJeff Bonwick if (wbuf != NULL) 303b24ab676SJeff Bonwick bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 304dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 305b24ab676SJeff Bonwick } 306fa9e4066Sahrens 307d80c45e0Sbonwick return (error); 308fa9e4066Sahrens } 309fa9e4066Sahrens 310fa9e4066Sahrens /* 311fa9e4066Sahrens * Parse the intent log, and call parse_func for each valid record within. 312fa9e4066Sahrens */ 313b24ab676SJeff Bonwick int 314fa9e4066Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 315fa9e4066Sahrens zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 316fa9e4066Sahrens { 317d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 318b24ab676SJeff Bonwick boolean_t claimed = !!zh->zh_claim_txg; 319b24ab676SJeff Bonwick uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 320b24ab676SJeff Bonwick uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 321b24ab676SJeff Bonwick uint64_t max_blk_seq = 0; 322b24ab676SJeff Bonwick uint64_t max_lr_seq = 0; 323b24ab676SJeff Bonwick uint64_t blk_count = 0; 324b24ab676SJeff Bonwick uint64_t lr_count = 0; 325b24ab676SJeff Bonwick blkptr_t blk, next_blk; 326fa9e4066Sahrens char *lrbuf, *lrp; 327b24ab676SJeff Bonwick int error = 0; 328fa9e4066Sahrens 329b24ab676SJeff Bonwick /* 330b24ab676SJeff Bonwick * Old logs didn't record the maximum zh_claim_lr_seq. 331b24ab676SJeff Bonwick */ 332b24ab676SJeff Bonwick if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 333b24ab676SJeff Bonwick claim_lr_seq = UINT64_MAX; 334fa9e4066Sahrens 335fa9e4066Sahrens /* 336fa9e4066Sahrens * Starting at the block pointed to by zh_log we read the log chain. 337fa9e4066Sahrens * For each block in the chain we strongly check that block to 338fa9e4066Sahrens * ensure its validity. We stop when an invalid block is found. 339fa9e4066Sahrens * For each block pointer in the chain we call parse_blk_func(). 340fa9e4066Sahrens * For each record in each valid block we call parse_lr_func(). 341d80c45e0Sbonwick * If the log has been claimed, stop if we encounter a sequence 342d80c45e0Sbonwick * number greater than the highest claimed sequence number. 343fa9e4066Sahrens */ 344b5152584SMatthew Ahrens lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 345b24ab676SJeff Bonwick zil_bp_tree_init(zilog); 346d80c45e0Sbonwick 347b24ab676SJeff Bonwick for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 348b24ab676SJeff Bonwick uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 349b24ab676SJeff Bonwick int reclen; 3506e1f5caaSNeil Perrin char *end; 351d80c45e0Sbonwick 352b24ab676SJeff Bonwick if (blk_seq > claim_blk_seq) 353b24ab676SJeff Bonwick break; 354b24ab676SJeff Bonwick if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 355b24ab676SJeff Bonwick break; 3566e1f5caaSNeil Perrin ASSERT3U(max_blk_seq, <, blk_seq); 357b24ab676SJeff Bonwick max_blk_seq = blk_seq; 358b24ab676SJeff Bonwick blk_count++; 359fa9e4066Sahrens 360b24ab676SJeff Bonwick if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 361b24ab676SJeff Bonwick break; 362fa9e4066Sahrens 3636e1f5caaSNeil Perrin error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 3643b2aab18SMatthew Ahrens if (error != 0) 365fa9e4066Sahrens break; 366fa9e4066Sahrens 3676e1f5caaSNeil Perrin for (lrp = lrbuf; lrp < end; lrp += reclen) { 368fa9e4066Sahrens lr_t *lr = (lr_t *)lrp; 369fa9e4066Sahrens reclen = lr->lrc_reclen; 370fa9e4066Sahrens ASSERT3U(reclen, >=, sizeof (lr_t)); 371b24ab676SJeff Bonwick if (lr->lrc_seq > claim_lr_seq) 372b24ab676SJeff Bonwick goto done; 373b24ab676SJeff Bonwick if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 374b24ab676SJeff Bonwick goto done; 3756e1f5caaSNeil Perrin ASSERT3U(max_lr_seq, <, lr->lrc_seq); 376b24ab676SJeff Bonwick max_lr_seq = lr->lrc_seq; 377b24ab676SJeff Bonwick lr_count++; 378fa9e4066Sahrens } 379fa9e4066Sahrens } 380b24ab676SJeff Bonwick done: 381b24ab676SJeff Bonwick zilog->zl_parse_error = error; 382b24ab676SJeff Bonwick zilog->zl_parse_blk_seq = max_blk_seq; 383b24ab676SJeff Bonwick zilog->zl_parse_lr_seq = max_lr_seq; 384b24ab676SJeff Bonwick zilog->zl_parse_blk_count = blk_count; 385b24ab676SJeff Bonwick zilog->zl_parse_lr_count = lr_count; 386b24ab676SJeff Bonwick 387b24ab676SJeff Bonwick ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 388b24ab676SJeff Bonwick (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 389d80c45e0Sbonwick 390b24ab676SJeff Bonwick zil_bp_tree_fini(zilog); 391b5152584SMatthew Ahrens zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); 392b24ab676SJeff Bonwick 393b24ab676SJeff Bonwick return (error); 394fa9e4066Sahrens } 395fa9e4066Sahrens 396b24ab676SJeff Bonwick static int 397fa9e4066Sahrens zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 398fa9e4066Sahrens { 399fa9e4066Sahrens /* 400fa9e4066Sahrens * Claim log block if not already committed and not already claimed. 401b24ab676SJeff Bonwick * If tx == NULL, just verify that the block is claimable. 402fa9e4066Sahrens */ 40343466aaeSMax Grossman if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || 40443466aaeSMax Grossman zil_bp_tree_add(zilog, bp) != 0) 405b24ab676SJeff Bonwick return (0); 406b24ab676SJeff Bonwick 407b24ab676SJeff Bonwick return (zio_wait(zio_claim(NULL, zilog->zl_spa, 408b24ab676SJeff Bonwick tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 409b24ab676SJeff Bonwick ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 410fa9e4066Sahrens } 411fa9e4066Sahrens 412b24ab676SJeff Bonwick static int 413fa9e4066Sahrens zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 414fa9e4066Sahrens { 415b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 416b24ab676SJeff Bonwick int error; 417b24ab676SJeff Bonwick 418b24ab676SJeff Bonwick if (lrc->lrc_txtype != TX_WRITE) 419b24ab676SJeff Bonwick return (0); 420b24ab676SJeff Bonwick 421b24ab676SJeff Bonwick /* 422b24ab676SJeff Bonwick * If the block is not readable, don't claim it. This can happen 423b24ab676SJeff Bonwick * in normal operation when a log block is written to disk before 424b24ab676SJeff Bonwick * some of the dmu_sync() blocks it points to. In this case, the 425b24ab676SJeff Bonwick * transaction cannot have been committed to anyone (we would have 426b24ab676SJeff Bonwick * waited for all writes to be stable first), so it is semantically 427b24ab676SJeff Bonwick * correct to declare this the end of the log. 428b24ab676SJeff Bonwick */ 429b24ab676SJeff Bonwick if (lr->lr_blkptr.blk_birth >= first_txg && 430b24ab676SJeff Bonwick (error = zil_read_log_data(zilog, lr, NULL)) != 0) 431b24ab676SJeff Bonwick return (error); 432b24ab676SJeff Bonwick return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 433fa9e4066Sahrens } 434fa9e4066Sahrens 435fa9e4066Sahrens /* ARGSUSED */ 436b24ab676SJeff Bonwick static int 437fa9e4066Sahrens zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 438fa9e4066Sahrens { 439b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 440b24ab676SJeff Bonwick 441b24ab676SJeff Bonwick return (0); 442fa9e4066Sahrens } 443fa9e4066Sahrens 444b24ab676SJeff Bonwick static int 445fa9e4066Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 446fa9e4066Sahrens { 447b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 448b24ab676SJeff Bonwick blkptr_t *bp = &lr->lr_blkptr; 449b24ab676SJeff Bonwick 450fa9e4066Sahrens /* 451fa9e4066Sahrens * If we previously claimed it, we need to free it. 452fa9e4066Sahrens */ 453b24ab676SJeff Bonwick if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 45443466aaeSMax Grossman bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && 45543466aaeSMax Grossman !BP_IS_HOLE(bp)) 456b24ab676SJeff Bonwick zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 457b24ab676SJeff Bonwick 458b24ab676SJeff Bonwick return (0); 459fa9e4066Sahrens } 460fa9e4066Sahrens 461*1271e4b1SPrakash Surya static int 462*1271e4b1SPrakash Surya zil_lwb_vdev_compare(const void *x1, const void *x2) 463*1271e4b1SPrakash Surya { 464*1271e4b1SPrakash Surya const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 465*1271e4b1SPrakash Surya const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 466*1271e4b1SPrakash Surya 467*1271e4b1SPrakash Surya if (v1 < v2) 468*1271e4b1SPrakash Surya return (-1); 469*1271e4b1SPrakash Surya if (v1 > v2) 470*1271e4b1SPrakash Surya return (1); 471*1271e4b1SPrakash Surya 472*1271e4b1SPrakash Surya return (0); 473*1271e4b1SPrakash Surya } 474*1271e4b1SPrakash Surya 4756e1f5caaSNeil Perrin static lwb_t * 476c5ee4681SAlexander Motin zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) 4776e1f5caaSNeil Perrin { 4786e1f5caaSNeil Perrin lwb_t *lwb; 4796e1f5caaSNeil Perrin 4806e1f5caaSNeil Perrin lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 4816e1f5caaSNeil Perrin lwb->lwb_zilog = zilog; 4826e1f5caaSNeil Perrin lwb->lwb_blk = *bp; 483c5ee4681SAlexander Motin lwb->lwb_slog = slog; 484*1271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_CLOSED; 4856e1f5caaSNeil Perrin lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 4866e1f5caaSNeil Perrin lwb->lwb_max_txg = txg; 487*1271e4b1SPrakash Surya lwb->lwb_write_zio = NULL; 488*1271e4b1SPrakash Surya lwb->lwb_root_zio = NULL; 4896e1f5caaSNeil Perrin lwb->lwb_tx = NULL; 490*1271e4b1SPrakash Surya lwb->lwb_issued_timestamp = 0; 4916e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 4926e1f5caaSNeil Perrin lwb->lwb_nused = sizeof (zil_chain_t); 4936e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp); 4946e1f5caaSNeil Perrin } else { 4956e1f5caaSNeil Perrin lwb->lwb_nused = 0; 4966e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 4976e1f5caaSNeil Perrin } 4986e1f5caaSNeil Perrin 4996e1f5caaSNeil Perrin mutex_enter(&zilog->zl_lock); 5006e1f5caaSNeil Perrin list_insert_tail(&zilog->zl_lwb_list, lwb); 5016e1f5caaSNeil Perrin mutex_exit(&zilog->zl_lock); 5026e1f5caaSNeil Perrin 503*1271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 504*1271e4b1SPrakash Surya ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 505*1271e4b1SPrakash Surya ASSERT(list_is_empty(&lwb->lwb_waiters)); 506*1271e4b1SPrakash Surya 5076e1f5caaSNeil Perrin return (lwb); 5086e1f5caaSNeil Perrin } 5096e1f5caaSNeil Perrin 510*1271e4b1SPrakash Surya static void 511*1271e4b1SPrakash Surya zil_free_lwb(zilog_t *zilog, lwb_t *lwb) 512*1271e4b1SPrakash Surya { 513*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_lock)); 514*1271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 515*1271e4b1SPrakash Surya ASSERT(list_is_empty(&lwb->lwb_waiters)); 516*1271e4b1SPrakash Surya 517*1271e4b1SPrakash Surya if (lwb->lwb_state == LWB_STATE_OPENED) { 518*1271e4b1SPrakash Surya avl_tree_t *t = &lwb->lwb_vdev_tree; 519*1271e4b1SPrakash Surya void *cookie = NULL; 520*1271e4b1SPrakash Surya zil_vdev_node_t *zv; 521*1271e4b1SPrakash Surya 522*1271e4b1SPrakash Surya while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) 523*1271e4b1SPrakash Surya kmem_free(zv, sizeof (*zv)); 524*1271e4b1SPrakash Surya 525*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 526*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 527*1271e4b1SPrakash Surya 528*1271e4b1SPrakash Surya zio_cancel(lwb->lwb_root_zio); 529*1271e4b1SPrakash Surya zio_cancel(lwb->lwb_write_zio); 530*1271e4b1SPrakash Surya 531*1271e4b1SPrakash Surya lwb->lwb_root_zio = NULL; 532*1271e4b1SPrakash Surya lwb->lwb_write_zio = NULL; 533*1271e4b1SPrakash Surya } else { 534*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 535*1271e4b1SPrakash Surya } 536*1271e4b1SPrakash Surya 537*1271e4b1SPrakash Surya ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 538*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, ==, NULL); 539*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, ==, NULL); 540*1271e4b1SPrakash Surya 541*1271e4b1SPrakash Surya /* 542*1271e4b1SPrakash Surya * Clear the zilog's field to indicate this lwb is no longer 543*1271e4b1SPrakash Surya * valid, and prevent use-after-free errors. 544*1271e4b1SPrakash Surya */ 545*1271e4b1SPrakash Surya if (zilog->zl_last_lwb_opened == lwb) 546*1271e4b1SPrakash Surya zilog->zl_last_lwb_opened = NULL; 547*1271e4b1SPrakash Surya 548*1271e4b1SPrakash Surya kmem_cache_free(zil_lwb_cache, lwb); 549*1271e4b1SPrakash Surya } 550*1271e4b1SPrakash Surya 551ce636f8bSMatthew Ahrens /* 552ce636f8bSMatthew Ahrens * Called when we create in-memory log transactions so that we know 553ce636f8bSMatthew Ahrens * to cleanup the itxs at the end of spa_sync(). 554ce636f8bSMatthew Ahrens */ 555ce636f8bSMatthew Ahrens void 556ce636f8bSMatthew Ahrens zilog_dirty(zilog_t *zilog, uint64_t txg) 557ce636f8bSMatthew Ahrens { 558ce636f8bSMatthew Ahrens dsl_pool_t *dp = zilog->zl_dmu_pool; 559ce636f8bSMatthew Ahrens dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 560ce636f8bSMatthew Ahrens 561*1271e4b1SPrakash Surya ASSERT(spa_writeable(zilog->zl_spa)); 562*1271e4b1SPrakash Surya 563bc9014e6SJustin Gibbs if (ds->ds_is_snapshot) 564ce636f8bSMatthew Ahrens panic("dirtying snapshot!"); 565ce636f8bSMatthew Ahrens 5663b2aab18SMatthew Ahrens if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { 567ce636f8bSMatthew Ahrens /* up the hold count until we can be written out */ 568ce636f8bSMatthew Ahrens dmu_buf_add_ref(ds->ds_dbuf, zilog); 569*1271e4b1SPrakash Surya 570*1271e4b1SPrakash Surya zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); 571ce636f8bSMatthew Ahrens } 572ce636f8bSMatthew Ahrens } 573ce636f8bSMatthew Ahrens 57443297f97SGeorge Wilson /* 57543297f97SGeorge Wilson * Determine if the zil is dirty in the specified txg. Callers wanting to 57643297f97SGeorge Wilson * ensure that the dirty state does not change must hold the itxg_lock for 57743297f97SGeorge Wilson * the specified txg. Holding the lock will ensure that the zil cannot be 57843297f97SGeorge Wilson * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current 57943297f97SGeorge Wilson * state. 58043297f97SGeorge Wilson */ 58143297f97SGeorge Wilson boolean_t 58243297f97SGeorge Wilson zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) 58343297f97SGeorge Wilson { 58443297f97SGeorge Wilson dsl_pool_t *dp = zilog->zl_dmu_pool; 58543297f97SGeorge Wilson 58643297f97SGeorge Wilson if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) 58743297f97SGeorge Wilson return (B_TRUE); 58843297f97SGeorge Wilson return (B_FALSE); 58943297f97SGeorge Wilson } 59043297f97SGeorge Wilson 59143297f97SGeorge Wilson /* 59243297f97SGeorge Wilson * Determine if the zil is dirty. The zil is considered dirty if it has 59343297f97SGeorge Wilson * any pending itx records that have not been cleaned by zil_clean(). 59443297f97SGeorge Wilson */ 595ce636f8bSMatthew Ahrens boolean_t 596ce636f8bSMatthew Ahrens zilog_is_dirty(zilog_t *zilog) 597ce636f8bSMatthew Ahrens { 598ce636f8bSMatthew Ahrens dsl_pool_t *dp = zilog->zl_dmu_pool; 599ce636f8bSMatthew Ahrens 600ce636f8bSMatthew Ahrens for (int t = 0; t < TXG_SIZE; t++) { 601ce636f8bSMatthew Ahrens if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) 602ce636f8bSMatthew Ahrens return (B_TRUE); 603ce636f8bSMatthew Ahrens } 604ce636f8bSMatthew Ahrens return (B_FALSE); 605ce636f8bSMatthew Ahrens } 606ce636f8bSMatthew Ahrens 607fa9e4066Sahrens /* 608fa9e4066Sahrens * Create an on-disk intent log. 609fa9e4066Sahrens */ 6106e1f5caaSNeil Perrin static lwb_t * 611fa9e4066Sahrens zil_create(zilog_t *zilog) 612fa9e4066Sahrens { 613d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 6146e1f5caaSNeil Perrin lwb_t *lwb = NULL; 615d80c45e0Sbonwick uint64_t txg = 0; 616d80c45e0Sbonwick dmu_tx_t *tx = NULL; 617fa9e4066Sahrens blkptr_t blk; 618d80c45e0Sbonwick int error = 0; 619c5ee4681SAlexander Motin boolean_t slog = FALSE; 620fa9e4066Sahrens 621fa9e4066Sahrens /* 622d80c45e0Sbonwick * Wait for any previous destroy to complete. 623fa9e4066Sahrens */ 624d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 625d80c45e0Sbonwick 626d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 627d80c45e0Sbonwick ASSERT(zh->zh_replay_seq == 0); 628d80c45e0Sbonwick 629d80c45e0Sbonwick blk = zh->zh_log; 630fa9e4066Sahrens 631fa9e4066Sahrens /* 6326e1f5caaSNeil Perrin * Allocate an initial log block if: 6336e1f5caaSNeil Perrin * - there isn't one already 6346e1f5caaSNeil Perrin * - the existing block is the wrong endianess 635fa9e4066Sahrens */ 636899217ddSNeil Perrin if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 637d80c45e0Sbonwick tx = dmu_tx_create(zilog->zl_os); 638*1271e4b1SPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 639d80c45e0Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 640d80c45e0Sbonwick txg = dmu_tx_get_txg(tx); 641d80c45e0Sbonwick 642899217ddSNeil Perrin if (!BP_IS_HOLE(&blk)) { 643b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &blk); 644899217ddSNeil Perrin BP_ZERO(&blk); 645899217ddSNeil Perrin } 646899217ddSNeil Perrin 647b24ab676SJeff Bonwick error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 648c5ee4681SAlexander Motin ZIL_MIN_BLKSZ, &slog); 649d80c45e0Sbonwick 650d80c45e0Sbonwick if (error == 0) 651d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 65213f5297eSperrin } 653fa9e4066Sahrens 654d80c45e0Sbonwick /* 655*1271e4b1SPrakash Surya * Allocate a log write block (lwb) for the first log block. 656d80c45e0Sbonwick */ 6576e1f5caaSNeil Perrin if (error == 0) 658c5ee4681SAlexander Motin lwb = zil_alloc_lwb(zilog, &blk, slog, txg); 659fa9e4066Sahrens 660d80c45e0Sbonwick /* 661d80c45e0Sbonwick * If we just allocated the first log block, commit our transaction 662d80c45e0Sbonwick * and wait for zil_sync() to stuff the block poiner into zh_log. 663d80c45e0Sbonwick * (zh is part of the MOS, so we cannot modify it in open context.) 664d80c45e0Sbonwick */ 665d80c45e0Sbonwick if (tx != NULL) { 666d80c45e0Sbonwick dmu_tx_commit(tx); 66713f5297eSperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 668d80c45e0Sbonwick } 669d80c45e0Sbonwick 670d80c45e0Sbonwick ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 6716e1f5caaSNeil Perrin 6726e1f5caaSNeil Perrin return (lwb); 673fa9e4066Sahrens } 674fa9e4066Sahrens 675fa9e4066Sahrens /* 676*1271e4b1SPrakash Surya * In one tx, free all log blocks and clear the log header. If keep_first 677*1271e4b1SPrakash Surya * is set, then we're replaying a log with no content. We want to keep the 678*1271e4b1SPrakash Surya * first block, however, so that the first synchronous transaction doesn't 679*1271e4b1SPrakash Surya * require a txg_wait_synced() in zil_create(). We don't need to 680*1271e4b1SPrakash Surya * txg_wait_synced() here either when keep_first is set, because both 681*1271e4b1SPrakash Surya * zil_create() and zil_destroy() will wait for any in-progress destroys 682*1271e4b1SPrakash Surya * to complete. 683fa9e4066Sahrens */ 684fa9e4066Sahrens void 685d80c45e0Sbonwick zil_destroy(zilog_t *zilog, boolean_t keep_first) 686fa9e4066Sahrens { 687d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 688d80c45e0Sbonwick lwb_t *lwb; 689fa9e4066Sahrens dmu_tx_t *tx; 690fa9e4066Sahrens uint64_t txg; 691fa9e4066Sahrens 692d80c45e0Sbonwick /* 693d80c45e0Sbonwick * Wait for any previous destroy to complete. 694d80c45e0Sbonwick */ 695d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 696fa9e4066Sahrens 697b24ab676SJeff Bonwick zilog->zl_old_header = *zh; /* debugging aid */ 698b24ab676SJeff Bonwick 699d80c45e0Sbonwick if (BP_IS_HOLE(&zh->zh_log)) 700fa9e4066Sahrens return; 701fa9e4066Sahrens 702fa9e4066Sahrens tx = dmu_tx_create(zilog->zl_os); 703*1271e4b1SPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 704fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 705fa9e4066Sahrens txg = dmu_tx_get_txg(tx); 706fa9e4066Sahrens 707d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 708d80c45e0Sbonwick 709d80c45e0Sbonwick ASSERT3U(zilog->zl_destroy_txg, <, txg); 710fa9e4066Sahrens zilog->zl_destroy_txg = txg; 711b24ab676SJeff Bonwick zilog->zl_keep_first = keep_first; 712d80c45e0Sbonwick 713d80c45e0Sbonwick if (!list_is_empty(&zilog->zl_lwb_list)) { 714d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 715c9ba2a43SEric Schrock VERIFY(!keep_first); 716d80c45e0Sbonwick while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 717d80c45e0Sbonwick list_remove(&zilog->zl_lwb_list, lwb); 718d80c45e0Sbonwick if (lwb->lwb_buf != NULL) 719d80c45e0Sbonwick zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 720*1271e4b1SPrakash Surya zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); 721*1271e4b1SPrakash Surya zil_free_lwb(zilog, lwb); 722d80c45e0Sbonwick } 723b24ab676SJeff Bonwick } else if (!keep_first) { 724ce636f8bSMatthew Ahrens zil_destroy_sync(zilog, tx); 725d80c45e0Sbonwick } 726b19a79ecSperrin mutex_exit(&zilog->zl_lock); 727fa9e4066Sahrens 728fa9e4066Sahrens dmu_tx_commit(tx); 729fa9e4066Sahrens } 730fa9e4066Sahrens 731ce636f8bSMatthew Ahrens void 732ce636f8bSMatthew Ahrens zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) 733ce636f8bSMatthew Ahrens { 734ce636f8bSMatthew Ahrens ASSERT(list_is_empty(&zilog->zl_lwb_list)); 735ce636f8bSMatthew Ahrens (void) zil_parse(zilog, zil_free_log_block, 736ce636f8bSMatthew Ahrens zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); 737ce636f8bSMatthew Ahrens } 738ce636f8bSMatthew Ahrens 7391d452cf5Sahrens int 74012380e1eSArne Jansen zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) 741fa9e4066Sahrens { 742fa9e4066Sahrens dmu_tx_t *tx = txarg; 743fa9e4066Sahrens uint64_t first_txg = dmu_tx_get_txg(tx); 744fa9e4066Sahrens zilog_t *zilog; 745fa9e4066Sahrens zil_header_t *zh; 746fa9e4066Sahrens objset_t *os; 747fa9e4066Sahrens int error; 748fa9e4066Sahrens 74912380e1eSArne Jansen error = dmu_objset_own_obj(dp, ds->ds_object, 75012380e1eSArne Jansen DMU_OST_ANY, B_FALSE, FTAG, &os); 7513b2aab18SMatthew Ahrens if (error != 0) { 75222438533SMatthew Ahrens /* 75322438533SMatthew Ahrens * EBUSY indicates that the objset is inconsistent, in which 75422438533SMatthew Ahrens * case it can not have a ZIL. 75522438533SMatthew Ahrens */ 75622438533SMatthew Ahrens if (error != EBUSY) { 75712380e1eSArne Jansen cmn_err(CE_WARN, "can't open objset for %llu, error %u", 75812380e1eSArne Jansen (unsigned long long)ds->ds_object, error); 75922438533SMatthew Ahrens } 7601d452cf5Sahrens return (0); 761fa9e4066Sahrens } 762fa9e4066Sahrens 763fa9e4066Sahrens zilog = dmu_objset_zil(os); 764d80c45e0Sbonwick zh = zil_header_in_syncing_context(zilog); 765fa9e4066Sahrens 766b24ab676SJeff Bonwick if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 767e6ca193dSGeorge Wilson if (!BP_IS_HOLE(&zh->zh_log)) 768b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 769e6ca193dSGeorge Wilson BP_ZERO(&zh->zh_log); 770e6ca193dSGeorge Wilson dsl_dataset_dirty(dmu_objset_ds(os), tx); 7713b2aab18SMatthew Ahrens dmu_objset_disown(os, FTAG); 772468c413aSTim Haley return (0); 773e6ca193dSGeorge Wilson } 774e6ca193dSGeorge Wilson 775fa9e4066Sahrens /* 776d80c45e0Sbonwick * Claim all log blocks if we haven't already done so, and remember 777d80c45e0Sbonwick * the highest claimed sequence number. This ensures that if we can 778d80c45e0Sbonwick * read only part of the log now (e.g. due to a missing device), 779d80c45e0Sbonwick * but we can read the entire log later, we will not try to replay 780d80c45e0Sbonwick * or destroy beyond the last block we successfully claimed. 781fa9e4066Sahrens */ 782fa9e4066Sahrens ASSERT3U(zh->zh_claim_txg, <=, first_txg); 783fa9e4066Sahrens if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 784b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_claim_log_block, 785d80c45e0Sbonwick zil_claim_log_record, tx, first_txg); 786b24ab676SJeff Bonwick zh->zh_claim_txg = first_txg; 787b24ab676SJeff Bonwick zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 788b24ab676SJeff Bonwick zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 789b24ab676SJeff Bonwick if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 790b24ab676SJeff Bonwick zh->zh_flags |= ZIL_REPLAY_NEEDED; 791b24ab676SJeff Bonwick zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 792fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(os), tx); 793fa9e4066Sahrens } 794d80c45e0Sbonwick 795fa9e4066Sahrens ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 7963b2aab18SMatthew Ahrens dmu_objset_disown(os, FTAG); 7971d452cf5Sahrens return (0); 798b87f3af3Sperrin } 799b87f3af3Sperrin 800b87f3af3Sperrin /* 801b87f3af3Sperrin * Check the log by walking the log chain. 802b87f3af3Sperrin * Checksum errors are ok as they indicate the end of the chain. 803b87f3af3Sperrin * Any other error (no device or read failure) returns an error. 804b87f3af3Sperrin */ 80512380e1eSArne Jansen /* ARGSUSED */ 806b87f3af3Sperrin int 80712380e1eSArne Jansen zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) 808b87f3af3Sperrin { 809b87f3af3Sperrin zilog_t *zilog; 810b87f3af3Sperrin objset_t *os; 8114b964adaSGeorge Wilson blkptr_t *bp; 812b87f3af3Sperrin int error; 813b87f3af3Sperrin 814b24ab676SJeff Bonwick ASSERT(tx == NULL); 815b24ab676SJeff Bonwick 81612380e1eSArne Jansen error = dmu_objset_from_ds(ds, &os); 8173b2aab18SMatthew Ahrens if (error != 0) { 81812380e1eSArne Jansen cmn_err(CE_WARN, "can't open objset %llu, error %d", 81912380e1eSArne Jansen (unsigned long long)ds->ds_object, error); 820b87f3af3Sperrin return (0); 821b87f3af3Sperrin } 822b87f3af3Sperrin 823b87f3af3Sperrin zilog = dmu_objset_zil(os); 8244b964adaSGeorge Wilson bp = (blkptr_t *)&zilog->zl_header->zh_log; 8254b964adaSGeorge Wilson 8264b964adaSGeorge Wilson /* 8274b964adaSGeorge Wilson * Check the first block and determine if it's on a log device 8284b964adaSGeorge Wilson * which may have been removed or faulted prior to loading this 8294b964adaSGeorge Wilson * pool. If so, there's no point in checking the rest of the log 8304b964adaSGeorge Wilson * as its content should have already been synced to the pool. 8314b964adaSGeorge Wilson */ 8324b964adaSGeorge Wilson if (!BP_IS_HOLE(bp)) { 8334b964adaSGeorge Wilson vdev_t *vd; 8344b964adaSGeorge Wilson boolean_t valid = B_TRUE; 8354b964adaSGeorge Wilson 8364b964adaSGeorge Wilson spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 8374b964adaSGeorge Wilson vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 8384b964adaSGeorge Wilson if (vd->vdev_islog && vdev_is_dead(vd)) 8394b964adaSGeorge Wilson valid = vdev_log_state_valid(vd); 8404b964adaSGeorge Wilson spa_config_exit(os->os_spa, SCL_STATE, FTAG); 8414b964adaSGeorge Wilson 84212380e1eSArne Jansen if (!valid) 8434b964adaSGeorge Wilson return (0); 8444b964adaSGeorge Wilson } 845b87f3af3Sperrin 846b24ab676SJeff Bonwick /* 847b24ab676SJeff Bonwick * Because tx == NULL, zil_claim_log_block() will not actually claim 848b24ab676SJeff Bonwick * any blocks, but just determine whether it is possible to do so. 849b24ab676SJeff Bonwick * In addition to checking the log chain, zil_claim_log_block() 850b24ab676SJeff Bonwick * will invoke zio_claim() with a done func of spa_claim_notify(), 851b24ab676SJeff Bonwick * which will update spa_max_claim_txg. See spa_load() for details. 852b24ab676SJeff Bonwick */ 853b24ab676SJeff Bonwick error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 854b24ab676SJeff Bonwick zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 855b24ab676SJeff Bonwick 856b24ab676SJeff Bonwick return ((error == ECKSUM || error == ENOENT) ? 0 : error); 857b87f3af3Sperrin } 858b87f3af3Sperrin 859*1271e4b1SPrakash Surya /* 860*1271e4b1SPrakash Surya * When an itx is "skipped", this function is used to properly mark the 861*1271e4b1SPrakash Surya * waiter as "done, and signal any thread(s) waiting on it. An itx can 862*1271e4b1SPrakash Surya * be skipped (and not committed to an lwb) for a variety of reasons, 863*1271e4b1SPrakash Surya * one of them being that the itx was committed via spa_sync(), prior to 864*1271e4b1SPrakash Surya * it being committed to an lwb; this can happen if a thread calling 865*1271e4b1SPrakash Surya * zil_commit() is racing with spa_sync(). 866*1271e4b1SPrakash Surya */ 867*1271e4b1SPrakash Surya static void 868*1271e4b1SPrakash Surya zil_commit_waiter_skip(zil_commit_waiter_t *zcw) 86917f17c2dSbonwick { 870*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 871*1271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_FALSE); 872*1271e4b1SPrakash Surya zcw->zcw_done = B_TRUE; 873*1271e4b1SPrakash Surya cv_broadcast(&zcw->zcw_cv); 874*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 875*1271e4b1SPrakash Surya } 87617f17c2dSbonwick 877*1271e4b1SPrakash Surya /* 878*1271e4b1SPrakash Surya * This function is used when the given waiter is to be linked into an 879*1271e4b1SPrakash Surya * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. 880*1271e4b1SPrakash Surya * At this point, the waiter will no longer be referenced by the itx, 881*1271e4b1SPrakash Surya * and instead, will be referenced by the lwb. 882*1271e4b1SPrakash Surya */ 883*1271e4b1SPrakash Surya static void 884*1271e4b1SPrakash Surya zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) 885*1271e4b1SPrakash Surya { 886*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 887*1271e4b1SPrakash Surya ASSERT(!list_link_active(&zcw->zcw_node)); 888*1271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, NULL); 889*1271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 890*1271e4b1SPrakash Surya ASSERT(lwb->lwb_state == LWB_STATE_OPENED || 891*1271e4b1SPrakash Surya lwb->lwb_state == LWB_STATE_ISSUED); 892*1271e4b1SPrakash Surya 893*1271e4b1SPrakash Surya list_insert_tail(&lwb->lwb_waiters, zcw); 894*1271e4b1SPrakash Surya zcw->zcw_lwb = lwb; 895*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 896*1271e4b1SPrakash Surya } 89717f17c2dSbonwick 898*1271e4b1SPrakash Surya /* 899*1271e4b1SPrakash Surya * This function is used when zio_alloc_zil() fails to allocate a ZIL 900*1271e4b1SPrakash Surya * block, and the given waiter must be linked to the "nolwb waiters" 901*1271e4b1SPrakash Surya * list inside of zil_process_commit_list(). 902*1271e4b1SPrakash Surya */ 903*1271e4b1SPrakash Surya static void 904*1271e4b1SPrakash Surya zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) 905*1271e4b1SPrakash Surya { 906*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 907*1271e4b1SPrakash Surya ASSERT(!list_link_active(&zcw->zcw_node)); 908*1271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, NULL); 909*1271e4b1SPrakash Surya list_insert_tail(nolwb, zcw); 910*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 91117f17c2dSbonwick } 91217f17c2dSbonwick 913fa9e4066Sahrens void 914*1271e4b1SPrakash Surya zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) 915fa9e4066Sahrens { 916*1271e4b1SPrakash Surya avl_tree_t *t = &lwb->lwb_vdev_tree; 91717f17c2dSbonwick avl_index_t where; 91817f17c2dSbonwick zil_vdev_node_t *zv, zvsearch; 91917f17c2dSbonwick int ndvas = BP_GET_NDVAS(bp); 92017f17c2dSbonwick int i; 921fa9e4066Sahrens 922416e0cd8Sek if (zfs_nocacheflush) 923fa9e4066Sahrens return; 924fa9e4066Sahrens 925*1271e4b1SPrakash Surya mutex_enter(&lwb->lwb_vdev_lock); 92617f17c2dSbonwick for (i = 0; i < ndvas; i++) { 92717f17c2dSbonwick zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 92817f17c2dSbonwick if (avl_find(t, &zvsearch, &where) == NULL) { 92917f17c2dSbonwick zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 93017f17c2dSbonwick zv->zv_vdev = zvsearch.zv_vdev; 93117f17c2dSbonwick avl_insert(t, zv, where); 93267bd71c6Sperrin } 93367bd71c6Sperrin } 934*1271e4b1SPrakash Surya mutex_exit(&lwb->lwb_vdev_lock); 935*1271e4b1SPrakash Surya } 936*1271e4b1SPrakash Surya 937*1271e4b1SPrakash Surya void 938*1271e4b1SPrakash Surya zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) 939*1271e4b1SPrakash Surya { 940*1271e4b1SPrakash Surya lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 941fa9e4066Sahrens } 942fa9e4066Sahrens 943*1271e4b1SPrakash Surya /* 944*1271e4b1SPrakash Surya * This function is a called after all VDEVs associated with a given lwb 945*1271e4b1SPrakash Surya * write have completed their DKIOCFLUSHWRITECACHE command; or as soon 946*1271e4b1SPrakash Surya * as the lwb write completes, if "zfs_nocacheflush" is set. 947*1271e4b1SPrakash Surya * 948*1271e4b1SPrakash Surya * The intention is for this function to be called as soon as the 949*1271e4b1SPrakash Surya * contents of an lwb are considered "stable" on disk, and will survive 950*1271e4b1SPrakash Surya * any sudden loss of power. At this point, any threads waiting for the 951*1271e4b1SPrakash Surya * lwb to reach this state are signalled, and the "waiter" structures 952*1271e4b1SPrakash Surya * are marked "done". 953*1271e4b1SPrakash Surya */ 95491de656bSNeil Perrin static void 955*1271e4b1SPrakash Surya zil_lwb_flush_vdevs_done(zio_t *zio) 95667bd71c6Sperrin { 957*1271e4b1SPrakash Surya lwb_t *lwb = zio->io_private; 958*1271e4b1SPrakash Surya zilog_t *zilog = lwb->lwb_zilog; 959*1271e4b1SPrakash Surya dmu_tx_t *tx = lwb->lwb_tx; 960*1271e4b1SPrakash Surya zil_commit_waiter_t *zcw; 961*1271e4b1SPrakash Surya 962*1271e4b1SPrakash Surya spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); 963fa9e4066Sahrens 964*1271e4b1SPrakash Surya zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 965*1271e4b1SPrakash Surya 966*1271e4b1SPrakash Surya mutex_enter(&zilog->zl_lock); 96767bd71c6Sperrin 96817f17c2dSbonwick /* 969*1271e4b1SPrakash Surya * Ensure the lwb buffer pointer is cleared before releasing the 970*1271e4b1SPrakash Surya * txg. If we have had an allocation failure and the txg is 971*1271e4b1SPrakash Surya * waiting to sync then we want zil_sync() to remove the lwb so 972*1271e4b1SPrakash Surya * that it's not picked up as the next new one in 973*1271e4b1SPrakash Surya * zil_process_commit_list(). zil_sync() will only remove the 974*1271e4b1SPrakash Surya * lwb if lwb_buf is null. 97517f17c2dSbonwick */ 976*1271e4b1SPrakash Surya lwb->lwb_buf = NULL; 977*1271e4b1SPrakash Surya lwb->lwb_tx = NULL; 97817f17c2dSbonwick 979*1271e4b1SPrakash Surya ASSERT3U(lwb->lwb_issued_timestamp, >, 0); 980*1271e4b1SPrakash Surya zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; 98117f17c2dSbonwick 982*1271e4b1SPrakash Surya lwb->lwb_root_zio = NULL; 983*1271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_DONE; 984fa9e4066Sahrens 985*1271e4b1SPrakash Surya if (zilog->zl_last_lwb_opened == lwb) { 986*1271e4b1SPrakash Surya /* 987*1271e4b1SPrakash Surya * Remember the highest committed log sequence number 988*1271e4b1SPrakash Surya * for ztest. We only update this value when all the log 989*1271e4b1SPrakash Surya * writes succeeded, because ztest wants to ASSERT that 990*1271e4b1SPrakash Surya * it got the whole log chain. 991*1271e4b1SPrakash Surya */ 992*1271e4b1SPrakash Surya zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 993*1271e4b1SPrakash Surya } 994*1271e4b1SPrakash Surya 995*1271e4b1SPrakash Surya while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { 996*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 997*1271e4b1SPrakash Surya 998*1271e4b1SPrakash Surya ASSERT(list_link_active(&zcw->zcw_node)); 999*1271e4b1SPrakash Surya list_remove(&lwb->lwb_waiters, zcw); 1000*1271e4b1SPrakash Surya 1001*1271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, lwb); 1002*1271e4b1SPrakash Surya zcw->zcw_lwb = NULL; 1003*1271e4b1SPrakash Surya 1004*1271e4b1SPrakash Surya zcw->zcw_zio_error = zio->io_error; 1005*1271e4b1SPrakash Surya 1006*1271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_FALSE); 1007*1271e4b1SPrakash Surya zcw->zcw_done = B_TRUE; 1008*1271e4b1SPrakash Surya cv_broadcast(&zcw->zcw_cv); 1009*1271e4b1SPrakash Surya 1010*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 101167bd71c6Sperrin } 101217f17c2dSbonwick 1013*1271e4b1SPrakash Surya mutex_exit(&zilog->zl_lock); 1014*1271e4b1SPrakash Surya 1015fa9e4066Sahrens /* 1016*1271e4b1SPrakash Surya * Now that we've written this log block, we have a stable pointer 1017*1271e4b1SPrakash Surya * to the next block in the chain, so it's OK to let the txg in 1018*1271e4b1SPrakash Surya * which we allocated the next block sync. 1019fa9e4066Sahrens */ 1020*1271e4b1SPrakash Surya dmu_tx_commit(tx); 1021fa9e4066Sahrens } 1022fa9e4066Sahrens 1023fa9e4066Sahrens /* 1024*1271e4b1SPrakash Surya * This is called when an lwb write completes. This means, this specific 1025*1271e4b1SPrakash Surya * lwb was written to disk, and all dependent lwb have also been 1026*1271e4b1SPrakash Surya * written to disk. 1027*1271e4b1SPrakash Surya * 1028*1271e4b1SPrakash Surya * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to 1029*1271e4b1SPrakash Surya * the VDEVs involved in writing out this specific lwb. The lwb will be 1030*1271e4b1SPrakash Surya * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the 1031*1271e4b1SPrakash Surya * zio completion callback for the lwb's root zio. 1032fa9e4066Sahrens */ 1033fa9e4066Sahrens static void 1034fa9e4066Sahrens zil_lwb_write_done(zio_t *zio) 1035fa9e4066Sahrens { 1036fa9e4066Sahrens lwb_t *lwb = zio->io_private; 1037*1271e4b1SPrakash Surya spa_t *spa = zio->io_spa; 1038fa9e4066Sahrens zilog_t *zilog = lwb->lwb_zilog; 1039*1271e4b1SPrakash Surya avl_tree_t *t = &lwb->lwb_vdev_tree; 1040*1271e4b1SPrakash Surya void *cookie = NULL; 1041*1271e4b1SPrakash Surya zil_vdev_node_t *zv; 1042*1271e4b1SPrakash Surya 1043*1271e4b1SPrakash Surya ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); 1044fa9e4066Sahrens 1045e14bb325SJeff Bonwick ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1046e14bb325SJeff Bonwick ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 1047e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 1048e14bb325SJeff Bonwick ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 1049e14bb325SJeff Bonwick ASSERT(!BP_IS_GANG(zio->io_bp)); 1050e14bb325SJeff Bonwick ASSERT(!BP_IS_HOLE(zio->io_bp)); 10515d7b4d43SMatthew Ahrens ASSERT(BP_GET_FILL(zio->io_bp) == 0); 1052e14bb325SJeff Bonwick 1053770499e1SDan Kimmel abd_put(zio->io_abd); 1054*1271e4b1SPrakash Surya 1055*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); 1056*1271e4b1SPrakash Surya 1057fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1058*1271e4b1SPrakash Surya lwb->lwb_write_zio = NULL; 1059b24ab676SJeff Bonwick mutex_exit(&zilog->zl_lock); 1060ef0d8e11SNeil Perrin 1061*1271e4b1SPrakash Surya if (avl_numnodes(t) == 0) 1062*1271e4b1SPrakash Surya return; 1063*1271e4b1SPrakash Surya 1064ef0d8e11SNeil Perrin /* 1065*1271e4b1SPrakash Surya * If there was an IO error, we're not going to call zio_flush() 1066*1271e4b1SPrakash Surya * on these vdevs, so we simply empty the tree and free the 1067*1271e4b1SPrakash Surya * nodes. We avoid calling zio_flush() since there isn't any 1068*1271e4b1SPrakash Surya * good reason for doing so, after the lwb block failed to be 1069*1271e4b1SPrakash Surya * written out. 1070ef0d8e11SNeil Perrin */ 1071*1271e4b1SPrakash Surya if (zio->io_error != 0) { 1072*1271e4b1SPrakash Surya while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) 1073*1271e4b1SPrakash Surya kmem_free(zv, sizeof (*zv)); 1074*1271e4b1SPrakash Surya return; 1075*1271e4b1SPrakash Surya } 1076*1271e4b1SPrakash Surya 1077*1271e4b1SPrakash Surya while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 1078*1271e4b1SPrakash Surya vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 1079*1271e4b1SPrakash Surya if (vd != NULL) 1080*1271e4b1SPrakash Surya zio_flush(lwb->lwb_root_zio, vd); 1081*1271e4b1SPrakash Surya kmem_free(zv, sizeof (*zv)); 1082*1271e4b1SPrakash Surya } 1083fa9e4066Sahrens } 1084fa9e4066Sahrens 1085c5c6ffa0Smaybee /* 1086*1271e4b1SPrakash Surya * This function's purpose is to "open" an lwb such that it is ready to 1087*1271e4b1SPrakash Surya * accept new itxs being committed to it. To do this, the lwb's zio 1088*1271e4b1SPrakash Surya * structures are created, and linked to the lwb. This function is 1089*1271e4b1SPrakash Surya * idempotent; if the passed in lwb has already been opened, this 1090*1271e4b1SPrakash Surya * function is essentially a no-op. 1091c5c6ffa0Smaybee */ 1092c5c6ffa0Smaybee static void 1093*1271e4b1SPrakash Surya zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) 1094c5c6ffa0Smaybee { 10957802d7bfSMatthew Ahrens zbookmark_phys_t zb; 1096c5ee4681SAlexander Motin zio_priority_t prio; 1097c5c6ffa0Smaybee 1098*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 1099*1271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 1100*1271e4b1SPrakash Surya EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); 1101*1271e4b1SPrakash Surya EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); 1102*1271e4b1SPrakash Surya 1103b24ab676SJeff Bonwick SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1104b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 1105b24ab676SJeff Bonwick lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 1106c5c6ffa0Smaybee 1107*1271e4b1SPrakash Surya if (lwb->lwb_root_zio == NULL) { 1108770499e1SDan Kimmel abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, 1109770499e1SDan Kimmel BP_GET_LSIZE(&lwb->lwb_blk)); 1110*1271e4b1SPrakash Surya 1111c5ee4681SAlexander Motin if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) 1112c5ee4681SAlexander Motin prio = ZIO_PRIORITY_SYNC_WRITE; 1113c5ee4681SAlexander Motin else 1114c5ee4681SAlexander Motin prio = ZIO_PRIORITY_ASYNC_WRITE; 1115*1271e4b1SPrakash Surya 1116*1271e4b1SPrakash Surya lwb->lwb_root_zio = zio_root(zilog->zl_spa, 1117*1271e4b1SPrakash Surya zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); 1118*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1119*1271e4b1SPrakash Surya 1120*1271e4b1SPrakash Surya lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, 1121*1271e4b1SPrakash Surya zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, 1122*1271e4b1SPrakash Surya BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, 1123*1271e4b1SPrakash Surya prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 1124*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1125*1271e4b1SPrakash Surya 1126*1271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_OPENED; 1127*1271e4b1SPrakash Surya 1128*1271e4b1SPrakash Surya mutex_enter(&zilog->zl_lock); 1129*1271e4b1SPrakash Surya 1130*1271e4b1SPrakash Surya /* 1131*1271e4b1SPrakash Surya * The zilog's "zl_last_lwb_opened" field is used to 1132*1271e4b1SPrakash Surya * build the lwb/zio dependency chain, which is used to 1133*1271e4b1SPrakash Surya * preserve the ordering of lwb completions that is 1134*1271e4b1SPrakash Surya * required by the semantics of the ZIL. Each new lwb 1135*1271e4b1SPrakash Surya * zio becomes a parent of the "previous" lwb zio, such 1136*1271e4b1SPrakash Surya * that the new lwb's zio cannot complete until the 1137*1271e4b1SPrakash Surya * "previous" lwb's zio completes. 1138*1271e4b1SPrakash Surya * 1139*1271e4b1SPrakash Surya * This is required by the semantics of zil_commit(); 1140*1271e4b1SPrakash Surya * the commit waiters attached to the lwbs will be woken 1141*1271e4b1SPrakash Surya * in the lwb zio's completion callback, so this zio 1142*1271e4b1SPrakash Surya * dependency graph ensures the waiters are woken in the 1143*1271e4b1SPrakash Surya * correct order (the same order the lwbs were created). 1144*1271e4b1SPrakash Surya */ 1145*1271e4b1SPrakash Surya lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; 1146*1271e4b1SPrakash Surya if (last_lwb_opened != NULL && 1147*1271e4b1SPrakash Surya last_lwb_opened->lwb_state != LWB_STATE_DONE) { 1148*1271e4b1SPrakash Surya ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || 1149*1271e4b1SPrakash Surya last_lwb_opened->lwb_state == LWB_STATE_ISSUED); 1150*1271e4b1SPrakash Surya ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); 1151*1271e4b1SPrakash Surya zio_add_child(lwb->lwb_root_zio, 1152*1271e4b1SPrakash Surya last_lwb_opened->lwb_root_zio); 1153*1271e4b1SPrakash Surya } 1154*1271e4b1SPrakash Surya zilog->zl_last_lwb_opened = lwb; 1155*1271e4b1SPrakash Surya 1156*1271e4b1SPrakash Surya mutex_exit(&zilog->zl_lock); 115767bd71c6Sperrin } 1158*1271e4b1SPrakash Surya 1159*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1160*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1161*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1162c5c6ffa0Smaybee } 1163c5c6ffa0Smaybee 11646e1f5caaSNeil Perrin /* 11656e1f5caaSNeil Perrin * Define a limited set of intent log block sizes. 1166f7170741SWill Andrews * 11676e1f5caaSNeil Perrin * These must be a multiple of 4KB. Note only the amount used (again 11686e1f5caaSNeil Perrin * aligned to 4KB) actually gets written. However, we can't always just 1169b5152584SMatthew Ahrens * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. 11706e1f5caaSNeil Perrin */ 11716e1f5caaSNeil Perrin uint64_t zil_block_buckets[] = { 11726e1f5caaSNeil Perrin 4096, /* non TX_WRITE */ 11736e1f5caaSNeil Perrin 8192+4096, /* data base */ 11746e1f5caaSNeil Perrin 32*1024 + 4096, /* NFS writes */ 11756e1f5caaSNeil Perrin UINT64_MAX 11766e1f5caaSNeil Perrin }; 11776e1f5caaSNeil Perrin 1178fa9e4066Sahrens /* 1179fa9e4066Sahrens * Start a log block write and advance to the next log block. 1180fa9e4066Sahrens * Calls are serialized. 1181fa9e4066Sahrens */ 1182fa9e4066Sahrens static lwb_t * 1183*1271e4b1SPrakash Surya zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) 1184fa9e4066Sahrens { 11856e1f5caaSNeil Perrin lwb_t *nlwb = NULL; 11866e1f5caaSNeil Perrin zil_chain_t *zilc; 1187d80c45e0Sbonwick spa_t *spa = zilog->zl_spa; 11886e1f5caaSNeil Perrin blkptr_t *bp; 1189b24ab676SJeff Bonwick dmu_tx_t *tx; 1190fa9e4066Sahrens uint64_t txg; 1191ada693c4SNeil Perrin uint64_t zil_blksz, wsz; 11926e1f5caaSNeil Perrin int i, error; 1193c5ee4681SAlexander Motin boolean_t slog; 11946e1f5caaSNeil Perrin 1195*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 1196*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 1197*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 1198*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1199*1271e4b1SPrakash Surya 12006e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 12016e1f5caaSNeil Perrin zilc = (zil_chain_t *)lwb->lwb_buf; 12026e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 12036e1f5caaSNeil Perrin } else { 12046e1f5caaSNeil Perrin zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 12056e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 12066e1f5caaSNeil Perrin } 1207fa9e4066Sahrens 12086e1f5caaSNeil Perrin ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 1209fa9e4066Sahrens 1210fa9e4066Sahrens /* 1211fa9e4066Sahrens * Allocate the next block and save its address in this block 1212fa9e4066Sahrens * before writing it in order to establish the log chain. 1213fa9e4066Sahrens * Note that if the allocation of nlwb synced before we wrote 1214fa9e4066Sahrens * the block that points at it (lwb), we'd leak it if we crashed. 1215b24ab676SJeff Bonwick * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 1216b24ab676SJeff Bonwick * We dirty the dataset to ensure that zil_sync() will be called 1217b24ab676SJeff Bonwick * to clean up in the event of allocation failure or I/O failure. 1218fa9e4066Sahrens */ 1219*1271e4b1SPrakash Surya 1220b24ab676SJeff Bonwick tx = dmu_tx_create(zilog->zl_os); 1221d28671a3SAndriy Gapon 1222d28671a3SAndriy Gapon /* 1223d28671a3SAndriy Gapon * Since we are not going to create any new dirty data and we can even 1224d28671a3SAndriy Gapon * help with clearing the existing dirty data, we should not be subject 1225d28671a3SAndriy Gapon * to the dirty data based delays. 1226d28671a3SAndriy Gapon * We (ab)use TXG_WAITED to bypass the delay mechanism. 1227d28671a3SAndriy Gapon * One side effect from using TXG_WAITED is that dmu_tx_assign() can 1228d28671a3SAndriy Gapon * fail if the pool is suspended. Those are dramatic circumstances, 1229d28671a3SAndriy Gapon * so we return NULL to signal that the normal ZIL processing is not 1230d28671a3SAndriy Gapon * possible and txg_wait_synced() should be used to ensure that the data 1231d28671a3SAndriy Gapon * is on disk. 1232d28671a3SAndriy Gapon */ 1233d28671a3SAndriy Gapon error = dmu_tx_assign(tx, TXG_WAITED); 1234d28671a3SAndriy Gapon if (error != 0) { 1235d28671a3SAndriy Gapon ASSERT3S(error, ==, EIO); 1236d28671a3SAndriy Gapon dmu_tx_abort(tx); 1237d28671a3SAndriy Gapon return (NULL); 1238d28671a3SAndriy Gapon } 1239b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1240b24ab676SJeff Bonwick txg = dmu_tx_get_txg(tx); 1241b24ab676SJeff Bonwick 1242b24ab676SJeff Bonwick lwb->lwb_tx = tx; 1243fa9e4066Sahrens 1244fa9e4066Sahrens /* 12456e1f5caaSNeil Perrin * Log blocks are pre-allocated. Here we select the size of the next 12466e1f5caaSNeil Perrin * block, based on size used in the last block. 12476e1f5caaSNeil Perrin * - first find the smallest bucket that will fit the block from a 12486e1f5caaSNeil Perrin * limited set of block sizes. This is because it's faster to write 12496e1f5caaSNeil Perrin * blocks allocated from the same metaslab as they are adjacent or 12506e1f5caaSNeil Perrin * close. 12516e1f5caaSNeil Perrin * - next find the maximum from the new suggested size and an array of 12526e1f5caaSNeil Perrin * previous sizes. This lessens a picket fence effect of wrongly 12536e1f5caaSNeil Perrin * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 12546e1f5caaSNeil Perrin * requests. 12556e1f5caaSNeil Perrin * 12566e1f5caaSNeil Perrin * Note we only write what is used, but we can't just allocate 12576e1f5caaSNeil Perrin * the maximum block size because we can exhaust the available 12586e1f5caaSNeil Perrin * pool log space. 1259fa9e4066Sahrens */ 12606e1f5caaSNeil Perrin zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 12616e1f5caaSNeil Perrin for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 12626e1f5caaSNeil Perrin continue; 12636e1f5caaSNeil Perrin zil_blksz = zil_block_buckets[i]; 12646e1f5caaSNeil Perrin if (zil_blksz == UINT64_MAX) 1265b5152584SMatthew Ahrens zil_blksz = SPA_OLD_MAXBLOCKSIZE; 12666e1f5caaSNeil Perrin zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 12676e1f5caaSNeil Perrin for (i = 0; i < ZIL_PREV_BLKS; i++) 12686e1f5caaSNeil Perrin zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 12696e1f5caaSNeil Perrin zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 1270fa9e4066Sahrens 127167bd71c6Sperrin BP_ZERO(bp); 1272*1271e4b1SPrakash Surya 127367bd71c6Sperrin /* pass the old blkptr in order to spread log blocks across devs */ 1274c5ee4681SAlexander Motin error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog); 12753b2aab18SMatthew Ahrens if (error == 0) { 12766e1f5caaSNeil Perrin ASSERT3U(bp->blk_birth, ==, txg); 12776e1f5caaSNeil Perrin bp->blk_cksum = lwb->lwb_blk.blk_cksum; 12786e1f5caaSNeil Perrin bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 1279d63d470bSgw 1280ea8dc4b6Seschrock /* 1281*1271e4b1SPrakash Surya * Allocate a new log write block (lwb). 1282ea8dc4b6Seschrock */ 1283c5ee4681SAlexander Motin nlwb = zil_alloc_lwb(zilog, bp, slog, txg); 1284fa9e4066Sahrens } 1285fa9e4066Sahrens 12866e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 12876e1f5caaSNeil Perrin /* For Slim ZIL only write what is used. */ 1288ada693c4SNeil Perrin wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 1289ada693c4SNeil Perrin ASSERT3U(wsz, <=, lwb->lwb_sz); 1290*1271e4b1SPrakash Surya zio_shrink(lwb->lwb_write_zio, wsz); 1291fa9e4066Sahrens 1292ada693c4SNeil Perrin } else { 1293ada693c4SNeil Perrin wsz = lwb->lwb_sz; 12946e1f5caaSNeil Perrin } 1295ada693c4SNeil Perrin 12966e1f5caaSNeil Perrin zilc->zc_pad = 0; 12976e1f5caaSNeil Perrin zilc->zc_nused = lwb->lwb_nused; 12986e1f5caaSNeil Perrin zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 1299fa9e4066Sahrens 1300ada693c4SNeil Perrin /* 1301ada693c4SNeil Perrin * clear unused data for security 1302ada693c4SNeil Perrin */ 1303ada693c4SNeil Perrin bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 1304ada693c4SNeil Perrin 1305*1271e4b1SPrakash Surya spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); 1306*1271e4b1SPrakash Surya 1307*1271e4b1SPrakash Surya zil_lwb_add_block(lwb, &lwb->lwb_blk); 1308*1271e4b1SPrakash Surya lwb->lwb_issued_timestamp = gethrtime(); 1309*1271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_ISSUED; 1310*1271e4b1SPrakash Surya 1311*1271e4b1SPrakash Surya zio_nowait(lwb->lwb_root_zio); 1312*1271e4b1SPrakash Surya zio_nowait(lwb->lwb_write_zio); 131367bd71c6Sperrin 1314fa9e4066Sahrens /* 13156e1f5caaSNeil Perrin * If there was an allocation failure then nlwb will be null which 13166e1f5caaSNeil Perrin * forces a txg_wait_synced(). 1317fa9e4066Sahrens */ 1318fa9e4066Sahrens return (nlwb); 1319fa9e4066Sahrens } 1320fa9e4066Sahrens 1321fa9e4066Sahrens static lwb_t * 1322fa9e4066Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 1323fa9e4066Sahrens { 1324c5ee4681SAlexander Motin lr_t *lrcb, *lrc; 1325c5ee4681SAlexander Motin lr_write_t *lrwb, *lrw; 1326b24ab676SJeff Bonwick char *lr_buf; 1327c5ee4681SAlexander Motin uint64_t dlen, dnow, lwb_sp, reclen, txg; 1328fa9e4066Sahrens 1329*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 1330*1271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 1331*1271e4b1SPrakash Surya ASSERT3P(lwb->lwb_buf, !=, NULL); 1332*1271e4b1SPrakash Surya 1333*1271e4b1SPrakash Surya zil_lwb_write_open(zilog, lwb); 1334b24ab676SJeff Bonwick 1335*1271e4b1SPrakash Surya lrc = &itx->itx_lr; 1336*1271e4b1SPrakash Surya lrw = (lr_write_t *)lrc; 1337*1271e4b1SPrakash Surya 1338*1271e4b1SPrakash Surya /* 1339*1271e4b1SPrakash Surya * A commit itx doesn't represent any on-disk state; instead 1340*1271e4b1SPrakash Surya * it's simply used as a place holder on the commit list, and 1341*1271e4b1SPrakash Surya * provides a mechanism for attaching a "commit waiter" onto the 1342*1271e4b1SPrakash Surya * correct lwb (such that the waiter can be signalled upon 1343*1271e4b1SPrakash Surya * completion of that lwb). Thus, we don't process this itx's 1344*1271e4b1SPrakash Surya * log record if it's a commit itx (these itx's don't have log 1345*1271e4b1SPrakash Surya * records), and instead link the itx's waiter onto the lwb's 1346*1271e4b1SPrakash Surya * list of waiters. 1347*1271e4b1SPrakash Surya * 1348*1271e4b1SPrakash Surya * For more details, see the comment above zil_commit(). 1349*1271e4b1SPrakash Surya */ 1350*1271e4b1SPrakash Surya if (lrc->lrc_txtype == TX_COMMIT) { 1351*1271e4b1SPrakash Surya zil_commit_waiter_link_lwb(itx->itx_private, lwb); 1352*1271e4b1SPrakash Surya itx->itx_private = NULL; 1353*1271e4b1SPrakash Surya return (lwb); 1354*1271e4b1SPrakash Surya } 1355fa9e4066Sahrens 1356c5ee4681SAlexander Motin if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { 1357c5c6ffa0Smaybee dlen = P2ROUNDUP_TYPED( 1358b24ab676SJeff Bonwick lrw->lr_length, sizeof (uint64_t), uint64_t); 1359c5ee4681SAlexander Motin } else { 1360c5ee4681SAlexander Motin dlen = 0; 1361c5ee4681SAlexander Motin } 1362c5ee4681SAlexander Motin reclen = lrc->lrc_reclen; 1363104e2ed7Sperrin zilog->zl_cur_used += (reclen + dlen); 1364c5ee4681SAlexander Motin txg = lrc->lrc_txg; 136522ac5be4Sperrin 1366*1271e4b1SPrakash Surya ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); 136767bd71c6Sperrin 1368c5ee4681SAlexander Motin cont: 1369fa9e4066Sahrens /* 1370fa9e4066Sahrens * If this record won't fit in the current log block, start a new one. 1371c5ee4681SAlexander Motin * For WR_NEED_COPY optimize layout for minimal number of chunks. 1372fa9e4066Sahrens */ 1373c5ee4681SAlexander Motin lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1374c5ee4681SAlexander Motin if (reclen > lwb_sp || (reclen + dlen > lwb_sp && 1375c5ee4681SAlexander Motin lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || 1376c5ee4681SAlexander Motin lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { 1377*1271e4b1SPrakash Surya lwb = zil_lwb_write_issue(zilog, lwb); 1378c5c6ffa0Smaybee if (lwb == NULL) 1379fa9e4066Sahrens return (NULL); 1380*1271e4b1SPrakash Surya zil_lwb_write_open(zilog, lwb); 13816e1f5caaSNeil Perrin ASSERT(LWB_EMPTY(lwb)); 1382c5ee4681SAlexander Motin lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1383c5ee4681SAlexander Motin ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); 1384fa9e4066Sahrens } 1385fa9e4066Sahrens 1386c5ee4681SAlexander Motin dnow = MIN(dlen, lwb_sp - reclen); 1387b24ab676SJeff Bonwick lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1388b24ab676SJeff Bonwick bcopy(lrc, lr_buf, reclen); 1389c5ee4681SAlexander Motin lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ 1390c5ee4681SAlexander Motin lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ 1391c5c6ffa0Smaybee 1392c5c6ffa0Smaybee /* 1393c5c6ffa0Smaybee * If it's a write, fetch the data or get its blkptr as appropriate. 1394c5c6ffa0Smaybee */ 1395c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE) { 1396c5c6ffa0Smaybee if (txg > spa_freeze_txg(zilog->zl_spa)) 1397c5c6ffa0Smaybee txg_wait_synced(zilog->zl_dmu_pool, txg); 1398c5c6ffa0Smaybee if (itx->itx_wr_state != WR_COPIED) { 1399c5c6ffa0Smaybee char *dbuf; 1400c5c6ffa0Smaybee int error; 1401c5c6ffa0Smaybee 1402c5ee4681SAlexander Motin if (itx->itx_wr_state == WR_NEED_COPY) { 1403b24ab676SJeff Bonwick dbuf = lr_buf + reclen; 1404c5ee4681SAlexander Motin lrcb->lrc_reclen += dnow; 1405c5ee4681SAlexander Motin if (lrwb->lr_length > dnow) 1406c5ee4681SAlexander Motin lrwb->lr_length = dnow; 1407c5ee4681SAlexander Motin lrw->lr_offset += dnow; 1408c5ee4681SAlexander Motin lrw->lr_length -= dnow; 1409c5c6ffa0Smaybee } else { 1410c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_INDIRECT); 1411c5c6ffa0Smaybee dbuf = NULL; 1412c5c6ffa0Smaybee } 1413*1271e4b1SPrakash Surya 1414*1271e4b1SPrakash Surya /* 1415*1271e4b1SPrakash Surya * We pass in the "lwb_write_zio" rather than 1416*1271e4b1SPrakash Surya * "lwb_root_zio" so that the "lwb_write_zio" 1417*1271e4b1SPrakash Surya * becomes the parent of any zio's created by 1418*1271e4b1SPrakash Surya * the "zl_get_data" callback. The vdevs are 1419*1271e4b1SPrakash Surya * flushed after the "lwb_write_zio" completes, 1420*1271e4b1SPrakash Surya * so we want to make sure that completion 1421*1271e4b1SPrakash Surya * callback waits for these additional zio's, 1422*1271e4b1SPrakash Surya * such that the vdevs used by those zio's will 1423*1271e4b1SPrakash Surya * be included in the lwb's vdev tree, and those 1424*1271e4b1SPrakash Surya * vdevs will be properly flushed. If we passed 1425*1271e4b1SPrakash Surya * in "lwb_root_zio" here, then these additional 1426*1271e4b1SPrakash Surya * vdevs may not be flushed; e.g. if these zio's 1427*1271e4b1SPrakash Surya * completed after "lwb_write_zio" completed. 1428*1271e4b1SPrakash Surya */ 1429*1271e4b1SPrakash Surya error = zilog->zl_get_data(itx->itx_private, 1430*1271e4b1SPrakash Surya lrwb, dbuf, lwb, lwb->lwb_write_zio); 1431*1271e4b1SPrakash Surya 1432c87b8fc5SMark J Musante if (error == EIO) { 1433c87b8fc5SMark J Musante txg_wait_synced(zilog->zl_dmu_pool, txg); 1434c87b8fc5SMark J Musante return (lwb); 1435c87b8fc5SMark J Musante } 14363b2aab18SMatthew Ahrens if (error != 0) { 1437c5c6ffa0Smaybee ASSERT(error == ENOENT || error == EEXIST || 1438c5c6ffa0Smaybee error == EALREADY); 1439c5c6ffa0Smaybee return (lwb); 1440c5c6ffa0Smaybee } 1441c5c6ffa0Smaybee } 1442104e2ed7Sperrin } 1443c5c6ffa0Smaybee 1444b24ab676SJeff Bonwick /* 1445b24ab676SJeff Bonwick * We're actually making an entry, so update lrc_seq to be the 1446b24ab676SJeff Bonwick * log record sequence number. Note that this is generally not 1447b24ab676SJeff Bonwick * equal to the itx sequence number because not all transactions 1448b24ab676SJeff Bonwick * are synchronous, and sometimes spa_sync() gets there first. 1449b24ab676SJeff Bonwick */ 1450*1271e4b1SPrakash Surya lrcb->lrc_seq = ++zilog->zl_lr_seq; 1451c5ee4681SAlexander Motin lwb->lwb_nused += reclen + dnow; 1452*1271e4b1SPrakash Surya 1453*1271e4b1SPrakash Surya zil_lwb_add_txg(lwb, txg); 1454*1271e4b1SPrakash Surya 14556e1f5caaSNeil Perrin ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1456fb09f5aaSMadhav Suresh ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); 1457fa9e4066Sahrens 1458c5ee4681SAlexander Motin dlen -= dnow; 1459c5ee4681SAlexander Motin if (dlen > 0) { 1460c5ee4681SAlexander Motin zilog->zl_cur_used += reclen; 1461c5ee4681SAlexander Motin goto cont; 1462c5ee4681SAlexander Motin } 1463c5ee4681SAlexander Motin 1464fa9e4066Sahrens return (lwb); 1465fa9e4066Sahrens } 1466fa9e4066Sahrens 1467fa9e4066Sahrens itx_t * 1468da6c28aaSamw zil_itx_create(uint64_t txtype, size_t lrsize) 1469fa9e4066Sahrens { 1470fa9e4066Sahrens itx_t *itx; 1471fa9e4066Sahrens 1472b4d654b0Sperrin lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1473fa9e4066Sahrens 1474fa9e4066Sahrens itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1475fa9e4066Sahrens itx->itx_lr.lrc_txtype = txtype; 1476fa9e4066Sahrens itx->itx_lr.lrc_reclen = lrsize; 1477fa9e4066Sahrens itx->itx_lr.lrc_seq = 0; /* defensive */ 14785002558fSNeil Perrin itx->itx_sync = B_TRUE; /* default is synchronous */ 1479fa9e4066Sahrens 1480fa9e4066Sahrens return (itx); 1481fa9e4066Sahrens } 1482fa9e4066Sahrens 1483b24ab676SJeff Bonwick void 1484b24ab676SJeff Bonwick zil_itx_destroy(itx_t *itx) 1485b24ab676SJeff Bonwick { 1486b24ab676SJeff Bonwick kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1487b24ab676SJeff Bonwick } 1488b24ab676SJeff Bonwick 14895002558fSNeil Perrin /* 14905002558fSNeil Perrin * Free up the sync and async itxs. The itxs_t has already been detached 14915002558fSNeil Perrin * so no locks are needed. 14925002558fSNeil Perrin */ 14935002558fSNeil Perrin static void 14945002558fSNeil Perrin zil_itxg_clean(itxs_t *itxs) 1495fa9e4066Sahrens { 14965002558fSNeil Perrin itx_t *itx; 14975002558fSNeil Perrin list_t *list; 14985002558fSNeil Perrin avl_tree_t *t; 14995002558fSNeil Perrin void *cookie; 15005002558fSNeil Perrin itx_async_node_t *ian; 15015002558fSNeil Perrin 15025002558fSNeil Perrin list = &itxs->i_sync_list; 15035002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 1504*1271e4b1SPrakash Surya /* 1505*1271e4b1SPrakash Surya * In the general case, commit itxs will not be found 1506*1271e4b1SPrakash Surya * here, as they'll be committed to an lwb via 1507*1271e4b1SPrakash Surya * zil_lwb_commit(), and free'd in that function. Having 1508*1271e4b1SPrakash Surya * said that, it is still possible for commit itxs to be 1509*1271e4b1SPrakash Surya * found here, due to the following race: 1510*1271e4b1SPrakash Surya * 1511*1271e4b1SPrakash Surya * - a thread calls zil_commit() which assigns the 1512*1271e4b1SPrakash Surya * commit itx to a per-txg i_sync_list 1513*1271e4b1SPrakash Surya * - zil_itxg_clean() is called (e.g. via spa_sync()) 1514*1271e4b1SPrakash Surya * while the waiter is still on the i_sync_list 1515*1271e4b1SPrakash Surya * 1516*1271e4b1SPrakash Surya * There's nothing to prevent syncing the txg while the 1517*1271e4b1SPrakash Surya * waiter is on the i_sync_list. This normally doesn't 1518*1271e4b1SPrakash Surya * happen because spa_sync() is slower than zil_commit(), 1519*1271e4b1SPrakash Surya * but if zil_commit() calls txg_wait_synced() (e.g. 1520*1271e4b1SPrakash Surya * because zil_create() or zil_commit_writer_stall() is 1521*1271e4b1SPrakash Surya * called) we will hit this case. 1522*1271e4b1SPrakash Surya */ 1523*1271e4b1SPrakash Surya if (itx->itx_lr.lrc_txtype == TX_COMMIT) 1524*1271e4b1SPrakash Surya zil_commit_waiter_skip(itx->itx_private); 1525*1271e4b1SPrakash Surya 15265002558fSNeil Perrin list_remove(list, itx); 1527*1271e4b1SPrakash Surya zil_itx_destroy(itx); 15285002558fSNeil Perrin } 1529fa9e4066Sahrens 15305002558fSNeil Perrin cookie = NULL; 15315002558fSNeil Perrin t = &itxs->i_async_tree; 15325002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 15335002558fSNeil Perrin list = &ian->ia_list; 15345002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 15355002558fSNeil Perrin list_remove(list, itx); 1536*1271e4b1SPrakash Surya /* commit itxs should never be on the async lists. */ 1537*1271e4b1SPrakash Surya ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1538*1271e4b1SPrakash Surya zil_itx_destroy(itx); 15395002558fSNeil Perrin } 15405002558fSNeil Perrin list_destroy(list); 15415002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 15425002558fSNeil Perrin } 15435002558fSNeil Perrin avl_destroy(t); 1544fa9e4066Sahrens 15455002558fSNeil Perrin kmem_free(itxs, sizeof (itxs_t)); 15465002558fSNeil Perrin } 15475002558fSNeil Perrin 15485002558fSNeil Perrin static int 15495002558fSNeil Perrin zil_aitx_compare(const void *x1, const void *x2) 15505002558fSNeil Perrin { 15515002558fSNeil Perrin const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 15525002558fSNeil Perrin const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1553fa9e4066Sahrens 15545002558fSNeil Perrin if (o1 < o2) 15555002558fSNeil Perrin return (-1); 15565002558fSNeil Perrin if (o1 > o2) 15575002558fSNeil Perrin return (1); 15585002558fSNeil Perrin 15595002558fSNeil Perrin return (0); 1560fa9e4066Sahrens } 1561fa9e4066Sahrens 1562fa9e4066Sahrens /* 15635002558fSNeil Perrin * Remove all async itx with the given oid. 1564fa9e4066Sahrens */ 156591de656bSNeil Perrin static void 15665002558fSNeil Perrin zil_remove_async(zilog_t *zilog, uint64_t oid) 1567fa9e4066Sahrens { 15685002558fSNeil Perrin uint64_t otxg, txg; 15695002558fSNeil Perrin itx_async_node_t *ian; 15705002558fSNeil Perrin avl_tree_t *t; 15715002558fSNeil Perrin avl_index_t where; 1572a584ef65Sjohansen list_t clean_list; 1573fa9e4066Sahrens itx_t *itx; 1574fa9e4066Sahrens 15755002558fSNeil Perrin ASSERT(oid != 0); 1576a584ef65Sjohansen list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1577a584ef65Sjohansen 15785002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 15795002558fSNeil Perrin otxg = ZILTEST_TXG; 15805002558fSNeil Perrin else 15815002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1582a584ef65Sjohansen 15835002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 15845002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 15855002558fSNeil Perrin 15865002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 15875002558fSNeil Perrin if (itxg->itxg_txg != txg) { 15885002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 15895002558fSNeil Perrin continue; 15905002558fSNeil Perrin } 1591a584ef65Sjohansen 15925002558fSNeil Perrin /* 15935002558fSNeil Perrin * Locate the object node and append its list. 15945002558fSNeil Perrin */ 15955002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 15965002558fSNeil Perrin ian = avl_find(t, &oid, &where); 15975002558fSNeil Perrin if (ian != NULL) 15985002558fSNeil Perrin list_move_tail(&clean_list, &ian->ia_list); 15995002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 16005002558fSNeil Perrin } 1601a584ef65Sjohansen while ((itx = list_head(&clean_list)) != NULL) { 1602a584ef65Sjohansen list_remove(&clean_list, itx); 1603*1271e4b1SPrakash Surya /* commit itxs should never be on the async lists. */ 1604*1271e4b1SPrakash Surya ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 1605*1271e4b1SPrakash Surya zil_itx_destroy(itx); 1606a584ef65Sjohansen } 1607a584ef65Sjohansen list_destroy(&clean_list); 1608fa9e4066Sahrens } 1609fa9e4066Sahrens 16105002558fSNeil Perrin void 16115002558fSNeil Perrin zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 16125002558fSNeil Perrin { 16135002558fSNeil Perrin uint64_t txg; 16145002558fSNeil Perrin itxg_t *itxg; 16155002558fSNeil Perrin itxs_t *itxs, *clean = NULL; 16165002558fSNeil Perrin 16175002558fSNeil Perrin /* 161891de656bSNeil Perrin * Object ids can be re-instantiated in the next txg so 16195002558fSNeil Perrin * remove any async transactions to avoid future leaks. 16205002558fSNeil Perrin * This can happen if a fsync occurs on the re-instantiated 16215002558fSNeil Perrin * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 16225002558fSNeil Perrin * the new file data and flushes a write record for the old object. 16235002558fSNeil Perrin */ 16245002558fSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 162551bd2f97SNeil Perrin zil_remove_async(zilog, itx->itx_oid); 16265002558fSNeil Perrin 162791de656bSNeil Perrin /* 162891de656bSNeil Perrin * Ensure the data of a renamed file is committed before the rename. 162991de656bSNeil Perrin */ 163091de656bSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 163191de656bSNeil Perrin zil_async_to_sync(zilog, itx->itx_oid); 163291de656bSNeil Perrin 1633ce636f8bSMatthew Ahrens if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 16345002558fSNeil Perrin txg = ZILTEST_TXG; 16355002558fSNeil Perrin else 16365002558fSNeil Perrin txg = dmu_tx_get_txg(tx); 16375002558fSNeil Perrin 16385002558fSNeil Perrin itxg = &zilog->zl_itxg[txg & TXG_MASK]; 16395002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 16405002558fSNeil Perrin itxs = itxg->itxg_itxs; 16415002558fSNeil Perrin if (itxg->itxg_txg != txg) { 16425002558fSNeil Perrin if (itxs != NULL) { 16435002558fSNeil Perrin /* 16445002558fSNeil Perrin * The zil_clean callback hasn't got around to cleaning 16455002558fSNeil Perrin * this itxg. Save the itxs for release below. 16465002558fSNeil Perrin * This should be rare. 16475002558fSNeil Perrin */ 164843297f97SGeorge Wilson zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " 164943297f97SGeorge Wilson "txg %llu", itxg->itxg_txg); 16505002558fSNeil Perrin clean = itxg->itxg_itxs; 16515002558fSNeil Perrin } 16525002558fSNeil Perrin itxg->itxg_txg = txg; 16535002558fSNeil Perrin itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 16545002558fSNeil Perrin 16555002558fSNeil Perrin list_create(&itxs->i_sync_list, sizeof (itx_t), 16565002558fSNeil Perrin offsetof(itx_t, itx_node)); 16575002558fSNeil Perrin avl_create(&itxs->i_async_tree, zil_aitx_compare, 16585002558fSNeil Perrin sizeof (itx_async_node_t), 16595002558fSNeil Perrin offsetof(itx_async_node_t, ia_node)); 16605002558fSNeil Perrin } 16615002558fSNeil Perrin if (itx->itx_sync) { 16625002558fSNeil Perrin list_insert_tail(&itxs->i_sync_list, itx); 16635002558fSNeil Perrin } else { 16645002558fSNeil Perrin avl_tree_t *t = &itxs->i_async_tree; 16655002558fSNeil Perrin uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 16665002558fSNeil Perrin itx_async_node_t *ian; 16675002558fSNeil Perrin avl_index_t where; 16685002558fSNeil Perrin 16695002558fSNeil Perrin ian = avl_find(t, &foid, &where); 16705002558fSNeil Perrin if (ian == NULL) { 16715002558fSNeil Perrin ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 16725002558fSNeil Perrin list_create(&ian->ia_list, sizeof (itx_t), 16735002558fSNeil Perrin offsetof(itx_t, itx_node)); 16745002558fSNeil Perrin ian->ia_foid = foid; 16755002558fSNeil Perrin avl_insert(t, ian, where); 16765002558fSNeil Perrin } 16775002558fSNeil Perrin list_insert_tail(&ian->ia_list, itx); 16785002558fSNeil Perrin } 16795002558fSNeil Perrin 16805002558fSNeil Perrin itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1681*1271e4b1SPrakash Surya 1682*1271e4b1SPrakash Surya /* 1683*1271e4b1SPrakash Surya * We don't want to dirty the ZIL using ZILTEST_TXG, because 1684*1271e4b1SPrakash Surya * zil_clean() will never be called using ZILTEST_TXG. Thus, we 1685*1271e4b1SPrakash Surya * need to be careful to always dirty the ZIL using the "real" 1686*1271e4b1SPrakash Surya * TXG (not itxg_txg) even when the SPA is frozen. 1687*1271e4b1SPrakash Surya */ 1688*1271e4b1SPrakash Surya zilog_dirty(zilog, dmu_tx_get_txg(tx)); 16895002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 16905002558fSNeil Perrin 16915002558fSNeil Perrin /* Release the old itxs now we've dropped the lock */ 16925002558fSNeil Perrin if (clean != NULL) 16935002558fSNeil Perrin zil_itxg_clean(clean); 16945002558fSNeil Perrin } 16955002558fSNeil Perrin 1696b19a79ecSperrin /* 169767bd71c6Sperrin * If there are any in-memory intent log transactions which have now been 1698ce636f8bSMatthew Ahrens * synced then start up a taskq to free them. We should only do this after we 1699ce636f8bSMatthew Ahrens * have written out the uberblocks (i.e. txg has been comitted) so that 1700ce636f8bSMatthew Ahrens * don't inadvertently clean out in-memory log records that would be required 1701ce636f8bSMatthew Ahrens * by zil_commit(). 1702b19a79ecSperrin */ 1703fa9e4066Sahrens void 17045002558fSNeil Perrin zil_clean(zilog_t *zilog, uint64_t synced_txg) 1705fa9e4066Sahrens { 17065002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 17075002558fSNeil Perrin itxs_t *clean_me; 170867bd71c6Sperrin 1709*1271e4b1SPrakash Surya ASSERT3U(synced_txg, <, ZILTEST_TXG); 1710*1271e4b1SPrakash Surya 17115002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 17125002558fSNeil Perrin if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 17135002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 17145002558fSNeil Perrin return; 17155002558fSNeil Perrin } 17165002558fSNeil Perrin ASSERT3U(itxg->itxg_txg, <=, synced_txg); 1717216d7723SPrakash Surya ASSERT3U(itxg->itxg_txg, !=, 0); 17185002558fSNeil Perrin clean_me = itxg->itxg_itxs; 17195002558fSNeil Perrin itxg->itxg_itxs = NULL; 17205002558fSNeil Perrin itxg->itxg_txg = 0; 17215002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 17225002558fSNeil Perrin /* 17235002558fSNeil Perrin * Preferably start a task queue to free up the old itxs but 17245002558fSNeil Perrin * if taskq_dispatch can't allocate resources to do that then 17255002558fSNeil Perrin * free it in-line. This should be rare. Note, using TQ_SLEEP 17265002558fSNeil Perrin * created a bad performance problem. 17275002558fSNeil Perrin */ 1728216d7723SPrakash Surya ASSERT3P(zilog->zl_dmu_pool, !=, NULL); 1729216d7723SPrakash Surya ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); 1730216d7723SPrakash Surya if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, 17315002558fSNeil Perrin (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) 17325002558fSNeil Perrin zil_itxg_clean(clean_me); 17335002558fSNeil Perrin } 17345002558fSNeil Perrin 17355002558fSNeil Perrin /* 1736*1271e4b1SPrakash Surya * This function will traverse the queue of itxs that need to be 1737*1271e4b1SPrakash Surya * committed, and move them onto the ZIL's zl_itx_commit_list. 17385002558fSNeil Perrin */ 173991de656bSNeil Perrin static void 17405002558fSNeil Perrin zil_get_commit_list(zilog_t *zilog) 17415002558fSNeil Perrin { 17425002558fSNeil Perrin uint64_t otxg, txg; 17435002558fSNeil Perrin list_t *commit_list = &zilog->zl_itx_commit_list; 17445002558fSNeil Perrin 1745*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 1746*1271e4b1SPrakash Surya 17475002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 17485002558fSNeil Perrin otxg = ZILTEST_TXG; 17495002558fSNeil Perrin else 17505002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 17515002558fSNeil Perrin 175243297f97SGeorge Wilson /* 175343297f97SGeorge Wilson * This is inherently racy, since there is nothing to prevent 175443297f97SGeorge Wilson * the last synced txg from changing. That's okay since we'll 175543297f97SGeorge Wilson * only commit things in the future. 175643297f97SGeorge Wilson */ 17575002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 17585002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 17595002558fSNeil Perrin 17605002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 17615002558fSNeil Perrin if (itxg->itxg_txg != txg) { 17625002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 17635002558fSNeil Perrin continue; 17645002558fSNeil Perrin } 17655002558fSNeil Perrin 176643297f97SGeorge Wilson /* 176743297f97SGeorge Wilson * If we're adding itx records to the zl_itx_commit_list, 176843297f97SGeorge Wilson * then the zil better be dirty in this "txg". We can assert 176943297f97SGeorge Wilson * that here since we're holding the itxg_lock which will 177043297f97SGeorge Wilson * prevent spa_sync from cleaning it. Once we add the itxs 177143297f97SGeorge Wilson * to the zl_itx_commit_list we must commit it to disk even 177243297f97SGeorge Wilson * if it's unnecessary (i.e. the txg was synced). 177343297f97SGeorge Wilson */ 177443297f97SGeorge Wilson ASSERT(zilog_is_dirty_in_txg(zilog, txg) || 177543297f97SGeorge Wilson spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); 17765002558fSNeil Perrin list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 17775002558fSNeil Perrin 17785002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 17795002558fSNeil Perrin } 17805002558fSNeil Perrin } 17815002558fSNeil Perrin 17825002558fSNeil Perrin /* 17835002558fSNeil Perrin * Move the async itxs for a specified object to commit into sync lists. 17845002558fSNeil Perrin */ 178591de656bSNeil Perrin static void 17865002558fSNeil Perrin zil_async_to_sync(zilog_t *zilog, uint64_t foid) 17875002558fSNeil Perrin { 17885002558fSNeil Perrin uint64_t otxg, txg; 17895002558fSNeil Perrin itx_async_node_t *ian; 17905002558fSNeil Perrin avl_tree_t *t; 17915002558fSNeil Perrin avl_index_t where; 17925002558fSNeil Perrin 17935002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 17945002558fSNeil Perrin otxg = ZILTEST_TXG; 17955002558fSNeil Perrin else 17965002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 17975002558fSNeil Perrin 179843297f97SGeorge Wilson /* 179943297f97SGeorge Wilson * This is inherently racy, since there is nothing to prevent 180043297f97SGeorge Wilson * the last synced txg from changing. 180143297f97SGeorge Wilson */ 18025002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 18035002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 18045002558fSNeil Perrin 18055002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 18065002558fSNeil Perrin if (itxg->itxg_txg != txg) { 18075002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 18085002558fSNeil Perrin continue; 18095002558fSNeil Perrin } 18105002558fSNeil Perrin 18115002558fSNeil Perrin /* 18125002558fSNeil Perrin * If a foid is specified then find that node and append its 18135002558fSNeil Perrin * list. Otherwise walk the tree appending all the lists 18145002558fSNeil Perrin * to the sync list. We add to the end rather than the 18155002558fSNeil Perrin * beginning to ensure the create has happened. 18165002558fSNeil Perrin */ 18175002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 18185002558fSNeil Perrin if (foid != 0) { 18195002558fSNeil Perrin ian = avl_find(t, &foid, &where); 18205002558fSNeil Perrin if (ian != NULL) { 18215002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 18225002558fSNeil Perrin &ian->ia_list); 18235002558fSNeil Perrin } 18245002558fSNeil Perrin } else { 18255002558fSNeil Perrin void *cookie = NULL; 18265002558fSNeil Perrin 18275002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 18285002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 18295002558fSNeil Perrin &ian->ia_list); 18305002558fSNeil Perrin list_destroy(&ian->ia_list); 18315002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 18325002558fSNeil Perrin } 18335002558fSNeil Perrin } 18345002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 183567bd71c6Sperrin } 1836fa9e4066Sahrens } 1837fa9e4066Sahrens 1838*1271e4b1SPrakash Surya /* 1839*1271e4b1SPrakash Surya * This function will prune commit itxs that are at the head of the 1840*1271e4b1SPrakash Surya * commit list (it won't prune past the first non-commit itx), and 1841*1271e4b1SPrakash Surya * either: a) attach them to the last lwb that's still pending 1842*1271e4b1SPrakash Surya * completion, or b) skip them altogether. 1843*1271e4b1SPrakash Surya * 1844*1271e4b1SPrakash Surya * This is used as a performance optimization to prevent commit itxs 1845*1271e4b1SPrakash Surya * from generating new lwbs when it's unnecessary to do so. 1846*1271e4b1SPrakash Surya */ 1847e14bb325SJeff Bonwick static void 1848*1271e4b1SPrakash Surya zil_prune_commit_list(zilog_t *zilog) 1849fa9e4066Sahrens { 18505002558fSNeil Perrin itx_t *itx; 1851fa9e4066Sahrens 1852*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 18535002558fSNeil Perrin 1854*1271e4b1SPrakash Surya while (itx = list_head(&zilog->zl_itx_commit_list)) { 1855*1271e4b1SPrakash Surya lr_t *lrc = &itx->itx_lr; 1856*1271e4b1SPrakash Surya if (lrc->lrc_txtype != TX_COMMIT) 1857*1271e4b1SPrakash Surya break; 18585002558fSNeil Perrin 1859*1271e4b1SPrakash Surya mutex_enter(&zilog->zl_lock); 1860*1271e4b1SPrakash Surya 1861*1271e4b1SPrakash Surya lwb_t *last_lwb = zilog->zl_last_lwb_opened; 1862*1271e4b1SPrakash Surya if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) { 1863*1271e4b1SPrakash Surya /* 1864*1271e4b1SPrakash Surya * All of the itxs this waiter was waiting on 1865*1271e4b1SPrakash Surya * must have already completed (or there were 1866*1271e4b1SPrakash Surya * never any itx's for it to wait on), so it's 1867*1271e4b1SPrakash Surya * safe to skip this waiter and mark it done. 1868*1271e4b1SPrakash Surya */ 1869*1271e4b1SPrakash Surya zil_commit_waiter_skip(itx->itx_private); 1870*1271e4b1SPrakash Surya } else { 1871*1271e4b1SPrakash Surya zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); 1872*1271e4b1SPrakash Surya itx->itx_private = NULL; 1873*1271e4b1SPrakash Surya } 1874*1271e4b1SPrakash Surya 1875*1271e4b1SPrakash Surya mutex_exit(&zilog->zl_lock); 1876*1271e4b1SPrakash Surya 1877*1271e4b1SPrakash Surya list_remove(&zilog->zl_itx_commit_list, itx); 1878*1271e4b1SPrakash Surya zil_itx_destroy(itx); 1879*1271e4b1SPrakash Surya } 1880*1271e4b1SPrakash Surya 1881*1271e4b1SPrakash Surya IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); 1882*1271e4b1SPrakash Surya } 1883*1271e4b1SPrakash Surya 1884*1271e4b1SPrakash Surya static void 1885*1271e4b1SPrakash Surya zil_commit_writer_stall(zilog_t *zilog) 1886*1271e4b1SPrakash Surya { 1887*1271e4b1SPrakash Surya /* 1888*1271e4b1SPrakash Surya * When zio_alloc_zil() fails to allocate the next lwb block on 1889*1271e4b1SPrakash Surya * disk, we must call txg_wait_synced() to ensure all of the 1890*1271e4b1SPrakash Surya * lwbs in the zilog's zl_lwb_list are synced and then freed (in 1891*1271e4b1SPrakash Surya * zil_sync()), such that any subsequent ZIL writer (i.e. a call 1892*1271e4b1SPrakash Surya * to zil_process_commit_list()) will have to call zil_create(), 1893*1271e4b1SPrakash Surya * and start a new ZIL chain. 1894*1271e4b1SPrakash Surya * 1895*1271e4b1SPrakash Surya * Since zil_alloc_zil() failed, the lwb that was previously 1896*1271e4b1SPrakash Surya * issued does not have a pointer to the "next" lwb on disk. 1897*1271e4b1SPrakash Surya * Thus, if another ZIL writer thread was to allocate the "next" 1898*1271e4b1SPrakash Surya * on-disk lwb, that block could be leaked in the event of a 1899*1271e4b1SPrakash Surya * crash (because the previous lwb on-disk would not point to 1900*1271e4b1SPrakash Surya * it). 1901*1271e4b1SPrakash Surya * 1902*1271e4b1SPrakash Surya * We must hold the zilog's zl_writer_lock while we do this, to 1903*1271e4b1SPrakash Surya * ensure no new threads enter zil_process_commit_list() until 1904*1271e4b1SPrakash Surya * all lwb's in the zl_lwb_list have been synced and freed 1905*1271e4b1SPrakash Surya * (which is achieved via the txg_wait_synced() call). 1906*1271e4b1SPrakash Surya */ 1907*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 1908*1271e4b1SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 1909*1271e4b1SPrakash Surya ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 1910*1271e4b1SPrakash Surya } 1911*1271e4b1SPrakash Surya 1912*1271e4b1SPrakash Surya /* 1913*1271e4b1SPrakash Surya * This function will traverse the commit list, creating new lwbs as 1914*1271e4b1SPrakash Surya * needed, and committing the itxs from the commit list to these newly 1915*1271e4b1SPrakash Surya * created lwbs. Additionally, as a new lwb is created, the previous 1916*1271e4b1SPrakash Surya * lwb will be issued to the zio layer to be written to disk. 1917*1271e4b1SPrakash Surya */ 1918*1271e4b1SPrakash Surya static void 1919*1271e4b1SPrakash Surya zil_process_commit_list(zilog_t *zilog) 1920*1271e4b1SPrakash Surya { 1921*1271e4b1SPrakash Surya spa_t *spa = zilog->zl_spa; 1922*1271e4b1SPrakash Surya list_t nolwb_waiters; 1923*1271e4b1SPrakash Surya lwb_t *lwb; 1924*1271e4b1SPrakash Surya itx_t *itx; 1925*1271e4b1SPrakash Surya 1926*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_writer_lock)); 19275002558fSNeil Perrin 19285002558fSNeil Perrin /* 19295002558fSNeil Perrin * Return if there's nothing to commit before we dirty the fs by 19305002558fSNeil Perrin * calling zil_create(). 19315002558fSNeil Perrin */ 1932*1271e4b1SPrakash Surya if (list_head(&zilog->zl_itx_commit_list) == NULL) 19335002558fSNeil Perrin return; 1934fa9e4066Sahrens 1935*1271e4b1SPrakash Surya list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), 1936*1271e4b1SPrakash Surya offsetof(zil_commit_waiter_t, zcw_node)); 1937*1271e4b1SPrakash Surya 1938*1271e4b1SPrakash Surya lwb = list_tail(&zilog->zl_lwb_list); 1939*1271e4b1SPrakash Surya if (lwb == NULL) { 1940*1271e4b1SPrakash Surya lwb = zil_create(zilog); 1941fa9e4066Sahrens } else { 1942*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 1943*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 1944fa9e4066Sahrens } 1945fa9e4066Sahrens 19465002558fSNeil Perrin while (itx = list_head(&zilog->zl_itx_commit_list)) { 1947*1271e4b1SPrakash Surya lr_t *lrc = &itx->itx_lr; 1948*1271e4b1SPrakash Surya uint64_t txg = lrc->lrc_txg; 1949*1271e4b1SPrakash Surya 195043297f97SGeorge Wilson ASSERT3U(txg, !=, 0); 1951fa9e4066Sahrens 1952*1271e4b1SPrakash Surya if (lrc->lrc_txtype == TX_COMMIT) { 1953*1271e4b1SPrakash Surya DTRACE_PROBE2(zil__process__commit__itx, 1954*1271e4b1SPrakash Surya zilog_t *, zilog, itx_t *, itx); 1955*1271e4b1SPrakash Surya } else { 1956*1271e4b1SPrakash Surya DTRACE_PROBE2(zil__process__normal__itx, 1957*1271e4b1SPrakash Surya zilog_t *, zilog, itx_t *, itx); 1958*1271e4b1SPrakash Surya } 1959*1271e4b1SPrakash Surya 196043297f97SGeorge Wilson /* 196143297f97SGeorge Wilson * This is inherently racy and may result in us writing 1962*1271e4b1SPrakash Surya * out a log block for a txg that was just synced. This 1963*1271e4b1SPrakash Surya * is ok since we'll end cleaning up that log block the 1964*1271e4b1SPrakash Surya * next time we call zil_sync(). 196543297f97SGeorge Wilson */ 1966*1271e4b1SPrakash Surya boolean_t synced = txg <= spa_last_synced_txg(spa); 1967*1271e4b1SPrakash Surya boolean_t frozen = txg > spa_freeze_txg(spa); 1968*1271e4b1SPrakash Surya 1969*1271e4b1SPrakash Surya if (!synced || frozen) { 1970*1271e4b1SPrakash Surya if (lwb != NULL) { 1971*1271e4b1SPrakash Surya lwb = zil_lwb_commit(zilog, itx, lwb); 1972*1271e4b1SPrakash Surya } else if (lrc->lrc_txtype == TX_COMMIT) { 1973*1271e4b1SPrakash Surya ASSERT3P(lwb, ==, NULL); 1974*1271e4b1SPrakash Surya zil_commit_waiter_link_nolwb( 1975*1271e4b1SPrakash Surya itx->itx_private, &nolwb_waiters); 1976*1271e4b1SPrakash Surya } 1977*1271e4b1SPrakash Surya } else if (lrc->lrc_txtype == TX_COMMIT) { 1978*1271e4b1SPrakash Surya ASSERT3B(synced, ==, B_TRUE); 1979*1271e4b1SPrakash Surya ASSERT3B(frozen, ==, B_FALSE); 1980*1271e4b1SPrakash Surya 1981*1271e4b1SPrakash Surya /* 1982*1271e4b1SPrakash Surya * If this is a commit itx, then there will be a 1983*1271e4b1SPrakash Surya * thread that is either: already waiting for 1984*1271e4b1SPrakash Surya * it, or soon will be waiting. 1985*1271e4b1SPrakash Surya * 1986*1271e4b1SPrakash Surya * This itx has already been committed to disk 1987*1271e4b1SPrakash Surya * via spa_sync() so we don't bother committing 1988*1271e4b1SPrakash Surya * it to an lwb. As a result, we cannot use the 1989*1271e4b1SPrakash Surya * lwb zio callback to signal the waiter and 1990*1271e4b1SPrakash Surya * mark it as done, so we must do that here. 1991*1271e4b1SPrakash Surya */ 1992*1271e4b1SPrakash Surya zil_commit_waiter_skip(itx->itx_private); 1993*1271e4b1SPrakash Surya } 1994*1271e4b1SPrakash Surya 19955002558fSNeil Perrin list_remove(&zilog->zl_itx_commit_list, itx); 1996*1271e4b1SPrakash Surya zil_itx_destroy(itx); 1997fa9e4066Sahrens } 1998b19a79ecSperrin DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 1999fa9e4066Sahrens 2000*1271e4b1SPrakash Surya if (lwb == NULL) { 2001*1271e4b1SPrakash Surya /* 2002*1271e4b1SPrakash Surya * This indicates zio_alloc_zil() failed to allocate the 2003*1271e4b1SPrakash Surya * "next" lwb on-disk. When this happens, we must stall 2004*1271e4b1SPrakash Surya * the ZIL write pipeline; see the comment within 2005*1271e4b1SPrakash Surya * zil_commit_writer_stall() for more details. 2006*1271e4b1SPrakash Surya */ 2007*1271e4b1SPrakash Surya zil_commit_writer_stall(zilog); 2008fa9e4066Sahrens 2009*1271e4b1SPrakash Surya /* 2010*1271e4b1SPrakash Surya * Additionally, we have to signal and mark the "nolwb" 2011*1271e4b1SPrakash Surya * waiters as "done" here, since without an lwb, we 2012*1271e4b1SPrakash Surya * can't do this via zil_lwb_flush_vdevs_done() like 2013*1271e4b1SPrakash Surya * normal. 2014*1271e4b1SPrakash Surya */ 2015*1271e4b1SPrakash Surya zil_commit_waiter_t *zcw; 2016*1271e4b1SPrakash Surya while (zcw = list_head(&nolwb_waiters)) { 2017*1271e4b1SPrakash Surya zil_commit_waiter_skip(zcw); 2018*1271e4b1SPrakash Surya list_remove(&nolwb_waiters, zcw); 2019*1271e4b1SPrakash Surya } 2020*1271e4b1SPrakash Surya } else { 2021*1271e4b1SPrakash Surya ASSERT(list_is_empty(&nolwb_waiters)); 2022*1271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 2023*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2024*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE); 2025*1271e4b1SPrakash Surya 2026*1271e4b1SPrakash Surya /* 2027*1271e4b1SPrakash Surya * At this point, the ZIL block pointed at by the "lwb" 2028*1271e4b1SPrakash Surya * variable is in one of the following states: "closed" 2029*1271e4b1SPrakash Surya * or "open". 2030*1271e4b1SPrakash Surya * 2031*1271e4b1SPrakash Surya * If its "closed", then no itxs have been committed to 2032*1271e4b1SPrakash Surya * it, so there's no point in issuing its zio (i.e. 2033*1271e4b1SPrakash Surya * it's "empty"). 2034*1271e4b1SPrakash Surya * 2035*1271e4b1SPrakash Surya * If its "open" state, then it contains one or more 2036*1271e4b1SPrakash Surya * itxs that eventually need to be committed to stable 2037*1271e4b1SPrakash Surya * storage. In this case we intentionally do not issue 2038*1271e4b1SPrakash Surya * the lwb's zio to disk yet, and instead rely on one of 2039*1271e4b1SPrakash Surya * the following two mechanisms for issuing the zio: 2040*1271e4b1SPrakash Surya * 2041*1271e4b1SPrakash Surya * 1. Ideally, there will be more ZIL activity occuring 2042*1271e4b1SPrakash Surya * on the system, such that this function will be 2043*1271e4b1SPrakash Surya * immediately called again (not necessarily by the same 2044*1271e4b1SPrakash Surya * thread) and this lwb's zio will be issued via 2045*1271e4b1SPrakash Surya * zil_lwb_commit(). This way, the lwb is guaranteed to 2046*1271e4b1SPrakash Surya * be "full" when it is issued to disk, and we'll make 2047*1271e4b1SPrakash Surya * use of the lwb's size the best we can. 2048*1271e4b1SPrakash Surya * 2049*1271e4b1SPrakash Surya * 2. If there isn't sufficient ZIL activity occuring on 2050*1271e4b1SPrakash Surya * the system, such that this lwb's zio isn't issued via 2051*1271e4b1SPrakash Surya * zil_lwb_commit(), zil_commit_waiter() will issue the 2052*1271e4b1SPrakash Surya * lwb's zio. If this occurs, the lwb is not guaranteed 2053*1271e4b1SPrakash Surya * to be "full" by the time its zio is issued, and means 2054*1271e4b1SPrakash Surya * the size of the lwb was "too large" given the amount 2055*1271e4b1SPrakash Surya * of ZIL activity occuring on the system at that time. 2056*1271e4b1SPrakash Surya * 2057*1271e4b1SPrakash Surya * We do this for a couple of reasons: 2058*1271e4b1SPrakash Surya * 2059*1271e4b1SPrakash Surya * 1. To try and reduce the number of IOPs needed to 2060*1271e4b1SPrakash Surya * write the same number of itxs. If an lwb has space 2061*1271e4b1SPrakash Surya * available in it's buffer for more itxs, and more itxs 2062*1271e4b1SPrakash Surya * will be committed relatively soon (relative to the 2063*1271e4b1SPrakash Surya * latency of performing a write), then it's beneficial 2064*1271e4b1SPrakash Surya * to wait for these "next" itxs. This way, more itxs 2065*1271e4b1SPrakash Surya * can be committed to stable storage with fewer writes. 2066*1271e4b1SPrakash Surya * 2067*1271e4b1SPrakash Surya * 2. To try and use the largest lwb block size that the 2068*1271e4b1SPrakash Surya * incoming rate of itxs can support. Again, this is to 2069*1271e4b1SPrakash Surya * try and pack as many itxs into as few lwbs as 2070*1271e4b1SPrakash Surya * possible, without significantly impacting the latency 2071*1271e4b1SPrakash Surya * of each individual itx. 2072*1271e4b1SPrakash Surya */ 2073*1271e4b1SPrakash Surya } 2074*1271e4b1SPrakash Surya } 2075*1271e4b1SPrakash Surya 2076*1271e4b1SPrakash Surya /* 2077*1271e4b1SPrakash Surya * This function is responsible for ensuring the passed in commit waiter 2078*1271e4b1SPrakash Surya * (and associated commit itx) is committed to an lwb. If the waiter is 2079*1271e4b1SPrakash Surya * not already committed to an lwb, all itxs in the zilog's queue of 2080*1271e4b1SPrakash Surya * itxs will be processed. The assumption is the passed in waiter's 2081*1271e4b1SPrakash Surya * commit itx will found in the queue just like the other non-commit 2082*1271e4b1SPrakash Surya * itxs, such that when the entire queue is processed, the waiter will 2083*1271e4b1SPrakash Surya * have been commited to an lwb. 2084*1271e4b1SPrakash Surya * 2085*1271e4b1SPrakash Surya * The lwb associated with the passed in waiter is not guaranteed to 2086*1271e4b1SPrakash Surya * have been issued by the time this function completes. If the lwb is 2087*1271e4b1SPrakash Surya * not issued, we rely on future calls to zil_commit_writer() to issue 2088*1271e4b1SPrakash Surya * the lwb, or the timeout mechanism found in zil_commit_waiter(). 2089*1271e4b1SPrakash Surya */ 2090*1271e4b1SPrakash Surya static void 2091*1271e4b1SPrakash Surya zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) 2092*1271e4b1SPrakash Surya { 2093*1271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2094*1271e4b1SPrakash Surya ASSERT(spa_writeable(zilog->zl_spa)); 2095*1271e4b1SPrakash Surya ASSERT0(zilog->zl_suspend); 2096*1271e4b1SPrakash Surya 2097*1271e4b1SPrakash Surya mutex_enter(&zilog->zl_writer_lock); 2098*1271e4b1SPrakash Surya 2099*1271e4b1SPrakash Surya if (zcw->zcw_lwb != NULL || zcw->zcw_done) { 2100*1271e4b1SPrakash Surya /* 2101*1271e4b1SPrakash Surya * It's possible that, while we were waiting to acquire 2102*1271e4b1SPrakash Surya * the "zl_writer_lock", another thread committed this 2103*1271e4b1SPrakash Surya * waiter to an lwb. If that occurs, we bail out early, 2104*1271e4b1SPrakash Surya * without processing any of the zilog's queue of itxs. 2105*1271e4b1SPrakash Surya * 2106*1271e4b1SPrakash Surya * On certain workloads and system configurations, the 2107*1271e4b1SPrakash Surya * "zl_writer_lock" can become highly contended. In an 2108*1271e4b1SPrakash Surya * attempt to reduce this contention, we immediately drop 2109*1271e4b1SPrakash Surya * the lock if the waiter has already been processed. 2110*1271e4b1SPrakash Surya * 2111*1271e4b1SPrakash Surya * We've measured this optimization to reduce CPU spent 2112*1271e4b1SPrakash Surya * contending on this lock by up to 5%, using a system 2113*1271e4b1SPrakash Surya * with 32 CPUs, low latency storage (~50 usec writes), 2114*1271e4b1SPrakash Surya * and 1024 threads performing sync writes. 2115*1271e4b1SPrakash Surya */ 2116*1271e4b1SPrakash Surya goto out; 2117*1271e4b1SPrakash Surya } 2118*1271e4b1SPrakash Surya 2119*1271e4b1SPrakash Surya zil_get_commit_list(zilog); 2120*1271e4b1SPrakash Surya zil_prune_commit_list(zilog); 2121*1271e4b1SPrakash Surya zil_process_commit_list(zilog); 2122*1271e4b1SPrakash Surya 2123*1271e4b1SPrakash Surya out: 2124*1271e4b1SPrakash Surya mutex_exit(&zilog->zl_writer_lock); 2125*1271e4b1SPrakash Surya } 2126*1271e4b1SPrakash Surya 2127*1271e4b1SPrakash Surya static void 2128*1271e4b1SPrakash Surya zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) 2129*1271e4b1SPrakash Surya { 2130*1271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_writer_lock)); 2131*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2132*1271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_FALSE); 2133*1271e4b1SPrakash Surya 2134*1271e4b1SPrakash Surya lwb_t *lwb = zcw->zcw_lwb; 2135*1271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 2136*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); 2137*1271e4b1SPrakash Surya 2138*1271e4b1SPrakash Surya /* 2139*1271e4b1SPrakash Surya * If the lwb has already been issued by another thread, we can 2140*1271e4b1SPrakash Surya * immediately return since there's no work to be done (the 2141*1271e4b1SPrakash Surya * point of this function is to issue the lwb). Additionally, we 2142*1271e4b1SPrakash Surya * do this prior to acquiring the zl_writer_lock, to avoid 2143*1271e4b1SPrakash Surya * acquiring it when it's not necessary to do so. 2144*1271e4b1SPrakash Surya */ 2145*1271e4b1SPrakash Surya if (lwb->lwb_state == LWB_STATE_ISSUED || 2146*1271e4b1SPrakash Surya lwb->lwb_state == LWB_STATE_DONE) 2147*1271e4b1SPrakash Surya return; 2148*1271e4b1SPrakash Surya 2149*1271e4b1SPrakash Surya /* 2150*1271e4b1SPrakash Surya * In order to call zil_lwb_write_issue() we must hold the 2151*1271e4b1SPrakash Surya * zilog's "zl_writer_lock". We can't simply acquire that lock, 2152*1271e4b1SPrakash Surya * since we're already holding the commit waiter's "zcw_lock", 2153*1271e4b1SPrakash Surya * and those two locks are aquired in the opposite order 2154*1271e4b1SPrakash Surya * elsewhere. 2155*1271e4b1SPrakash Surya */ 2156*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 2157*1271e4b1SPrakash Surya mutex_enter(&zilog->zl_writer_lock); 2158*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 2159fa9e4066Sahrens 2160fa9e4066Sahrens /* 2161*1271e4b1SPrakash Surya * Since we just dropped and re-acquired the commit waiter's 2162*1271e4b1SPrakash Surya * lock, we have to re-check to see if the waiter was marked 2163*1271e4b1SPrakash Surya * "done" during that process. If the waiter was marked "done", 2164*1271e4b1SPrakash Surya * the "lwb" pointer is no longer valid (it can be free'd after 2165*1271e4b1SPrakash Surya * the waiter is marked "done"), so without this check we could 2166*1271e4b1SPrakash Surya * wind up with a use-after-free error below. 2167*1271e4b1SPrakash Surya */ 2168*1271e4b1SPrakash Surya if (zcw->zcw_done) 2169*1271e4b1SPrakash Surya goto out; 2170*1271e4b1SPrakash Surya 2171*1271e4b1SPrakash Surya ASSERT3P(lwb, ==, zcw->zcw_lwb); 2172*1271e4b1SPrakash Surya 2173*1271e4b1SPrakash Surya /* 2174*1271e4b1SPrakash Surya * We've already checked this above, but since we hadn't 2175*1271e4b1SPrakash Surya * acquired the zilog's zl_writer_lock, we have to perform this 2176*1271e4b1SPrakash Surya * check a second time while holding the lock. We can't call 2177*1271e4b1SPrakash Surya * zil_lwb_write_issue() if the lwb had already been issued. 2178*1271e4b1SPrakash Surya */ 2179*1271e4b1SPrakash Surya if (lwb->lwb_state == LWB_STATE_ISSUED || 2180*1271e4b1SPrakash Surya lwb->lwb_state == LWB_STATE_DONE) 2181*1271e4b1SPrakash Surya goto out; 2182*1271e4b1SPrakash Surya 2183*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 2184*1271e4b1SPrakash Surya 2185*1271e4b1SPrakash Surya /* 2186*1271e4b1SPrakash Surya * As described in the comments above zil_commit_waiter() and 2187*1271e4b1SPrakash Surya * zil_process_commit_list(), we need to issue this lwb's zio 2188*1271e4b1SPrakash Surya * since we've reached the commit waiter's timeout and it still 2189*1271e4b1SPrakash Surya * hasn't been issued. 2190*1271e4b1SPrakash Surya */ 2191*1271e4b1SPrakash Surya lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); 2192*1271e4b1SPrakash Surya 2193*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); 2194*1271e4b1SPrakash Surya 2195*1271e4b1SPrakash Surya /* 2196*1271e4b1SPrakash Surya * Since the lwb's zio hadn't been issued by the time this thread 2197*1271e4b1SPrakash Surya * reached its timeout, we reset the zilog's "zl_cur_used" field 2198*1271e4b1SPrakash Surya * to influence the zil block size selection algorithm. 2199*1271e4b1SPrakash Surya * 2200*1271e4b1SPrakash Surya * By having to issue the lwb's zio here, it means the size of the 2201*1271e4b1SPrakash Surya * lwb was too large, given the incoming throughput of itxs. By 2202*1271e4b1SPrakash Surya * setting "zl_cur_used" to zero, we communicate this fact to the 2203*1271e4b1SPrakash Surya * block size selection algorithm, so it can take this informaiton 2204*1271e4b1SPrakash Surya * into account, and potentially select a smaller size for the 2205*1271e4b1SPrakash Surya * next lwb block that is allocated. 2206fa9e4066Sahrens */ 2207*1271e4b1SPrakash Surya zilog->zl_cur_used = 0; 2208*1271e4b1SPrakash Surya 2209*1271e4b1SPrakash Surya if (nlwb == NULL) { 2210*1271e4b1SPrakash Surya /* 2211*1271e4b1SPrakash Surya * When zil_lwb_write_issue() returns NULL, this 2212*1271e4b1SPrakash Surya * indicates zio_alloc_zil() failed to allocate the 2213*1271e4b1SPrakash Surya * "next" lwb on-disk. When this occurs, the ZIL write 2214*1271e4b1SPrakash Surya * pipeline must be stalled; see the comment within the 2215*1271e4b1SPrakash Surya * zil_commit_writer_stall() function for more details. 2216*1271e4b1SPrakash Surya * 2217*1271e4b1SPrakash Surya * We must drop the commit waiter's lock prior to 2218*1271e4b1SPrakash Surya * calling zil_commit_writer_stall() or else we can wind 2219*1271e4b1SPrakash Surya * up with the following deadlock: 2220*1271e4b1SPrakash Surya * 2221*1271e4b1SPrakash Surya * - This thread is waiting for the txg to sync while 2222*1271e4b1SPrakash Surya * holding the waiter's lock; txg_wait_synced() is 2223*1271e4b1SPrakash Surya * used within txg_commit_writer_stall(). 2224*1271e4b1SPrakash Surya * 2225*1271e4b1SPrakash Surya * - The txg can't sync because it is waiting for this 2226*1271e4b1SPrakash Surya * lwb's zio callback to call dmu_tx_commit(). 2227*1271e4b1SPrakash Surya * 2228*1271e4b1SPrakash Surya * - The lwb's zio callback can't call dmu_tx_commit() 2229*1271e4b1SPrakash Surya * because it's blocked trying to acquire the waiter's 2230*1271e4b1SPrakash Surya * lock, which occurs prior to calling dmu_tx_commit() 2231*1271e4b1SPrakash Surya */ 2232*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 2233*1271e4b1SPrakash Surya zil_commit_writer_stall(zilog); 2234*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 2235fa9e4066Sahrens } 223622ac5be4Sperrin 2237*1271e4b1SPrakash Surya out: 2238*1271e4b1SPrakash Surya mutex_exit(&zilog->zl_writer_lock); 2239*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2240*1271e4b1SPrakash Surya } 224167bd71c6Sperrin 2242*1271e4b1SPrakash Surya /* 2243*1271e4b1SPrakash Surya * This function is responsible for performing the following two tasks: 2244*1271e4b1SPrakash Surya * 2245*1271e4b1SPrakash Surya * 1. its primary responsibility is to block until the given "commit 2246*1271e4b1SPrakash Surya * waiter" is considered "done". 2247*1271e4b1SPrakash Surya * 2248*1271e4b1SPrakash Surya * 2. its secondary responsibility is to issue the zio for the lwb that 2249*1271e4b1SPrakash Surya * the given "commit waiter" is waiting on, if this function has 2250*1271e4b1SPrakash Surya * waited "long enough" and the lwb is still in the "open" state. 2251*1271e4b1SPrakash Surya * 2252*1271e4b1SPrakash Surya * Given a sufficient amount of itxs being generated and written using 2253*1271e4b1SPrakash Surya * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() 2254*1271e4b1SPrakash Surya * function. If this does not occur, this secondary responsibility will 2255*1271e4b1SPrakash Surya * ensure the lwb is issued even if there is not other synchronous 2256*1271e4b1SPrakash Surya * activity on the system. 2257*1271e4b1SPrakash Surya * 2258*1271e4b1SPrakash Surya * For more details, see zil_process_commit_list(); more specifically, 2259*1271e4b1SPrakash Surya * the comment at the bottom of that function. 2260*1271e4b1SPrakash Surya */ 2261*1271e4b1SPrakash Surya static void 2262*1271e4b1SPrakash Surya zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) 2263*1271e4b1SPrakash Surya { 2264*1271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2265*1271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_writer_lock)); 2266*1271e4b1SPrakash Surya ASSERT(spa_writeable(zilog->zl_spa)); 2267*1271e4b1SPrakash Surya ASSERT0(zilog->zl_suspend); 2268*1271e4b1SPrakash Surya 2269*1271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 2270b24ab676SJeff Bonwick 2271b24ab676SJeff Bonwick /* 2272*1271e4b1SPrakash Surya * The timeout is scaled based on the lwb latency to avoid 2273*1271e4b1SPrakash Surya * significantly impacting the latency of each individual itx. 2274*1271e4b1SPrakash Surya * For more details, see the comment at the bottom of the 2275*1271e4b1SPrakash Surya * zil_process_commit_list() function. 2276b24ab676SJeff Bonwick */ 2277*1271e4b1SPrakash Surya int pct = MAX(zfs_commit_timeout_pct, 1); 2278*1271e4b1SPrakash Surya hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; 2279*1271e4b1SPrakash Surya hrtime_t wakeup = gethrtime() + sleep; 2280*1271e4b1SPrakash Surya boolean_t timedout = B_FALSE; 2281*1271e4b1SPrakash Surya 2282*1271e4b1SPrakash Surya while (!zcw->zcw_done) { 2283*1271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 2284*1271e4b1SPrakash Surya 2285*1271e4b1SPrakash Surya lwb_t *lwb = zcw->zcw_lwb; 2286*1271e4b1SPrakash Surya 2287*1271e4b1SPrakash Surya /* 2288*1271e4b1SPrakash Surya * Usually, the waiter will have a non-NULL lwb field here, 2289*1271e4b1SPrakash Surya * but it's possible for it to be NULL as a result of 2290*1271e4b1SPrakash Surya * zil_commit() racing with spa_sync(). 2291*1271e4b1SPrakash Surya * 2292*1271e4b1SPrakash Surya * When zil_clean() is called, it's possible for the itxg 2293*1271e4b1SPrakash Surya * list (which may be cleaned via a taskq) to contain 2294*1271e4b1SPrakash Surya * commit itxs. When this occurs, the commit waiters linked 2295*1271e4b1SPrakash Surya * off of these commit itxs will not be committed to an 2296*1271e4b1SPrakash Surya * lwb. Additionally, these commit waiters will not be 2297*1271e4b1SPrakash Surya * marked done until zil_commit_waiter_skip() is called via 2298*1271e4b1SPrakash Surya * zil_itxg_clean(). 2299*1271e4b1SPrakash Surya * 2300*1271e4b1SPrakash Surya * Thus, it's possible for this commit waiter (i.e. the 2301*1271e4b1SPrakash Surya * "zcw" variable) to be found in this "in between" state; 2302*1271e4b1SPrakash Surya * where it's "zcw_lwb" field is NULL, and it hasn't yet 2303*1271e4b1SPrakash Surya * been skipped, so it's "zcw_done" field is still B_FALSE. 2304*1271e4b1SPrakash Surya */ 2305*1271e4b1SPrakash Surya IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); 2306*1271e4b1SPrakash Surya 2307*1271e4b1SPrakash Surya if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { 2308*1271e4b1SPrakash Surya ASSERT3B(timedout, ==, B_FALSE); 2309*1271e4b1SPrakash Surya 2310*1271e4b1SPrakash Surya /* 2311*1271e4b1SPrakash Surya * If the lwb hasn't been issued yet, then we 2312*1271e4b1SPrakash Surya * need to wait with a timeout, in case this 2313*1271e4b1SPrakash Surya * function needs to issue the lwb after the 2314*1271e4b1SPrakash Surya * timeout is reached; responsibility (2) from 2315*1271e4b1SPrakash Surya * the comment above this function. 2316*1271e4b1SPrakash Surya */ 2317*1271e4b1SPrakash Surya clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, 2318*1271e4b1SPrakash Surya &zcw->zcw_lock, wakeup, USEC2NSEC(1), 2319*1271e4b1SPrakash Surya CALLOUT_FLAG_ABSOLUTE); 2320*1271e4b1SPrakash Surya 2321*1271e4b1SPrakash Surya if (timeleft >= 0 || zcw->zcw_done) 2322*1271e4b1SPrakash Surya continue; 2323*1271e4b1SPrakash Surya 2324*1271e4b1SPrakash Surya timedout = B_TRUE; 2325*1271e4b1SPrakash Surya zil_commit_waiter_timeout(zilog, zcw); 2326*1271e4b1SPrakash Surya 2327*1271e4b1SPrakash Surya if (!zcw->zcw_done) { 2328*1271e4b1SPrakash Surya /* 2329*1271e4b1SPrakash Surya * If the commit waiter has already been 2330*1271e4b1SPrakash Surya * marked "done", it's possible for the 2331*1271e4b1SPrakash Surya * waiter's lwb structure to have already 2332*1271e4b1SPrakash Surya * been freed. Thus, we can only reliably 2333*1271e4b1SPrakash Surya * make these assertions if the waiter 2334*1271e4b1SPrakash Surya * isn't done. 2335*1271e4b1SPrakash Surya */ 2336*1271e4b1SPrakash Surya ASSERT3P(lwb, ==, zcw->zcw_lwb); 2337*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); 2338*1271e4b1SPrakash Surya } 2339*1271e4b1SPrakash Surya } else { 2340*1271e4b1SPrakash Surya /* 2341*1271e4b1SPrakash Surya * If the lwb isn't open, then it must have already 2342*1271e4b1SPrakash Surya * been issued. In that case, there's no need to 2343*1271e4b1SPrakash Surya * use a timeout when waiting for the lwb to 2344*1271e4b1SPrakash Surya * complete. 2345*1271e4b1SPrakash Surya * 2346*1271e4b1SPrakash Surya * Additionally, if the lwb is NULL, the waiter 2347*1271e4b1SPrakash Surya * will soon be signalled and marked done via 2348*1271e4b1SPrakash Surya * zil_clean() and zil_itxg_clean(), so no timeout 2349*1271e4b1SPrakash Surya * is required. 2350*1271e4b1SPrakash Surya */ 2351*1271e4b1SPrakash Surya 2352*1271e4b1SPrakash Surya IMPLY(lwb != NULL, 2353*1271e4b1SPrakash Surya lwb->lwb_state == LWB_STATE_ISSUED || 2354*1271e4b1SPrakash Surya lwb->lwb_state == LWB_STATE_DONE); 2355*1271e4b1SPrakash Surya cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); 2356*1271e4b1SPrakash Surya } 2357*1271e4b1SPrakash Surya } 2358*1271e4b1SPrakash Surya 2359*1271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 2360*1271e4b1SPrakash Surya } 2361*1271e4b1SPrakash Surya 2362*1271e4b1SPrakash Surya static zil_commit_waiter_t * 2363*1271e4b1SPrakash Surya zil_alloc_commit_waiter() 2364*1271e4b1SPrakash Surya { 2365*1271e4b1SPrakash Surya zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); 2366*1271e4b1SPrakash Surya 2367*1271e4b1SPrakash Surya cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); 2368*1271e4b1SPrakash Surya mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); 2369*1271e4b1SPrakash Surya list_link_init(&zcw->zcw_node); 2370*1271e4b1SPrakash Surya zcw->zcw_lwb = NULL; 2371*1271e4b1SPrakash Surya zcw->zcw_done = B_FALSE; 2372*1271e4b1SPrakash Surya zcw->zcw_zio_error = 0; 2373*1271e4b1SPrakash Surya 2374*1271e4b1SPrakash Surya return (zcw); 2375*1271e4b1SPrakash Surya } 2376*1271e4b1SPrakash Surya 2377*1271e4b1SPrakash Surya static void 2378*1271e4b1SPrakash Surya zil_free_commit_waiter(zil_commit_waiter_t *zcw) 2379*1271e4b1SPrakash Surya { 2380*1271e4b1SPrakash Surya ASSERT(!list_link_active(&zcw->zcw_node)); 2381*1271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, NULL); 2382*1271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_TRUE); 2383*1271e4b1SPrakash Surya mutex_destroy(&zcw->zcw_lock); 2384*1271e4b1SPrakash Surya cv_destroy(&zcw->zcw_cv); 2385*1271e4b1SPrakash Surya kmem_cache_free(zil_zcw_cache, zcw); 2386*1271e4b1SPrakash Surya } 2387*1271e4b1SPrakash Surya 2388*1271e4b1SPrakash Surya /* 2389*1271e4b1SPrakash Surya * This function is used to create a TX_COMMIT itx and assign it. This 2390*1271e4b1SPrakash Surya * way, it will be linked into the ZIL's list of synchronous itxs, and 2391*1271e4b1SPrakash Surya * then later committed to an lwb (or skipped) when 2392*1271e4b1SPrakash Surya * zil_process_commit_list() is called. 2393*1271e4b1SPrakash Surya */ 2394*1271e4b1SPrakash Surya static void 2395*1271e4b1SPrakash Surya zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) 2396*1271e4b1SPrakash Surya { 2397*1271e4b1SPrakash Surya dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 2398*1271e4b1SPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 2399*1271e4b1SPrakash Surya 2400*1271e4b1SPrakash Surya itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); 2401*1271e4b1SPrakash Surya itx->itx_sync = B_TRUE; 2402*1271e4b1SPrakash Surya itx->itx_private = zcw; 2403*1271e4b1SPrakash Surya 2404*1271e4b1SPrakash Surya zil_itx_assign(zilog, itx, tx); 2405*1271e4b1SPrakash Surya 2406*1271e4b1SPrakash Surya dmu_tx_commit(tx); 2407b19a79ecSperrin } 2408b19a79ecSperrin 2409b19a79ecSperrin /* 2410*1271e4b1SPrakash Surya * Commit ZFS Intent Log transactions (itxs) to stable storage. 2411*1271e4b1SPrakash Surya * 2412*1271e4b1SPrakash Surya * When writing ZIL transactions to the on-disk representation of the 2413*1271e4b1SPrakash Surya * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple 2414*1271e4b1SPrakash Surya * itxs can be committed to a single lwb. Once a lwb is written and 2415*1271e4b1SPrakash Surya * committed to stable storage (i.e. the lwb is written, and vdevs have 2416*1271e4b1SPrakash Surya * been flushed), each itx that was committed to that lwb is also 2417*1271e4b1SPrakash Surya * considered to be committed to stable storage. 2418*1271e4b1SPrakash Surya * 2419*1271e4b1SPrakash Surya * When an itx is committed to an lwb, the log record (lr_t) contained 2420*1271e4b1SPrakash Surya * by the itx is copied into the lwb's zio buffer, and once this buffer 2421*1271e4b1SPrakash Surya * is written to disk, it becomes an on-disk ZIL block. 2422*1271e4b1SPrakash Surya * 2423*1271e4b1SPrakash Surya * As itxs are generated, they're inserted into the ZIL's queue of 2424*1271e4b1SPrakash Surya * uncommitted itxs. The semantics of zil_commit() are such that it will 2425*1271e4b1SPrakash Surya * block until all itxs that were in the queue when it was called, are 2426*1271e4b1SPrakash Surya * committed to stable storage. 2427*1271e4b1SPrakash Surya * 2428*1271e4b1SPrakash Surya * If "foid" is zero, this means all "synchronous" and "asynchronous" 2429*1271e4b1SPrakash Surya * itxs, for all objects in the dataset, will be committed to stable 2430*1271e4b1SPrakash Surya * storage prior to zil_commit() returning. If "foid" is non-zero, all 2431*1271e4b1SPrakash Surya * "synchronous" itxs for all objects, but only "asynchronous" itxs 2432*1271e4b1SPrakash Surya * that correspond to the foid passed in, will be committed to stable 2433*1271e4b1SPrakash Surya * storage prior to zil_commit() returning. 2434*1271e4b1SPrakash Surya * 2435*1271e4b1SPrakash Surya * Generally speaking, when zil_commit() is called, the consumer doesn't 2436*1271e4b1SPrakash Surya * actually care about _all_ of the uncommitted itxs. Instead, they're 2437*1271e4b1SPrakash Surya * simply trying to waiting for a specific itx to be committed to disk, 2438*1271e4b1SPrakash Surya * but the interface(s) for interacting with the ZIL don't allow such 2439*1271e4b1SPrakash Surya * fine-grained communication. A better interface would allow a consumer 2440*1271e4b1SPrakash Surya * to create and assign an itx, and then pass a reference to this itx to 2441*1271e4b1SPrakash Surya * zil_commit(); such that zil_commit() would return as soon as that 2442*1271e4b1SPrakash Surya * specific itx was committed to disk (instead of waiting for _all_ 2443*1271e4b1SPrakash Surya * itxs to be committed). 2444*1271e4b1SPrakash Surya * 2445*1271e4b1SPrakash Surya * When a thread calls zil_commit() a special "commit itx" will be 2446*1271e4b1SPrakash Surya * generated, along with a corresponding "waiter" for this commit itx. 2447*1271e4b1SPrakash Surya * zil_commit() will wait on this waiter's CV, such that when the waiter 2448*1271e4b1SPrakash Surya * is marked done, and signalled, zil_commit() will return. 2449*1271e4b1SPrakash Surya * 2450*1271e4b1SPrakash Surya * This commit itx is inserted into the queue of uncommitted itxs. This 2451*1271e4b1SPrakash Surya * provides an easy mechanism for determining which itxs were in the 2452*1271e4b1SPrakash Surya * queue prior to zil_commit() having been called, and which itxs were 2453*1271e4b1SPrakash Surya * added after zil_commit() was called. 24545002558fSNeil Perrin * 2455*1271e4b1SPrakash Surya * The commit it is special; it doesn't have any on-disk representation. 2456*1271e4b1SPrakash Surya * When a commit itx is "committed" to an lwb, the waiter associated 2457*1271e4b1SPrakash Surya * with it is linked onto the lwb's list of waiters. Then, when that lwb 2458*1271e4b1SPrakash Surya * completes, each waiter on the lwb's list is marked done and signalled 2459*1271e4b1SPrakash Surya * -- allowing the thread waiting on the waiter to return from zil_commit(). 24605002558fSNeil Perrin * 2461*1271e4b1SPrakash Surya * It's important to point out a few critical factors that allow us 2462*1271e4b1SPrakash Surya * to make use of the commit itxs, commit waiters, per-lwb lists of 2463*1271e4b1SPrakash Surya * commit waiters, and zio completion callbacks like we're doing: 24645002558fSNeil Perrin * 2465*1271e4b1SPrakash Surya * 1. The list of waiters for each lwb is traversed, and each commit 2466*1271e4b1SPrakash Surya * waiter is marked "done" and signalled, in the zio completion 2467*1271e4b1SPrakash Surya * callback of the lwb's zio[*]. 24685002558fSNeil Perrin * 2469*1271e4b1SPrakash Surya * * Actually, the waiters are signalled in the zio completion 2470*1271e4b1SPrakash Surya * callback of the root zio for the DKIOCFLUSHWRITECACHE commands 2471*1271e4b1SPrakash Surya * that are sent to the vdevs upon completion of the lwb zio. 2472*1271e4b1SPrakash Surya * 2473*1271e4b1SPrakash Surya * 2. When the itxs are inserted into the ZIL's queue of uncommitted 2474*1271e4b1SPrakash Surya * itxs, the order in which they are inserted is preserved[*]; as 2475*1271e4b1SPrakash Surya * itxs are added to the queue, they are added to the tail of 2476*1271e4b1SPrakash Surya * in-memory linked lists. 2477*1271e4b1SPrakash Surya * 2478*1271e4b1SPrakash Surya * When committing the itxs to lwbs (to be written to disk), they 2479*1271e4b1SPrakash Surya * are committed in the same order in which the itxs were added to 2480*1271e4b1SPrakash Surya * the uncommitted queue's linked list(s); i.e. the linked list of 2481*1271e4b1SPrakash Surya * itxs to commit is traversed from head to tail, and each itx is 2482*1271e4b1SPrakash Surya * committed to an lwb in that order. 2483*1271e4b1SPrakash Surya * 2484*1271e4b1SPrakash Surya * * To clarify: 2485*1271e4b1SPrakash Surya * 2486*1271e4b1SPrakash Surya * - the order of "sync" itxs is preserved w.r.t. other 2487*1271e4b1SPrakash Surya * "sync" itxs, regardless of the corresponding objects. 2488*1271e4b1SPrakash Surya * - the order of "async" itxs is preserved w.r.t. other 2489*1271e4b1SPrakash Surya * "async" itxs corresponding to the same object. 2490*1271e4b1SPrakash Surya * - the order of "async" itxs is *not* preserved w.r.t. other 2491*1271e4b1SPrakash Surya * "async" itxs corresponding to different objects. 2492*1271e4b1SPrakash Surya * - the order of "sync" itxs w.r.t. "async" itxs (or vice 2493*1271e4b1SPrakash Surya * versa) is *not* preserved, even for itxs that correspond 2494*1271e4b1SPrakash Surya * to the same object. 2495*1271e4b1SPrakash Surya * 2496*1271e4b1SPrakash Surya * For more details, see: zil_itx_assign(), zil_async_to_sync(), 2497*1271e4b1SPrakash Surya * zil_get_commit_list(), and zil_process_commit_list(). 2498*1271e4b1SPrakash Surya * 2499*1271e4b1SPrakash Surya * 3. The lwbs represent a linked list of blocks on disk. Thus, any 2500*1271e4b1SPrakash Surya * lwb cannot be considered committed to stable storage, until its 2501*1271e4b1SPrakash Surya * "previous" lwb is also committed to stable storage. This fact, 2502*1271e4b1SPrakash Surya * coupled with the fact described above, means that itxs are 2503*1271e4b1SPrakash Surya * committed in (roughly) the order in which they were generated. 2504*1271e4b1SPrakash Surya * This is essential because itxs are dependent on prior itxs. 2505*1271e4b1SPrakash Surya * Thus, we *must not* deem an itx as being committed to stable 2506*1271e4b1SPrakash Surya * storage, until *all* prior itxs have also been committed to 2507*1271e4b1SPrakash Surya * stable storage. 2508*1271e4b1SPrakash Surya * 2509*1271e4b1SPrakash Surya * To enforce this ordering of lwb zio's, while still leveraging as 2510*1271e4b1SPrakash Surya * much of the underlying storage performance as possible, we rely 2511*1271e4b1SPrakash Surya * on two fundamental concepts: 2512*1271e4b1SPrakash Surya * 2513*1271e4b1SPrakash Surya * 1. The creation and issuance of lwb zio's is protected by 2514*1271e4b1SPrakash Surya * the zilog's "zl_writer_lock", which ensures only a single 2515*1271e4b1SPrakash Surya * thread is creating and/or issuing lwb's at a time 2516*1271e4b1SPrakash Surya * 2. The "previous" lwb is a child of the "current" lwb 2517*1271e4b1SPrakash Surya * (leveraging the zio parent-child depenency graph) 2518*1271e4b1SPrakash Surya * 2519*1271e4b1SPrakash Surya * By relying on this parent-child zio relationship, we can have 2520*1271e4b1SPrakash Surya * many lwb zio's concurrently issued to the underlying storage, 2521*1271e4b1SPrakash Surya * but the order in which they complete will be the same order in 2522*1271e4b1SPrakash Surya * which they were created. 2523b19a79ecSperrin */ 2524b19a79ecSperrin void 25255002558fSNeil Perrin zil_commit(zilog_t *zilog, uint64_t foid) 2526b19a79ecSperrin { 2527*1271e4b1SPrakash Surya /* 2528*1271e4b1SPrakash Surya * We should never attempt to call zil_commit on a snapshot for 2529*1271e4b1SPrakash Surya * a couple of reasons: 2530*1271e4b1SPrakash Surya * 2531*1271e4b1SPrakash Surya * 1. A snapshot may never be modified, thus it cannot have any 2532*1271e4b1SPrakash Surya * in-flight itxs that would have modified the dataset. 2533*1271e4b1SPrakash Surya * 2534*1271e4b1SPrakash Surya * 2. By design, when zil_commit() is called, a commit itx will 2535*1271e4b1SPrakash Surya * be assigned to this zilog; as a result, the zilog will be 2536*1271e4b1SPrakash Surya * dirtied. We must not dirty the zilog of a snapshot; there's 2537*1271e4b1SPrakash Surya * checks in the code that enforce this invariant, and will 2538*1271e4b1SPrakash Surya * cause a panic if it's not upheld. 2539*1271e4b1SPrakash Surya */ 2540*1271e4b1SPrakash Surya ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); 2541b19a79ecSperrin 25425002558fSNeil Perrin if (zilog->zl_sync == ZFS_SYNC_DISABLED) 25435002558fSNeil Perrin return; 2544b19a79ecSperrin 2545*1271e4b1SPrakash Surya if (!spa_writeable(zilog->zl_spa)) { 2546*1271e4b1SPrakash Surya /* 2547*1271e4b1SPrakash Surya * If the SPA is not writable, there should never be any 2548*1271e4b1SPrakash Surya * pending itxs waiting to be committed to disk. If that 2549*1271e4b1SPrakash Surya * weren't true, we'd skip writing those itxs out, and 2550*1271e4b1SPrakash Surya * would break the sematics of zil_commit(); thus, we're 2551*1271e4b1SPrakash Surya * verifying that truth before we return to the caller. 2552*1271e4b1SPrakash Surya */ 2553*1271e4b1SPrakash Surya ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2554*1271e4b1SPrakash Surya ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2555*1271e4b1SPrakash Surya for (int i = 0; i < TXG_SIZE; i++) 2556*1271e4b1SPrakash Surya ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); 2557*1271e4b1SPrakash Surya return; 2558*1271e4b1SPrakash Surya } 2559b19a79ecSperrin 2560*1271e4b1SPrakash Surya /* 2561*1271e4b1SPrakash Surya * If the ZIL is suspended, we don't want to dirty it by calling 2562*1271e4b1SPrakash Surya * zil_commit_itx_assign() below, nor can we write out 2563*1271e4b1SPrakash Surya * lwbs like would be done in zil_commit_write(). Thus, we 2564*1271e4b1SPrakash Surya * simply rely on txg_wait_synced() to maintain the necessary 2565*1271e4b1SPrakash Surya * semantics, and avoid calling those functions altogether. 2566*1271e4b1SPrakash Surya */ 2567*1271e4b1SPrakash Surya if (zilog->zl_suspend > 0) { 2568*1271e4b1SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 2569*1271e4b1SPrakash Surya return; 257067bd71c6Sperrin } 2571b24ab676SJeff Bonwick 2572*1271e4b1SPrakash Surya /* 2573*1271e4b1SPrakash Surya * Move the "async" itxs for the specified foid to the "sync" 2574*1271e4b1SPrakash Surya * queues, such that they will be later committed (or skipped) 2575*1271e4b1SPrakash Surya * to an lwb when zil_process_commit_list() is called. 2576*1271e4b1SPrakash Surya * 2577*1271e4b1SPrakash Surya * Since these "async" itxs must be committed prior to this 2578*1271e4b1SPrakash Surya * call to zil_commit returning, we must perform this operation 2579*1271e4b1SPrakash Surya * before we call zil_commit_itx_assign(). 2580*1271e4b1SPrakash Surya */ 2581*1271e4b1SPrakash Surya zil_async_to_sync(zilog, foid); 2582b24ab676SJeff Bonwick 2583*1271e4b1SPrakash Surya /* 2584*1271e4b1SPrakash Surya * We allocate a new "waiter" structure which will initially be 2585*1271e4b1SPrakash Surya * linked to the commit itx using the itx's "itx_private" field. 2586*1271e4b1SPrakash Surya * Since the commit itx doesn't represent any on-disk state, 2587*1271e4b1SPrakash Surya * when it's committed to an lwb, rather than copying the its 2588*1271e4b1SPrakash Surya * lr_t into the lwb's buffer, the commit itx's "waiter" will be 2589*1271e4b1SPrakash Surya * added to the lwb's list of waiters. Then, when the lwb is 2590*1271e4b1SPrakash Surya * committed to stable storage, each waiter in the lwb's list of 2591*1271e4b1SPrakash Surya * waiters will be marked "done", and signalled. 2592*1271e4b1SPrakash Surya * 2593*1271e4b1SPrakash Surya * We must create the waiter and assign the commit itx prior to 2594*1271e4b1SPrakash Surya * calling zil_commit_writer(), or else our specific commit itx 2595*1271e4b1SPrakash Surya * is not guaranteed to be committed to an lwb prior to calling 2596*1271e4b1SPrakash Surya * zil_commit_waiter(). 2597*1271e4b1SPrakash Surya */ 2598*1271e4b1SPrakash Surya zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); 2599*1271e4b1SPrakash Surya zil_commit_itx_assign(zilog, zcw); 2600*1271e4b1SPrakash Surya 2601*1271e4b1SPrakash Surya zil_commit_writer(zilog, zcw); 2602*1271e4b1SPrakash Surya zil_commit_waiter(zilog, zcw); 2603b24ab676SJeff Bonwick 2604*1271e4b1SPrakash Surya if (zcw->zcw_zio_error != 0) { 2605*1271e4b1SPrakash Surya /* 2606*1271e4b1SPrakash Surya * If there was an error writing out the ZIL blocks that 2607*1271e4b1SPrakash Surya * this thread is waiting on, then we fallback to 2608*1271e4b1SPrakash Surya * relying on spa_sync() to write out the data this 2609*1271e4b1SPrakash Surya * thread is waiting on. Obviously this has performance 2610*1271e4b1SPrakash Surya * implications, but the expectation is for this to be 2611*1271e4b1SPrakash Surya * an exceptional case, and shouldn't occur often. 2612*1271e4b1SPrakash Surya */ 2613*1271e4b1SPrakash Surya DTRACE_PROBE2(zil__commit__io__error, 2614*1271e4b1SPrakash Surya zilog_t *, zilog, zil_commit_waiter_t *, zcw); 2615*1271e4b1SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 2616*1271e4b1SPrakash Surya } 2617*1271e4b1SPrakash Surya 2618*1271e4b1SPrakash Surya zil_free_commit_waiter(zcw); 2619b24ab676SJeff Bonwick } 2620b24ab676SJeff Bonwick 2621fa9e4066Sahrens /* 2622fa9e4066Sahrens * Called in syncing context to free committed log blocks and update log header. 2623fa9e4066Sahrens */ 2624fa9e4066Sahrens void 2625fa9e4066Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx) 2626fa9e4066Sahrens { 2627d80c45e0Sbonwick zil_header_t *zh = zil_header_in_syncing_context(zilog); 2628fa9e4066Sahrens uint64_t txg = dmu_tx_get_txg(tx); 2629fa9e4066Sahrens spa_t *spa = zilog->zl_spa; 2630b24ab676SJeff Bonwick uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 2631fa9e4066Sahrens lwb_t *lwb; 2632fa9e4066Sahrens 263314843421SMatthew Ahrens /* 263414843421SMatthew Ahrens * We don't zero out zl_destroy_txg, so make sure we don't try 263514843421SMatthew Ahrens * to destroy it twice. 263614843421SMatthew Ahrens */ 263714843421SMatthew Ahrens if (spa_sync_pass(spa) != 1) 263814843421SMatthew Ahrens return; 263914843421SMatthew Ahrens 2640d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 2641d80c45e0Sbonwick 2642fa9e4066Sahrens ASSERT(zilog->zl_stop_sync == 0); 2643fa9e4066Sahrens 2644b24ab676SJeff Bonwick if (*replayed_seq != 0) { 2645b24ab676SJeff Bonwick ASSERT(zh->zh_replay_seq < *replayed_seq); 2646b24ab676SJeff Bonwick zh->zh_replay_seq = *replayed_seq; 2647b24ab676SJeff Bonwick *replayed_seq = 0; 2648b24ab676SJeff Bonwick } 2649fa9e4066Sahrens 2650fa9e4066Sahrens if (zilog->zl_destroy_txg == txg) { 2651d80c45e0Sbonwick blkptr_t blk = zh->zh_log; 2652d80c45e0Sbonwick 2653d80c45e0Sbonwick ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 2654d80c45e0Sbonwick 2655d80c45e0Sbonwick bzero(zh, sizeof (zil_header_t)); 26561209a471SNeil Perrin bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 2657d80c45e0Sbonwick 2658d80c45e0Sbonwick if (zilog->zl_keep_first) { 2659d80c45e0Sbonwick /* 2660d80c45e0Sbonwick * If this block was part of log chain that couldn't 2661d80c45e0Sbonwick * be claimed because a device was missing during 2662d80c45e0Sbonwick * zil_claim(), but that device later returns, 2663d80c45e0Sbonwick * then this block could erroneously appear valid. 2664d80c45e0Sbonwick * To guard against this, assign a new GUID to the new 2665d80c45e0Sbonwick * log chain so it doesn't matter what blk points to. 2666d80c45e0Sbonwick */ 2667d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 2668d80c45e0Sbonwick zh->zh_log = blk; 2669d80c45e0Sbonwick } 2670fa9e4066Sahrens } 2671fa9e4066Sahrens 2672e6ca193dSGeorge Wilson while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 2673b19a79ecSperrin zh->zh_log = lwb->lwb_blk; 2674fa9e4066Sahrens if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 2675fa9e4066Sahrens break; 2676fa9e4066Sahrens list_remove(&zilog->zl_lwb_list, lwb); 2677*1271e4b1SPrakash Surya zio_free(spa, txg, &lwb->lwb_blk); 2678*1271e4b1SPrakash Surya zil_free_lwb(zilog, lwb); 2679d63d470bSgw 2680d63d470bSgw /* 2681d63d470bSgw * If we don't have anything left in the lwb list then 2682d63d470bSgw * we've had an allocation failure and we need to zero 2683d63d470bSgw * out the zil_header blkptr so that we don't end 2684d63d470bSgw * up freeing the same block twice. 2685d63d470bSgw */ 2686d63d470bSgw if (list_head(&zilog->zl_lwb_list) == NULL) 2687d63d470bSgw BP_ZERO(&zh->zh_log); 2688fa9e4066Sahrens } 2689fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 2690fa9e4066Sahrens } 2691fa9e4066Sahrens 2692*1271e4b1SPrakash Surya /* ARGSUSED */ 2693*1271e4b1SPrakash Surya static int 2694*1271e4b1SPrakash Surya zil_lwb_cons(void *vbuf, void *unused, int kmflag) 2695*1271e4b1SPrakash Surya { 2696*1271e4b1SPrakash Surya lwb_t *lwb = vbuf; 2697*1271e4b1SPrakash Surya list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), 2698*1271e4b1SPrakash Surya offsetof(zil_commit_waiter_t, zcw_node)); 2699*1271e4b1SPrakash Surya avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, 2700*1271e4b1SPrakash Surya sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 2701*1271e4b1SPrakash Surya mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 2702*1271e4b1SPrakash Surya return (0); 2703*1271e4b1SPrakash Surya } 2704*1271e4b1SPrakash Surya 2705*1271e4b1SPrakash Surya /* ARGSUSED */ 2706*1271e4b1SPrakash Surya static void 2707*1271e4b1SPrakash Surya zil_lwb_dest(void *vbuf, void *unused) 2708*1271e4b1SPrakash Surya { 2709*1271e4b1SPrakash Surya lwb_t *lwb = vbuf; 2710*1271e4b1SPrakash Surya mutex_destroy(&lwb->lwb_vdev_lock); 2711*1271e4b1SPrakash Surya avl_destroy(&lwb->lwb_vdev_tree); 2712*1271e4b1SPrakash Surya list_destroy(&lwb->lwb_waiters); 2713*1271e4b1SPrakash Surya } 2714*1271e4b1SPrakash Surya 2715fa9e4066Sahrens void 2716fa9e4066Sahrens zil_init(void) 2717fa9e4066Sahrens { 2718fa9e4066Sahrens zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 2719*1271e4b1SPrakash Surya sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); 2720*1271e4b1SPrakash Surya 2721*1271e4b1SPrakash Surya zil_zcw_cache = kmem_cache_create("zil_zcw_cache", 2722*1271e4b1SPrakash Surya sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 2723fa9e4066Sahrens } 2724fa9e4066Sahrens 2725fa9e4066Sahrens void 2726fa9e4066Sahrens zil_fini(void) 2727fa9e4066Sahrens { 2728*1271e4b1SPrakash Surya kmem_cache_destroy(zil_zcw_cache); 2729fa9e4066Sahrens kmem_cache_destroy(zil_lwb_cache); 2730fa9e4066Sahrens } 2731fa9e4066Sahrens 273255da60b9SMark J Musante void 273355da60b9SMark J Musante zil_set_sync(zilog_t *zilog, uint64_t sync) 273455da60b9SMark J Musante { 273555da60b9SMark J Musante zilog->zl_sync = sync; 273655da60b9SMark J Musante } 273755da60b9SMark J Musante 2738e09fa4daSNeil Perrin void 2739e09fa4daSNeil Perrin zil_set_logbias(zilog_t *zilog, uint64_t logbias) 2740e09fa4daSNeil Perrin { 2741e09fa4daSNeil Perrin zilog->zl_logbias = logbias; 2742e09fa4daSNeil Perrin } 2743e09fa4daSNeil Perrin 2744fa9e4066Sahrens zilog_t * 2745fa9e4066Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys) 2746fa9e4066Sahrens { 2747fa9e4066Sahrens zilog_t *zilog; 2748fa9e4066Sahrens 2749fa9e4066Sahrens zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 2750fa9e4066Sahrens 2751fa9e4066Sahrens zilog->zl_header = zh_phys; 2752fa9e4066Sahrens zilog->zl_os = os; 2753fa9e4066Sahrens zilog->zl_spa = dmu_objset_spa(os); 2754fa9e4066Sahrens zilog->zl_dmu_pool = dmu_objset_pool(os); 2755d80c45e0Sbonwick zilog->zl_destroy_txg = TXG_INITIAL - 1; 2756e09fa4daSNeil Perrin zilog->zl_logbias = dmu_objset_logbias(os); 275755da60b9SMark J Musante zilog->zl_sync = dmu_objset_syncprop(os); 2758*1271e4b1SPrakash Surya zilog->zl_dirty_max_txg = 0; 2759*1271e4b1SPrakash Surya zilog->zl_last_lwb_opened = NULL; 2760*1271e4b1SPrakash Surya zilog->zl_last_lwb_latency = 0; 2761fa9e4066Sahrens 27625ad82045Snd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 2763*1271e4b1SPrakash Surya mutex_init(&zilog->zl_writer_lock, NULL, MUTEX_DEFAULT, NULL); 27645ad82045Snd 27655002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 27665002558fSNeil Perrin mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 27675002558fSNeil Perrin MUTEX_DEFAULT, NULL); 27685002558fSNeil Perrin } 2769fa9e4066Sahrens 2770fa9e4066Sahrens list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 2771fa9e4066Sahrens offsetof(lwb_t, lwb_node)); 2772fa9e4066Sahrens 27735002558fSNeil Perrin list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 27745002558fSNeil Perrin offsetof(itx_t, itx_node)); 27755002558fSNeil Perrin 2776b7b97454Sperrin cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 2777b7b97454Sperrin 2778fa9e4066Sahrens return (zilog); 2779fa9e4066Sahrens } 2780fa9e4066Sahrens 2781fa9e4066Sahrens void 2782fa9e4066Sahrens zil_free(zilog_t *zilog) 2783fa9e4066Sahrens { 2784fa9e4066Sahrens zilog->zl_stop_sync = 1; 2785fa9e4066Sahrens 27863b2aab18SMatthew Ahrens ASSERT0(zilog->zl_suspend); 27873b2aab18SMatthew Ahrens ASSERT0(zilog->zl_suspending); 27883b2aab18SMatthew Ahrens 2789c9ba2a43SEric Schrock ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2790fa9e4066Sahrens list_destroy(&zilog->zl_lwb_list); 2791fa9e4066Sahrens 27925002558fSNeil Perrin ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 27935002558fSNeil Perrin list_destroy(&zilog->zl_itx_commit_list); 27945002558fSNeil Perrin 27955002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 27965002558fSNeil Perrin /* 27975002558fSNeil Perrin * It's possible for an itx to be generated that doesn't dirty 27985002558fSNeil Perrin * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 27995002558fSNeil Perrin * callback to remove the entry. We remove those here. 28005002558fSNeil Perrin * 28015002558fSNeil Perrin * Also free up the ziltest itxs. 28025002558fSNeil Perrin */ 28035002558fSNeil Perrin if (zilog->zl_itxg[i].itxg_itxs) 28045002558fSNeil Perrin zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 28055002558fSNeil Perrin mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 28065002558fSNeil Perrin } 28075002558fSNeil Perrin 2808*1271e4b1SPrakash Surya mutex_destroy(&zilog->zl_writer_lock); 28095ad82045Snd mutex_destroy(&zilog->zl_lock); 2810fa9e4066Sahrens 2811b7b97454Sperrin cv_destroy(&zilog->zl_cv_suspend); 2812b7b97454Sperrin 2813fa9e4066Sahrens kmem_free(zilog, sizeof (zilog_t)); 2814fa9e4066Sahrens } 2815fa9e4066Sahrens 2816fa9e4066Sahrens /* 2817fa9e4066Sahrens * Open an intent log. 2818fa9e4066Sahrens */ 2819fa9e4066Sahrens zilog_t * 2820fa9e4066Sahrens zil_open(objset_t *os, zil_get_data_t *get_data) 2821fa9e4066Sahrens { 2822fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 2823fa9e4066Sahrens 2824*1271e4b1SPrakash Surya ASSERT3P(zilog->zl_get_data, ==, NULL); 2825*1271e4b1SPrakash Surya ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 2826c9ba2a43SEric Schrock ASSERT(list_is_empty(&zilog->zl_lwb_list)); 2827c9ba2a43SEric Schrock 2828fa9e4066Sahrens zilog->zl_get_data = get_data; 2829fa9e4066Sahrens 2830fa9e4066Sahrens return (zilog); 2831fa9e4066Sahrens } 2832fa9e4066Sahrens 2833fa9e4066Sahrens /* 2834fa9e4066Sahrens * Close an intent log. 2835fa9e4066Sahrens */ 2836fa9e4066Sahrens void 2837fa9e4066Sahrens zil_close(zilog_t *zilog) 2838fa9e4066Sahrens { 2839c9ba2a43SEric Schrock lwb_t *lwb; 2840*1271e4b1SPrakash Surya uint64_t txg; 28415002558fSNeil Perrin 2842*1271e4b1SPrakash Surya if (!dmu_objset_is_snapshot(zilog->zl_os)) { 2843*1271e4b1SPrakash Surya zil_commit(zilog, 0); 2844*1271e4b1SPrakash Surya } else { 2845*1271e4b1SPrakash Surya ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 2846*1271e4b1SPrakash Surya ASSERT0(zilog->zl_dirty_max_txg); 2847*1271e4b1SPrakash Surya ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); 2848*1271e4b1SPrakash Surya } 28495002558fSNeil Perrin 28505002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 2851c9ba2a43SEric Schrock lwb = list_tail(&zilog->zl_lwb_list); 2852*1271e4b1SPrakash Surya if (lwb == NULL) 2853*1271e4b1SPrakash Surya txg = zilog->zl_dirty_max_txg; 2854*1271e4b1SPrakash Surya else 2855*1271e4b1SPrakash Surya txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); 28565002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 2857*1271e4b1SPrakash Surya 2858*1271e4b1SPrakash Surya /* 2859*1271e4b1SPrakash Surya * We need to use txg_wait_synced() to wait long enough for the 2860*1271e4b1SPrakash Surya * ZIL to be clean, and to wait for all pending lwbs to be 2861*1271e4b1SPrakash Surya * written out. 2862*1271e4b1SPrakash Surya */ 2863*1271e4b1SPrakash Surya if (txg != 0) 2864d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, txg); 286543297f97SGeorge Wilson 286643297f97SGeorge Wilson if (zilog_is_dirty(zilog)) 286743297f97SGeorge Wilson zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); 286843297f97SGeorge Wilson VERIFY(!zilog_is_dirty(zilog)); 2869d80c45e0Sbonwick 2870fa9e4066Sahrens zilog->zl_get_data = NULL; 2871c9ba2a43SEric Schrock 2872c9ba2a43SEric Schrock /* 2873*1271e4b1SPrakash Surya * We should have only one lwb left on the list; remove it now. 2874c9ba2a43SEric Schrock */ 2875c9ba2a43SEric Schrock mutex_enter(&zilog->zl_lock); 2876c9ba2a43SEric Schrock lwb = list_head(&zilog->zl_lwb_list); 2877c9ba2a43SEric Schrock if (lwb != NULL) { 2878*1271e4b1SPrakash Surya ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); 2879*1271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2880c9ba2a43SEric Schrock list_remove(&zilog->zl_lwb_list, lwb); 2881c9ba2a43SEric Schrock zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 2882*1271e4b1SPrakash Surya zil_free_lwb(zilog, lwb); 2883c9ba2a43SEric Schrock } 2884c9ba2a43SEric Schrock mutex_exit(&zilog->zl_lock); 2885fa9e4066Sahrens } 2886fa9e4066Sahrens 28873b2aab18SMatthew Ahrens static char *suspend_tag = "zil suspending"; 28883b2aab18SMatthew Ahrens 2889fa9e4066Sahrens /* 2890fa9e4066Sahrens * Suspend an intent log. While in suspended mode, we still honor 2891fa9e4066Sahrens * synchronous semantics, but we rely on txg_wait_synced() to do it. 28923b2aab18SMatthew Ahrens * On old version pools, we suspend the log briefly when taking a 28933b2aab18SMatthew Ahrens * snapshot so that it will have an empty intent log. 28943b2aab18SMatthew Ahrens * 28953b2aab18SMatthew Ahrens * Long holds are not really intended to be used the way we do here -- 28963b2aab18SMatthew Ahrens * held for such a short time. A concurrent caller of dsl_dataset_long_held() 28973b2aab18SMatthew Ahrens * could fail. Therefore we take pains to only put a long hold if it is 28983b2aab18SMatthew Ahrens * actually necessary. Fortunately, it will only be necessary if the 28993b2aab18SMatthew Ahrens * objset is currently mounted (or the ZVOL equivalent). In that case it 29003b2aab18SMatthew Ahrens * will already have a long hold, so we are not really making things any worse. 29013b2aab18SMatthew Ahrens * 29023b2aab18SMatthew Ahrens * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or 29033b2aab18SMatthew Ahrens * zvol_state_t), and use their mechanism to prevent their hold from being 29043b2aab18SMatthew Ahrens * dropped (e.g. VFS_HOLD()). However, that would be even more pain for 29053b2aab18SMatthew Ahrens * very little gain. 29063b2aab18SMatthew Ahrens * 29073b2aab18SMatthew Ahrens * if cookiep == NULL, this does both the suspend & resume. 29083b2aab18SMatthew Ahrens * Otherwise, it returns with the dataset "long held", and the cookie 29093b2aab18SMatthew Ahrens * should be passed into zil_resume(). 2910fa9e4066Sahrens */ 2911fa9e4066Sahrens int 29123b2aab18SMatthew Ahrens zil_suspend(const char *osname, void **cookiep) 2913fa9e4066Sahrens { 29143b2aab18SMatthew Ahrens objset_t *os; 29153b2aab18SMatthew Ahrens zilog_t *zilog; 29163b2aab18SMatthew Ahrens const zil_header_t *zh; 29173b2aab18SMatthew Ahrens int error; 29183b2aab18SMatthew Ahrens 29193b2aab18SMatthew Ahrens error = dmu_objset_hold(osname, suspend_tag, &os); 29203b2aab18SMatthew Ahrens if (error != 0) 29213b2aab18SMatthew Ahrens return (error); 29223b2aab18SMatthew Ahrens zilog = dmu_objset_zil(os); 2923fa9e4066Sahrens 2924fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 29253b2aab18SMatthew Ahrens zh = zilog->zl_header; 29263b2aab18SMatthew Ahrens 29273589c4f0SNeil Perrin if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 2928fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 29293b2aab18SMatthew Ahrens dmu_objset_rele(os, suspend_tag); 2930be6fd75aSMatthew Ahrens return (SET_ERROR(EBUSY)); 2931fa9e4066Sahrens } 29323b2aab18SMatthew Ahrens 29333b2aab18SMatthew Ahrens /* 29343b2aab18SMatthew Ahrens * Don't put a long hold in the cases where we can avoid it. This 29353b2aab18SMatthew Ahrens * is when there is no cookie so we are doing a suspend & resume 29363b2aab18SMatthew Ahrens * (i.e. called from zil_vdev_offline()), and there's nothing to do 29373b2aab18SMatthew Ahrens * for the suspend because it's already suspended, or there's no ZIL. 29383b2aab18SMatthew Ahrens */ 29393b2aab18SMatthew Ahrens if (cookiep == NULL && !zilog->zl_suspending && 29403b2aab18SMatthew Ahrens (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { 29413b2aab18SMatthew Ahrens mutex_exit(&zilog->zl_lock); 29423b2aab18SMatthew Ahrens dmu_objset_rele(os, suspend_tag); 29433b2aab18SMatthew Ahrens return (0); 29443b2aab18SMatthew Ahrens } 29453b2aab18SMatthew Ahrens 29463b2aab18SMatthew Ahrens dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); 29473b2aab18SMatthew Ahrens dsl_pool_rele(dmu_objset_pool(os), suspend_tag); 29483b2aab18SMatthew Ahrens 29493b2aab18SMatthew Ahrens zilog->zl_suspend++; 29503b2aab18SMatthew Ahrens 29513b2aab18SMatthew Ahrens if (zilog->zl_suspend > 1) { 2952d80c45e0Sbonwick /* 29533b2aab18SMatthew Ahrens * Someone else is already suspending it. 2954d80c45e0Sbonwick * Just wait for them to finish. 2955d80c45e0Sbonwick */ 29563b2aab18SMatthew Ahrens 2957d80c45e0Sbonwick while (zilog->zl_suspending) 2958d80c45e0Sbonwick cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 2959d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 29603b2aab18SMatthew Ahrens 29613b2aab18SMatthew Ahrens if (cookiep == NULL) 29623b2aab18SMatthew Ahrens zil_resume(os); 29633b2aab18SMatthew Ahrens else 29643b2aab18SMatthew Ahrens *cookiep = os; 29653b2aab18SMatthew Ahrens return (0); 29663b2aab18SMatthew Ahrens } 29673b2aab18SMatthew Ahrens 29683b2aab18SMatthew Ahrens /* 29693b2aab18SMatthew Ahrens * If there is no pointer to an on-disk block, this ZIL must not 29703b2aab18SMatthew Ahrens * be active (e.g. filesystem not mounted), so there's nothing 29713b2aab18SMatthew Ahrens * to clean up. 29723b2aab18SMatthew Ahrens */ 29733b2aab18SMatthew Ahrens if (BP_IS_HOLE(&zh->zh_log)) { 29743b2aab18SMatthew Ahrens ASSERT(cookiep != NULL); /* fast path already handled */ 29753b2aab18SMatthew Ahrens 29763b2aab18SMatthew Ahrens *cookiep = os; 29773b2aab18SMatthew Ahrens mutex_exit(&zilog->zl_lock); 2978d80c45e0Sbonwick return (0); 2979d80c45e0Sbonwick } 29803b2aab18SMatthew Ahrens 2981d80c45e0Sbonwick zilog->zl_suspending = B_TRUE; 2982fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 2983fa9e4066Sahrens 29845002558fSNeil Perrin zil_commit(zilog, 0); 2985fa9e4066Sahrens 2986d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 2987d80c45e0Sbonwick 2988d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 2989d80c45e0Sbonwick zilog->zl_suspending = B_FALSE; 2990d80c45e0Sbonwick cv_broadcast(&zilog->zl_cv_suspend); 2991d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 2992fa9e4066Sahrens 29933b2aab18SMatthew Ahrens if (cookiep == NULL) 29943b2aab18SMatthew Ahrens zil_resume(os); 29953b2aab18SMatthew Ahrens else 29963b2aab18SMatthew Ahrens *cookiep = os; 2997fa9e4066Sahrens return (0); 2998fa9e4066Sahrens } 2999fa9e4066Sahrens 3000fa9e4066Sahrens void 30013b2aab18SMatthew Ahrens zil_resume(void *cookie) 3002fa9e4066Sahrens { 30033b2aab18SMatthew Ahrens objset_t *os = cookie; 30043b2aab18SMatthew Ahrens zilog_t *zilog = dmu_objset_zil(os); 30053b2aab18SMatthew Ahrens 3006fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 3007fa9e4066Sahrens ASSERT(zilog->zl_suspend != 0); 3008fa9e4066Sahrens zilog->zl_suspend--; 3009fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 30103b2aab18SMatthew Ahrens dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); 30113b2aab18SMatthew Ahrens dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); 3012fa9e4066Sahrens } 3013fa9e4066Sahrens 3014fa9e4066Sahrens typedef struct zil_replay_arg { 3015fa9e4066Sahrens zil_replay_func_t **zr_replay; 3016fa9e4066Sahrens void *zr_arg; 3017fa9e4066Sahrens boolean_t zr_byteswap; 3018b24ab676SJeff Bonwick char *zr_lr; 3019fa9e4066Sahrens } zil_replay_arg_t; 3020fa9e4066Sahrens 3021b24ab676SJeff Bonwick static int 3022b24ab676SJeff Bonwick zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 3023b24ab676SJeff Bonwick { 30249adfa60dSMatthew Ahrens char name[ZFS_MAX_DATASET_NAME_LEN]; 3025b24ab676SJeff Bonwick 3026b24ab676SJeff Bonwick zilog->zl_replaying_seq--; /* didn't actually replay this one */ 3027b24ab676SJeff Bonwick 3028b24ab676SJeff Bonwick dmu_objset_name(zilog->zl_os, name); 3029b24ab676SJeff Bonwick 3030b24ab676SJeff Bonwick cmn_err(CE_WARN, "ZFS replay transaction error %d, " 3031b24ab676SJeff Bonwick "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 3032b24ab676SJeff Bonwick (u_longlong_t)lr->lrc_seq, 3033b24ab676SJeff Bonwick (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 3034b24ab676SJeff Bonwick (lr->lrc_txtype & TX_CI) ? "CI" : ""); 3035b24ab676SJeff Bonwick 3036b24ab676SJeff Bonwick return (error); 3037b24ab676SJeff Bonwick } 3038b24ab676SJeff Bonwick 3039b24ab676SJeff Bonwick static int 3040fa9e4066Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 3041fa9e4066Sahrens { 3042fa9e4066Sahrens zil_replay_arg_t *zr = zra; 3043d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 3044fa9e4066Sahrens uint64_t reclen = lr->lrc_reclen; 3045fa9e4066Sahrens uint64_t txtype = lr->lrc_txtype; 3046b24ab676SJeff Bonwick int error = 0; 3047fa9e4066Sahrens 3048b24ab676SJeff Bonwick zilog->zl_replaying_seq = lr->lrc_seq; 3049fa9e4066Sahrens 3050fa9e4066Sahrens if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 3051b24ab676SJeff Bonwick return (0); 3052b24ab676SJeff Bonwick 3053b24ab676SJeff Bonwick if (lr->lrc_txg < claim_txg) /* already committed */ 3054b24ab676SJeff Bonwick return (0); 3055fa9e4066Sahrens 3056da6c28aaSamw /* Strip case-insensitive bit, still present in log record */ 3057da6c28aaSamw txtype &= ~TX_CI; 3058da6c28aaSamw 3059b24ab676SJeff Bonwick if (txtype == 0 || txtype >= TX_MAX_TYPE) 3060b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, EINVAL)); 3061b24ab676SJeff Bonwick 3062b24ab676SJeff Bonwick /* 3063b24ab676SJeff Bonwick * If this record type can be logged out of order, the object 3064b24ab676SJeff Bonwick * (lr_foid) may no longer exist. That's legitimate, not an error. 3065b24ab676SJeff Bonwick */ 3066b24ab676SJeff Bonwick if (TX_OOO(txtype)) { 3067b24ab676SJeff Bonwick error = dmu_object_info(zilog->zl_os, 3068b24ab676SJeff Bonwick ((lr_ooo_t *)lr)->lr_foid, NULL); 3069b24ab676SJeff Bonwick if (error == ENOENT || error == EEXIST) 3070b24ab676SJeff Bonwick return (0); 30711209a471SNeil Perrin } 30721209a471SNeil Perrin 3073fa9e4066Sahrens /* 3074fa9e4066Sahrens * Make a copy of the data so we can revise and extend it. 3075fa9e4066Sahrens */ 3076b24ab676SJeff Bonwick bcopy(lr, zr->zr_lr, reclen); 3077b24ab676SJeff Bonwick 3078b24ab676SJeff Bonwick /* 3079b24ab676SJeff Bonwick * If this is a TX_WRITE with a blkptr, suck in the data. 3080b24ab676SJeff Bonwick */ 3081b24ab676SJeff Bonwick if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 3082b24ab676SJeff Bonwick error = zil_read_log_data(zilog, (lr_write_t *)lr, 3083b24ab676SJeff Bonwick zr->zr_lr + reclen); 30843b2aab18SMatthew Ahrens if (error != 0) 3085b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 3086b24ab676SJeff Bonwick } 3087fa9e4066Sahrens 3088fa9e4066Sahrens /* 3089fa9e4066Sahrens * The log block containing this lr may have been byteswapped 3090fa9e4066Sahrens * so that we can easily examine common fields like lrc_txtype. 3091b24ab676SJeff Bonwick * However, the log is a mix of different record types, and only the 3092fa9e4066Sahrens * replay vectors know how to byteswap their records. Therefore, if 3093fa9e4066Sahrens * the lr was byteswapped, undo it before invoking the replay vector. 3094fa9e4066Sahrens */ 3095fa9e4066Sahrens if (zr->zr_byteswap) 3096b24ab676SJeff Bonwick byteswap_uint64_array(zr->zr_lr, reclen); 3097fa9e4066Sahrens 3098fa9e4066Sahrens /* 3099fa9e4066Sahrens * We must now do two things atomically: replay this log record, 31001209a471SNeil Perrin * and update the log header sequence number to reflect the fact that 31011209a471SNeil Perrin * we did so. At the end of each replay function the sequence number 31021209a471SNeil Perrin * is updated if we are in replay mode. 3103fa9e4066Sahrens */ 3104b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 31053b2aab18SMatthew Ahrens if (error != 0) { 310667bd71c6Sperrin /* 310767bd71c6Sperrin * The DMU's dnode layer doesn't see removes until the txg 310867bd71c6Sperrin * commits, so a subsequent claim can spuriously fail with 31091209a471SNeil Perrin * EEXIST. So if we receive any error we try syncing out 3110b24ab676SJeff Bonwick * any removes then retry the transaction. Note that we 3111b24ab676SJeff Bonwick * specify B_FALSE for byteswap now, so we don't do it twice. 311267bd71c6Sperrin */ 3113b24ab676SJeff Bonwick txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 3114b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 31153b2aab18SMatthew Ahrens if (error != 0) 3116b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 3117fa9e4066Sahrens } 3118b24ab676SJeff Bonwick return (0); 311967bd71c6Sperrin } 3120fa9e4066Sahrens 312167bd71c6Sperrin /* ARGSUSED */ 3122b24ab676SJeff Bonwick static int 312367bd71c6Sperrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 312467bd71c6Sperrin { 312567bd71c6Sperrin zilog->zl_replay_blks++; 3126b24ab676SJeff Bonwick 3127b24ab676SJeff Bonwick return (0); 3128fa9e4066Sahrens } 3129fa9e4066Sahrens 3130fa9e4066Sahrens /* 313113f5297eSperrin * If this dataset has a non-empty intent log, replay it and destroy it. 3132fa9e4066Sahrens */ 3133fa9e4066Sahrens void 31341209a471SNeil Perrin zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 3135fa9e4066Sahrens { 3136fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 3137d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 3138d80c45e0Sbonwick zil_replay_arg_t zr; 313913f5297eSperrin 31403589c4f0SNeil Perrin if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 3141d80c45e0Sbonwick zil_destroy(zilog, B_TRUE); 314213f5297eSperrin return; 314313f5297eSperrin } 3144fa9e4066Sahrens 3145fa9e4066Sahrens zr.zr_replay = replay_func; 3146fa9e4066Sahrens zr.zr_arg = arg; 3147d80c45e0Sbonwick zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 3148b24ab676SJeff Bonwick zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 3149fa9e4066Sahrens 3150fa9e4066Sahrens /* 3151fa9e4066Sahrens * Wait for in-progress removes to sync before starting replay. 3152fa9e4066Sahrens */ 3153fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 3154fa9e4066Sahrens 31551209a471SNeil Perrin zilog->zl_replay = B_TRUE; 3156d3d50737SRafael Vanoni zilog->zl_replay_time = ddi_get_lbolt(); 315767bd71c6Sperrin ASSERT(zilog->zl_replay_blks == 0); 315867bd71c6Sperrin (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 3159d80c45e0Sbonwick zh->zh_claim_txg); 3160b24ab676SJeff Bonwick kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 3161fa9e4066Sahrens 3162d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 3163a4611edeSahrens txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 31641209a471SNeil Perrin zilog->zl_replay = B_FALSE; 3165fa9e4066Sahrens } 3166436b2950Sperrin 3167b24ab676SJeff Bonwick boolean_t 3168b24ab676SJeff Bonwick zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 3169436b2950Sperrin { 317055da60b9SMark J Musante if (zilog->zl_sync == ZFS_SYNC_DISABLED) 3171b24ab676SJeff Bonwick return (B_TRUE); 3172436b2950Sperrin 3173b24ab676SJeff Bonwick if (zilog->zl_replay) { 3174b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 3175b24ab676SJeff Bonwick zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 3176b24ab676SJeff Bonwick zilog->zl_replaying_seq; 3177b24ab676SJeff Bonwick return (B_TRUE); 3178b19a79ecSperrin } 3179b19a79ecSperrin 3180b24ab676SJeff Bonwick return (B_FALSE); 3181436b2950Sperrin } 3182e6ca193dSGeorge Wilson 3183e6ca193dSGeorge Wilson /* ARGSUSED */ 3184e6ca193dSGeorge Wilson int 3185fd136879SMatthew Ahrens zil_vdev_offline(const char *osname, void *arg) 3186e6ca193dSGeorge Wilson { 3187e6ca193dSGeorge Wilson int error; 3188e6ca193dSGeorge Wilson 31893b2aab18SMatthew Ahrens error = zil_suspend(osname, NULL); 31903b2aab18SMatthew Ahrens if (error != 0) 3191be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST)); 31923b2aab18SMatthew Ahrens return (0); 3193e6ca193dSGeorge Wilson } 3194