1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5fe9cf88cSperrin * Common Development and Distribution License (the "License"). 6fe9cf88cSperrin * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2255da60b9SMark J Musante * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23f78cdc34SPaul Dagnelie * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 25fa9e4066Sahrens */ 26fa9e4066Sahrens 2755da60b9SMark J Musante /* Portions Copyright 2010 Robert Milkowski */ 2855da60b9SMark J Musante 29fa9e4066Sahrens #include <sys/zfs_context.h> 30fa9e4066Sahrens #include <sys/spa.h> 3186714001SSerapheim Dimitropoulos #include <sys/spa_impl.h> 32fa9e4066Sahrens #include <sys/dmu.h> 33fa9e4066Sahrens #include <sys/zap.h> 34fa9e4066Sahrens #include <sys/arc.h> 35fa9e4066Sahrens #include <sys/stat.h> 36fa9e4066Sahrens #include <sys/resource.h> 37fa9e4066Sahrens #include <sys/zil.h> 38fa9e4066Sahrens #include <sys/zil_impl.h> 39fa9e4066Sahrens #include <sys/dsl_dataset.h> 404b964adaSGeorge Wilson #include <sys/vdev_impl.h> 41d63d470bSgw #include <sys/dmu_tx.h> 423f9d6ad7SLin Ling #include <sys/dsl_pool.h> 43770499e1SDan Kimmel #include <sys/abd.h> 44fa9e4066Sahrens 45fa9e4066Sahrens /* 461271e4b1SPrakash Surya * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system 471271e4b1SPrakash Surya * calls that change the file system. Each itx has enough information to 481271e4b1SPrakash Surya * be able to replay them after a system crash, power loss, or 491271e4b1SPrakash Surya * equivalent failure mode. These are stored in memory until either: 50fa9e4066Sahrens * 511271e4b1SPrakash Surya * 1. they are committed to the pool by the DMU transaction group 521271e4b1SPrakash Surya * (txg), at which point they can be discarded; or 531271e4b1SPrakash Surya * 2. they are committed to the on-disk ZIL for the dataset being 541271e4b1SPrakash Surya * modified (e.g. due to an fsync, O_DSYNC, or other synchronous 551271e4b1SPrakash Surya * requirement). 56fa9e4066Sahrens * 571271e4b1SPrakash Surya * In the event of a crash or power loss, the itxs contained by each 581271e4b1SPrakash Surya * dataset's on-disk ZIL will be replayed when that dataset is first 591271e4b1SPrakash Surya * instantianted (e.g. if the dataset is a normal fileystem, when it is 601271e4b1SPrakash Surya * first mounted). 61fa9e4066Sahrens * 621271e4b1SPrakash Surya * As hinted at above, there is one ZIL per dataset (both the in-memory 631271e4b1SPrakash Surya * representation, and the on-disk representation). The on-disk format 641271e4b1SPrakash Surya * consists of 3 parts: 651271e4b1SPrakash Surya * 66*54811da5SToomas Soome * - a single, per-dataset, ZIL header; which points to a chain of 67*54811da5SToomas Soome * - zero or more ZIL blocks; each of which contains 68*54811da5SToomas Soome * - zero or more ZIL records 691271e4b1SPrakash Surya * 701271e4b1SPrakash Surya * A ZIL record holds the information necessary to replay a single 711271e4b1SPrakash Surya * system call transaction. A ZIL block can hold many ZIL records, and 721271e4b1SPrakash Surya * the blocks are chained together, similarly to a singly linked list. 731271e4b1SPrakash Surya * 741271e4b1SPrakash Surya * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL 751271e4b1SPrakash Surya * block in the chain, and the ZIL header points to the first block in 761271e4b1SPrakash Surya * the chain. 771271e4b1SPrakash Surya * 781271e4b1SPrakash Surya * Note, there is not a fixed place in the pool to hold these ZIL 791271e4b1SPrakash Surya * blocks; they are dynamically allocated and freed as needed from the 801271e4b1SPrakash Surya * blocks available on the pool, though they can be preferentially 811271e4b1SPrakash Surya * allocated from a dedicated "log" vdev. 821271e4b1SPrakash Surya */ 831271e4b1SPrakash Surya 841271e4b1SPrakash Surya /* 851271e4b1SPrakash Surya * This controls the amount of time that a ZIL block (lwb) will remain 861271e4b1SPrakash Surya * "open" when it isn't "full", and it has a thread waiting for it to be 871271e4b1SPrakash Surya * committed to stable storage. Please refer to the zil_commit_waiter() 881271e4b1SPrakash Surya * function (and the comments within it) for more details. 89fa9e4066Sahrens */ 901271e4b1SPrakash Surya int zfs_commit_timeout_pct = 5; 91fa9e4066Sahrens 92fa9e4066Sahrens /* 93f7170741SWill Andrews * Disable intent logging replay. This global ZIL switch affects all pools. 94fa9e4066Sahrens */ 95f7170741SWill Andrews int zil_replay_disable = 0; 96416e0cd8Sek 97416e0cd8Sek /* 98f8fdf681SPrakash Surya * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to 99f8fdf681SPrakash Surya * the disk(s) by the ZIL after an LWB write has completed. Setting this 100f8fdf681SPrakash Surya * will cause ZIL corruption on power loss if a volatile out-of-order 101f8fdf681SPrakash Surya * write cache is enabled. 102416e0cd8Sek */ 103f8fdf681SPrakash Surya boolean_t zil_nocacheflush = B_FALSE; 104fa9e4066Sahrens 105c5ee4681SAlexander Motin /* 106c5ee4681SAlexander Motin * Limit SLOG write size per commit executed with synchronous priority. 107c5ee4681SAlexander Motin * Any writes above that will be executed with lower (asynchronous) priority 108c5ee4681SAlexander Motin * to limit potential SLOG device abuse by single active ZIL writer. 109c5ee4681SAlexander Motin */ 110c5ee4681SAlexander Motin uint64_t zil_slog_bulk = 768 * 1024; 111c5ee4681SAlexander Motin 112fa9e4066Sahrens static kmem_cache_t *zil_lwb_cache; 1131271e4b1SPrakash Surya static kmem_cache_t *zil_zcw_cache; 114fa9e4066Sahrens 11591de656bSNeil Perrin static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); 1168f18d1faSGeorge Wilson 1176e1f5caaSNeil Perrin #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 1186e1f5caaSNeil Perrin sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 1196e1f5caaSNeil Perrin 120fa9e4066Sahrens static int 121b24ab676SJeff Bonwick zil_bp_compare(const void *x1, const void *x2) 122fa9e4066Sahrens { 123b24ab676SJeff Bonwick const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 124b24ab676SJeff Bonwick const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 125fa9e4066Sahrens 126fa9e4066Sahrens if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 127fa9e4066Sahrens return (-1); 128fa9e4066Sahrens if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 129fa9e4066Sahrens return (1); 130fa9e4066Sahrens 131fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 132fa9e4066Sahrens return (-1); 133fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 134fa9e4066Sahrens return (1); 135fa9e4066Sahrens 136fa9e4066Sahrens return (0); 137fa9e4066Sahrens } 138fa9e4066Sahrens 139fa9e4066Sahrens static void 140b24ab676SJeff Bonwick zil_bp_tree_init(zilog_t *zilog) 141fa9e4066Sahrens { 142b24ab676SJeff Bonwick avl_create(&zilog->zl_bp_tree, zil_bp_compare, 143b24ab676SJeff Bonwick sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 144fa9e4066Sahrens } 145fa9e4066Sahrens 146fa9e4066Sahrens static void 147b24ab676SJeff Bonwick zil_bp_tree_fini(zilog_t *zilog) 148fa9e4066Sahrens { 149b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 150b24ab676SJeff Bonwick zil_bp_node_t *zn; 151fa9e4066Sahrens void *cookie = NULL; 152fa9e4066Sahrens 153fa9e4066Sahrens while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 154b24ab676SJeff Bonwick kmem_free(zn, sizeof (zil_bp_node_t)); 155fa9e4066Sahrens 156fa9e4066Sahrens avl_destroy(t); 157fa9e4066Sahrens } 158fa9e4066Sahrens 159b24ab676SJeff Bonwick int 160b24ab676SJeff Bonwick zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 161fa9e4066Sahrens { 162b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 1635d7b4d43SMatthew Ahrens const dva_t *dva; 164b24ab676SJeff Bonwick zil_bp_node_t *zn; 165fa9e4066Sahrens avl_index_t where; 166fa9e4066Sahrens 1675d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 1685d7b4d43SMatthew Ahrens return (0); 1695d7b4d43SMatthew Ahrens 1705d7b4d43SMatthew Ahrens dva = BP_IDENTITY(bp); 1715d7b4d43SMatthew Ahrens 172fa9e4066Sahrens if (avl_find(t, dva, &where) != NULL) 173be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST)); 174fa9e4066Sahrens 175b24ab676SJeff Bonwick zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 176fa9e4066Sahrens zn->zn_dva = *dva; 177fa9e4066Sahrens avl_insert(t, zn, where); 178fa9e4066Sahrens 179fa9e4066Sahrens return (0); 180fa9e4066Sahrens } 181fa9e4066Sahrens 182d80c45e0Sbonwick static zil_header_t * 183d80c45e0Sbonwick zil_header_in_syncing_context(zilog_t *zilog) 184d80c45e0Sbonwick { 185d80c45e0Sbonwick return ((zil_header_t *)zilog->zl_header); 186d80c45e0Sbonwick } 187d80c45e0Sbonwick 188d80c45e0Sbonwick static void 189d80c45e0Sbonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 190d80c45e0Sbonwick { 191d80c45e0Sbonwick zio_cksum_t *zc = &bp->blk_cksum; 192d80c45e0Sbonwick 193d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 194d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 195d80c45e0Sbonwick zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 196d80c45e0Sbonwick zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 197d80c45e0Sbonwick } 198d80c45e0Sbonwick 199fa9e4066Sahrens /* 200b24ab676SJeff Bonwick * Read a log block and make sure it's valid. 201fa9e4066Sahrens */ 202fa9e4066Sahrens static int 2036e1f5caaSNeil Perrin zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 2046e1f5caaSNeil Perrin char **end) 205fa9e4066Sahrens { 206b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 2077adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 208b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 2097802d7bfSMatthew Ahrens zbookmark_phys_t zb; 210fa9e4066Sahrens int error; 211fa9e4066Sahrens 212b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 213b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 214ea8dc4b6Seschrock 215b24ab676SJeff Bonwick if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 216b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE; 217fa9e4066Sahrens 218b24ab676SJeff Bonwick SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 219b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 220b24ab676SJeff Bonwick 2211b912ec7SGeorge Wilson error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 222b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 223fa9e4066Sahrens 224d80c45e0Sbonwick if (error == 0) { 225d80c45e0Sbonwick zio_cksum_t cksum = bp->blk_cksum; 226fa9e4066Sahrens 227d80c45e0Sbonwick /* 228f5e6e722SNeil Perrin * Validate the checksummed log block. 229f5e6e722SNeil Perrin * 230d80c45e0Sbonwick * Sequence numbers should be... sequential. The checksum 231d80c45e0Sbonwick * verifier for the next block should be bp's checksum plus 1. 232f5e6e722SNeil Perrin * 233f5e6e722SNeil Perrin * Also check the log chain linkage and size used. 234d80c45e0Sbonwick */ 235d80c45e0Sbonwick cksum.zc_word[ZIL_ZC_SEQ]++; 236d80c45e0Sbonwick 2376e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 2386e1f5caaSNeil Perrin zil_chain_t *zilc = abuf->b_data; 2396e1f5caaSNeil Perrin char *lr = (char *)(zilc + 1); 2406e1f5caaSNeil Perrin uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 2416e1f5caaSNeil Perrin 2426e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2436e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 244be6fd75aSMatthew Ahrens error = SET_ERROR(ECKSUM); 2456e1f5caaSNeil Perrin } else { 246b5152584SMatthew Ahrens ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); 2476e1f5caaSNeil Perrin bcopy(lr, dst, len); 2486e1f5caaSNeil Perrin *end = (char *)dst + len; 2496e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2506e1f5caaSNeil Perrin } 2516e1f5caaSNeil Perrin } else { 2526e1f5caaSNeil Perrin char *lr = abuf->b_data; 2536e1f5caaSNeil Perrin uint64_t size = BP_GET_LSIZE(bp); 2546e1f5caaSNeil Perrin zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 2556e1f5caaSNeil Perrin 2566e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2576e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 2586e1f5caaSNeil Perrin (zilc->zc_nused > (size - sizeof (*zilc)))) { 259be6fd75aSMatthew Ahrens error = SET_ERROR(ECKSUM); 2606e1f5caaSNeil Perrin } else { 261b5152584SMatthew Ahrens ASSERT3U(zilc->zc_nused, <=, 262b5152584SMatthew Ahrens SPA_OLD_MAXBLOCKSIZE); 2636e1f5caaSNeil Perrin bcopy(lr, dst, zilc->zc_nused); 2646e1f5caaSNeil Perrin *end = (char *)dst + zilc->zc_nused; 2656e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2666e1f5caaSNeil Perrin } 2676e1f5caaSNeil Perrin } 268fa9e4066Sahrens 269dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 270fa9e4066Sahrens } 271fa9e4066Sahrens 272b24ab676SJeff Bonwick return (error); 273b24ab676SJeff Bonwick } 274b24ab676SJeff Bonwick 275b24ab676SJeff Bonwick /* 276b24ab676SJeff Bonwick * Read a TX_WRITE log data block. 277b24ab676SJeff Bonwick */ 278b24ab676SJeff Bonwick static int 279b24ab676SJeff Bonwick zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 280b24ab676SJeff Bonwick { 281b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 282b24ab676SJeff Bonwick const blkptr_t *bp = &lr->lr_blkptr; 2837adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 284b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 2857802d7bfSMatthew Ahrens zbookmark_phys_t zb; 286b24ab676SJeff Bonwick int error; 287b24ab676SJeff Bonwick 288b24ab676SJeff Bonwick if (BP_IS_HOLE(bp)) { 289b24ab676SJeff Bonwick if (wbuf != NULL) 290b24ab676SJeff Bonwick bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 291b24ab676SJeff Bonwick return (0); 292b24ab676SJeff Bonwick } 293b24ab676SJeff Bonwick 294b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 295b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 296b24ab676SJeff Bonwick 297b24ab676SJeff Bonwick SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 298b24ab676SJeff Bonwick ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 299b24ab676SJeff Bonwick 3001b912ec7SGeorge Wilson error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 301b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 302b24ab676SJeff Bonwick 303b24ab676SJeff Bonwick if (error == 0) { 304b24ab676SJeff Bonwick if (wbuf != NULL) 305b24ab676SJeff Bonwick bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 306dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 307b24ab676SJeff Bonwick } 308fa9e4066Sahrens 309d80c45e0Sbonwick return (error); 310fa9e4066Sahrens } 311fa9e4066Sahrens 312fa9e4066Sahrens /* 313fa9e4066Sahrens * Parse the intent log, and call parse_func for each valid record within. 314fa9e4066Sahrens */ 315b24ab676SJeff Bonwick int 316fa9e4066Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 317fa9e4066Sahrens zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 318fa9e4066Sahrens { 319d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 320b24ab676SJeff Bonwick boolean_t claimed = !!zh->zh_claim_txg; 321b24ab676SJeff Bonwick uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 322b24ab676SJeff Bonwick uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 323b24ab676SJeff Bonwick uint64_t max_blk_seq = 0; 324b24ab676SJeff Bonwick uint64_t max_lr_seq = 0; 325b24ab676SJeff Bonwick uint64_t blk_count = 0; 326b24ab676SJeff Bonwick uint64_t lr_count = 0; 327b24ab676SJeff Bonwick blkptr_t blk, next_blk; 328fa9e4066Sahrens char *lrbuf, *lrp; 329b24ab676SJeff Bonwick int error = 0; 330fa9e4066Sahrens 331b24ab676SJeff Bonwick /* 332b24ab676SJeff Bonwick * Old logs didn't record the maximum zh_claim_lr_seq. 333b24ab676SJeff Bonwick */ 334b24ab676SJeff Bonwick if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 335b24ab676SJeff Bonwick claim_lr_seq = UINT64_MAX; 336fa9e4066Sahrens 337fa9e4066Sahrens /* 338fa9e4066Sahrens * Starting at the block pointed to by zh_log we read the log chain. 339fa9e4066Sahrens * For each block in the chain we strongly check that block to 340fa9e4066Sahrens * ensure its validity. We stop when an invalid block is found. 341fa9e4066Sahrens * For each block pointer in the chain we call parse_blk_func(). 342fa9e4066Sahrens * For each record in each valid block we call parse_lr_func(). 343d80c45e0Sbonwick * If the log has been claimed, stop if we encounter a sequence 344d80c45e0Sbonwick * number greater than the highest claimed sequence number. 345fa9e4066Sahrens */ 346b5152584SMatthew Ahrens lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 347b24ab676SJeff Bonwick zil_bp_tree_init(zilog); 348d80c45e0Sbonwick 349b24ab676SJeff Bonwick for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 350b24ab676SJeff Bonwick uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 351b24ab676SJeff Bonwick int reclen; 3526e1f5caaSNeil Perrin char *end; 353d80c45e0Sbonwick 354b24ab676SJeff Bonwick if (blk_seq > claim_blk_seq) 355b24ab676SJeff Bonwick break; 356b24ab676SJeff Bonwick if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 357b24ab676SJeff Bonwick break; 3586e1f5caaSNeil Perrin ASSERT3U(max_blk_seq, <, blk_seq); 359b24ab676SJeff Bonwick max_blk_seq = blk_seq; 360b24ab676SJeff Bonwick blk_count++; 361fa9e4066Sahrens 362b24ab676SJeff Bonwick if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 363b24ab676SJeff Bonwick break; 364fa9e4066Sahrens 3656e1f5caaSNeil Perrin error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 3663b2aab18SMatthew Ahrens if (error != 0) 367fa9e4066Sahrens break; 368fa9e4066Sahrens 3696e1f5caaSNeil Perrin for (lrp = lrbuf; lrp < end; lrp += reclen) { 370fa9e4066Sahrens lr_t *lr = (lr_t *)lrp; 371fa9e4066Sahrens reclen = lr->lrc_reclen; 372fa9e4066Sahrens ASSERT3U(reclen, >=, sizeof (lr_t)); 373b24ab676SJeff Bonwick if (lr->lrc_seq > claim_lr_seq) 374b24ab676SJeff Bonwick goto done; 375b24ab676SJeff Bonwick if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 376b24ab676SJeff Bonwick goto done; 3776e1f5caaSNeil Perrin ASSERT3U(max_lr_seq, <, lr->lrc_seq); 378b24ab676SJeff Bonwick max_lr_seq = lr->lrc_seq; 379b24ab676SJeff Bonwick lr_count++; 380fa9e4066Sahrens } 381fa9e4066Sahrens } 382b24ab676SJeff Bonwick done: 383b24ab676SJeff Bonwick zilog->zl_parse_error = error; 384b24ab676SJeff Bonwick zilog->zl_parse_blk_seq = max_blk_seq; 385b24ab676SJeff Bonwick zilog->zl_parse_lr_seq = max_lr_seq; 386b24ab676SJeff Bonwick zilog->zl_parse_blk_count = blk_count; 387b24ab676SJeff Bonwick zilog->zl_parse_lr_count = lr_count; 388b24ab676SJeff Bonwick 389b24ab676SJeff Bonwick ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 390b24ab676SJeff Bonwick (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 391d80c45e0Sbonwick 392b24ab676SJeff Bonwick zil_bp_tree_fini(zilog); 393b5152584SMatthew Ahrens zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); 394b24ab676SJeff Bonwick 395b24ab676SJeff Bonwick return (error); 396fa9e4066Sahrens } 397fa9e4066Sahrens 39886714001SSerapheim Dimitropoulos /* ARGSUSED */ 39986714001SSerapheim Dimitropoulos static int 40086714001SSerapheim Dimitropoulos zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 40186714001SSerapheim Dimitropoulos { 40286714001SSerapheim Dimitropoulos ASSERT(!BP_IS_HOLE(bp)); 40386714001SSerapheim Dimitropoulos 40486714001SSerapheim Dimitropoulos /* 40586714001SSerapheim Dimitropoulos * As we call this function from the context of a rewind to a 40686714001SSerapheim Dimitropoulos * checkpoint, each ZIL block whose txg is later than the txg 40786714001SSerapheim Dimitropoulos * that we rewind to is invalid. Thus, we return -1 so 40886714001SSerapheim Dimitropoulos * zil_parse() doesn't attempt to read it. 40986714001SSerapheim Dimitropoulos */ 41086714001SSerapheim Dimitropoulos if (bp->blk_birth >= first_txg) 41186714001SSerapheim Dimitropoulos return (-1); 41286714001SSerapheim Dimitropoulos 41386714001SSerapheim Dimitropoulos if (zil_bp_tree_add(zilog, bp) != 0) 41486714001SSerapheim Dimitropoulos return (0); 41586714001SSerapheim Dimitropoulos 41686714001SSerapheim Dimitropoulos zio_free(zilog->zl_spa, first_txg, bp); 41786714001SSerapheim Dimitropoulos return (0); 41886714001SSerapheim Dimitropoulos } 41986714001SSerapheim Dimitropoulos 42086714001SSerapheim Dimitropoulos /* ARGSUSED */ 42186714001SSerapheim Dimitropoulos static int 42286714001SSerapheim Dimitropoulos zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 42386714001SSerapheim Dimitropoulos { 42486714001SSerapheim Dimitropoulos return (0); 42586714001SSerapheim Dimitropoulos } 42686714001SSerapheim Dimitropoulos 427b24ab676SJeff Bonwick static int 428fa9e4066Sahrens zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 429fa9e4066Sahrens { 430fa9e4066Sahrens /* 431fa9e4066Sahrens * Claim log block if not already committed and not already claimed. 432b24ab676SJeff Bonwick * If tx == NULL, just verify that the block is claimable. 433fa9e4066Sahrens */ 43443466aaeSMax Grossman if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || 43543466aaeSMax Grossman zil_bp_tree_add(zilog, bp) != 0) 436b24ab676SJeff Bonwick return (0); 437b24ab676SJeff Bonwick 438b24ab676SJeff Bonwick return (zio_wait(zio_claim(NULL, zilog->zl_spa, 439b24ab676SJeff Bonwick tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 440b24ab676SJeff Bonwick ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 441fa9e4066Sahrens } 442fa9e4066Sahrens 443b24ab676SJeff Bonwick static int 444fa9e4066Sahrens zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 445fa9e4066Sahrens { 446b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 447b24ab676SJeff Bonwick int error; 448b24ab676SJeff Bonwick 449b24ab676SJeff Bonwick if (lrc->lrc_txtype != TX_WRITE) 450b24ab676SJeff Bonwick return (0); 451b24ab676SJeff Bonwick 452b24ab676SJeff Bonwick /* 453b24ab676SJeff Bonwick * If the block is not readable, don't claim it. This can happen 454b24ab676SJeff Bonwick * in normal operation when a log block is written to disk before 455b24ab676SJeff Bonwick * some of the dmu_sync() blocks it points to. In this case, the 456b24ab676SJeff Bonwick * transaction cannot have been committed to anyone (we would have 457b24ab676SJeff Bonwick * waited for all writes to be stable first), so it is semantically 458b24ab676SJeff Bonwick * correct to declare this the end of the log. 459b24ab676SJeff Bonwick */ 460b24ab676SJeff Bonwick if (lr->lr_blkptr.blk_birth >= first_txg && 461b24ab676SJeff Bonwick (error = zil_read_log_data(zilog, lr, NULL)) != 0) 462b24ab676SJeff Bonwick return (error); 463b24ab676SJeff Bonwick return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 464fa9e4066Sahrens } 465fa9e4066Sahrens 466fa9e4066Sahrens /* ARGSUSED */ 467b24ab676SJeff Bonwick static int 468fa9e4066Sahrens zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 469fa9e4066Sahrens { 47086714001SSerapheim Dimitropoulos zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 471b24ab676SJeff Bonwick 472b24ab676SJeff Bonwick return (0); 473fa9e4066Sahrens } 474fa9e4066Sahrens 475b24ab676SJeff Bonwick static int 476fa9e4066Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 477fa9e4066Sahrens { 478b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 479b24ab676SJeff Bonwick blkptr_t *bp = &lr->lr_blkptr; 480b24ab676SJeff Bonwick 481fa9e4066Sahrens /* 482fa9e4066Sahrens * If we previously claimed it, we need to free it. 483fa9e4066Sahrens */ 484b24ab676SJeff Bonwick if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 48543466aaeSMax Grossman bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && 48643466aaeSMax Grossman !BP_IS_HOLE(bp)) 487b24ab676SJeff Bonwick zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 488b24ab676SJeff Bonwick 489b24ab676SJeff Bonwick return (0); 490fa9e4066Sahrens } 491fa9e4066Sahrens 4921271e4b1SPrakash Surya static int 4931271e4b1SPrakash Surya zil_lwb_vdev_compare(const void *x1, const void *x2) 4941271e4b1SPrakash Surya { 4951271e4b1SPrakash Surya const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 4961271e4b1SPrakash Surya const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 4971271e4b1SPrakash Surya 4981271e4b1SPrakash Surya if (v1 < v2) 4991271e4b1SPrakash Surya return (-1); 5001271e4b1SPrakash Surya if (v1 > v2) 5011271e4b1SPrakash Surya return (1); 5021271e4b1SPrakash Surya 5031271e4b1SPrakash Surya return (0); 5041271e4b1SPrakash Surya } 5051271e4b1SPrakash Surya 5066e1f5caaSNeil Perrin static lwb_t * 507c5ee4681SAlexander Motin zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) 5086e1f5caaSNeil Perrin { 5096e1f5caaSNeil Perrin lwb_t *lwb; 5106e1f5caaSNeil Perrin 5116e1f5caaSNeil Perrin lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 5126e1f5caaSNeil Perrin lwb->lwb_zilog = zilog; 5136e1f5caaSNeil Perrin lwb->lwb_blk = *bp; 514c5ee4681SAlexander Motin lwb->lwb_slog = slog; 5151271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_CLOSED; 5166e1f5caaSNeil Perrin lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 5176e1f5caaSNeil Perrin lwb->lwb_max_txg = txg; 5181271e4b1SPrakash Surya lwb->lwb_write_zio = NULL; 5191271e4b1SPrakash Surya lwb->lwb_root_zio = NULL; 5206e1f5caaSNeil Perrin lwb->lwb_tx = NULL; 5211271e4b1SPrakash Surya lwb->lwb_issued_timestamp = 0; 5226e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 5236e1f5caaSNeil Perrin lwb->lwb_nused = sizeof (zil_chain_t); 5246e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp); 5256e1f5caaSNeil Perrin } else { 5266e1f5caaSNeil Perrin lwb->lwb_nused = 0; 5276e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 5286e1f5caaSNeil Perrin } 5296e1f5caaSNeil Perrin 5306e1f5caaSNeil Perrin mutex_enter(&zilog->zl_lock); 5316e1f5caaSNeil Perrin list_insert_tail(&zilog->zl_lwb_list, lwb); 5326e1f5caaSNeil Perrin mutex_exit(&zilog->zl_lock); 5336e1f5caaSNeil Perrin 5341271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 5351271e4b1SPrakash Surya ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 53694ddd090SPrakash Surya VERIFY(list_is_empty(&lwb->lwb_waiters)); 5371271e4b1SPrakash Surya 5386e1f5caaSNeil Perrin return (lwb); 5396e1f5caaSNeil Perrin } 5406e1f5caaSNeil Perrin 5411271e4b1SPrakash Surya static void 5421271e4b1SPrakash Surya zil_free_lwb(zilog_t *zilog, lwb_t *lwb) 5431271e4b1SPrakash Surya { 5441271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_lock)); 5451271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); 54694ddd090SPrakash Surya VERIFY(list_is_empty(&lwb->lwb_waiters)); 5471271e4b1SPrakash Surya ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 5481271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, ==, NULL); 5491271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, ==, NULL); 55094ddd090SPrakash Surya ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); 55194ddd090SPrakash Surya ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || 552cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_FLUSH_DONE); 5531271e4b1SPrakash Surya 5541271e4b1SPrakash Surya /* 5551271e4b1SPrakash Surya * Clear the zilog's field to indicate this lwb is no longer 5561271e4b1SPrakash Surya * valid, and prevent use-after-free errors. 5571271e4b1SPrakash Surya */ 5581271e4b1SPrakash Surya if (zilog->zl_last_lwb_opened == lwb) 5591271e4b1SPrakash Surya zilog->zl_last_lwb_opened = NULL; 5601271e4b1SPrakash Surya 5611271e4b1SPrakash Surya kmem_cache_free(zil_lwb_cache, lwb); 5621271e4b1SPrakash Surya } 5631271e4b1SPrakash Surya 564ce636f8bSMatthew Ahrens /* 565ce636f8bSMatthew Ahrens * Called when we create in-memory log transactions so that we know 566ce636f8bSMatthew Ahrens * to cleanup the itxs at the end of spa_sync(). 567ce636f8bSMatthew Ahrens */ 568ce636f8bSMatthew Ahrens void 569ce636f8bSMatthew Ahrens zilog_dirty(zilog_t *zilog, uint64_t txg) 570ce636f8bSMatthew Ahrens { 571ce636f8bSMatthew Ahrens dsl_pool_t *dp = zilog->zl_dmu_pool; 572ce636f8bSMatthew Ahrens dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 573ce636f8bSMatthew Ahrens 5741271e4b1SPrakash Surya ASSERT(spa_writeable(zilog->zl_spa)); 5751271e4b1SPrakash Surya 576bc9014e6SJustin Gibbs if (ds->ds_is_snapshot) 577ce636f8bSMatthew Ahrens panic("dirtying snapshot!"); 578ce636f8bSMatthew Ahrens 5793b2aab18SMatthew Ahrens if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { 580ce636f8bSMatthew Ahrens /* up the hold count until we can be written out */ 581ce636f8bSMatthew Ahrens dmu_buf_add_ref(ds->ds_dbuf, zilog); 5821271e4b1SPrakash Surya 5831271e4b1SPrakash Surya zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); 584ce636f8bSMatthew Ahrens } 585ce636f8bSMatthew Ahrens } 586ce636f8bSMatthew Ahrens 58743297f97SGeorge Wilson /* 58843297f97SGeorge Wilson * Determine if the zil is dirty in the specified txg. Callers wanting to 58943297f97SGeorge Wilson * ensure that the dirty state does not change must hold the itxg_lock for 59043297f97SGeorge Wilson * the specified txg. Holding the lock will ensure that the zil cannot be 59143297f97SGeorge Wilson * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current 59243297f97SGeorge Wilson * state. 59343297f97SGeorge Wilson */ 59443297f97SGeorge Wilson boolean_t 59543297f97SGeorge Wilson zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) 59643297f97SGeorge Wilson { 59743297f97SGeorge Wilson dsl_pool_t *dp = zilog->zl_dmu_pool; 59843297f97SGeorge Wilson 59943297f97SGeorge Wilson if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) 60043297f97SGeorge Wilson return (B_TRUE); 60143297f97SGeorge Wilson return (B_FALSE); 60243297f97SGeorge Wilson } 60343297f97SGeorge Wilson 60443297f97SGeorge Wilson /* 60543297f97SGeorge Wilson * Determine if the zil is dirty. The zil is considered dirty if it has 60643297f97SGeorge Wilson * any pending itx records that have not been cleaned by zil_clean(). 60743297f97SGeorge Wilson */ 608ce636f8bSMatthew Ahrens boolean_t 609ce636f8bSMatthew Ahrens zilog_is_dirty(zilog_t *zilog) 610ce636f8bSMatthew Ahrens { 611ce636f8bSMatthew Ahrens dsl_pool_t *dp = zilog->zl_dmu_pool; 612ce636f8bSMatthew Ahrens 613ce636f8bSMatthew Ahrens for (int t = 0; t < TXG_SIZE; t++) { 614ce636f8bSMatthew Ahrens if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) 615ce636f8bSMatthew Ahrens return (B_TRUE); 616ce636f8bSMatthew Ahrens } 617ce636f8bSMatthew Ahrens return (B_FALSE); 618ce636f8bSMatthew Ahrens } 619ce636f8bSMatthew Ahrens 620fa9e4066Sahrens /* 621fa9e4066Sahrens * Create an on-disk intent log. 622fa9e4066Sahrens */ 6236e1f5caaSNeil Perrin static lwb_t * 624fa9e4066Sahrens zil_create(zilog_t *zilog) 625fa9e4066Sahrens { 626d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 6276e1f5caaSNeil Perrin lwb_t *lwb = NULL; 628d80c45e0Sbonwick uint64_t txg = 0; 629d80c45e0Sbonwick dmu_tx_t *tx = NULL; 630fa9e4066Sahrens blkptr_t blk; 631d80c45e0Sbonwick int error = 0; 632c5ee4681SAlexander Motin boolean_t slog = FALSE; 633fa9e4066Sahrens 634fa9e4066Sahrens /* 635d80c45e0Sbonwick * Wait for any previous destroy to complete. 636fa9e4066Sahrens */ 637d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 638d80c45e0Sbonwick 639d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 640d80c45e0Sbonwick ASSERT(zh->zh_replay_seq == 0); 641d80c45e0Sbonwick 642d80c45e0Sbonwick blk = zh->zh_log; 643fa9e4066Sahrens 644fa9e4066Sahrens /* 6456e1f5caaSNeil Perrin * Allocate an initial log block if: 6466e1f5caaSNeil Perrin * - there isn't one already 6476e1f5caaSNeil Perrin * - the existing block is the wrong endianess 648fa9e4066Sahrens */ 649899217ddSNeil Perrin if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 650d80c45e0Sbonwick tx = dmu_tx_create(zilog->zl_os); 6511271e4b1SPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 652d80c45e0Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 653d80c45e0Sbonwick txg = dmu_tx_get_txg(tx); 654d80c45e0Sbonwick 655899217ddSNeil Perrin if (!BP_IS_HOLE(&blk)) { 65686714001SSerapheim Dimitropoulos zio_free(zilog->zl_spa, txg, &blk); 657899217ddSNeil Perrin BP_ZERO(&blk); 658899217ddSNeil Perrin } 659899217ddSNeil Perrin 660f78cdc34SPaul Dagnelie error = zio_alloc_zil(zilog->zl_spa, 661f78cdc34SPaul Dagnelie zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, 662c5ee4681SAlexander Motin ZIL_MIN_BLKSZ, &slog); 663d80c45e0Sbonwick 664d80c45e0Sbonwick if (error == 0) 665d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 66613f5297eSperrin } 667fa9e4066Sahrens 668d80c45e0Sbonwick /* 6691271e4b1SPrakash Surya * Allocate a log write block (lwb) for the first log block. 670d80c45e0Sbonwick */ 6716e1f5caaSNeil Perrin if (error == 0) 672c5ee4681SAlexander Motin lwb = zil_alloc_lwb(zilog, &blk, slog, txg); 673fa9e4066Sahrens 674d80c45e0Sbonwick /* 675d80c45e0Sbonwick * If we just allocated the first log block, commit our transaction 676d80c45e0Sbonwick * and wait for zil_sync() to stuff the block poiner into zh_log. 677d80c45e0Sbonwick * (zh is part of the MOS, so we cannot modify it in open context.) 678d80c45e0Sbonwick */ 679d80c45e0Sbonwick if (tx != NULL) { 680d80c45e0Sbonwick dmu_tx_commit(tx); 68113f5297eSperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 682d80c45e0Sbonwick } 683d80c45e0Sbonwick 684d80c45e0Sbonwick ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 6856e1f5caaSNeil Perrin 6866e1f5caaSNeil Perrin return (lwb); 687fa9e4066Sahrens } 688fa9e4066Sahrens 689fa9e4066Sahrens /* 6901271e4b1SPrakash Surya * In one tx, free all log blocks and clear the log header. If keep_first 6911271e4b1SPrakash Surya * is set, then we're replaying a log with no content. We want to keep the 6921271e4b1SPrakash Surya * first block, however, so that the first synchronous transaction doesn't 6931271e4b1SPrakash Surya * require a txg_wait_synced() in zil_create(). We don't need to 6941271e4b1SPrakash Surya * txg_wait_synced() here either when keep_first is set, because both 6951271e4b1SPrakash Surya * zil_create() and zil_destroy() will wait for any in-progress destroys 6961271e4b1SPrakash Surya * to complete. 697fa9e4066Sahrens */ 698fa9e4066Sahrens void 699d80c45e0Sbonwick zil_destroy(zilog_t *zilog, boolean_t keep_first) 700fa9e4066Sahrens { 701d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 702d80c45e0Sbonwick lwb_t *lwb; 703fa9e4066Sahrens dmu_tx_t *tx; 704fa9e4066Sahrens uint64_t txg; 705fa9e4066Sahrens 706d80c45e0Sbonwick /* 707d80c45e0Sbonwick * Wait for any previous destroy to complete. 708d80c45e0Sbonwick */ 709d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 710fa9e4066Sahrens 711b24ab676SJeff Bonwick zilog->zl_old_header = *zh; /* debugging aid */ 712b24ab676SJeff Bonwick 713d80c45e0Sbonwick if (BP_IS_HOLE(&zh->zh_log)) 714fa9e4066Sahrens return; 715fa9e4066Sahrens 716fa9e4066Sahrens tx = dmu_tx_create(zilog->zl_os); 7171271e4b1SPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 718fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 719fa9e4066Sahrens txg = dmu_tx_get_txg(tx); 720fa9e4066Sahrens 721d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 722d80c45e0Sbonwick 723d80c45e0Sbonwick ASSERT3U(zilog->zl_destroy_txg, <, txg); 724fa9e4066Sahrens zilog->zl_destroy_txg = txg; 725b24ab676SJeff Bonwick zilog->zl_keep_first = keep_first; 726d80c45e0Sbonwick 727d80c45e0Sbonwick if (!list_is_empty(&zilog->zl_lwb_list)) { 728d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 729c9ba2a43SEric Schrock VERIFY(!keep_first); 730d80c45e0Sbonwick while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 731d80c45e0Sbonwick list_remove(&zilog->zl_lwb_list, lwb); 732d80c45e0Sbonwick if (lwb->lwb_buf != NULL) 733d80c45e0Sbonwick zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 7341271e4b1SPrakash Surya zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); 7351271e4b1SPrakash Surya zil_free_lwb(zilog, lwb); 736d80c45e0Sbonwick } 737b24ab676SJeff Bonwick } else if (!keep_first) { 738ce636f8bSMatthew Ahrens zil_destroy_sync(zilog, tx); 739d80c45e0Sbonwick } 740b19a79ecSperrin mutex_exit(&zilog->zl_lock); 741fa9e4066Sahrens 742fa9e4066Sahrens dmu_tx_commit(tx); 743fa9e4066Sahrens } 744fa9e4066Sahrens 745ce636f8bSMatthew Ahrens void 746ce636f8bSMatthew Ahrens zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) 747ce636f8bSMatthew Ahrens { 748ce636f8bSMatthew Ahrens ASSERT(list_is_empty(&zilog->zl_lwb_list)); 749ce636f8bSMatthew Ahrens (void) zil_parse(zilog, zil_free_log_block, 750ce636f8bSMatthew Ahrens zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); 751ce636f8bSMatthew Ahrens } 752ce636f8bSMatthew Ahrens 7531d452cf5Sahrens int 75412380e1eSArne Jansen zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) 755fa9e4066Sahrens { 756fa9e4066Sahrens dmu_tx_t *tx = txarg; 757fa9e4066Sahrens zilog_t *zilog; 75886714001SSerapheim Dimitropoulos uint64_t first_txg; 759fa9e4066Sahrens zil_header_t *zh; 760fa9e4066Sahrens objset_t *os; 761fa9e4066Sahrens int error; 762fa9e4066Sahrens 76312380e1eSArne Jansen error = dmu_objset_own_obj(dp, ds->ds_object, 76412380e1eSArne Jansen DMU_OST_ANY, B_FALSE, FTAG, &os); 7653b2aab18SMatthew Ahrens if (error != 0) { 76622438533SMatthew Ahrens /* 76722438533SMatthew Ahrens * EBUSY indicates that the objset is inconsistent, in which 76822438533SMatthew Ahrens * case it can not have a ZIL. 76922438533SMatthew Ahrens */ 77022438533SMatthew Ahrens if (error != EBUSY) { 77112380e1eSArne Jansen cmn_err(CE_WARN, "can't open objset for %llu, error %u", 77212380e1eSArne Jansen (unsigned long long)ds->ds_object, error); 77322438533SMatthew Ahrens } 7741d452cf5Sahrens return (0); 775fa9e4066Sahrens } 776fa9e4066Sahrens 777fa9e4066Sahrens zilog = dmu_objset_zil(os); 778d80c45e0Sbonwick zh = zil_header_in_syncing_context(zilog); 77986714001SSerapheim Dimitropoulos ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); 78086714001SSerapheim Dimitropoulos first_txg = spa_min_claim_txg(zilog->zl_spa); 781fa9e4066Sahrens 78286714001SSerapheim Dimitropoulos /* 78386714001SSerapheim Dimitropoulos * If the spa_log_state is not set to be cleared, check whether 78486714001SSerapheim Dimitropoulos * the current uberblock is a checkpoint one and if the current 78586714001SSerapheim Dimitropoulos * header has been claimed before moving on. 78686714001SSerapheim Dimitropoulos * 78786714001SSerapheim Dimitropoulos * If the current uberblock is a checkpointed uberblock then 78886714001SSerapheim Dimitropoulos * one of the following scenarios took place: 78986714001SSerapheim Dimitropoulos * 79086714001SSerapheim Dimitropoulos * 1] We are currently rewinding to the checkpoint of the pool. 79186714001SSerapheim Dimitropoulos * 2] We crashed in the middle of a checkpoint rewind but we 79286714001SSerapheim Dimitropoulos * did manage to write the checkpointed uberblock to the 79386714001SSerapheim Dimitropoulos * vdev labels, so when we tried to import the pool again 79486714001SSerapheim Dimitropoulos * the checkpointed uberblock was selected from the import 79586714001SSerapheim Dimitropoulos * procedure. 79686714001SSerapheim Dimitropoulos * 79786714001SSerapheim Dimitropoulos * In both cases we want to zero out all the ZIL blocks, except 79886714001SSerapheim Dimitropoulos * the ones that have been claimed at the time of the checkpoint 79986714001SSerapheim Dimitropoulos * (their zh_claim_txg != 0). The reason is that these blocks 80086714001SSerapheim Dimitropoulos * may be corrupted since we may have reused their locations on 80186714001SSerapheim Dimitropoulos * disk after we took the checkpoint. 80286714001SSerapheim Dimitropoulos * 80386714001SSerapheim Dimitropoulos * We could try to set spa_log_state to SPA_LOG_CLEAR earlier 80486714001SSerapheim Dimitropoulos * when we first figure out whether the current uberblock is 80586714001SSerapheim Dimitropoulos * checkpointed or not. Unfortunately, that would discard all 80686714001SSerapheim Dimitropoulos * the logs, including the ones that are claimed, and we would 80786714001SSerapheim Dimitropoulos * leak space. 80886714001SSerapheim Dimitropoulos */ 80986714001SSerapheim Dimitropoulos if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || 81086714001SSerapheim Dimitropoulos (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 81186714001SSerapheim Dimitropoulos zh->zh_claim_txg == 0)) { 81286714001SSerapheim Dimitropoulos if (!BP_IS_HOLE(&zh->zh_log)) { 81386714001SSerapheim Dimitropoulos (void) zil_parse(zilog, zil_clear_log_block, 81486714001SSerapheim Dimitropoulos zil_noop_log_record, tx, first_txg); 81586714001SSerapheim Dimitropoulos } 816e6ca193dSGeorge Wilson BP_ZERO(&zh->zh_log); 817e6ca193dSGeorge Wilson dsl_dataset_dirty(dmu_objset_ds(os), tx); 8183b2aab18SMatthew Ahrens dmu_objset_disown(os, FTAG); 819468c413aSTim Haley return (0); 820e6ca193dSGeorge Wilson } 821e6ca193dSGeorge Wilson 82286714001SSerapheim Dimitropoulos /* 82386714001SSerapheim Dimitropoulos * If we are not rewinding and opening the pool normally, then 82486714001SSerapheim Dimitropoulos * the min_claim_txg should be equal to the first txg of the pool. 82586714001SSerapheim Dimitropoulos */ 82686714001SSerapheim Dimitropoulos ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); 82786714001SSerapheim Dimitropoulos 828fa9e4066Sahrens /* 829d80c45e0Sbonwick * Claim all log blocks if we haven't already done so, and remember 830d80c45e0Sbonwick * the highest claimed sequence number. This ensures that if we can 831d80c45e0Sbonwick * read only part of the log now (e.g. due to a missing device), 832d80c45e0Sbonwick * but we can read the entire log later, we will not try to replay 833d80c45e0Sbonwick * or destroy beyond the last block we successfully claimed. 834fa9e4066Sahrens */ 835fa9e4066Sahrens ASSERT3U(zh->zh_claim_txg, <=, first_txg); 836fa9e4066Sahrens if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 837b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_claim_log_block, 838d80c45e0Sbonwick zil_claim_log_record, tx, first_txg); 839b24ab676SJeff Bonwick zh->zh_claim_txg = first_txg; 840b24ab676SJeff Bonwick zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 841b24ab676SJeff Bonwick zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 842b24ab676SJeff Bonwick if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 843b24ab676SJeff Bonwick zh->zh_flags |= ZIL_REPLAY_NEEDED; 844b24ab676SJeff Bonwick zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 845fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(os), tx); 846fa9e4066Sahrens } 847d80c45e0Sbonwick 848fa9e4066Sahrens ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 8493b2aab18SMatthew Ahrens dmu_objset_disown(os, FTAG); 8501d452cf5Sahrens return (0); 851b87f3af3Sperrin } 852b87f3af3Sperrin 853b87f3af3Sperrin /* 854b87f3af3Sperrin * Check the log by walking the log chain. 855b87f3af3Sperrin * Checksum errors are ok as they indicate the end of the chain. 856b87f3af3Sperrin * Any other error (no device or read failure) returns an error. 857b87f3af3Sperrin */ 85812380e1eSArne Jansen /* ARGSUSED */ 859b87f3af3Sperrin int 86012380e1eSArne Jansen zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) 861b87f3af3Sperrin { 862b87f3af3Sperrin zilog_t *zilog; 863b87f3af3Sperrin objset_t *os; 8644b964adaSGeorge Wilson blkptr_t *bp; 865b87f3af3Sperrin int error; 866b87f3af3Sperrin 867b24ab676SJeff Bonwick ASSERT(tx == NULL); 868b24ab676SJeff Bonwick 86912380e1eSArne Jansen error = dmu_objset_from_ds(ds, &os); 8703b2aab18SMatthew Ahrens if (error != 0) { 87112380e1eSArne Jansen cmn_err(CE_WARN, "can't open objset %llu, error %d", 87212380e1eSArne Jansen (unsigned long long)ds->ds_object, error); 873b87f3af3Sperrin return (0); 874b87f3af3Sperrin } 875b87f3af3Sperrin 876b87f3af3Sperrin zilog = dmu_objset_zil(os); 8774b964adaSGeorge Wilson bp = (blkptr_t *)&zilog->zl_header->zh_log; 8784b964adaSGeorge Wilson 8794b964adaSGeorge Wilson if (!BP_IS_HOLE(bp)) { 8804b964adaSGeorge Wilson vdev_t *vd; 8814b964adaSGeorge Wilson boolean_t valid = B_TRUE; 8824b964adaSGeorge Wilson 88386714001SSerapheim Dimitropoulos /* 88486714001SSerapheim Dimitropoulos * Check the first block and determine if it's on a log device 88586714001SSerapheim Dimitropoulos * which may have been removed or faulted prior to loading this 88686714001SSerapheim Dimitropoulos * pool. If so, there's no point in checking the rest of the 88786714001SSerapheim Dimitropoulos * log as its content should have already been synced to the 88886714001SSerapheim Dimitropoulos * pool. 88986714001SSerapheim Dimitropoulos */ 8904b964adaSGeorge Wilson spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 8914b964adaSGeorge Wilson vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 8924b964adaSGeorge Wilson if (vd->vdev_islog && vdev_is_dead(vd)) 8934b964adaSGeorge Wilson valid = vdev_log_state_valid(vd); 8944b964adaSGeorge Wilson spa_config_exit(os->os_spa, SCL_STATE, FTAG); 8954b964adaSGeorge Wilson 89612380e1eSArne Jansen if (!valid) 8974b964adaSGeorge Wilson return (0); 89886714001SSerapheim Dimitropoulos 89986714001SSerapheim Dimitropoulos /* 90086714001SSerapheim Dimitropoulos * Check whether the current uberblock is checkpointed (e.g. 90186714001SSerapheim Dimitropoulos * we are rewinding) and whether the current header has been 90286714001SSerapheim Dimitropoulos * claimed or not. If it hasn't then skip verifying it. We 90386714001SSerapheim Dimitropoulos * do this because its ZIL blocks may be part of the pool's 90486714001SSerapheim Dimitropoulos * state before the rewind, which is no longer valid. 90586714001SSerapheim Dimitropoulos */ 90686714001SSerapheim Dimitropoulos zil_header_t *zh = zil_header_in_syncing_context(zilog); 90786714001SSerapheim Dimitropoulos if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && 90886714001SSerapheim Dimitropoulos zh->zh_claim_txg == 0) 90986714001SSerapheim Dimitropoulos return (0); 9104b964adaSGeorge Wilson } 911b87f3af3Sperrin 912b24ab676SJeff Bonwick /* 913b24ab676SJeff Bonwick * Because tx == NULL, zil_claim_log_block() will not actually claim 914b24ab676SJeff Bonwick * any blocks, but just determine whether it is possible to do so. 915b24ab676SJeff Bonwick * In addition to checking the log chain, zil_claim_log_block() 916b24ab676SJeff Bonwick * will invoke zio_claim() with a done func of spa_claim_notify(), 917b24ab676SJeff Bonwick * which will update spa_max_claim_txg. See spa_load() for details. 918b24ab676SJeff Bonwick */ 919b24ab676SJeff Bonwick error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 92086714001SSerapheim Dimitropoulos zilog->zl_header->zh_claim_txg ? -1ULL : 92186714001SSerapheim Dimitropoulos spa_min_claim_txg(os->os_spa)); 922b24ab676SJeff Bonwick 923b24ab676SJeff Bonwick return ((error == ECKSUM || error == ENOENT) ? 0 : error); 924b87f3af3Sperrin } 925b87f3af3Sperrin 9261271e4b1SPrakash Surya /* 9271271e4b1SPrakash Surya * When an itx is "skipped", this function is used to properly mark the 9281271e4b1SPrakash Surya * waiter as "done, and signal any thread(s) waiting on it. An itx can 9291271e4b1SPrakash Surya * be skipped (and not committed to an lwb) for a variety of reasons, 9301271e4b1SPrakash Surya * one of them being that the itx was committed via spa_sync(), prior to 9311271e4b1SPrakash Surya * it being committed to an lwb; this can happen if a thread calling 9321271e4b1SPrakash Surya * zil_commit() is racing with spa_sync(). 9331271e4b1SPrakash Surya */ 9341271e4b1SPrakash Surya static void 9351271e4b1SPrakash Surya zil_commit_waiter_skip(zil_commit_waiter_t *zcw) 93617f17c2dSbonwick { 9371271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 9381271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_FALSE); 9391271e4b1SPrakash Surya zcw->zcw_done = B_TRUE; 9401271e4b1SPrakash Surya cv_broadcast(&zcw->zcw_cv); 9411271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 9421271e4b1SPrakash Surya } 94317f17c2dSbonwick 9441271e4b1SPrakash Surya /* 9451271e4b1SPrakash Surya * This function is used when the given waiter is to be linked into an 9461271e4b1SPrakash Surya * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. 9471271e4b1SPrakash Surya * At this point, the waiter will no longer be referenced by the itx, 9481271e4b1SPrakash Surya * and instead, will be referenced by the lwb. 9491271e4b1SPrakash Surya */ 9501271e4b1SPrakash Surya static void 9511271e4b1SPrakash Surya zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) 9521271e4b1SPrakash Surya { 95394ddd090SPrakash Surya /* 95494ddd090SPrakash Surya * The lwb_waiters field of the lwb is protected by the zilog's 95594ddd090SPrakash Surya * zl_lock, thus it must be held when calling this function. 95694ddd090SPrakash Surya */ 95794ddd090SPrakash Surya ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); 95894ddd090SPrakash Surya 9591271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 9601271e4b1SPrakash Surya ASSERT(!list_link_active(&zcw->zcw_node)); 9611271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, NULL); 9621271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 9631271e4b1SPrakash Surya ASSERT(lwb->lwb_state == LWB_STATE_OPENED || 964cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_ISSUED || 965cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_WRITE_DONE); 9661271e4b1SPrakash Surya 9671271e4b1SPrakash Surya list_insert_tail(&lwb->lwb_waiters, zcw); 9681271e4b1SPrakash Surya zcw->zcw_lwb = lwb; 9691271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 9701271e4b1SPrakash Surya } 97117f17c2dSbonwick 9721271e4b1SPrakash Surya /* 9731271e4b1SPrakash Surya * This function is used when zio_alloc_zil() fails to allocate a ZIL 9741271e4b1SPrakash Surya * block, and the given waiter must be linked to the "nolwb waiters" 9751271e4b1SPrakash Surya * list inside of zil_process_commit_list(). 9761271e4b1SPrakash Surya */ 9771271e4b1SPrakash Surya static void 9781271e4b1SPrakash Surya zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) 9791271e4b1SPrakash Surya { 9801271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 9811271e4b1SPrakash Surya ASSERT(!list_link_active(&zcw->zcw_node)); 9821271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, NULL); 9831271e4b1SPrakash Surya list_insert_tail(nolwb, zcw); 9841271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 98517f17c2dSbonwick } 98617f17c2dSbonwick 987fa9e4066Sahrens void 9881271e4b1SPrakash Surya zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) 989fa9e4066Sahrens { 9901271e4b1SPrakash Surya avl_tree_t *t = &lwb->lwb_vdev_tree; 99117f17c2dSbonwick avl_index_t where; 99217f17c2dSbonwick zil_vdev_node_t *zv, zvsearch; 99317f17c2dSbonwick int ndvas = BP_GET_NDVAS(bp); 99417f17c2dSbonwick int i; 995fa9e4066Sahrens 996f8fdf681SPrakash Surya if (zil_nocacheflush) 997fa9e4066Sahrens return; 998fa9e4066Sahrens 9991271e4b1SPrakash Surya mutex_enter(&lwb->lwb_vdev_lock); 100017f17c2dSbonwick for (i = 0; i < ndvas; i++) { 100117f17c2dSbonwick zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 100217f17c2dSbonwick if (avl_find(t, &zvsearch, &where) == NULL) { 100317f17c2dSbonwick zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 100417f17c2dSbonwick zv->zv_vdev = zvsearch.zv_vdev; 100517f17c2dSbonwick avl_insert(t, zv, where); 100667bd71c6Sperrin } 100767bd71c6Sperrin } 10081271e4b1SPrakash Surya mutex_exit(&lwb->lwb_vdev_lock); 10091271e4b1SPrakash Surya } 10101271e4b1SPrakash Surya 1011cab3a55eSPrakash Surya static void 1012cab3a55eSPrakash Surya zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) 1013cab3a55eSPrakash Surya { 1014cab3a55eSPrakash Surya avl_tree_t *src = &lwb->lwb_vdev_tree; 1015cab3a55eSPrakash Surya avl_tree_t *dst = &nlwb->lwb_vdev_tree; 1016cab3a55eSPrakash Surya void *cookie = NULL; 1017cab3a55eSPrakash Surya zil_vdev_node_t *zv; 1018cab3a55eSPrakash Surya 1019cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); 1020cab3a55eSPrakash Surya ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); 1021cab3a55eSPrakash Surya ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); 1022cab3a55eSPrakash Surya 1023cab3a55eSPrakash Surya /* 1024cab3a55eSPrakash Surya * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does 1025cab3a55eSPrakash Surya * not need the protection of lwb_vdev_lock (it will only be modified 1026cab3a55eSPrakash Surya * while holding zilog->zl_lock) as its writes and those of its 1027cab3a55eSPrakash Surya * children have all completed. The younger 'nlwb' may be waiting on 1028cab3a55eSPrakash Surya * future writes to additional vdevs. 1029cab3a55eSPrakash Surya */ 1030cab3a55eSPrakash Surya mutex_enter(&nlwb->lwb_vdev_lock); 1031cab3a55eSPrakash Surya /* 1032cab3a55eSPrakash Surya * Tear down the 'lwb' vdev tree, ensuring that entries which do not 1033cab3a55eSPrakash Surya * exist in 'nlwb' are moved to it, freeing any would-be duplicates. 1034cab3a55eSPrakash Surya */ 1035cab3a55eSPrakash Surya while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { 1036cab3a55eSPrakash Surya avl_index_t where; 1037cab3a55eSPrakash Surya 1038cab3a55eSPrakash Surya if (avl_find(dst, zv, &where) == NULL) { 1039cab3a55eSPrakash Surya avl_insert(dst, zv, where); 1040cab3a55eSPrakash Surya } else { 1041cab3a55eSPrakash Surya kmem_free(zv, sizeof (*zv)); 1042cab3a55eSPrakash Surya } 1043cab3a55eSPrakash Surya } 1044cab3a55eSPrakash Surya mutex_exit(&nlwb->lwb_vdev_lock); 1045cab3a55eSPrakash Surya } 1046cab3a55eSPrakash Surya 10471271e4b1SPrakash Surya void 10481271e4b1SPrakash Surya zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) 10491271e4b1SPrakash Surya { 10501271e4b1SPrakash Surya lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1051fa9e4066Sahrens } 1052fa9e4066Sahrens 10531271e4b1SPrakash Surya /* 1054cab3a55eSPrakash Surya * This function is a called after all vdevs associated with a given lwb 10551271e4b1SPrakash Surya * write have completed their DKIOCFLUSHWRITECACHE command; or as soon 1056cab3a55eSPrakash Surya * as the lwb write completes, if "zil_nocacheflush" is set. Further, 1057cab3a55eSPrakash Surya * all "previous" lwb's will have completed before this function is 1058cab3a55eSPrakash Surya * called; i.e. this function is called for all previous lwbs before 1059cab3a55eSPrakash Surya * it's called for "this" lwb (enforced via zio the dependencies 1060cab3a55eSPrakash Surya * configured in zil_lwb_set_zio_dependency()). 10611271e4b1SPrakash Surya * 10621271e4b1SPrakash Surya * The intention is for this function to be called as soon as the 10631271e4b1SPrakash Surya * contents of an lwb are considered "stable" on disk, and will survive 10641271e4b1SPrakash Surya * any sudden loss of power. At this point, any threads waiting for the 10651271e4b1SPrakash Surya * lwb to reach this state are signalled, and the "waiter" structures 10661271e4b1SPrakash Surya * are marked "done". 10671271e4b1SPrakash Surya */ 106891de656bSNeil Perrin static void 10691271e4b1SPrakash Surya zil_lwb_flush_vdevs_done(zio_t *zio) 107067bd71c6Sperrin { 10711271e4b1SPrakash Surya lwb_t *lwb = zio->io_private; 10721271e4b1SPrakash Surya zilog_t *zilog = lwb->lwb_zilog; 10731271e4b1SPrakash Surya dmu_tx_t *tx = lwb->lwb_tx; 10741271e4b1SPrakash Surya zil_commit_waiter_t *zcw; 10751271e4b1SPrakash Surya 10761271e4b1SPrakash Surya spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); 1077fa9e4066Sahrens 10781271e4b1SPrakash Surya zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 10791271e4b1SPrakash Surya 10801271e4b1SPrakash Surya mutex_enter(&zilog->zl_lock); 108167bd71c6Sperrin 108217f17c2dSbonwick /* 10831271e4b1SPrakash Surya * Ensure the lwb buffer pointer is cleared before releasing the 10841271e4b1SPrakash Surya * txg. If we have had an allocation failure and the txg is 10851271e4b1SPrakash Surya * waiting to sync then we want zil_sync() to remove the lwb so 10861271e4b1SPrakash Surya * that it's not picked up as the next new one in 10871271e4b1SPrakash Surya * zil_process_commit_list(). zil_sync() will only remove the 10881271e4b1SPrakash Surya * lwb if lwb_buf is null. 108917f17c2dSbonwick */ 10901271e4b1SPrakash Surya lwb->lwb_buf = NULL; 10911271e4b1SPrakash Surya lwb->lwb_tx = NULL; 109217f17c2dSbonwick 10931271e4b1SPrakash Surya ASSERT3U(lwb->lwb_issued_timestamp, >, 0); 10941271e4b1SPrakash Surya zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; 109517f17c2dSbonwick 10961271e4b1SPrakash Surya lwb->lwb_root_zio = NULL; 1097cab3a55eSPrakash Surya 1098cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); 1099cab3a55eSPrakash Surya lwb->lwb_state = LWB_STATE_FLUSH_DONE; 1100fa9e4066Sahrens 11011271e4b1SPrakash Surya if (zilog->zl_last_lwb_opened == lwb) { 11021271e4b1SPrakash Surya /* 11031271e4b1SPrakash Surya * Remember the highest committed log sequence number 11041271e4b1SPrakash Surya * for ztest. We only update this value when all the log 11051271e4b1SPrakash Surya * writes succeeded, because ztest wants to ASSERT that 11061271e4b1SPrakash Surya * it got the whole log chain. 11071271e4b1SPrakash Surya */ 11081271e4b1SPrakash Surya zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 11091271e4b1SPrakash Surya } 11101271e4b1SPrakash Surya 11111271e4b1SPrakash Surya while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { 11121271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 11131271e4b1SPrakash Surya 11141271e4b1SPrakash Surya ASSERT(list_link_active(&zcw->zcw_node)); 11151271e4b1SPrakash Surya list_remove(&lwb->lwb_waiters, zcw); 11161271e4b1SPrakash Surya 11171271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, lwb); 11181271e4b1SPrakash Surya zcw->zcw_lwb = NULL; 11191271e4b1SPrakash Surya 11201271e4b1SPrakash Surya zcw->zcw_zio_error = zio->io_error; 11211271e4b1SPrakash Surya 11221271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_FALSE); 11231271e4b1SPrakash Surya zcw->zcw_done = B_TRUE; 11241271e4b1SPrakash Surya cv_broadcast(&zcw->zcw_cv); 11251271e4b1SPrakash Surya 11261271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 112767bd71c6Sperrin } 112817f17c2dSbonwick 11291271e4b1SPrakash Surya mutex_exit(&zilog->zl_lock); 11301271e4b1SPrakash Surya 1131fa9e4066Sahrens /* 11321271e4b1SPrakash Surya * Now that we've written this log block, we have a stable pointer 11331271e4b1SPrakash Surya * to the next block in the chain, so it's OK to let the txg in 11341271e4b1SPrakash Surya * which we allocated the next block sync. 1135fa9e4066Sahrens */ 11361271e4b1SPrakash Surya dmu_tx_commit(tx); 1137fa9e4066Sahrens } 1138fa9e4066Sahrens 1139fa9e4066Sahrens /* 1140cab3a55eSPrakash Surya * This is called when an lwb's write zio completes. The callback's 1141cab3a55eSPrakash Surya * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs 1142cab3a55eSPrakash Surya * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved 1143cab3a55eSPrakash Surya * in writing out this specific lwb's data, and in the case that cache 1144cab3a55eSPrakash Surya * flushes have been deferred, vdevs involved in writing the data for 1145cab3a55eSPrakash Surya * previous lwbs. The writes corresponding to all the vdevs in the 1146cab3a55eSPrakash Surya * lwb_vdev_tree will have completed by the time this is called, due to 1147cab3a55eSPrakash Surya * the zio dependencies configured in zil_lwb_set_zio_dependency(), 1148cab3a55eSPrakash Surya * which takes deferred flushes into account. The lwb will be "done" 1149cab3a55eSPrakash Surya * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio 1150cab3a55eSPrakash Surya * completion callback for the lwb's root zio. 1151fa9e4066Sahrens */ 1152fa9e4066Sahrens static void 1153fa9e4066Sahrens zil_lwb_write_done(zio_t *zio) 1154fa9e4066Sahrens { 1155fa9e4066Sahrens lwb_t *lwb = zio->io_private; 11561271e4b1SPrakash Surya spa_t *spa = zio->io_spa; 1157fa9e4066Sahrens zilog_t *zilog = lwb->lwb_zilog; 11581271e4b1SPrakash Surya avl_tree_t *t = &lwb->lwb_vdev_tree; 11591271e4b1SPrakash Surya void *cookie = NULL; 11601271e4b1SPrakash Surya zil_vdev_node_t *zv; 1161cab3a55eSPrakash Surya lwb_t *nlwb; 11621271e4b1SPrakash Surya 11631271e4b1SPrakash Surya ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); 1164fa9e4066Sahrens 1165e14bb325SJeff Bonwick ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 1166e14bb325SJeff Bonwick ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 1167e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 1168e14bb325SJeff Bonwick ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 1169e14bb325SJeff Bonwick ASSERT(!BP_IS_GANG(zio->io_bp)); 1170e14bb325SJeff Bonwick ASSERT(!BP_IS_HOLE(zio->io_bp)); 11715d7b4d43SMatthew Ahrens ASSERT(BP_GET_FILL(zio->io_bp) == 0); 1172e14bb325SJeff Bonwick 1173770499e1SDan Kimmel abd_put(zio->io_abd); 11741271e4b1SPrakash Surya 1175fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1176cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); 1177cab3a55eSPrakash Surya lwb->lwb_state = LWB_STATE_WRITE_DONE; 11781271e4b1SPrakash Surya lwb->lwb_write_zio = NULL; 1179cab3a55eSPrakash Surya nlwb = list_next(&zilog->zl_lwb_list, lwb); 1180b24ab676SJeff Bonwick mutex_exit(&zilog->zl_lock); 1181ef0d8e11SNeil Perrin 11821271e4b1SPrakash Surya if (avl_numnodes(t) == 0) 11831271e4b1SPrakash Surya return; 11841271e4b1SPrakash Surya 1185ef0d8e11SNeil Perrin /* 11861271e4b1SPrakash Surya * If there was an IO error, we're not going to call zio_flush() 11871271e4b1SPrakash Surya * on these vdevs, so we simply empty the tree and free the 11881271e4b1SPrakash Surya * nodes. We avoid calling zio_flush() since there isn't any 11891271e4b1SPrakash Surya * good reason for doing so, after the lwb block failed to be 11901271e4b1SPrakash Surya * written out. 1191ef0d8e11SNeil Perrin */ 11921271e4b1SPrakash Surya if (zio->io_error != 0) { 11931271e4b1SPrakash Surya while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) 11941271e4b1SPrakash Surya kmem_free(zv, sizeof (*zv)); 11951271e4b1SPrakash Surya return; 11961271e4b1SPrakash Surya } 11971271e4b1SPrakash Surya 1198cab3a55eSPrakash Surya /* 1199cab3a55eSPrakash Surya * If this lwb does not have any threads waiting for it to 1200cab3a55eSPrakash Surya * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE 1201cab3a55eSPrakash Surya * command to the vdevs written to by "this" lwb, and instead 1202cab3a55eSPrakash Surya * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE 1203cab3a55eSPrakash Surya * command for those vdevs. Thus, we merge the vdev tree of 1204cab3a55eSPrakash Surya * "this" lwb with the vdev tree of the "next" lwb in the list, 1205cab3a55eSPrakash Surya * and assume the "next" lwb will handle flushing the vdevs (or 1206cab3a55eSPrakash Surya * deferring the flush(s) again). 1207cab3a55eSPrakash Surya * 1208cab3a55eSPrakash Surya * This is a useful performance optimization, especially for 1209cab3a55eSPrakash Surya * workloads with lots of async write activity and few sync 1210cab3a55eSPrakash Surya * write and/or fsync activity, as it has the potential to 1211cab3a55eSPrakash Surya * coalesce multiple flush commands to a vdev into one. 1212cab3a55eSPrakash Surya */ 1213cab3a55eSPrakash Surya if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) { 1214cab3a55eSPrakash Surya zil_lwb_flush_defer(lwb, nlwb); 1215cab3a55eSPrakash Surya ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); 1216cab3a55eSPrakash Surya return; 1217cab3a55eSPrakash Surya } 1218cab3a55eSPrakash Surya 12191271e4b1SPrakash Surya while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 12201271e4b1SPrakash Surya vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 12211271e4b1SPrakash Surya if (vd != NULL) 12221271e4b1SPrakash Surya zio_flush(lwb->lwb_root_zio, vd); 12231271e4b1SPrakash Surya kmem_free(zv, sizeof (*zv)); 12241271e4b1SPrakash Surya } 1225fa9e4066Sahrens } 1226fa9e4066Sahrens 1227cab3a55eSPrakash Surya static void 1228cab3a55eSPrakash Surya zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) 1229cab3a55eSPrakash Surya { 1230cab3a55eSPrakash Surya lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; 1231cab3a55eSPrakash Surya 1232cab3a55eSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 1233cab3a55eSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_lock)); 1234cab3a55eSPrakash Surya 1235cab3a55eSPrakash Surya /* 1236cab3a55eSPrakash Surya * The zilog's "zl_last_lwb_opened" field is used to build the 1237cab3a55eSPrakash Surya * lwb/zio dependency chain, which is used to preserve the 1238cab3a55eSPrakash Surya * ordering of lwb completions that is required by the semantics 1239cab3a55eSPrakash Surya * of the ZIL. Each new lwb zio becomes a parent of the 1240cab3a55eSPrakash Surya * "previous" lwb zio, such that the new lwb's zio cannot 1241cab3a55eSPrakash Surya * complete until the "previous" lwb's zio completes. 1242cab3a55eSPrakash Surya * 1243cab3a55eSPrakash Surya * This is required by the semantics of zil_commit(); the commit 1244cab3a55eSPrakash Surya * waiters attached to the lwbs will be woken in the lwb zio's 1245cab3a55eSPrakash Surya * completion callback, so this zio dependency graph ensures the 1246cab3a55eSPrakash Surya * waiters are woken in the correct order (the same order the 1247cab3a55eSPrakash Surya * lwbs were created). 1248cab3a55eSPrakash Surya */ 1249cab3a55eSPrakash Surya if (last_lwb_opened != NULL && 1250cab3a55eSPrakash Surya last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { 1251cab3a55eSPrakash Surya ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || 1252cab3a55eSPrakash Surya last_lwb_opened->lwb_state == LWB_STATE_ISSUED || 1253cab3a55eSPrakash Surya last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); 1254cab3a55eSPrakash Surya 1255cab3a55eSPrakash Surya ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); 1256cab3a55eSPrakash Surya zio_add_child(lwb->lwb_root_zio, 1257cab3a55eSPrakash Surya last_lwb_opened->lwb_root_zio); 1258cab3a55eSPrakash Surya 1259cab3a55eSPrakash Surya /* 1260cab3a55eSPrakash Surya * If the previous lwb's write hasn't already completed, 1261cab3a55eSPrakash Surya * we also want to order the completion of the lwb write 1262cab3a55eSPrakash Surya * zios (above, we only order the completion of the lwb 1263cab3a55eSPrakash Surya * root zios). This is required because of how we can 1264cab3a55eSPrakash Surya * defer the DKIOCFLUSHWRITECACHE commands for each lwb. 1265cab3a55eSPrakash Surya * 1266cab3a55eSPrakash Surya * When the DKIOCFLUSHWRITECACHE commands are defered, 1267cab3a55eSPrakash Surya * the previous lwb will rely on this lwb to flush the 1268cab3a55eSPrakash Surya * vdevs written to by that previous lwb. Thus, we need 1269cab3a55eSPrakash Surya * to ensure this lwb doesn't issue the flush until 1270cab3a55eSPrakash Surya * after the previous lwb's write completes. We ensure 1271cab3a55eSPrakash Surya * this ordering by setting the zio parent/child 1272cab3a55eSPrakash Surya * relationship here. 1273cab3a55eSPrakash Surya * 1274cab3a55eSPrakash Surya * Without this relationship on the lwb's write zio, 1275cab3a55eSPrakash Surya * it's possible for this lwb's write to complete prior 1276cab3a55eSPrakash Surya * to the previous lwb's write completing; and thus, the 1277cab3a55eSPrakash Surya * vdevs for the previous lwb would be flushed prior to 1278cab3a55eSPrakash Surya * that lwb's data being written to those vdevs (the 1279cab3a55eSPrakash Surya * vdevs are flushed in the lwb write zio's completion 1280cab3a55eSPrakash Surya * handler, zil_lwb_write_done()). 1281cab3a55eSPrakash Surya */ 1282cab3a55eSPrakash Surya if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { 1283cab3a55eSPrakash Surya ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || 1284cab3a55eSPrakash Surya last_lwb_opened->lwb_state == LWB_STATE_ISSUED); 1285cab3a55eSPrakash Surya 1286cab3a55eSPrakash Surya ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); 1287cab3a55eSPrakash Surya zio_add_child(lwb->lwb_write_zio, 1288cab3a55eSPrakash Surya last_lwb_opened->lwb_write_zio); 1289cab3a55eSPrakash Surya } 1290cab3a55eSPrakash Surya } 1291cab3a55eSPrakash Surya } 1292cab3a55eSPrakash Surya 1293cab3a55eSPrakash Surya 1294c5c6ffa0Smaybee /* 12951271e4b1SPrakash Surya * This function's purpose is to "open" an lwb such that it is ready to 12961271e4b1SPrakash Surya * accept new itxs being committed to it. To do this, the lwb's zio 12971271e4b1SPrakash Surya * structures are created, and linked to the lwb. This function is 12981271e4b1SPrakash Surya * idempotent; if the passed in lwb has already been opened, this 12991271e4b1SPrakash Surya * function is essentially a no-op. 1300c5c6ffa0Smaybee */ 1301c5c6ffa0Smaybee static void 13021271e4b1SPrakash Surya zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) 1303c5c6ffa0Smaybee { 13047802d7bfSMatthew Ahrens zbookmark_phys_t zb; 1305c5ee4681SAlexander Motin zio_priority_t prio; 1306c5c6ffa0Smaybee 1307cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 13081271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 13091271e4b1SPrakash Surya EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); 13101271e4b1SPrakash Surya EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); 13111271e4b1SPrakash Surya 1312b24ab676SJeff Bonwick SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1313b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 1314b24ab676SJeff Bonwick lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 1315c5c6ffa0Smaybee 13161271e4b1SPrakash Surya if (lwb->lwb_root_zio == NULL) { 1317770499e1SDan Kimmel abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, 1318770499e1SDan Kimmel BP_GET_LSIZE(&lwb->lwb_blk)); 13191271e4b1SPrakash Surya 1320c5ee4681SAlexander Motin if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) 1321c5ee4681SAlexander Motin prio = ZIO_PRIORITY_SYNC_WRITE; 1322c5ee4681SAlexander Motin else 1323c5ee4681SAlexander Motin prio = ZIO_PRIORITY_ASYNC_WRITE; 13241271e4b1SPrakash Surya 13251271e4b1SPrakash Surya lwb->lwb_root_zio = zio_root(zilog->zl_spa, 13261271e4b1SPrakash Surya zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); 13271271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 13281271e4b1SPrakash Surya 13291271e4b1SPrakash Surya lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, 13301271e4b1SPrakash Surya zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, 13311271e4b1SPrakash Surya BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, 13321271e4b1SPrakash Surya prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 13331271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 13341271e4b1SPrakash Surya 13351271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_OPENED; 13361271e4b1SPrakash Surya 13371271e4b1SPrakash Surya mutex_enter(&zilog->zl_lock); 1338cab3a55eSPrakash Surya zil_lwb_set_zio_dependency(zilog, lwb); 13391271e4b1SPrakash Surya zilog->zl_last_lwb_opened = lwb; 13401271e4b1SPrakash Surya mutex_exit(&zilog->zl_lock); 134167bd71c6Sperrin } 13421271e4b1SPrakash Surya 13431271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 13441271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 13451271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 1346c5c6ffa0Smaybee } 1347c5c6ffa0Smaybee 13486e1f5caaSNeil Perrin /* 13496e1f5caaSNeil Perrin * Define a limited set of intent log block sizes. 1350f7170741SWill Andrews * 13516e1f5caaSNeil Perrin * These must be a multiple of 4KB. Note only the amount used (again 13526e1f5caaSNeil Perrin * aligned to 4KB) actually gets written. However, we can't always just 1353b5152584SMatthew Ahrens * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. 13546e1f5caaSNeil Perrin */ 13556e1f5caaSNeil Perrin uint64_t zil_block_buckets[] = { 13566e1f5caaSNeil Perrin 4096, /* non TX_WRITE */ 13576e1f5caaSNeil Perrin 8192+4096, /* data base */ 1358*54811da5SToomas Soome 32*1024 + 4096, /* NFS writes */ 13596e1f5caaSNeil Perrin UINT64_MAX 13606e1f5caaSNeil Perrin }; 13616e1f5caaSNeil Perrin 1362fa9e4066Sahrens /* 1363fa9e4066Sahrens * Start a log block write and advance to the next log block. 1364fa9e4066Sahrens * Calls are serialized. 1365fa9e4066Sahrens */ 1366fa9e4066Sahrens static lwb_t * 13671271e4b1SPrakash Surya zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) 1368fa9e4066Sahrens { 13696e1f5caaSNeil Perrin lwb_t *nlwb = NULL; 13706e1f5caaSNeil Perrin zil_chain_t *zilc; 1371d80c45e0Sbonwick spa_t *spa = zilog->zl_spa; 13726e1f5caaSNeil Perrin blkptr_t *bp; 1373b24ab676SJeff Bonwick dmu_tx_t *tx; 1374fa9e4066Sahrens uint64_t txg; 1375ada693c4SNeil Perrin uint64_t zil_blksz, wsz; 13766e1f5caaSNeil Perrin int i, error; 1377c5ee4681SAlexander Motin boolean_t slog; 13786e1f5caaSNeil Perrin 1379cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 13801271e4b1SPrakash Surya ASSERT3P(lwb->lwb_root_zio, !=, NULL); 13811271e4b1SPrakash Surya ASSERT3P(lwb->lwb_write_zio, !=, NULL); 13821271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 13831271e4b1SPrakash Surya 13846e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 13856e1f5caaSNeil Perrin zilc = (zil_chain_t *)lwb->lwb_buf; 13866e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 13876e1f5caaSNeil Perrin } else { 13886e1f5caaSNeil Perrin zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 13896e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 13906e1f5caaSNeil Perrin } 1391fa9e4066Sahrens 13926e1f5caaSNeil Perrin ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 1393fa9e4066Sahrens 1394fa9e4066Sahrens /* 1395fa9e4066Sahrens * Allocate the next block and save its address in this block 1396fa9e4066Sahrens * before writing it in order to establish the log chain. 1397fa9e4066Sahrens * Note that if the allocation of nlwb synced before we wrote 1398fa9e4066Sahrens * the block that points at it (lwb), we'd leak it if we crashed. 1399b24ab676SJeff Bonwick * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 1400b24ab676SJeff Bonwick * We dirty the dataset to ensure that zil_sync() will be called 1401b24ab676SJeff Bonwick * to clean up in the event of allocation failure or I/O failure. 1402fa9e4066Sahrens */ 14031271e4b1SPrakash Surya 1404b24ab676SJeff Bonwick tx = dmu_tx_create(zilog->zl_os); 1405d28671a3SAndriy Gapon 1406d28671a3SAndriy Gapon /* 1407f864f99eSPrakash Surya * Since we are not going to create any new dirty data, and we 1408f864f99eSPrakash Surya * can even help with clearing the existing dirty data, we 1409f864f99eSPrakash Surya * should not be subject to the dirty data based delays. We 1410f864f99eSPrakash Surya * use TXG_NOTHROTTLE to bypass the delay mechanism. 1411d28671a3SAndriy Gapon */ 1412f864f99eSPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); 1413f864f99eSPrakash Surya 1414b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1415b24ab676SJeff Bonwick txg = dmu_tx_get_txg(tx); 1416b24ab676SJeff Bonwick 1417b24ab676SJeff Bonwick lwb->lwb_tx = tx; 1418fa9e4066Sahrens 1419fa9e4066Sahrens /* 14206e1f5caaSNeil Perrin * Log blocks are pre-allocated. Here we select the size of the next 14216e1f5caaSNeil Perrin * block, based on size used in the last block. 14226e1f5caaSNeil Perrin * - first find the smallest bucket that will fit the block from a 14236e1f5caaSNeil Perrin * limited set of block sizes. This is because it's faster to write 14246e1f5caaSNeil Perrin * blocks allocated from the same metaslab as they are adjacent or 14256e1f5caaSNeil Perrin * close. 14266e1f5caaSNeil Perrin * - next find the maximum from the new suggested size and an array of 14276e1f5caaSNeil Perrin * previous sizes. This lessens a picket fence effect of wrongly 14286e1f5caaSNeil Perrin * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 14296e1f5caaSNeil Perrin * requests. 14306e1f5caaSNeil Perrin * 14316e1f5caaSNeil Perrin * Note we only write what is used, but we can't just allocate 14326e1f5caaSNeil Perrin * the maximum block size because we can exhaust the available 14336e1f5caaSNeil Perrin * pool log space. 1434fa9e4066Sahrens */ 14356e1f5caaSNeil Perrin zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 14366e1f5caaSNeil Perrin for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 14376e1f5caaSNeil Perrin continue; 14386e1f5caaSNeil Perrin zil_blksz = zil_block_buckets[i]; 14396e1f5caaSNeil Perrin if (zil_blksz == UINT64_MAX) 1440b5152584SMatthew Ahrens zil_blksz = SPA_OLD_MAXBLOCKSIZE; 14416e1f5caaSNeil Perrin zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 14426e1f5caaSNeil Perrin for (i = 0; i < ZIL_PREV_BLKS; i++) 14436e1f5caaSNeil Perrin zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 14446e1f5caaSNeil Perrin zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 1445fa9e4066Sahrens 144667bd71c6Sperrin BP_ZERO(bp); 14471271e4b1SPrakash Surya 144867bd71c6Sperrin /* pass the old blkptr in order to spread log blocks across devs */ 1449f78cdc34SPaul Dagnelie error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, 1450f78cdc34SPaul Dagnelie txg, bp, &lwb->lwb_blk, zil_blksz, &slog); 14513b2aab18SMatthew Ahrens if (error == 0) { 14526e1f5caaSNeil Perrin ASSERT3U(bp->blk_birth, ==, txg); 14536e1f5caaSNeil Perrin bp->blk_cksum = lwb->lwb_blk.blk_cksum; 14546e1f5caaSNeil Perrin bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 1455d63d470bSgw 1456ea8dc4b6Seschrock /* 14571271e4b1SPrakash Surya * Allocate a new log write block (lwb). 1458ea8dc4b6Seschrock */ 1459c5ee4681SAlexander Motin nlwb = zil_alloc_lwb(zilog, bp, slog, txg); 1460fa9e4066Sahrens } 1461fa9e4066Sahrens 14626e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 14636e1f5caaSNeil Perrin /* For Slim ZIL only write what is used. */ 1464ada693c4SNeil Perrin wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 1465ada693c4SNeil Perrin ASSERT3U(wsz, <=, lwb->lwb_sz); 14661271e4b1SPrakash Surya zio_shrink(lwb->lwb_write_zio, wsz); 1467fa9e4066Sahrens 1468ada693c4SNeil Perrin } else { 1469ada693c4SNeil Perrin wsz = lwb->lwb_sz; 14706e1f5caaSNeil Perrin } 1471ada693c4SNeil Perrin 14726e1f5caaSNeil Perrin zilc->zc_pad = 0; 14736e1f5caaSNeil Perrin zilc->zc_nused = lwb->lwb_nused; 14746e1f5caaSNeil Perrin zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 1475fa9e4066Sahrens 1476ada693c4SNeil Perrin /* 1477ada693c4SNeil Perrin * clear unused data for security 1478ada693c4SNeil Perrin */ 1479ada693c4SNeil Perrin bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 1480ada693c4SNeil Perrin 14811271e4b1SPrakash Surya spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); 14821271e4b1SPrakash Surya 14831271e4b1SPrakash Surya zil_lwb_add_block(lwb, &lwb->lwb_blk); 14841271e4b1SPrakash Surya lwb->lwb_issued_timestamp = gethrtime(); 14851271e4b1SPrakash Surya lwb->lwb_state = LWB_STATE_ISSUED; 14861271e4b1SPrakash Surya 14871271e4b1SPrakash Surya zio_nowait(lwb->lwb_root_zio); 14881271e4b1SPrakash Surya zio_nowait(lwb->lwb_write_zio); 148967bd71c6Sperrin 1490fa9e4066Sahrens /* 14916e1f5caaSNeil Perrin * If there was an allocation failure then nlwb will be null which 14926e1f5caaSNeil Perrin * forces a txg_wait_synced(). 1493fa9e4066Sahrens */ 1494fa9e4066Sahrens return (nlwb); 1495fa9e4066Sahrens } 1496fa9e4066Sahrens 1497fa9e4066Sahrens static lwb_t * 1498fa9e4066Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 1499fa9e4066Sahrens { 1500c5ee4681SAlexander Motin lr_t *lrcb, *lrc; 1501c5ee4681SAlexander Motin lr_write_t *lrwb, *lrw; 1502b24ab676SJeff Bonwick char *lr_buf; 1503c5ee4681SAlexander Motin uint64_t dlen, dnow, lwb_sp, reclen, txg; 1504fa9e4066Sahrens 1505cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 15061271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 15071271e4b1SPrakash Surya ASSERT3P(lwb->lwb_buf, !=, NULL); 15081271e4b1SPrakash Surya 15091271e4b1SPrakash Surya zil_lwb_write_open(zilog, lwb); 1510b24ab676SJeff Bonwick 15111271e4b1SPrakash Surya lrc = &itx->itx_lr; 15121271e4b1SPrakash Surya lrw = (lr_write_t *)lrc; 15131271e4b1SPrakash Surya 15141271e4b1SPrakash Surya /* 15151271e4b1SPrakash Surya * A commit itx doesn't represent any on-disk state; instead 15161271e4b1SPrakash Surya * it's simply used as a place holder on the commit list, and 15171271e4b1SPrakash Surya * provides a mechanism for attaching a "commit waiter" onto the 15181271e4b1SPrakash Surya * correct lwb (such that the waiter can be signalled upon 15191271e4b1SPrakash Surya * completion of that lwb). Thus, we don't process this itx's 15201271e4b1SPrakash Surya * log record if it's a commit itx (these itx's don't have log 15211271e4b1SPrakash Surya * records), and instead link the itx's waiter onto the lwb's 15221271e4b1SPrakash Surya * list of waiters. 15231271e4b1SPrakash Surya * 15241271e4b1SPrakash Surya * For more details, see the comment above zil_commit(). 15251271e4b1SPrakash Surya */ 15261271e4b1SPrakash Surya if (lrc->lrc_txtype == TX_COMMIT) { 152794ddd090SPrakash Surya mutex_enter(&zilog->zl_lock); 15281271e4b1SPrakash Surya zil_commit_waiter_link_lwb(itx->itx_private, lwb); 15291271e4b1SPrakash Surya itx->itx_private = NULL; 153094ddd090SPrakash Surya mutex_exit(&zilog->zl_lock); 15311271e4b1SPrakash Surya return (lwb); 15321271e4b1SPrakash Surya } 1533fa9e4066Sahrens 1534c5ee4681SAlexander Motin if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { 1535c5c6ffa0Smaybee dlen = P2ROUNDUP_TYPED( 1536b24ab676SJeff Bonwick lrw->lr_length, sizeof (uint64_t), uint64_t); 1537c5ee4681SAlexander Motin } else { 1538c5ee4681SAlexander Motin dlen = 0; 1539c5ee4681SAlexander Motin } 1540c5ee4681SAlexander Motin reclen = lrc->lrc_reclen; 1541104e2ed7Sperrin zilog->zl_cur_used += (reclen + dlen); 1542c5ee4681SAlexander Motin txg = lrc->lrc_txg; 154322ac5be4Sperrin 15441271e4b1SPrakash Surya ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); 154567bd71c6Sperrin 1546c5ee4681SAlexander Motin cont: 1547fa9e4066Sahrens /* 1548fa9e4066Sahrens * If this record won't fit in the current log block, start a new one. 1549c5ee4681SAlexander Motin * For WR_NEED_COPY optimize layout for minimal number of chunks. 1550fa9e4066Sahrens */ 1551c5ee4681SAlexander Motin lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1552c5ee4681SAlexander Motin if (reclen > lwb_sp || (reclen + dlen > lwb_sp && 1553c5ee4681SAlexander Motin lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || 1554c5ee4681SAlexander Motin lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { 15551271e4b1SPrakash Surya lwb = zil_lwb_write_issue(zilog, lwb); 1556c5c6ffa0Smaybee if (lwb == NULL) 1557fa9e4066Sahrens return (NULL); 15581271e4b1SPrakash Surya zil_lwb_write_open(zilog, lwb); 15596e1f5caaSNeil Perrin ASSERT(LWB_EMPTY(lwb)); 1560c5ee4681SAlexander Motin lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1561c5ee4681SAlexander Motin ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); 1562fa9e4066Sahrens } 1563fa9e4066Sahrens 1564c5ee4681SAlexander Motin dnow = MIN(dlen, lwb_sp - reclen); 1565b24ab676SJeff Bonwick lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1566b24ab676SJeff Bonwick bcopy(lrc, lr_buf, reclen); 1567c5ee4681SAlexander Motin lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ 1568c5ee4681SAlexander Motin lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ 1569c5c6ffa0Smaybee 1570c5c6ffa0Smaybee /* 1571c5c6ffa0Smaybee * If it's a write, fetch the data or get its blkptr as appropriate. 1572c5c6ffa0Smaybee */ 1573c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE) { 1574c5c6ffa0Smaybee if (txg > spa_freeze_txg(zilog->zl_spa)) 1575c5c6ffa0Smaybee txg_wait_synced(zilog->zl_dmu_pool, txg); 1576c5c6ffa0Smaybee if (itx->itx_wr_state != WR_COPIED) { 1577c5c6ffa0Smaybee char *dbuf; 1578c5c6ffa0Smaybee int error; 1579c5c6ffa0Smaybee 1580c5ee4681SAlexander Motin if (itx->itx_wr_state == WR_NEED_COPY) { 1581b24ab676SJeff Bonwick dbuf = lr_buf + reclen; 1582c5ee4681SAlexander Motin lrcb->lrc_reclen += dnow; 1583c5ee4681SAlexander Motin if (lrwb->lr_length > dnow) 1584c5ee4681SAlexander Motin lrwb->lr_length = dnow; 1585c5ee4681SAlexander Motin lrw->lr_offset += dnow; 1586c5ee4681SAlexander Motin lrw->lr_length -= dnow; 1587c5c6ffa0Smaybee } else { 1588c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_INDIRECT); 1589c5c6ffa0Smaybee dbuf = NULL; 1590c5c6ffa0Smaybee } 15911271e4b1SPrakash Surya 15921271e4b1SPrakash Surya /* 15931271e4b1SPrakash Surya * We pass in the "lwb_write_zio" rather than 15941271e4b1SPrakash Surya * "lwb_root_zio" so that the "lwb_write_zio" 15951271e4b1SPrakash Surya * becomes the parent of any zio's created by 15961271e4b1SPrakash Surya * the "zl_get_data" callback. The vdevs are 15971271e4b1SPrakash Surya * flushed after the "lwb_write_zio" completes, 15981271e4b1SPrakash Surya * so we want to make sure that completion 15991271e4b1SPrakash Surya * callback waits for these additional zio's, 16001271e4b1SPrakash Surya * such that the vdevs used by those zio's will 16011271e4b1SPrakash Surya * be included in the lwb's vdev tree, and those 16021271e4b1SPrakash Surya * vdevs will be properly flushed. If we passed 16031271e4b1SPrakash Surya * in "lwb_root_zio" here, then these additional 16041271e4b1SPrakash Surya * vdevs may not be flushed; e.g. if these zio's 16051271e4b1SPrakash Surya * completed after "lwb_write_zio" completed. 16061271e4b1SPrakash Surya */ 16071271e4b1SPrakash Surya error = zilog->zl_get_data(itx->itx_private, 16081271e4b1SPrakash Surya lrwb, dbuf, lwb, lwb->lwb_write_zio); 16091271e4b1SPrakash Surya 1610c87b8fc5SMark J Musante if (error == EIO) { 1611c87b8fc5SMark J Musante txg_wait_synced(zilog->zl_dmu_pool, txg); 1612c87b8fc5SMark J Musante return (lwb); 1613c87b8fc5SMark J Musante } 16143b2aab18SMatthew Ahrens if (error != 0) { 1615c5c6ffa0Smaybee ASSERT(error == ENOENT || error == EEXIST || 1616c5c6ffa0Smaybee error == EALREADY); 1617c5c6ffa0Smaybee return (lwb); 1618c5c6ffa0Smaybee } 1619c5c6ffa0Smaybee } 1620104e2ed7Sperrin } 1621c5c6ffa0Smaybee 1622b24ab676SJeff Bonwick /* 1623b24ab676SJeff Bonwick * We're actually making an entry, so update lrc_seq to be the 1624b24ab676SJeff Bonwick * log record sequence number. Note that this is generally not 1625b24ab676SJeff Bonwick * equal to the itx sequence number because not all transactions 1626b24ab676SJeff Bonwick * are synchronous, and sometimes spa_sync() gets there first. 1627b24ab676SJeff Bonwick */ 16281271e4b1SPrakash Surya lrcb->lrc_seq = ++zilog->zl_lr_seq; 1629c5ee4681SAlexander Motin lwb->lwb_nused += reclen + dnow; 16301271e4b1SPrakash Surya 16311271e4b1SPrakash Surya zil_lwb_add_txg(lwb, txg); 16321271e4b1SPrakash Surya 16336e1f5caaSNeil Perrin ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1634fb09f5aaSMadhav Suresh ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); 1635fa9e4066Sahrens 1636c5ee4681SAlexander Motin dlen -= dnow; 1637c5ee4681SAlexander Motin if (dlen > 0) { 1638c5ee4681SAlexander Motin zilog->zl_cur_used += reclen; 1639c5ee4681SAlexander Motin goto cont; 1640c5ee4681SAlexander Motin } 1641c5ee4681SAlexander Motin 1642fa9e4066Sahrens return (lwb); 1643fa9e4066Sahrens } 1644fa9e4066Sahrens 1645fa9e4066Sahrens itx_t * 1646da6c28aaSamw zil_itx_create(uint64_t txtype, size_t lrsize) 1647fa9e4066Sahrens { 1648fa9e4066Sahrens itx_t *itx; 1649fa9e4066Sahrens 1650b4d654b0Sperrin lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1651fa9e4066Sahrens 1652fa9e4066Sahrens itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1653fa9e4066Sahrens itx->itx_lr.lrc_txtype = txtype; 1654fa9e4066Sahrens itx->itx_lr.lrc_reclen = lrsize; 1655fa9e4066Sahrens itx->itx_lr.lrc_seq = 0; /* defensive */ 16565002558fSNeil Perrin itx->itx_sync = B_TRUE; /* default is synchronous */ 1657fa9e4066Sahrens 1658fa9e4066Sahrens return (itx); 1659fa9e4066Sahrens } 1660fa9e4066Sahrens 1661b24ab676SJeff Bonwick void 1662b24ab676SJeff Bonwick zil_itx_destroy(itx_t *itx) 1663b24ab676SJeff Bonwick { 1664b24ab676SJeff Bonwick kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1665b24ab676SJeff Bonwick } 1666b24ab676SJeff Bonwick 16675002558fSNeil Perrin /* 16685002558fSNeil Perrin * Free up the sync and async itxs. The itxs_t has already been detached 16695002558fSNeil Perrin * so no locks are needed. 16705002558fSNeil Perrin */ 16715002558fSNeil Perrin static void 16725002558fSNeil Perrin zil_itxg_clean(itxs_t *itxs) 1673fa9e4066Sahrens { 16745002558fSNeil Perrin itx_t *itx; 16755002558fSNeil Perrin list_t *list; 16765002558fSNeil Perrin avl_tree_t *t; 16775002558fSNeil Perrin void *cookie; 16785002558fSNeil Perrin itx_async_node_t *ian; 16795002558fSNeil Perrin 16805002558fSNeil Perrin list = &itxs->i_sync_list; 16815002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 16821271e4b1SPrakash Surya /* 16831271e4b1SPrakash Surya * In the general case, commit itxs will not be found 16841271e4b1SPrakash Surya * here, as they'll be committed to an lwb via 16851271e4b1SPrakash Surya * zil_lwb_commit(), and free'd in that function. Having 16861271e4b1SPrakash Surya * said that, it is still possible for commit itxs to be 16871271e4b1SPrakash Surya * found here, due to the following race: 16881271e4b1SPrakash Surya * 16891271e4b1SPrakash Surya * - a thread calls zil_commit() which assigns the 16901271e4b1SPrakash Surya * commit itx to a per-txg i_sync_list 16911271e4b1SPrakash Surya * - zil_itxg_clean() is called (e.g. via spa_sync()) 16921271e4b1SPrakash Surya * while the waiter is still on the i_sync_list 16931271e4b1SPrakash Surya * 16941271e4b1SPrakash Surya * There's nothing to prevent syncing the txg while the 16951271e4b1SPrakash Surya * waiter is on the i_sync_list. This normally doesn't 16961271e4b1SPrakash Surya * happen because spa_sync() is slower than zil_commit(), 16971271e4b1SPrakash Surya * but if zil_commit() calls txg_wait_synced() (e.g. 16981271e4b1SPrakash Surya * because zil_create() or zil_commit_writer_stall() is 16991271e4b1SPrakash Surya * called) we will hit this case. 17001271e4b1SPrakash Surya */ 17011271e4b1SPrakash Surya if (itx->itx_lr.lrc_txtype == TX_COMMIT) 17021271e4b1SPrakash Surya zil_commit_waiter_skip(itx->itx_private); 17031271e4b1SPrakash Surya 17045002558fSNeil Perrin list_remove(list, itx); 17051271e4b1SPrakash Surya zil_itx_destroy(itx); 17065002558fSNeil Perrin } 1707fa9e4066Sahrens 17085002558fSNeil Perrin cookie = NULL; 17095002558fSNeil Perrin t = &itxs->i_async_tree; 17105002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 17115002558fSNeil Perrin list = &ian->ia_list; 17125002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 17135002558fSNeil Perrin list_remove(list, itx); 17141271e4b1SPrakash Surya /* commit itxs should never be on the async lists. */ 17151271e4b1SPrakash Surya ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 17161271e4b1SPrakash Surya zil_itx_destroy(itx); 17175002558fSNeil Perrin } 17185002558fSNeil Perrin list_destroy(list); 17195002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 17205002558fSNeil Perrin } 17215002558fSNeil Perrin avl_destroy(t); 1722fa9e4066Sahrens 17235002558fSNeil Perrin kmem_free(itxs, sizeof (itxs_t)); 17245002558fSNeil Perrin } 17255002558fSNeil Perrin 17265002558fSNeil Perrin static int 17275002558fSNeil Perrin zil_aitx_compare(const void *x1, const void *x2) 17285002558fSNeil Perrin { 17295002558fSNeil Perrin const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 17305002558fSNeil Perrin const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1731fa9e4066Sahrens 17325002558fSNeil Perrin if (o1 < o2) 17335002558fSNeil Perrin return (-1); 17345002558fSNeil Perrin if (o1 > o2) 17355002558fSNeil Perrin return (1); 17365002558fSNeil Perrin 17375002558fSNeil Perrin return (0); 1738fa9e4066Sahrens } 1739fa9e4066Sahrens 1740fa9e4066Sahrens /* 17415002558fSNeil Perrin * Remove all async itx with the given oid. 1742fa9e4066Sahrens */ 174391de656bSNeil Perrin static void 17445002558fSNeil Perrin zil_remove_async(zilog_t *zilog, uint64_t oid) 1745fa9e4066Sahrens { 17465002558fSNeil Perrin uint64_t otxg, txg; 17475002558fSNeil Perrin itx_async_node_t *ian; 17485002558fSNeil Perrin avl_tree_t *t; 17495002558fSNeil Perrin avl_index_t where; 1750a584ef65Sjohansen list_t clean_list; 1751fa9e4066Sahrens itx_t *itx; 1752fa9e4066Sahrens 17535002558fSNeil Perrin ASSERT(oid != 0); 1754a584ef65Sjohansen list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1755a584ef65Sjohansen 17565002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 17575002558fSNeil Perrin otxg = ZILTEST_TXG; 17585002558fSNeil Perrin else 17595002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1760a584ef65Sjohansen 17615002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 17625002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 17635002558fSNeil Perrin 17645002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 17655002558fSNeil Perrin if (itxg->itxg_txg != txg) { 17665002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 17675002558fSNeil Perrin continue; 17685002558fSNeil Perrin } 1769a584ef65Sjohansen 17705002558fSNeil Perrin /* 17715002558fSNeil Perrin * Locate the object node and append its list. 17725002558fSNeil Perrin */ 17735002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 17745002558fSNeil Perrin ian = avl_find(t, &oid, &where); 17755002558fSNeil Perrin if (ian != NULL) 17765002558fSNeil Perrin list_move_tail(&clean_list, &ian->ia_list); 17775002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 17785002558fSNeil Perrin } 1779a584ef65Sjohansen while ((itx = list_head(&clean_list)) != NULL) { 1780a584ef65Sjohansen list_remove(&clean_list, itx); 17811271e4b1SPrakash Surya /* commit itxs should never be on the async lists. */ 17821271e4b1SPrakash Surya ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); 17831271e4b1SPrakash Surya zil_itx_destroy(itx); 1784a584ef65Sjohansen } 1785a584ef65Sjohansen list_destroy(&clean_list); 1786fa9e4066Sahrens } 1787fa9e4066Sahrens 17885002558fSNeil Perrin void 17895002558fSNeil Perrin zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 17905002558fSNeil Perrin { 17915002558fSNeil Perrin uint64_t txg; 17925002558fSNeil Perrin itxg_t *itxg; 17935002558fSNeil Perrin itxs_t *itxs, *clean = NULL; 17945002558fSNeil Perrin 17955002558fSNeil Perrin /* 179691de656bSNeil Perrin * Object ids can be re-instantiated in the next txg so 17975002558fSNeil Perrin * remove any async transactions to avoid future leaks. 17985002558fSNeil Perrin * This can happen if a fsync occurs on the re-instantiated 17995002558fSNeil Perrin * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 18005002558fSNeil Perrin * the new file data and flushes a write record for the old object. 18015002558fSNeil Perrin */ 18025002558fSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 180351bd2f97SNeil Perrin zil_remove_async(zilog, itx->itx_oid); 18045002558fSNeil Perrin 180591de656bSNeil Perrin /* 180691de656bSNeil Perrin * Ensure the data of a renamed file is committed before the rename. 180791de656bSNeil Perrin */ 180891de656bSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 180991de656bSNeil Perrin zil_async_to_sync(zilog, itx->itx_oid); 181091de656bSNeil Perrin 1811ce636f8bSMatthew Ahrens if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 18125002558fSNeil Perrin txg = ZILTEST_TXG; 18135002558fSNeil Perrin else 18145002558fSNeil Perrin txg = dmu_tx_get_txg(tx); 18155002558fSNeil Perrin 18165002558fSNeil Perrin itxg = &zilog->zl_itxg[txg & TXG_MASK]; 18175002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 18185002558fSNeil Perrin itxs = itxg->itxg_itxs; 18195002558fSNeil Perrin if (itxg->itxg_txg != txg) { 18205002558fSNeil Perrin if (itxs != NULL) { 18215002558fSNeil Perrin /* 18225002558fSNeil Perrin * The zil_clean callback hasn't got around to cleaning 18235002558fSNeil Perrin * this itxg. Save the itxs for release below. 18245002558fSNeil Perrin * This should be rare. 18255002558fSNeil Perrin */ 182643297f97SGeorge Wilson zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " 182743297f97SGeorge Wilson "txg %llu", itxg->itxg_txg); 18285002558fSNeil Perrin clean = itxg->itxg_itxs; 18295002558fSNeil Perrin } 18305002558fSNeil Perrin itxg->itxg_txg = txg; 18315002558fSNeil Perrin itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 18325002558fSNeil Perrin 18335002558fSNeil Perrin list_create(&itxs->i_sync_list, sizeof (itx_t), 18345002558fSNeil Perrin offsetof(itx_t, itx_node)); 18355002558fSNeil Perrin avl_create(&itxs->i_async_tree, zil_aitx_compare, 18365002558fSNeil Perrin sizeof (itx_async_node_t), 18375002558fSNeil Perrin offsetof(itx_async_node_t, ia_node)); 18385002558fSNeil Perrin } 18395002558fSNeil Perrin if (itx->itx_sync) { 18405002558fSNeil Perrin list_insert_tail(&itxs->i_sync_list, itx); 18415002558fSNeil Perrin } else { 18425002558fSNeil Perrin avl_tree_t *t = &itxs->i_async_tree; 1843*54811da5SToomas Soome uint64_t foid = 1844*54811da5SToomas Soome LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); 18455002558fSNeil Perrin itx_async_node_t *ian; 18465002558fSNeil Perrin avl_index_t where; 18475002558fSNeil Perrin 18485002558fSNeil Perrin ian = avl_find(t, &foid, &where); 18495002558fSNeil Perrin if (ian == NULL) { 18505002558fSNeil Perrin ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 18515002558fSNeil Perrin list_create(&ian->ia_list, sizeof (itx_t), 18525002558fSNeil Perrin offsetof(itx_t, itx_node)); 18535002558fSNeil Perrin ian->ia_foid = foid; 18545002558fSNeil Perrin avl_insert(t, ian, where); 18555002558fSNeil Perrin } 18565002558fSNeil Perrin list_insert_tail(&ian->ia_list, itx); 18575002558fSNeil Perrin } 18585002558fSNeil Perrin 18595002558fSNeil Perrin itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 18601271e4b1SPrakash Surya 18611271e4b1SPrakash Surya /* 18621271e4b1SPrakash Surya * We don't want to dirty the ZIL using ZILTEST_TXG, because 18631271e4b1SPrakash Surya * zil_clean() will never be called using ZILTEST_TXG. Thus, we 18641271e4b1SPrakash Surya * need to be careful to always dirty the ZIL using the "real" 18651271e4b1SPrakash Surya * TXG (not itxg_txg) even when the SPA is frozen. 18661271e4b1SPrakash Surya */ 18671271e4b1SPrakash Surya zilog_dirty(zilog, dmu_tx_get_txg(tx)); 18685002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 18695002558fSNeil Perrin 18705002558fSNeil Perrin /* Release the old itxs now we've dropped the lock */ 18715002558fSNeil Perrin if (clean != NULL) 18725002558fSNeil Perrin zil_itxg_clean(clean); 18735002558fSNeil Perrin } 18745002558fSNeil Perrin 1875b19a79ecSperrin /* 187667bd71c6Sperrin * If there are any in-memory intent log transactions which have now been 1877ce636f8bSMatthew Ahrens * synced then start up a taskq to free them. We should only do this after we 1878ce636f8bSMatthew Ahrens * have written out the uberblocks (i.e. txg has been comitted) so that 1879ce636f8bSMatthew Ahrens * don't inadvertently clean out in-memory log records that would be required 1880ce636f8bSMatthew Ahrens * by zil_commit(). 1881b19a79ecSperrin */ 1882fa9e4066Sahrens void 18835002558fSNeil Perrin zil_clean(zilog_t *zilog, uint64_t synced_txg) 1884fa9e4066Sahrens { 18855002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 18865002558fSNeil Perrin itxs_t *clean_me; 188767bd71c6Sperrin 18881271e4b1SPrakash Surya ASSERT3U(synced_txg, <, ZILTEST_TXG); 18891271e4b1SPrakash Surya 18905002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 18915002558fSNeil Perrin if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 18925002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 18935002558fSNeil Perrin return; 18945002558fSNeil Perrin } 18955002558fSNeil Perrin ASSERT3U(itxg->itxg_txg, <=, synced_txg); 1896216d7723SPrakash Surya ASSERT3U(itxg->itxg_txg, !=, 0); 18975002558fSNeil Perrin clean_me = itxg->itxg_itxs; 18985002558fSNeil Perrin itxg->itxg_itxs = NULL; 18995002558fSNeil Perrin itxg->itxg_txg = 0; 19005002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 19015002558fSNeil Perrin /* 19025002558fSNeil Perrin * Preferably start a task queue to free up the old itxs but 19035002558fSNeil Perrin * if taskq_dispatch can't allocate resources to do that then 19045002558fSNeil Perrin * free it in-line. This should be rare. Note, using TQ_SLEEP 19055002558fSNeil Perrin * created a bad performance problem. 19065002558fSNeil Perrin */ 1907216d7723SPrakash Surya ASSERT3P(zilog->zl_dmu_pool, !=, NULL); 1908216d7723SPrakash Surya ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); 1909216d7723SPrakash Surya if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, 19105002558fSNeil Perrin (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) 19115002558fSNeil Perrin zil_itxg_clean(clean_me); 19125002558fSNeil Perrin } 19135002558fSNeil Perrin 19145002558fSNeil Perrin /* 19151271e4b1SPrakash Surya * This function will traverse the queue of itxs that need to be 19161271e4b1SPrakash Surya * committed, and move them onto the ZIL's zl_itx_commit_list. 19175002558fSNeil Perrin */ 191891de656bSNeil Perrin static void 19195002558fSNeil Perrin zil_get_commit_list(zilog_t *zilog) 19205002558fSNeil Perrin { 19215002558fSNeil Perrin uint64_t otxg, txg; 19225002558fSNeil Perrin list_t *commit_list = &zilog->zl_itx_commit_list; 19235002558fSNeil Perrin 1924cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 19251271e4b1SPrakash Surya 19265002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 19275002558fSNeil Perrin otxg = ZILTEST_TXG; 19285002558fSNeil Perrin else 19295002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 19305002558fSNeil Perrin 193143297f97SGeorge Wilson /* 193243297f97SGeorge Wilson * This is inherently racy, since there is nothing to prevent 193343297f97SGeorge Wilson * the last synced txg from changing. That's okay since we'll 193443297f97SGeorge Wilson * only commit things in the future. 193543297f97SGeorge Wilson */ 19365002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 19375002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 19385002558fSNeil Perrin 19395002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 19405002558fSNeil Perrin if (itxg->itxg_txg != txg) { 19415002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 19425002558fSNeil Perrin continue; 19435002558fSNeil Perrin } 19445002558fSNeil Perrin 194543297f97SGeorge Wilson /* 194643297f97SGeorge Wilson * If we're adding itx records to the zl_itx_commit_list, 194743297f97SGeorge Wilson * then the zil better be dirty in this "txg". We can assert 194843297f97SGeorge Wilson * that here since we're holding the itxg_lock which will 194943297f97SGeorge Wilson * prevent spa_sync from cleaning it. Once we add the itxs 195043297f97SGeorge Wilson * to the zl_itx_commit_list we must commit it to disk even 195143297f97SGeorge Wilson * if it's unnecessary (i.e. the txg was synced). 195243297f97SGeorge Wilson */ 195343297f97SGeorge Wilson ASSERT(zilog_is_dirty_in_txg(zilog, txg) || 195443297f97SGeorge Wilson spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); 19555002558fSNeil Perrin list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 19565002558fSNeil Perrin 19575002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 19585002558fSNeil Perrin } 19595002558fSNeil Perrin } 19605002558fSNeil Perrin 19615002558fSNeil Perrin /* 19625002558fSNeil Perrin * Move the async itxs for a specified object to commit into sync lists. 19635002558fSNeil Perrin */ 196491de656bSNeil Perrin static void 19655002558fSNeil Perrin zil_async_to_sync(zilog_t *zilog, uint64_t foid) 19665002558fSNeil Perrin { 19675002558fSNeil Perrin uint64_t otxg, txg; 19685002558fSNeil Perrin itx_async_node_t *ian; 19695002558fSNeil Perrin avl_tree_t *t; 19705002558fSNeil Perrin avl_index_t where; 19715002558fSNeil Perrin 19725002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 19735002558fSNeil Perrin otxg = ZILTEST_TXG; 19745002558fSNeil Perrin else 19755002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 19765002558fSNeil Perrin 197743297f97SGeorge Wilson /* 197843297f97SGeorge Wilson * This is inherently racy, since there is nothing to prevent 197943297f97SGeorge Wilson * the last synced txg from changing. 198043297f97SGeorge Wilson */ 19815002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 19825002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 19835002558fSNeil Perrin 19845002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 19855002558fSNeil Perrin if (itxg->itxg_txg != txg) { 19865002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 19875002558fSNeil Perrin continue; 19885002558fSNeil Perrin } 19895002558fSNeil Perrin 19905002558fSNeil Perrin /* 19915002558fSNeil Perrin * If a foid is specified then find that node and append its 19925002558fSNeil Perrin * list. Otherwise walk the tree appending all the lists 19935002558fSNeil Perrin * to the sync list. We add to the end rather than the 19945002558fSNeil Perrin * beginning to ensure the create has happened. 19955002558fSNeil Perrin */ 19965002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 19975002558fSNeil Perrin if (foid != 0) { 19985002558fSNeil Perrin ian = avl_find(t, &foid, &where); 19995002558fSNeil Perrin if (ian != NULL) { 20005002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 20015002558fSNeil Perrin &ian->ia_list); 20025002558fSNeil Perrin } 20035002558fSNeil Perrin } else { 20045002558fSNeil Perrin void *cookie = NULL; 20055002558fSNeil Perrin 20065002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 20075002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 20085002558fSNeil Perrin &ian->ia_list); 20095002558fSNeil Perrin list_destroy(&ian->ia_list); 20105002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 20115002558fSNeil Perrin } 20125002558fSNeil Perrin } 20135002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 201467bd71c6Sperrin } 2015fa9e4066Sahrens } 2016fa9e4066Sahrens 20171271e4b1SPrakash Surya /* 20181271e4b1SPrakash Surya * This function will prune commit itxs that are at the head of the 20191271e4b1SPrakash Surya * commit list (it won't prune past the first non-commit itx), and 20201271e4b1SPrakash Surya * either: a) attach them to the last lwb that's still pending 20211271e4b1SPrakash Surya * completion, or b) skip them altogether. 20221271e4b1SPrakash Surya * 20231271e4b1SPrakash Surya * This is used as a performance optimization to prevent commit itxs 20241271e4b1SPrakash Surya * from generating new lwbs when it's unnecessary to do so. 20251271e4b1SPrakash Surya */ 2026e14bb325SJeff Bonwick static void 20271271e4b1SPrakash Surya zil_prune_commit_list(zilog_t *zilog) 2028fa9e4066Sahrens { 20295002558fSNeil Perrin itx_t *itx; 2030fa9e4066Sahrens 2031cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 20325002558fSNeil Perrin 20331271e4b1SPrakash Surya while (itx = list_head(&zilog->zl_itx_commit_list)) { 20341271e4b1SPrakash Surya lr_t *lrc = &itx->itx_lr; 20351271e4b1SPrakash Surya if (lrc->lrc_txtype != TX_COMMIT) 20361271e4b1SPrakash Surya break; 20375002558fSNeil Perrin 20381271e4b1SPrakash Surya mutex_enter(&zilog->zl_lock); 20391271e4b1SPrakash Surya 20401271e4b1SPrakash Surya lwb_t *last_lwb = zilog->zl_last_lwb_opened; 2041cab3a55eSPrakash Surya if (last_lwb == NULL || 2042cab3a55eSPrakash Surya last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { 20431271e4b1SPrakash Surya /* 20441271e4b1SPrakash Surya * All of the itxs this waiter was waiting on 20451271e4b1SPrakash Surya * must have already completed (or there were 20461271e4b1SPrakash Surya * never any itx's for it to wait on), so it's 20471271e4b1SPrakash Surya * safe to skip this waiter and mark it done. 20481271e4b1SPrakash Surya */ 20491271e4b1SPrakash Surya zil_commit_waiter_skip(itx->itx_private); 20501271e4b1SPrakash Surya } else { 20511271e4b1SPrakash Surya zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); 20521271e4b1SPrakash Surya itx->itx_private = NULL; 20531271e4b1SPrakash Surya } 20541271e4b1SPrakash Surya 20551271e4b1SPrakash Surya mutex_exit(&zilog->zl_lock); 20561271e4b1SPrakash Surya 20571271e4b1SPrakash Surya list_remove(&zilog->zl_itx_commit_list, itx); 20581271e4b1SPrakash Surya zil_itx_destroy(itx); 20591271e4b1SPrakash Surya } 20601271e4b1SPrakash Surya 20611271e4b1SPrakash Surya IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); 20621271e4b1SPrakash Surya } 20631271e4b1SPrakash Surya 20641271e4b1SPrakash Surya static void 20651271e4b1SPrakash Surya zil_commit_writer_stall(zilog_t *zilog) 20661271e4b1SPrakash Surya { 20671271e4b1SPrakash Surya /* 20681271e4b1SPrakash Surya * When zio_alloc_zil() fails to allocate the next lwb block on 20691271e4b1SPrakash Surya * disk, we must call txg_wait_synced() to ensure all of the 20701271e4b1SPrakash Surya * lwbs in the zilog's zl_lwb_list are synced and then freed (in 20711271e4b1SPrakash Surya * zil_sync()), such that any subsequent ZIL writer (i.e. a call 20721271e4b1SPrakash Surya * to zil_process_commit_list()) will have to call zil_create(), 20731271e4b1SPrakash Surya * and start a new ZIL chain. 20741271e4b1SPrakash Surya * 20751271e4b1SPrakash Surya * Since zil_alloc_zil() failed, the lwb that was previously 20761271e4b1SPrakash Surya * issued does not have a pointer to the "next" lwb on disk. 20771271e4b1SPrakash Surya * Thus, if another ZIL writer thread was to allocate the "next" 20781271e4b1SPrakash Surya * on-disk lwb, that block could be leaked in the event of a 20791271e4b1SPrakash Surya * crash (because the previous lwb on-disk would not point to 20801271e4b1SPrakash Surya * it). 20811271e4b1SPrakash Surya * 2082cf07d3daSPrakash Surya * We must hold the zilog's zl_issuer_lock while we do this, to 20831271e4b1SPrakash Surya * ensure no new threads enter zil_process_commit_list() until 20841271e4b1SPrakash Surya * all lwb's in the zl_lwb_list have been synced and freed 20851271e4b1SPrakash Surya * (which is achieved via the txg_wait_synced() call). 20861271e4b1SPrakash Surya */ 2087cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 20881271e4b1SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 20891271e4b1SPrakash Surya ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 20901271e4b1SPrakash Surya } 20911271e4b1SPrakash Surya 20921271e4b1SPrakash Surya /* 20931271e4b1SPrakash Surya * This function will traverse the commit list, creating new lwbs as 20941271e4b1SPrakash Surya * needed, and committing the itxs from the commit list to these newly 20951271e4b1SPrakash Surya * created lwbs. Additionally, as a new lwb is created, the previous 20961271e4b1SPrakash Surya * lwb will be issued to the zio layer to be written to disk. 20971271e4b1SPrakash Surya */ 20981271e4b1SPrakash Surya static void 20991271e4b1SPrakash Surya zil_process_commit_list(zilog_t *zilog) 21001271e4b1SPrakash Surya { 21011271e4b1SPrakash Surya spa_t *spa = zilog->zl_spa; 21021271e4b1SPrakash Surya list_t nolwb_waiters; 21031271e4b1SPrakash Surya lwb_t *lwb; 21041271e4b1SPrakash Surya itx_t *itx; 21051271e4b1SPrakash Surya 2106cf07d3daSPrakash Surya ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); 21075002558fSNeil Perrin 21085002558fSNeil Perrin /* 21095002558fSNeil Perrin * Return if there's nothing to commit before we dirty the fs by 21105002558fSNeil Perrin * calling zil_create(). 21115002558fSNeil Perrin */ 21121271e4b1SPrakash Surya if (list_head(&zilog->zl_itx_commit_list) == NULL) 21135002558fSNeil Perrin return; 2114fa9e4066Sahrens 21151271e4b1SPrakash Surya list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), 21161271e4b1SPrakash Surya offsetof(zil_commit_waiter_t, zcw_node)); 21171271e4b1SPrakash Surya 21181271e4b1SPrakash Surya lwb = list_tail(&zilog->zl_lwb_list); 21191271e4b1SPrakash Surya if (lwb == NULL) { 21201271e4b1SPrakash Surya lwb = zil_create(zilog); 2121fa9e4066Sahrens } else { 21221271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2123cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); 2124cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); 2125fa9e4066Sahrens } 2126fa9e4066Sahrens 21275002558fSNeil Perrin while (itx = list_head(&zilog->zl_itx_commit_list)) { 21281271e4b1SPrakash Surya lr_t *lrc = &itx->itx_lr; 21291271e4b1SPrakash Surya uint64_t txg = lrc->lrc_txg; 21301271e4b1SPrakash Surya 213143297f97SGeorge Wilson ASSERT3U(txg, !=, 0); 2132fa9e4066Sahrens 21331271e4b1SPrakash Surya if (lrc->lrc_txtype == TX_COMMIT) { 21341271e4b1SPrakash Surya DTRACE_PROBE2(zil__process__commit__itx, 21351271e4b1SPrakash Surya zilog_t *, zilog, itx_t *, itx); 21361271e4b1SPrakash Surya } else { 21371271e4b1SPrakash Surya DTRACE_PROBE2(zil__process__normal__itx, 21381271e4b1SPrakash Surya zilog_t *, zilog, itx_t *, itx); 21391271e4b1SPrakash Surya } 21401271e4b1SPrakash Surya 21411271e4b1SPrakash Surya boolean_t synced = txg <= spa_last_synced_txg(spa); 21421271e4b1SPrakash Surya boolean_t frozen = txg > spa_freeze_txg(spa); 21431271e4b1SPrakash Surya 214494ddd090SPrakash Surya /* 214594ddd090SPrakash Surya * If the txg of this itx has already been synced out, then 214694ddd090SPrakash Surya * we don't need to commit this itx to an lwb. This is 214794ddd090SPrakash Surya * because the data of this itx will have already been 214894ddd090SPrakash Surya * written to the main pool. This is inherently racy, and 214994ddd090SPrakash Surya * it's still ok to commit an itx whose txg has already 215094ddd090SPrakash Surya * been synced; this will result in a write that's 215194ddd090SPrakash Surya * unnecessary, but will do no harm. 215294ddd090SPrakash Surya * 215394ddd090SPrakash Surya * With that said, we always want to commit TX_COMMIT itxs 215494ddd090SPrakash Surya * to an lwb, regardless of whether or not that itx's txg 215594ddd090SPrakash Surya * has been synced out. We do this to ensure any OPENED lwb 215694ddd090SPrakash Surya * will always have at least one zil_commit_waiter_t linked 215794ddd090SPrakash Surya * to the lwb. 215894ddd090SPrakash Surya * 215994ddd090SPrakash Surya * As a counter-example, if we skipped TX_COMMIT itx's 216094ddd090SPrakash Surya * whose txg had already been synced, the following 216194ddd090SPrakash Surya * situation could occur if we happened to be racing with 216294ddd090SPrakash Surya * spa_sync: 216394ddd090SPrakash Surya * 216494ddd090SPrakash Surya * 1. we commit a non-TX_COMMIT itx to an lwb, where the 216594ddd090SPrakash Surya * itx's txg is 10 and the last synced txg is 9. 216694ddd090SPrakash Surya * 2. spa_sync finishes syncing out txg 10. 216794ddd090SPrakash Surya * 3. we move to the next itx in the list, it's a TX_COMMIT 216894ddd090SPrakash Surya * whose txg is 10, so we skip it rather than committing 216994ddd090SPrakash Surya * it to the lwb used in (1). 217094ddd090SPrakash Surya * 217194ddd090SPrakash Surya * If the itx that is skipped in (3) is the last TX_COMMIT 217294ddd090SPrakash Surya * itx in the commit list, than it's possible for the lwb 217394ddd090SPrakash Surya * used in (1) to remain in the OPENED state indefinitely. 217494ddd090SPrakash Surya * 217594ddd090SPrakash Surya * To prevent the above scenario from occuring, ensuring 217694ddd090SPrakash Surya * that once an lwb is OPENED it will transition to ISSUED 217794ddd090SPrakash Surya * and eventually DONE, we always commit TX_COMMIT itx's to 217894ddd090SPrakash Surya * an lwb here, even if that itx's txg has already been 217994ddd090SPrakash Surya * synced. 218094ddd090SPrakash Surya * 218194ddd090SPrakash Surya * Finally, if the pool is frozen, we _always_ commit the 218294ddd090SPrakash Surya * itx. The point of freezing the pool is to prevent data 218394ddd090SPrakash Surya * from being written to the main pool via spa_sync, and 218494ddd090SPrakash Surya * instead rely solely on the ZIL to persistently store the 218594ddd090SPrakash Surya * data; i.e. when the pool is frozen, the last synced txg 218694ddd090SPrakash Surya * value can't be trusted. 218794ddd090SPrakash Surya */ 218894ddd090SPrakash Surya if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { 21891271e4b1SPrakash Surya if (lwb != NULL) { 21901271e4b1SPrakash Surya lwb = zil_lwb_commit(zilog, itx, lwb); 21911271e4b1SPrakash Surya } else if (lrc->lrc_txtype == TX_COMMIT) { 21921271e4b1SPrakash Surya ASSERT3P(lwb, ==, NULL); 21931271e4b1SPrakash Surya zil_commit_waiter_link_nolwb( 21941271e4b1SPrakash Surya itx->itx_private, &nolwb_waiters); 21951271e4b1SPrakash Surya } 21961271e4b1SPrakash Surya } 21971271e4b1SPrakash Surya 21985002558fSNeil Perrin list_remove(&zilog->zl_itx_commit_list, itx); 21991271e4b1SPrakash Surya zil_itx_destroy(itx); 2200fa9e4066Sahrens } 2201fa9e4066Sahrens 22021271e4b1SPrakash Surya if (lwb == NULL) { 22031271e4b1SPrakash Surya /* 22041271e4b1SPrakash Surya * This indicates zio_alloc_zil() failed to allocate the 22051271e4b1SPrakash Surya * "next" lwb on-disk. When this happens, we must stall 22061271e4b1SPrakash Surya * the ZIL write pipeline; see the comment within 22071271e4b1SPrakash Surya * zil_commit_writer_stall() for more details. 22081271e4b1SPrakash Surya */ 22091271e4b1SPrakash Surya zil_commit_writer_stall(zilog); 2210fa9e4066Sahrens 22111271e4b1SPrakash Surya /* 22121271e4b1SPrakash Surya * Additionally, we have to signal and mark the "nolwb" 22131271e4b1SPrakash Surya * waiters as "done" here, since without an lwb, we 22141271e4b1SPrakash Surya * can't do this via zil_lwb_flush_vdevs_done() like 22151271e4b1SPrakash Surya * normal. 22161271e4b1SPrakash Surya */ 22171271e4b1SPrakash Surya zil_commit_waiter_t *zcw; 22181271e4b1SPrakash Surya while (zcw = list_head(&nolwb_waiters)) { 22191271e4b1SPrakash Surya zil_commit_waiter_skip(zcw); 22201271e4b1SPrakash Surya list_remove(&nolwb_waiters, zcw); 22211271e4b1SPrakash Surya } 22221271e4b1SPrakash Surya } else { 22231271e4b1SPrakash Surya ASSERT(list_is_empty(&nolwb_waiters)); 22241271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 22251271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 2226cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); 2227cab3a55eSPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); 22281271e4b1SPrakash Surya 22291271e4b1SPrakash Surya /* 22301271e4b1SPrakash Surya * At this point, the ZIL block pointed at by the "lwb" 22311271e4b1SPrakash Surya * variable is in one of the following states: "closed" 22321271e4b1SPrakash Surya * or "open". 22331271e4b1SPrakash Surya * 22341271e4b1SPrakash Surya * If its "closed", then no itxs have been committed to 22351271e4b1SPrakash Surya * it, so there's no point in issuing its zio (i.e. 22361271e4b1SPrakash Surya * it's "empty"). 22371271e4b1SPrakash Surya * 22381271e4b1SPrakash Surya * If its "open" state, then it contains one or more 22391271e4b1SPrakash Surya * itxs that eventually need to be committed to stable 22401271e4b1SPrakash Surya * storage. In this case we intentionally do not issue 22411271e4b1SPrakash Surya * the lwb's zio to disk yet, and instead rely on one of 22421271e4b1SPrakash Surya * the following two mechanisms for issuing the zio: 22431271e4b1SPrakash Surya * 22441271e4b1SPrakash Surya * 1. Ideally, there will be more ZIL activity occuring 22451271e4b1SPrakash Surya * on the system, such that this function will be 22461271e4b1SPrakash Surya * immediately called again (not necessarily by the same 22471271e4b1SPrakash Surya * thread) and this lwb's zio will be issued via 22481271e4b1SPrakash Surya * zil_lwb_commit(). This way, the lwb is guaranteed to 22491271e4b1SPrakash Surya * be "full" when it is issued to disk, and we'll make 22501271e4b1SPrakash Surya * use of the lwb's size the best we can. 22511271e4b1SPrakash Surya * 22521271e4b1SPrakash Surya * 2. If there isn't sufficient ZIL activity occuring on 22531271e4b1SPrakash Surya * the system, such that this lwb's zio isn't issued via 22541271e4b1SPrakash Surya * zil_lwb_commit(), zil_commit_waiter() will issue the 22551271e4b1SPrakash Surya * lwb's zio. If this occurs, the lwb is not guaranteed 22561271e4b1SPrakash Surya * to be "full" by the time its zio is issued, and means 22571271e4b1SPrakash Surya * the size of the lwb was "too large" given the amount 22581271e4b1SPrakash Surya * of ZIL activity occuring on the system at that time. 22591271e4b1SPrakash Surya * 22601271e4b1SPrakash Surya * We do this for a couple of reasons: 22611271e4b1SPrakash Surya * 22621271e4b1SPrakash Surya * 1. To try and reduce the number of IOPs needed to 22631271e4b1SPrakash Surya * write the same number of itxs. If an lwb has space 22641271e4b1SPrakash Surya * available in it's buffer for more itxs, and more itxs 22651271e4b1SPrakash Surya * will be committed relatively soon (relative to the 22661271e4b1SPrakash Surya * latency of performing a write), then it's beneficial 22671271e4b1SPrakash Surya * to wait for these "next" itxs. This way, more itxs 22681271e4b1SPrakash Surya * can be committed to stable storage with fewer writes. 22691271e4b1SPrakash Surya * 22701271e4b1SPrakash Surya * 2. To try and use the largest lwb block size that the 22711271e4b1SPrakash Surya * incoming rate of itxs can support. Again, this is to 22721271e4b1SPrakash Surya * try and pack as many itxs into as few lwbs as 22731271e4b1SPrakash Surya * possible, without significantly impacting the latency 22741271e4b1SPrakash Surya * of each individual itx. 22751271e4b1SPrakash Surya */ 22761271e4b1SPrakash Surya } 22771271e4b1SPrakash Surya } 22781271e4b1SPrakash Surya 22791271e4b1SPrakash Surya /* 22801271e4b1SPrakash Surya * This function is responsible for ensuring the passed in commit waiter 22811271e4b1SPrakash Surya * (and associated commit itx) is committed to an lwb. If the waiter is 22821271e4b1SPrakash Surya * not already committed to an lwb, all itxs in the zilog's queue of 22831271e4b1SPrakash Surya * itxs will be processed. The assumption is the passed in waiter's 22841271e4b1SPrakash Surya * commit itx will found in the queue just like the other non-commit 22851271e4b1SPrakash Surya * itxs, such that when the entire queue is processed, the waiter will 22861271e4b1SPrakash Surya * have been commited to an lwb. 22871271e4b1SPrakash Surya * 22881271e4b1SPrakash Surya * The lwb associated with the passed in waiter is not guaranteed to 22891271e4b1SPrakash Surya * have been issued by the time this function completes. If the lwb is 22901271e4b1SPrakash Surya * not issued, we rely on future calls to zil_commit_writer() to issue 22911271e4b1SPrakash Surya * the lwb, or the timeout mechanism found in zil_commit_waiter(). 22921271e4b1SPrakash Surya */ 22931271e4b1SPrakash Surya static void 22941271e4b1SPrakash Surya zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) 22951271e4b1SPrakash Surya { 22961271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 22971271e4b1SPrakash Surya ASSERT(spa_writeable(zilog->zl_spa)); 22981271e4b1SPrakash Surya 2299cf07d3daSPrakash Surya mutex_enter(&zilog->zl_issuer_lock); 23001271e4b1SPrakash Surya 23011271e4b1SPrakash Surya if (zcw->zcw_lwb != NULL || zcw->zcw_done) { 23021271e4b1SPrakash Surya /* 23031271e4b1SPrakash Surya * It's possible that, while we were waiting to acquire 2304cf07d3daSPrakash Surya * the "zl_issuer_lock", another thread committed this 23051271e4b1SPrakash Surya * waiter to an lwb. If that occurs, we bail out early, 23061271e4b1SPrakash Surya * without processing any of the zilog's queue of itxs. 23071271e4b1SPrakash Surya * 23081271e4b1SPrakash Surya * On certain workloads and system configurations, the 2309cf07d3daSPrakash Surya * "zl_issuer_lock" can become highly contended. In an 23101271e4b1SPrakash Surya * attempt to reduce this contention, we immediately drop 23111271e4b1SPrakash Surya * the lock if the waiter has already been processed. 23121271e4b1SPrakash Surya * 23131271e4b1SPrakash Surya * We've measured this optimization to reduce CPU spent 23141271e4b1SPrakash Surya * contending on this lock by up to 5%, using a system 23151271e4b1SPrakash Surya * with 32 CPUs, low latency storage (~50 usec writes), 23161271e4b1SPrakash Surya * and 1024 threads performing sync writes. 23171271e4b1SPrakash Surya */ 23181271e4b1SPrakash Surya goto out; 23191271e4b1SPrakash Surya } 23201271e4b1SPrakash Surya 23211271e4b1SPrakash Surya zil_get_commit_list(zilog); 23221271e4b1SPrakash Surya zil_prune_commit_list(zilog); 23231271e4b1SPrakash Surya zil_process_commit_list(zilog); 23241271e4b1SPrakash Surya 23251271e4b1SPrakash Surya out: 2326cf07d3daSPrakash Surya mutex_exit(&zilog->zl_issuer_lock); 23271271e4b1SPrakash Surya } 23281271e4b1SPrakash Surya 23291271e4b1SPrakash Surya static void 23301271e4b1SPrakash Surya zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) 23311271e4b1SPrakash Surya { 2332cf07d3daSPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 23331271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 23341271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_FALSE); 23351271e4b1SPrakash Surya 23361271e4b1SPrakash Surya lwb_t *lwb = zcw->zcw_lwb; 23371271e4b1SPrakash Surya ASSERT3P(lwb, !=, NULL); 23381271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); 23391271e4b1SPrakash Surya 23401271e4b1SPrakash Surya /* 23411271e4b1SPrakash Surya * If the lwb has already been issued by another thread, we can 23421271e4b1SPrakash Surya * immediately return since there's no work to be done (the 23431271e4b1SPrakash Surya * point of this function is to issue the lwb). Additionally, we 2344cf07d3daSPrakash Surya * do this prior to acquiring the zl_issuer_lock, to avoid 23451271e4b1SPrakash Surya * acquiring it when it's not necessary to do so. 23461271e4b1SPrakash Surya */ 23471271e4b1SPrakash Surya if (lwb->lwb_state == LWB_STATE_ISSUED || 2348cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_WRITE_DONE || 2349cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_FLUSH_DONE) 23501271e4b1SPrakash Surya return; 23511271e4b1SPrakash Surya 23521271e4b1SPrakash Surya /* 23531271e4b1SPrakash Surya * In order to call zil_lwb_write_issue() we must hold the 2354cf07d3daSPrakash Surya * zilog's "zl_issuer_lock". We can't simply acquire that lock, 23551271e4b1SPrakash Surya * since we're already holding the commit waiter's "zcw_lock", 23561271e4b1SPrakash Surya * and those two locks are aquired in the opposite order 23571271e4b1SPrakash Surya * elsewhere. 23581271e4b1SPrakash Surya */ 23591271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 2360cf07d3daSPrakash Surya mutex_enter(&zilog->zl_issuer_lock); 23611271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 2362fa9e4066Sahrens 2363fa9e4066Sahrens /* 23641271e4b1SPrakash Surya * Since we just dropped and re-acquired the commit waiter's 23651271e4b1SPrakash Surya * lock, we have to re-check to see if the waiter was marked 23661271e4b1SPrakash Surya * "done" during that process. If the waiter was marked "done", 23671271e4b1SPrakash Surya * the "lwb" pointer is no longer valid (it can be free'd after 23681271e4b1SPrakash Surya * the waiter is marked "done"), so without this check we could 23691271e4b1SPrakash Surya * wind up with a use-after-free error below. 23701271e4b1SPrakash Surya */ 23711271e4b1SPrakash Surya if (zcw->zcw_done) 23721271e4b1SPrakash Surya goto out; 23731271e4b1SPrakash Surya 23741271e4b1SPrakash Surya ASSERT3P(lwb, ==, zcw->zcw_lwb); 23751271e4b1SPrakash Surya 23761271e4b1SPrakash Surya /* 237794ddd090SPrakash Surya * We've already checked this above, but since we hadn't acquired 237894ddd090SPrakash Surya * the zilog's zl_issuer_lock, we have to perform this check a 237994ddd090SPrakash Surya * second time while holding the lock. 238094ddd090SPrakash Surya * 238194ddd090SPrakash Surya * We don't need to hold the zl_lock since the lwb cannot transition 238294ddd090SPrakash Surya * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb 238394ddd090SPrakash Surya * _can_ transition from ISSUED to DONE, but it's OK to race with 238494ddd090SPrakash Surya * that transition since we treat the lwb the same, whether it's in 238594ddd090SPrakash Surya * the ISSUED or DONE states. 238694ddd090SPrakash Surya * 238794ddd090SPrakash Surya * The important thing, is we treat the lwb differently depending on 238894ddd090SPrakash Surya * if it's ISSUED or OPENED, and block any other threads that might 238994ddd090SPrakash Surya * attempt to issue this lwb. For that reason we hold the 239094ddd090SPrakash Surya * zl_issuer_lock when checking the lwb_state; we must not call 23911271e4b1SPrakash Surya * zil_lwb_write_issue() if the lwb had already been issued. 239294ddd090SPrakash Surya * 239394ddd090SPrakash Surya * See the comment above the lwb_state_t structure definition for 239494ddd090SPrakash Surya * more details on the lwb states, and locking requirements. 23951271e4b1SPrakash Surya */ 23961271e4b1SPrakash Surya if (lwb->lwb_state == LWB_STATE_ISSUED || 2397cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_WRITE_DONE || 2398cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_FLUSH_DONE) 23991271e4b1SPrakash Surya goto out; 24001271e4b1SPrakash Surya 24011271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); 24021271e4b1SPrakash Surya 24031271e4b1SPrakash Surya /* 24041271e4b1SPrakash Surya * As described in the comments above zil_commit_waiter() and 24051271e4b1SPrakash Surya * zil_process_commit_list(), we need to issue this lwb's zio 24061271e4b1SPrakash Surya * since we've reached the commit waiter's timeout and it still 24071271e4b1SPrakash Surya * hasn't been issued. 24081271e4b1SPrakash Surya */ 24091271e4b1SPrakash Surya lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); 24101271e4b1SPrakash Surya 2411b6031810SPrakash Surya IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); 24121271e4b1SPrakash Surya 24131271e4b1SPrakash Surya /* 24141271e4b1SPrakash Surya * Since the lwb's zio hadn't been issued by the time this thread 24151271e4b1SPrakash Surya * reached its timeout, we reset the zilog's "zl_cur_used" field 24161271e4b1SPrakash Surya * to influence the zil block size selection algorithm. 24171271e4b1SPrakash Surya * 24181271e4b1SPrakash Surya * By having to issue the lwb's zio here, it means the size of the 24191271e4b1SPrakash Surya * lwb was too large, given the incoming throughput of itxs. By 24201271e4b1SPrakash Surya * setting "zl_cur_used" to zero, we communicate this fact to the 24211271e4b1SPrakash Surya * block size selection algorithm, so it can take this informaiton 24221271e4b1SPrakash Surya * into account, and potentially select a smaller size for the 24231271e4b1SPrakash Surya * next lwb block that is allocated. 2424fa9e4066Sahrens */ 24251271e4b1SPrakash Surya zilog->zl_cur_used = 0; 24261271e4b1SPrakash Surya 24271271e4b1SPrakash Surya if (nlwb == NULL) { 24281271e4b1SPrakash Surya /* 24291271e4b1SPrakash Surya * When zil_lwb_write_issue() returns NULL, this 24301271e4b1SPrakash Surya * indicates zio_alloc_zil() failed to allocate the 24311271e4b1SPrakash Surya * "next" lwb on-disk. When this occurs, the ZIL write 24321271e4b1SPrakash Surya * pipeline must be stalled; see the comment within the 24331271e4b1SPrakash Surya * zil_commit_writer_stall() function for more details. 24341271e4b1SPrakash Surya * 24351271e4b1SPrakash Surya * We must drop the commit waiter's lock prior to 24361271e4b1SPrakash Surya * calling zil_commit_writer_stall() or else we can wind 24371271e4b1SPrakash Surya * up with the following deadlock: 24381271e4b1SPrakash Surya * 24391271e4b1SPrakash Surya * - This thread is waiting for the txg to sync while 24401271e4b1SPrakash Surya * holding the waiter's lock; txg_wait_synced() is 24411271e4b1SPrakash Surya * used within txg_commit_writer_stall(). 24421271e4b1SPrakash Surya * 24431271e4b1SPrakash Surya * - The txg can't sync because it is waiting for this 24441271e4b1SPrakash Surya * lwb's zio callback to call dmu_tx_commit(). 24451271e4b1SPrakash Surya * 24461271e4b1SPrakash Surya * - The lwb's zio callback can't call dmu_tx_commit() 24471271e4b1SPrakash Surya * because it's blocked trying to acquire the waiter's 24481271e4b1SPrakash Surya * lock, which occurs prior to calling dmu_tx_commit() 24491271e4b1SPrakash Surya */ 24501271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 24511271e4b1SPrakash Surya zil_commit_writer_stall(zilog); 24521271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 2453fa9e4066Sahrens } 245422ac5be4Sperrin 24551271e4b1SPrakash Surya out: 2456cf07d3daSPrakash Surya mutex_exit(&zilog->zl_issuer_lock); 24571271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 24581271e4b1SPrakash Surya } 245967bd71c6Sperrin 24601271e4b1SPrakash Surya /* 24611271e4b1SPrakash Surya * This function is responsible for performing the following two tasks: 24621271e4b1SPrakash Surya * 24631271e4b1SPrakash Surya * 1. its primary responsibility is to block until the given "commit 24641271e4b1SPrakash Surya * waiter" is considered "done". 24651271e4b1SPrakash Surya * 24661271e4b1SPrakash Surya * 2. its secondary responsibility is to issue the zio for the lwb that 24671271e4b1SPrakash Surya * the given "commit waiter" is waiting on, if this function has 24681271e4b1SPrakash Surya * waited "long enough" and the lwb is still in the "open" state. 24691271e4b1SPrakash Surya * 24701271e4b1SPrakash Surya * Given a sufficient amount of itxs being generated and written using 24711271e4b1SPrakash Surya * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() 24721271e4b1SPrakash Surya * function. If this does not occur, this secondary responsibility will 24731271e4b1SPrakash Surya * ensure the lwb is issued even if there is not other synchronous 24741271e4b1SPrakash Surya * activity on the system. 24751271e4b1SPrakash Surya * 24761271e4b1SPrakash Surya * For more details, see zil_process_commit_list(); more specifically, 24771271e4b1SPrakash Surya * the comment at the bottom of that function. 24781271e4b1SPrakash Surya */ 24791271e4b1SPrakash Surya static void 24801271e4b1SPrakash Surya zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) 24811271e4b1SPrakash Surya { 24821271e4b1SPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_lock)); 2483cf07d3daSPrakash Surya ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); 24841271e4b1SPrakash Surya ASSERT(spa_writeable(zilog->zl_spa)); 24851271e4b1SPrakash Surya 24861271e4b1SPrakash Surya mutex_enter(&zcw->zcw_lock); 2487b24ab676SJeff Bonwick 2488b24ab676SJeff Bonwick /* 24891271e4b1SPrakash Surya * The timeout is scaled based on the lwb latency to avoid 24901271e4b1SPrakash Surya * significantly impacting the latency of each individual itx. 24911271e4b1SPrakash Surya * For more details, see the comment at the bottom of the 24921271e4b1SPrakash Surya * zil_process_commit_list() function. 2493b24ab676SJeff Bonwick */ 24941271e4b1SPrakash Surya int pct = MAX(zfs_commit_timeout_pct, 1); 24951271e4b1SPrakash Surya hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; 24961271e4b1SPrakash Surya hrtime_t wakeup = gethrtime() + sleep; 24971271e4b1SPrakash Surya boolean_t timedout = B_FALSE; 24981271e4b1SPrakash Surya 24991271e4b1SPrakash Surya while (!zcw->zcw_done) { 25001271e4b1SPrakash Surya ASSERT(MUTEX_HELD(&zcw->zcw_lock)); 25011271e4b1SPrakash Surya 25021271e4b1SPrakash Surya lwb_t *lwb = zcw->zcw_lwb; 25031271e4b1SPrakash Surya 25041271e4b1SPrakash Surya /* 25051271e4b1SPrakash Surya * Usually, the waiter will have a non-NULL lwb field here, 25061271e4b1SPrakash Surya * but it's possible for it to be NULL as a result of 25071271e4b1SPrakash Surya * zil_commit() racing with spa_sync(). 25081271e4b1SPrakash Surya * 25091271e4b1SPrakash Surya * When zil_clean() is called, it's possible for the itxg 25101271e4b1SPrakash Surya * list (which may be cleaned via a taskq) to contain 25111271e4b1SPrakash Surya * commit itxs. When this occurs, the commit waiters linked 25121271e4b1SPrakash Surya * off of these commit itxs will not be committed to an 25131271e4b1SPrakash Surya * lwb. Additionally, these commit waiters will not be 25141271e4b1SPrakash Surya * marked done until zil_commit_waiter_skip() is called via 25151271e4b1SPrakash Surya * zil_itxg_clean(). 25161271e4b1SPrakash Surya * 25171271e4b1SPrakash Surya * Thus, it's possible for this commit waiter (i.e. the 25181271e4b1SPrakash Surya * "zcw" variable) to be found in this "in between" state; 25191271e4b1SPrakash Surya * where it's "zcw_lwb" field is NULL, and it hasn't yet 25201271e4b1SPrakash Surya * been skipped, so it's "zcw_done" field is still B_FALSE. 25211271e4b1SPrakash Surya */ 25221271e4b1SPrakash Surya IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); 25231271e4b1SPrakash Surya 25241271e4b1SPrakash Surya if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { 25251271e4b1SPrakash Surya ASSERT3B(timedout, ==, B_FALSE); 25261271e4b1SPrakash Surya 25271271e4b1SPrakash Surya /* 25281271e4b1SPrakash Surya * If the lwb hasn't been issued yet, then we 25291271e4b1SPrakash Surya * need to wait with a timeout, in case this 25301271e4b1SPrakash Surya * function needs to issue the lwb after the 25311271e4b1SPrakash Surya * timeout is reached; responsibility (2) from 25321271e4b1SPrakash Surya * the comment above this function. 25331271e4b1SPrakash Surya */ 25341271e4b1SPrakash Surya clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, 25351271e4b1SPrakash Surya &zcw->zcw_lock, wakeup, USEC2NSEC(1), 25361271e4b1SPrakash Surya CALLOUT_FLAG_ABSOLUTE); 25371271e4b1SPrakash Surya 25381271e4b1SPrakash Surya if (timeleft >= 0 || zcw->zcw_done) 25391271e4b1SPrakash Surya continue; 25401271e4b1SPrakash Surya 25411271e4b1SPrakash Surya timedout = B_TRUE; 25421271e4b1SPrakash Surya zil_commit_waiter_timeout(zilog, zcw); 25431271e4b1SPrakash Surya 25441271e4b1SPrakash Surya if (!zcw->zcw_done) { 25451271e4b1SPrakash Surya /* 25461271e4b1SPrakash Surya * If the commit waiter has already been 25471271e4b1SPrakash Surya * marked "done", it's possible for the 25481271e4b1SPrakash Surya * waiter's lwb structure to have already 25491271e4b1SPrakash Surya * been freed. Thus, we can only reliably 25501271e4b1SPrakash Surya * make these assertions if the waiter 25511271e4b1SPrakash Surya * isn't done. 25521271e4b1SPrakash Surya */ 25531271e4b1SPrakash Surya ASSERT3P(lwb, ==, zcw->zcw_lwb); 25541271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); 25551271e4b1SPrakash Surya } 25561271e4b1SPrakash Surya } else { 25571271e4b1SPrakash Surya /* 25581271e4b1SPrakash Surya * If the lwb isn't open, then it must have already 25591271e4b1SPrakash Surya * been issued. In that case, there's no need to 25601271e4b1SPrakash Surya * use a timeout when waiting for the lwb to 25611271e4b1SPrakash Surya * complete. 25621271e4b1SPrakash Surya * 25631271e4b1SPrakash Surya * Additionally, if the lwb is NULL, the waiter 25641271e4b1SPrakash Surya * will soon be signalled and marked done via 25651271e4b1SPrakash Surya * zil_clean() and zil_itxg_clean(), so no timeout 25661271e4b1SPrakash Surya * is required. 25671271e4b1SPrakash Surya */ 25681271e4b1SPrakash Surya 25691271e4b1SPrakash Surya IMPLY(lwb != NULL, 25701271e4b1SPrakash Surya lwb->lwb_state == LWB_STATE_ISSUED || 2571cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_WRITE_DONE || 2572cab3a55eSPrakash Surya lwb->lwb_state == LWB_STATE_FLUSH_DONE); 25731271e4b1SPrakash Surya cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); 25741271e4b1SPrakash Surya } 25751271e4b1SPrakash Surya } 25761271e4b1SPrakash Surya 25771271e4b1SPrakash Surya mutex_exit(&zcw->zcw_lock); 25781271e4b1SPrakash Surya } 25791271e4b1SPrakash Surya 25801271e4b1SPrakash Surya static zil_commit_waiter_t * 25811271e4b1SPrakash Surya zil_alloc_commit_waiter() 25821271e4b1SPrakash Surya { 25831271e4b1SPrakash Surya zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); 25841271e4b1SPrakash Surya 25851271e4b1SPrakash Surya cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); 25861271e4b1SPrakash Surya mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); 25871271e4b1SPrakash Surya list_link_init(&zcw->zcw_node); 25881271e4b1SPrakash Surya zcw->zcw_lwb = NULL; 25891271e4b1SPrakash Surya zcw->zcw_done = B_FALSE; 25901271e4b1SPrakash Surya zcw->zcw_zio_error = 0; 25911271e4b1SPrakash Surya 25921271e4b1SPrakash Surya return (zcw); 25931271e4b1SPrakash Surya } 25941271e4b1SPrakash Surya 25951271e4b1SPrakash Surya static void 25961271e4b1SPrakash Surya zil_free_commit_waiter(zil_commit_waiter_t *zcw) 25971271e4b1SPrakash Surya { 25981271e4b1SPrakash Surya ASSERT(!list_link_active(&zcw->zcw_node)); 25991271e4b1SPrakash Surya ASSERT3P(zcw->zcw_lwb, ==, NULL); 26001271e4b1SPrakash Surya ASSERT3B(zcw->zcw_done, ==, B_TRUE); 26011271e4b1SPrakash Surya mutex_destroy(&zcw->zcw_lock); 26021271e4b1SPrakash Surya cv_destroy(&zcw->zcw_cv); 26031271e4b1SPrakash Surya kmem_cache_free(zil_zcw_cache, zcw); 26041271e4b1SPrakash Surya } 26051271e4b1SPrakash Surya 26061271e4b1SPrakash Surya /* 26071271e4b1SPrakash Surya * This function is used to create a TX_COMMIT itx and assign it. This 26081271e4b1SPrakash Surya * way, it will be linked into the ZIL's list of synchronous itxs, and 26091271e4b1SPrakash Surya * then later committed to an lwb (or skipped) when 26101271e4b1SPrakash Surya * zil_process_commit_list() is called. 26111271e4b1SPrakash Surya */ 26121271e4b1SPrakash Surya static void 26131271e4b1SPrakash Surya zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) 26141271e4b1SPrakash Surya { 26151271e4b1SPrakash Surya dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 26161271e4b1SPrakash Surya VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 26171271e4b1SPrakash Surya 26181271e4b1SPrakash Surya itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); 26191271e4b1SPrakash Surya itx->itx_sync = B_TRUE; 26201271e4b1SPrakash Surya itx->itx_private = zcw; 26211271e4b1SPrakash Surya 26221271e4b1SPrakash Surya zil_itx_assign(zilog, itx, tx); 26231271e4b1SPrakash Surya 26241271e4b1SPrakash Surya dmu_tx_commit(tx); 2625b19a79ecSperrin } 2626b19a79ecSperrin 2627b19a79ecSperrin /* 26281271e4b1SPrakash Surya * Commit ZFS Intent Log transactions (itxs) to stable storage. 26291271e4b1SPrakash Surya * 26301271e4b1SPrakash Surya * When writing ZIL transactions to the on-disk representation of the 26311271e4b1SPrakash Surya * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple 26321271e4b1SPrakash Surya * itxs can be committed to a single lwb. Once a lwb is written and 26331271e4b1SPrakash Surya * committed to stable storage (i.e. the lwb is written, and vdevs have 26341271e4b1SPrakash Surya * been flushed), each itx that was committed to that lwb is also 26351271e4b1SPrakash Surya * considered to be committed to stable storage. 26361271e4b1SPrakash Surya * 26371271e4b1SPrakash Surya * When an itx is committed to an lwb, the log record (lr_t) contained 26381271e4b1SPrakash Surya * by the itx is copied into the lwb's zio buffer, and once this buffer 26391271e4b1SPrakash Surya * is written to disk, it becomes an on-disk ZIL block. 26401271e4b1SPrakash Surya * 26411271e4b1SPrakash Surya * As itxs are generated, they're inserted into the ZIL's queue of 26421271e4b1SPrakash Surya * uncommitted itxs. The semantics of zil_commit() are such that it will 26431271e4b1SPrakash Surya * block until all itxs that were in the queue when it was called, are 26441271e4b1SPrakash Surya * committed to stable storage. 26451271e4b1SPrakash Surya * 26461271e4b1SPrakash Surya * If "foid" is zero, this means all "synchronous" and "asynchronous" 26471271e4b1SPrakash Surya * itxs, for all objects in the dataset, will be committed to stable 26481271e4b1SPrakash Surya * storage prior to zil_commit() returning. If "foid" is non-zero, all 26491271e4b1SPrakash Surya * "synchronous" itxs for all objects, but only "asynchronous" itxs 26501271e4b1SPrakash Surya * that correspond to the foid passed in, will be committed to stable 26511271e4b1SPrakash Surya * storage prior to zil_commit() returning. 26521271e4b1SPrakash Surya * 26531271e4b1SPrakash Surya * Generally speaking, when zil_commit() is called, the consumer doesn't 26541271e4b1SPrakash Surya * actually care about _all_ of the uncommitted itxs. Instead, they're 26551271e4b1SPrakash Surya * simply trying to waiting for a specific itx to be committed to disk, 26561271e4b1SPrakash Surya * but the interface(s) for interacting with the ZIL don't allow such 26571271e4b1SPrakash Surya * fine-grained communication. A better interface would allow a consumer 26581271e4b1SPrakash Surya * to create and assign an itx, and then pass a reference to this itx to 26591271e4b1SPrakash Surya * zil_commit(); such that zil_commit() would return as soon as that 26601271e4b1SPrakash Surya * specific itx was committed to disk (instead of waiting for _all_ 26611271e4b1SPrakash Surya * itxs to be committed). 26621271e4b1SPrakash Surya * 26631271e4b1SPrakash Surya * When a thread calls zil_commit() a special "commit itx" will be 26641271e4b1SPrakash Surya * generated, along with a corresponding "waiter" for this commit itx. 26651271e4b1SPrakash Surya * zil_commit() will wait on this waiter's CV, such that when the waiter 26661271e4b1SPrakash Surya * is marked done, and signalled, zil_commit() will return. 26671271e4b1SPrakash Surya * 26681271e4b1SPrakash Surya * This commit itx is inserted into the queue of uncommitted itxs. This 26691271e4b1SPrakash Surya * provides an easy mechanism for determining which itxs were in the 26701271e4b1SPrakash Surya * queue prior to zil_commit() having been called, and which itxs were 26711271e4b1SPrakash Surya * added after zil_commit() was called. 26725002558fSNeil Perrin * 26731271e4b1SPrakash Surya * The commit it is special; it doesn't have any on-disk representation. 26741271e4b1SPrakash Surya * When a commit itx is "committed" to an lwb, the waiter associated 26751271e4b1SPrakash Surya * with it is linked onto the lwb's list of waiters. Then, when that lwb 26761271e4b1SPrakash Surya * completes, each waiter on the lwb's list is marked done and signalled 26771271e4b1SPrakash Surya * -- allowing the thread waiting on the waiter to return from zil_commit(). 26785002558fSNeil Perrin * 26791271e4b1SPrakash Surya * It's important to point out a few critical factors that allow us 26801271e4b1SPrakash Surya * to make use of the commit itxs, commit waiters, per-lwb lists of 26811271e4b1SPrakash Surya * commit waiters, and zio completion callbacks like we're doing: 26825002558fSNeil Perrin * 26831271e4b1SPrakash Surya * 1. The list of waiters for each lwb is traversed, and each commit 26841271e4b1SPrakash Surya * waiter is marked "done" and signalled, in the zio completion 26851271e4b1SPrakash Surya * callback of the lwb's zio[*]. 26865002558fSNeil Perrin * 26871271e4b1SPrakash Surya * * Actually, the waiters are signalled in the zio completion 26881271e4b1SPrakash Surya * callback of the root zio for the DKIOCFLUSHWRITECACHE commands 26891271e4b1SPrakash Surya * that are sent to the vdevs upon completion of the lwb zio. 26901271e4b1SPrakash Surya * 26911271e4b1SPrakash Surya * 2. When the itxs are inserted into the ZIL's queue of uncommitted 26921271e4b1SPrakash Surya * itxs, the order in which they are inserted is preserved[*]; as 26931271e4b1SPrakash Surya * itxs are added to the queue, they are added to the tail of 26941271e4b1SPrakash Surya * in-memory linked lists. 26951271e4b1SPrakash Surya * 26961271e4b1SPrakash Surya * When committing the itxs to lwbs (to be written to disk), they 26971271e4b1SPrakash Surya * are committed in the same order in which the itxs were added to 26981271e4b1SPrakash Surya * the uncommitted queue's linked list(s); i.e. the linked list of 26991271e4b1SPrakash Surya * itxs to commit is traversed from head to tail, and each itx is 27001271e4b1SPrakash Surya * committed to an lwb in that order. 27011271e4b1SPrakash Surya * 27021271e4b1SPrakash Surya * * To clarify: 27031271e4b1SPrakash Surya * 27041271e4b1SPrakash Surya * - the order of "sync" itxs is preserved w.r.t. other 27051271e4b1SPrakash Surya * "sync" itxs, regardless of the corresponding objects. 27061271e4b1SPrakash Surya * - the order of "async" itxs is preserved w.r.t. other 27071271e4b1SPrakash Surya * "async" itxs corresponding to the same object. 27081271e4b1SPrakash Surya * - the order of "async" itxs is *not* preserved w.r.t. other 27091271e4b1SPrakash Surya * "async" itxs corresponding to different objects. 27101271e4b1SPrakash Surya * - the order of "sync" itxs w.r.t. "async" itxs (or vice 27111271e4b1SPrakash Surya * versa) is *not* preserved, even for itxs that correspond 27121271e4b1SPrakash Surya * to the same object. 27131271e4b1SPrakash Surya * 27141271e4b1SPrakash Surya * For more details, see: zil_itx_assign(), zil_async_to_sync(), 27151271e4b1SPrakash Surya * zil_get_commit_list(), and zil_process_commit_list(). 27161271e4b1SPrakash Surya * 27171271e4b1SPrakash Surya * 3. The lwbs represent a linked list of blocks on disk. Thus, any 27181271e4b1SPrakash Surya * lwb cannot be considered committed to stable storage, until its 27191271e4b1SPrakash Surya * "previous" lwb is also committed to stable storage. This fact, 27201271e4b1SPrakash Surya * coupled with the fact described above, means that itxs are 27211271e4b1SPrakash Surya * committed in (roughly) the order in which they were generated. 27221271e4b1SPrakash Surya * This is essential because itxs are dependent on prior itxs. 27231271e4b1SPrakash Surya * Thus, we *must not* deem an itx as being committed to stable 27241271e4b1SPrakash Surya * storage, until *all* prior itxs have also been committed to 27251271e4b1SPrakash Surya * stable storage. 27261271e4b1SPrakash Surya * 27271271e4b1SPrakash Surya * To enforce this ordering of lwb zio's, while still leveraging as 27281271e4b1SPrakash Surya * much of the underlying storage performance as possible, we rely 27291271e4b1SPrakash Surya * on two fundamental concepts: 27301271e4b1SPrakash Surya * 27311271e4b1SPrakash Surya * 1. The creation and issuance of lwb zio's is protected by 2732cf07d3daSPrakash Surya * the zilog's "zl_issuer_lock", which ensures only a single 27331271e4b1SPrakash Surya * thread is creating and/or issuing lwb's at a time 27341271e4b1SPrakash Surya * 2. The "previous" lwb is a child of the "current" lwb 27351271e4b1SPrakash Surya * (leveraging the zio parent-child depenency graph) 27361271e4b1SPrakash Surya * 27371271e4b1SPrakash Surya * By relying on this parent-child zio relationship, we can have 27381271e4b1SPrakash Surya * many lwb zio's concurrently issued to the underlying storage, 27391271e4b1SPrakash Surya * but the order in which they complete will be the same order in 27401271e4b1SPrakash Surya * which they were created. 2741b19a79ecSperrin */ 2742b19a79ecSperrin void 27435002558fSNeil Perrin zil_commit(zilog_t *zilog, uint64_t foid) 2744b19a79ecSperrin { 27451271e4b1SPrakash Surya /* 27461271e4b1SPrakash Surya * We should never attempt to call zil_commit on a snapshot for 27471271e4b1SPrakash Surya * a couple of reasons: 27481271e4b1SPrakash Surya * 27491271e4b1SPrakash Surya * 1. A snapshot may never be modified, thus it cannot have any 27501271e4b1SPrakash Surya * in-flight itxs that would have modified the dataset. 27511271e4b1SPrakash Surya * 27521271e4b1SPrakash Surya * 2. By design, when zil_commit() is called, a commit itx will 27531271e4b1SPrakash Surya * be assigned to this zilog; as a result, the zilog will be 27541271e4b1SPrakash Surya * dirtied. We must not dirty the zilog of a snapshot; there's 27551271e4b1SPrakash Surya * checks in the code that enforce this invariant, and will 27561271e4b1SPrakash Surya * cause a panic if it's not upheld. 27571271e4b1SPrakash Surya */ 27581271e4b1SPrakash Surya ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); 2759b19a79ecSperrin 27605002558fSNeil Perrin if (zilog->zl_sync == ZFS_SYNC_DISABLED) 27615002558fSNeil Perrin return; 2762b19a79ecSperrin 27631271e4b1SPrakash Surya if (!spa_writeable(zilog->zl_spa)) { 27641271e4b1SPrakash Surya /* 27651271e4b1SPrakash Surya * If the SPA is not writable, there should never be any 27661271e4b1SPrakash Surya * pending itxs waiting to be committed to disk. If that 27671271e4b1SPrakash Surya * weren't true, we'd skip writing those itxs out, and 27681271e4b1SPrakash Surya * would break the sematics of zil_commit(); thus, we're 27691271e4b1SPrakash Surya * verifying that truth before we return to the caller. 27701271e4b1SPrakash Surya */ 27711271e4b1SPrakash Surya ASSERT(list_is_empty(&zilog->zl_lwb_list)); 27721271e4b1SPrakash Surya ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 27731271e4b1SPrakash Surya for (int i = 0; i < TXG_SIZE; i++) 27741271e4b1SPrakash Surya ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); 27751271e4b1SPrakash Surya return; 27761271e4b1SPrakash Surya } 2777b19a79ecSperrin 27781271e4b1SPrakash Surya /* 27791271e4b1SPrakash Surya * If the ZIL is suspended, we don't want to dirty it by calling 27801271e4b1SPrakash Surya * zil_commit_itx_assign() below, nor can we write out 27811271e4b1SPrakash Surya * lwbs like would be done in zil_commit_write(). Thus, we 27821271e4b1SPrakash Surya * simply rely on txg_wait_synced() to maintain the necessary 27831271e4b1SPrakash Surya * semantics, and avoid calling those functions altogether. 27841271e4b1SPrakash Surya */ 27851271e4b1SPrakash Surya if (zilog->zl_suspend > 0) { 27861271e4b1SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 27871271e4b1SPrakash Surya return; 278867bd71c6Sperrin } 2789b24ab676SJeff Bonwick 279094ddd090SPrakash Surya zil_commit_impl(zilog, foid); 279194ddd090SPrakash Surya } 279294ddd090SPrakash Surya 279394ddd090SPrakash Surya void 279494ddd090SPrakash Surya zil_commit_impl(zilog_t *zilog, uint64_t foid) 279594ddd090SPrakash Surya { 27961271e4b1SPrakash Surya /* 27971271e4b1SPrakash Surya * Move the "async" itxs for the specified foid to the "sync" 27981271e4b1SPrakash Surya * queues, such that they will be later committed (or skipped) 27991271e4b1SPrakash Surya * to an lwb when zil_process_commit_list() is called. 28001271e4b1SPrakash Surya * 28011271e4b1SPrakash Surya * Since these "async" itxs must be committed prior to this 28021271e4b1SPrakash Surya * call to zil_commit returning, we must perform this operation 28031271e4b1SPrakash Surya * before we call zil_commit_itx_assign(). 28041271e4b1SPrakash Surya */ 28051271e4b1SPrakash Surya zil_async_to_sync(zilog, foid); 2806b24ab676SJeff Bonwick 28071271e4b1SPrakash Surya /* 28081271e4b1SPrakash Surya * We allocate a new "waiter" structure which will initially be 28091271e4b1SPrakash Surya * linked to the commit itx using the itx's "itx_private" field. 28101271e4b1SPrakash Surya * Since the commit itx doesn't represent any on-disk state, 28111271e4b1SPrakash Surya * when it's committed to an lwb, rather than copying the its 28121271e4b1SPrakash Surya * lr_t into the lwb's buffer, the commit itx's "waiter" will be 28131271e4b1SPrakash Surya * added to the lwb's list of waiters. Then, when the lwb is 28141271e4b1SPrakash Surya * committed to stable storage, each waiter in the lwb's list of 28151271e4b1SPrakash Surya * waiters will be marked "done", and signalled. 28161271e4b1SPrakash Surya * 28171271e4b1SPrakash Surya * We must create the waiter and assign the commit itx prior to 28181271e4b1SPrakash Surya * calling zil_commit_writer(), or else our specific commit itx 28191271e4b1SPrakash Surya * is not guaranteed to be committed to an lwb prior to calling 28201271e4b1SPrakash Surya * zil_commit_waiter(). 28211271e4b1SPrakash Surya */ 28221271e4b1SPrakash Surya zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); 28231271e4b1SPrakash Surya zil_commit_itx_assign(zilog, zcw); 28241271e4b1SPrakash Surya 28251271e4b1SPrakash Surya zil_commit_writer(zilog, zcw); 28261271e4b1SPrakash Surya zil_commit_waiter(zilog, zcw); 2827b24ab676SJeff Bonwick 28281271e4b1SPrakash Surya if (zcw->zcw_zio_error != 0) { 28291271e4b1SPrakash Surya /* 28301271e4b1SPrakash Surya * If there was an error writing out the ZIL blocks that 28311271e4b1SPrakash Surya * this thread is waiting on, then we fallback to 28321271e4b1SPrakash Surya * relying on spa_sync() to write out the data this 28331271e4b1SPrakash Surya * thread is waiting on. Obviously this has performance 28341271e4b1SPrakash Surya * implications, but the expectation is for this to be 28351271e4b1SPrakash Surya * an exceptional case, and shouldn't occur often. 28361271e4b1SPrakash Surya */ 28371271e4b1SPrakash Surya DTRACE_PROBE2(zil__commit__io__error, 28381271e4b1SPrakash Surya zilog_t *, zilog, zil_commit_waiter_t *, zcw); 28391271e4b1SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 28401271e4b1SPrakash Surya } 28411271e4b1SPrakash Surya 28421271e4b1SPrakash Surya zil_free_commit_waiter(zcw); 2843b24ab676SJeff Bonwick } 2844b24ab676SJeff Bonwick 2845fa9e4066Sahrens /* 2846fa9e4066Sahrens * Called in syncing context to free committed log blocks and update log header. 2847fa9e4066Sahrens */ 2848fa9e4066Sahrens void 2849fa9e4066Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx) 2850fa9e4066Sahrens { 2851d80c45e0Sbonwick zil_header_t *zh = zil_header_in_syncing_context(zilog); 2852fa9e4066Sahrens uint64_t txg = dmu_tx_get_txg(tx); 2853fa9e4066Sahrens spa_t *spa = zilog->zl_spa; 2854b24ab676SJeff Bonwick uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 2855fa9e4066Sahrens lwb_t *lwb; 2856fa9e4066Sahrens 285714843421SMatthew Ahrens /* 285814843421SMatthew Ahrens * We don't zero out zl_destroy_txg, so make sure we don't try 285914843421SMatthew Ahrens * to destroy it twice. 286014843421SMatthew Ahrens */ 286114843421SMatthew Ahrens if (spa_sync_pass(spa) != 1) 286214843421SMatthew Ahrens return; 286314843421SMatthew Ahrens 2864d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 2865d80c45e0Sbonwick 2866fa9e4066Sahrens ASSERT(zilog->zl_stop_sync == 0); 2867fa9e4066Sahrens 2868b24ab676SJeff Bonwick if (*replayed_seq != 0) { 2869b24ab676SJeff Bonwick ASSERT(zh->zh_replay_seq < *replayed_seq); 2870b24ab676SJeff Bonwick zh->zh_replay_seq = *replayed_seq; 2871b24ab676SJeff Bonwick *replayed_seq = 0; 2872b24ab676SJeff Bonwick } 2873fa9e4066Sahrens 2874fa9e4066Sahrens if (zilog->zl_destroy_txg == txg) { 2875d80c45e0Sbonwick blkptr_t blk = zh->zh_log; 2876d80c45e0Sbonwick 2877d80c45e0Sbonwick ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 2878d80c45e0Sbonwick 2879d80c45e0Sbonwick bzero(zh, sizeof (zil_header_t)); 28801209a471SNeil Perrin bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 2881d80c45e0Sbonwick 2882d80c45e0Sbonwick if (zilog->zl_keep_first) { 2883d80c45e0Sbonwick /* 2884d80c45e0Sbonwick * If this block was part of log chain that couldn't 2885d80c45e0Sbonwick * be claimed because a device was missing during 2886d80c45e0Sbonwick * zil_claim(), but that device later returns, 2887d80c45e0Sbonwick * then this block could erroneously appear valid. 2888d80c45e0Sbonwick * To guard against this, assign a new GUID to the new 2889d80c45e0Sbonwick * log chain so it doesn't matter what blk points to. 2890d80c45e0Sbonwick */ 2891d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 2892d80c45e0Sbonwick zh->zh_log = blk; 2893d80c45e0Sbonwick } 2894fa9e4066Sahrens } 2895fa9e4066Sahrens 2896e6ca193dSGeorge Wilson while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 2897b19a79ecSperrin zh->zh_log = lwb->lwb_blk; 2898fa9e4066Sahrens if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 2899fa9e4066Sahrens break; 2900fa9e4066Sahrens list_remove(&zilog->zl_lwb_list, lwb); 29011271e4b1SPrakash Surya zio_free(spa, txg, &lwb->lwb_blk); 29021271e4b1SPrakash Surya zil_free_lwb(zilog, lwb); 2903d63d470bSgw 2904d63d470bSgw /* 2905d63d470bSgw * If we don't have anything left in the lwb list then 2906d63d470bSgw * we've had an allocation failure and we need to zero 2907d63d470bSgw * out the zil_header blkptr so that we don't end 2908d63d470bSgw * up freeing the same block twice. 2909d63d470bSgw */ 2910d63d470bSgw if (list_head(&zilog->zl_lwb_list) == NULL) 2911d63d470bSgw BP_ZERO(&zh->zh_log); 2912fa9e4066Sahrens } 2913fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 2914fa9e4066Sahrens } 2915fa9e4066Sahrens 29161271e4b1SPrakash Surya /* ARGSUSED */ 29171271e4b1SPrakash Surya static int 29181271e4b1SPrakash Surya zil_lwb_cons(void *vbuf, void *unused, int kmflag) 29191271e4b1SPrakash Surya { 29201271e4b1SPrakash Surya lwb_t *lwb = vbuf; 29211271e4b1SPrakash Surya list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), 29221271e4b1SPrakash Surya offsetof(zil_commit_waiter_t, zcw_node)); 29231271e4b1SPrakash Surya avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, 29241271e4b1SPrakash Surya sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 29251271e4b1SPrakash Surya mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 29261271e4b1SPrakash Surya return (0); 29271271e4b1SPrakash Surya } 29281271e4b1SPrakash Surya 29291271e4b1SPrakash Surya /* ARGSUSED */ 29301271e4b1SPrakash Surya static void 29311271e4b1SPrakash Surya zil_lwb_dest(void *vbuf, void *unused) 29321271e4b1SPrakash Surya { 29331271e4b1SPrakash Surya lwb_t *lwb = vbuf; 29341271e4b1SPrakash Surya mutex_destroy(&lwb->lwb_vdev_lock); 29351271e4b1SPrakash Surya avl_destroy(&lwb->lwb_vdev_tree); 29361271e4b1SPrakash Surya list_destroy(&lwb->lwb_waiters); 29371271e4b1SPrakash Surya } 29381271e4b1SPrakash Surya 2939fa9e4066Sahrens void 2940fa9e4066Sahrens zil_init(void) 2941fa9e4066Sahrens { 2942fa9e4066Sahrens zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 29431271e4b1SPrakash Surya sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); 29441271e4b1SPrakash Surya 29451271e4b1SPrakash Surya zil_zcw_cache = kmem_cache_create("zil_zcw_cache", 29461271e4b1SPrakash Surya sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 2947fa9e4066Sahrens } 2948fa9e4066Sahrens 2949fa9e4066Sahrens void 2950fa9e4066Sahrens zil_fini(void) 2951fa9e4066Sahrens { 29521271e4b1SPrakash Surya kmem_cache_destroy(zil_zcw_cache); 2953fa9e4066Sahrens kmem_cache_destroy(zil_lwb_cache); 2954fa9e4066Sahrens } 2955fa9e4066Sahrens 295655da60b9SMark J Musante void 295755da60b9SMark J Musante zil_set_sync(zilog_t *zilog, uint64_t sync) 295855da60b9SMark J Musante { 295955da60b9SMark J Musante zilog->zl_sync = sync; 296055da60b9SMark J Musante } 296155da60b9SMark J Musante 2962e09fa4daSNeil Perrin void 2963e09fa4daSNeil Perrin zil_set_logbias(zilog_t *zilog, uint64_t logbias) 2964e09fa4daSNeil Perrin { 2965e09fa4daSNeil Perrin zilog->zl_logbias = logbias; 2966e09fa4daSNeil Perrin } 2967e09fa4daSNeil Perrin 2968fa9e4066Sahrens zilog_t * 2969fa9e4066Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys) 2970fa9e4066Sahrens { 2971fa9e4066Sahrens zilog_t *zilog; 2972fa9e4066Sahrens 2973fa9e4066Sahrens zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 2974fa9e4066Sahrens 2975fa9e4066Sahrens zilog->zl_header = zh_phys; 2976fa9e4066Sahrens zilog->zl_os = os; 2977fa9e4066Sahrens zilog->zl_spa = dmu_objset_spa(os); 2978fa9e4066Sahrens zilog->zl_dmu_pool = dmu_objset_pool(os); 2979d80c45e0Sbonwick zilog->zl_destroy_txg = TXG_INITIAL - 1; 2980e09fa4daSNeil Perrin zilog->zl_logbias = dmu_objset_logbias(os); 298155da60b9SMark J Musante zilog->zl_sync = dmu_objset_syncprop(os); 29821271e4b1SPrakash Surya zilog->zl_dirty_max_txg = 0; 29831271e4b1SPrakash Surya zilog->zl_last_lwb_opened = NULL; 29841271e4b1SPrakash Surya zilog->zl_last_lwb_latency = 0; 2985fa9e4066Sahrens 29865ad82045Snd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 2987cf07d3daSPrakash Surya mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); 29885ad82045Snd 29895002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 29905002558fSNeil Perrin mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 29915002558fSNeil Perrin MUTEX_DEFAULT, NULL); 29925002558fSNeil Perrin } 2993fa9e4066Sahrens 2994fa9e4066Sahrens list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 2995fa9e4066Sahrens offsetof(lwb_t, lwb_node)); 2996fa9e4066Sahrens 29975002558fSNeil Perrin list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 29985002558fSNeil Perrin offsetof(itx_t, itx_node)); 29995002558fSNeil Perrin 3000b7b97454Sperrin cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 3001b7b97454Sperrin 3002fa9e4066Sahrens return (zilog); 3003fa9e4066Sahrens } 3004fa9e4066Sahrens 3005fa9e4066Sahrens void 3006fa9e4066Sahrens zil_free(zilog_t *zilog) 3007fa9e4066Sahrens { 3008fa9e4066Sahrens zilog->zl_stop_sync = 1; 3009fa9e4066Sahrens 30103b2aab18SMatthew Ahrens ASSERT0(zilog->zl_suspend); 30113b2aab18SMatthew Ahrens ASSERT0(zilog->zl_suspending); 30123b2aab18SMatthew Ahrens 3013c9ba2a43SEric Schrock ASSERT(list_is_empty(&zilog->zl_lwb_list)); 3014fa9e4066Sahrens list_destroy(&zilog->zl_lwb_list); 3015fa9e4066Sahrens 30165002558fSNeil Perrin ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 30175002558fSNeil Perrin list_destroy(&zilog->zl_itx_commit_list); 30185002558fSNeil Perrin 30195002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 30205002558fSNeil Perrin /* 30215002558fSNeil Perrin * It's possible for an itx to be generated that doesn't dirty 30225002558fSNeil Perrin * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 30235002558fSNeil Perrin * callback to remove the entry. We remove those here. 30245002558fSNeil Perrin * 30255002558fSNeil Perrin * Also free up the ziltest itxs. 30265002558fSNeil Perrin */ 30275002558fSNeil Perrin if (zilog->zl_itxg[i].itxg_itxs) 30285002558fSNeil Perrin zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 30295002558fSNeil Perrin mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 30305002558fSNeil Perrin } 30315002558fSNeil Perrin 3032cf07d3daSPrakash Surya mutex_destroy(&zilog->zl_issuer_lock); 30335ad82045Snd mutex_destroy(&zilog->zl_lock); 3034fa9e4066Sahrens 3035b7b97454Sperrin cv_destroy(&zilog->zl_cv_suspend); 3036b7b97454Sperrin 3037fa9e4066Sahrens kmem_free(zilog, sizeof (zilog_t)); 3038fa9e4066Sahrens } 3039fa9e4066Sahrens 3040fa9e4066Sahrens /* 3041fa9e4066Sahrens * Open an intent log. 3042fa9e4066Sahrens */ 3043fa9e4066Sahrens zilog_t * 3044fa9e4066Sahrens zil_open(objset_t *os, zil_get_data_t *get_data) 3045fa9e4066Sahrens { 3046fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 3047fa9e4066Sahrens 30481271e4b1SPrakash Surya ASSERT3P(zilog->zl_get_data, ==, NULL); 30491271e4b1SPrakash Surya ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); 3050c9ba2a43SEric Schrock ASSERT(list_is_empty(&zilog->zl_lwb_list)); 3051c9ba2a43SEric Schrock 3052fa9e4066Sahrens zilog->zl_get_data = get_data; 3053fa9e4066Sahrens 3054fa9e4066Sahrens return (zilog); 3055fa9e4066Sahrens } 3056fa9e4066Sahrens 3057fa9e4066Sahrens /* 3058fa9e4066Sahrens * Close an intent log. 3059fa9e4066Sahrens */ 3060fa9e4066Sahrens void 3061fa9e4066Sahrens zil_close(zilog_t *zilog) 3062fa9e4066Sahrens { 3063c9ba2a43SEric Schrock lwb_t *lwb; 30641271e4b1SPrakash Surya uint64_t txg; 30655002558fSNeil Perrin 30661271e4b1SPrakash Surya if (!dmu_objset_is_snapshot(zilog->zl_os)) { 30671271e4b1SPrakash Surya zil_commit(zilog, 0); 30681271e4b1SPrakash Surya } else { 30691271e4b1SPrakash Surya ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); 30701271e4b1SPrakash Surya ASSERT0(zilog->zl_dirty_max_txg); 30711271e4b1SPrakash Surya ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); 30721271e4b1SPrakash Surya } 30735002558fSNeil Perrin 30745002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 3075c9ba2a43SEric Schrock lwb = list_tail(&zilog->zl_lwb_list); 30761271e4b1SPrakash Surya if (lwb == NULL) 30771271e4b1SPrakash Surya txg = zilog->zl_dirty_max_txg; 30781271e4b1SPrakash Surya else 30791271e4b1SPrakash Surya txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); 30805002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 30811271e4b1SPrakash Surya 30821271e4b1SPrakash Surya /* 30831271e4b1SPrakash Surya * We need to use txg_wait_synced() to wait long enough for the 30841271e4b1SPrakash Surya * ZIL to be clean, and to wait for all pending lwbs to be 30851271e4b1SPrakash Surya * written out. 30861271e4b1SPrakash Surya */ 30871271e4b1SPrakash Surya if (txg != 0) 3088d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, txg); 308943297f97SGeorge Wilson 309043297f97SGeorge Wilson if (zilog_is_dirty(zilog)) 309143297f97SGeorge Wilson zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); 3092*54811da5SToomas Soome if (txg < spa_freeze_txg(zilog->zl_spa)) 3093*54811da5SToomas Soome VERIFY(!zilog_is_dirty(zilog)); 3094d80c45e0Sbonwick 3095fa9e4066Sahrens zilog->zl_get_data = NULL; 3096c9ba2a43SEric Schrock 3097c9ba2a43SEric Schrock /* 30981271e4b1SPrakash Surya * We should have only one lwb left on the list; remove it now. 3099c9ba2a43SEric Schrock */ 3100c9ba2a43SEric Schrock mutex_enter(&zilog->zl_lock); 3101c9ba2a43SEric Schrock lwb = list_head(&zilog->zl_lwb_list); 3102c9ba2a43SEric Schrock if (lwb != NULL) { 31031271e4b1SPrakash Surya ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); 31041271e4b1SPrakash Surya ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); 3105c9ba2a43SEric Schrock list_remove(&zilog->zl_lwb_list, lwb); 3106c9ba2a43SEric Schrock zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 31071271e4b1SPrakash Surya zil_free_lwb(zilog, lwb); 3108c9ba2a43SEric Schrock } 3109c9ba2a43SEric Schrock mutex_exit(&zilog->zl_lock); 3110fa9e4066Sahrens } 3111fa9e4066Sahrens 31123b2aab18SMatthew Ahrens static char *suspend_tag = "zil suspending"; 31133b2aab18SMatthew Ahrens 3114fa9e4066Sahrens /* 3115fa9e4066Sahrens * Suspend an intent log. While in suspended mode, we still honor 3116fa9e4066Sahrens * synchronous semantics, but we rely on txg_wait_synced() to do it. 31173b2aab18SMatthew Ahrens * On old version pools, we suspend the log briefly when taking a 31183b2aab18SMatthew Ahrens * snapshot so that it will have an empty intent log. 31193b2aab18SMatthew Ahrens * 31203b2aab18SMatthew Ahrens * Long holds are not really intended to be used the way we do here -- 31213b2aab18SMatthew Ahrens * held for such a short time. A concurrent caller of dsl_dataset_long_held() 31223b2aab18SMatthew Ahrens * could fail. Therefore we take pains to only put a long hold if it is 31233b2aab18SMatthew Ahrens * actually necessary. Fortunately, it will only be necessary if the 31243b2aab18SMatthew Ahrens * objset is currently mounted (or the ZVOL equivalent). In that case it 31253b2aab18SMatthew Ahrens * will already have a long hold, so we are not really making things any worse. 31263b2aab18SMatthew Ahrens * 31273b2aab18SMatthew Ahrens * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or 31283b2aab18SMatthew Ahrens * zvol_state_t), and use their mechanism to prevent their hold from being 31293b2aab18SMatthew Ahrens * dropped (e.g. VFS_HOLD()). However, that would be even more pain for 31303b2aab18SMatthew Ahrens * very little gain. 31313b2aab18SMatthew Ahrens * 31323b2aab18SMatthew Ahrens * if cookiep == NULL, this does both the suspend & resume. 31333b2aab18SMatthew Ahrens * Otherwise, it returns with the dataset "long held", and the cookie 31343b2aab18SMatthew Ahrens * should be passed into zil_resume(). 3135fa9e4066Sahrens */ 3136fa9e4066Sahrens int 31373b2aab18SMatthew Ahrens zil_suspend(const char *osname, void **cookiep) 3138fa9e4066Sahrens { 31393b2aab18SMatthew Ahrens objset_t *os; 31403b2aab18SMatthew Ahrens zilog_t *zilog; 31413b2aab18SMatthew Ahrens const zil_header_t *zh; 31423b2aab18SMatthew Ahrens int error; 31433b2aab18SMatthew Ahrens 31443b2aab18SMatthew Ahrens error = dmu_objset_hold(osname, suspend_tag, &os); 31453b2aab18SMatthew Ahrens if (error != 0) 31463b2aab18SMatthew Ahrens return (error); 31473b2aab18SMatthew Ahrens zilog = dmu_objset_zil(os); 3148fa9e4066Sahrens 3149fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 31503b2aab18SMatthew Ahrens zh = zilog->zl_header; 31513b2aab18SMatthew Ahrens 31523589c4f0SNeil Perrin if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 3153fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 31543b2aab18SMatthew Ahrens dmu_objset_rele(os, suspend_tag); 3155be6fd75aSMatthew Ahrens return (SET_ERROR(EBUSY)); 3156fa9e4066Sahrens } 31573b2aab18SMatthew Ahrens 31583b2aab18SMatthew Ahrens /* 31593b2aab18SMatthew Ahrens * Don't put a long hold in the cases where we can avoid it. This 31603b2aab18SMatthew Ahrens * is when there is no cookie so we are doing a suspend & resume 31613b2aab18SMatthew Ahrens * (i.e. called from zil_vdev_offline()), and there's nothing to do 31623b2aab18SMatthew Ahrens * for the suspend because it's already suspended, or there's no ZIL. 31633b2aab18SMatthew Ahrens */ 31643b2aab18SMatthew Ahrens if (cookiep == NULL && !zilog->zl_suspending && 31653b2aab18SMatthew Ahrens (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { 31663b2aab18SMatthew Ahrens mutex_exit(&zilog->zl_lock); 31673b2aab18SMatthew Ahrens dmu_objset_rele(os, suspend_tag); 31683b2aab18SMatthew Ahrens return (0); 31693b2aab18SMatthew Ahrens } 31703b2aab18SMatthew Ahrens 31713b2aab18SMatthew Ahrens dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); 31723b2aab18SMatthew Ahrens dsl_pool_rele(dmu_objset_pool(os), suspend_tag); 31733b2aab18SMatthew Ahrens 31743b2aab18SMatthew Ahrens zilog->zl_suspend++; 31753b2aab18SMatthew Ahrens 31763b2aab18SMatthew Ahrens if (zilog->zl_suspend > 1) { 3177d80c45e0Sbonwick /* 31783b2aab18SMatthew Ahrens * Someone else is already suspending it. 3179d80c45e0Sbonwick * Just wait for them to finish. 3180d80c45e0Sbonwick */ 31813b2aab18SMatthew Ahrens 3182d80c45e0Sbonwick while (zilog->zl_suspending) 3183d80c45e0Sbonwick cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 3184d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 31853b2aab18SMatthew Ahrens 31863b2aab18SMatthew Ahrens if (cookiep == NULL) 31873b2aab18SMatthew Ahrens zil_resume(os); 31883b2aab18SMatthew Ahrens else 31893b2aab18SMatthew Ahrens *cookiep = os; 31903b2aab18SMatthew Ahrens return (0); 31913b2aab18SMatthew Ahrens } 31923b2aab18SMatthew Ahrens 31933b2aab18SMatthew Ahrens /* 31943b2aab18SMatthew Ahrens * If there is no pointer to an on-disk block, this ZIL must not 31953b2aab18SMatthew Ahrens * be active (e.g. filesystem not mounted), so there's nothing 31963b2aab18SMatthew Ahrens * to clean up. 31973b2aab18SMatthew Ahrens */ 31983b2aab18SMatthew Ahrens if (BP_IS_HOLE(&zh->zh_log)) { 31993b2aab18SMatthew Ahrens ASSERT(cookiep != NULL); /* fast path already handled */ 32003b2aab18SMatthew Ahrens 32013b2aab18SMatthew Ahrens *cookiep = os; 32023b2aab18SMatthew Ahrens mutex_exit(&zilog->zl_lock); 3203d80c45e0Sbonwick return (0); 3204d80c45e0Sbonwick } 32053b2aab18SMatthew Ahrens 3206d80c45e0Sbonwick zilog->zl_suspending = B_TRUE; 3207fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 3208fa9e4066Sahrens 320994ddd090SPrakash Surya /* 321094ddd090SPrakash Surya * We need to use zil_commit_impl to ensure we wait for all 321194ddd090SPrakash Surya * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed 321294ddd090SPrakash Surya * to disk before proceeding. If we used zil_commit instead, it 321394ddd090SPrakash Surya * would just call txg_wait_synced(), because zl_suspend is set. 321494ddd090SPrakash Surya * txg_wait_synced() doesn't wait for these lwb's to be 3215cab3a55eSPrakash Surya * LWB_STATE_FLUSH_DONE before returning. 321694ddd090SPrakash Surya */ 321794ddd090SPrakash Surya zil_commit_impl(zilog, 0); 321894ddd090SPrakash Surya 321994ddd090SPrakash Surya /* 3220cab3a55eSPrakash Surya * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we 3221cab3a55eSPrakash Surya * use txg_wait_synced() to ensure the data from the zilog has 322294ddd090SPrakash Surya * migrated to the main pool before calling zil_destroy(). 322394ddd090SPrakash Surya */ 322494ddd090SPrakash Surya txg_wait_synced(zilog->zl_dmu_pool, 0); 3225fa9e4066Sahrens 3226d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 3227d80c45e0Sbonwick 3228d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 3229d80c45e0Sbonwick zilog->zl_suspending = B_FALSE; 3230d80c45e0Sbonwick cv_broadcast(&zilog->zl_cv_suspend); 3231d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 3232fa9e4066Sahrens 32333b2aab18SMatthew Ahrens if (cookiep == NULL) 32343b2aab18SMatthew Ahrens zil_resume(os); 32353b2aab18SMatthew Ahrens else 32363b2aab18SMatthew Ahrens *cookiep = os; 3237fa9e4066Sahrens return (0); 3238fa9e4066Sahrens } 3239fa9e4066Sahrens 3240fa9e4066Sahrens void 32413b2aab18SMatthew Ahrens zil_resume(void *cookie) 3242fa9e4066Sahrens { 32433b2aab18SMatthew Ahrens objset_t *os = cookie; 32443b2aab18SMatthew Ahrens zilog_t *zilog = dmu_objset_zil(os); 32453b2aab18SMatthew Ahrens 3246fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 3247fa9e4066Sahrens ASSERT(zilog->zl_suspend != 0); 3248fa9e4066Sahrens zilog->zl_suspend--; 3249fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 32503b2aab18SMatthew Ahrens dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); 32513b2aab18SMatthew Ahrens dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); 3252fa9e4066Sahrens } 3253fa9e4066Sahrens 3254fa9e4066Sahrens typedef struct zil_replay_arg { 3255fa9e4066Sahrens zil_replay_func_t **zr_replay; 3256fa9e4066Sahrens void *zr_arg; 3257fa9e4066Sahrens boolean_t zr_byteswap; 3258b24ab676SJeff Bonwick char *zr_lr; 3259fa9e4066Sahrens } zil_replay_arg_t; 3260fa9e4066Sahrens 3261b24ab676SJeff Bonwick static int 3262b24ab676SJeff Bonwick zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 3263b24ab676SJeff Bonwick { 32649adfa60dSMatthew Ahrens char name[ZFS_MAX_DATASET_NAME_LEN]; 3265b24ab676SJeff Bonwick 3266b24ab676SJeff Bonwick zilog->zl_replaying_seq--; /* didn't actually replay this one */ 3267b24ab676SJeff Bonwick 3268b24ab676SJeff Bonwick dmu_objset_name(zilog->zl_os, name); 3269b24ab676SJeff Bonwick 3270b24ab676SJeff Bonwick cmn_err(CE_WARN, "ZFS replay transaction error %d, " 3271b24ab676SJeff Bonwick "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 3272b24ab676SJeff Bonwick (u_longlong_t)lr->lrc_seq, 3273b24ab676SJeff Bonwick (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 3274b24ab676SJeff Bonwick (lr->lrc_txtype & TX_CI) ? "CI" : ""); 3275b24ab676SJeff Bonwick 3276b24ab676SJeff Bonwick return (error); 3277b24ab676SJeff Bonwick } 3278b24ab676SJeff Bonwick 3279b24ab676SJeff Bonwick static int 3280fa9e4066Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 3281fa9e4066Sahrens { 3282fa9e4066Sahrens zil_replay_arg_t *zr = zra; 3283d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 3284fa9e4066Sahrens uint64_t reclen = lr->lrc_reclen; 3285fa9e4066Sahrens uint64_t txtype = lr->lrc_txtype; 3286b24ab676SJeff Bonwick int error = 0; 3287fa9e4066Sahrens 3288b24ab676SJeff Bonwick zilog->zl_replaying_seq = lr->lrc_seq; 3289fa9e4066Sahrens 3290fa9e4066Sahrens if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 3291b24ab676SJeff Bonwick return (0); 3292b24ab676SJeff Bonwick 3293b24ab676SJeff Bonwick if (lr->lrc_txg < claim_txg) /* already committed */ 3294b24ab676SJeff Bonwick return (0); 3295fa9e4066Sahrens 3296da6c28aaSamw /* Strip case-insensitive bit, still present in log record */ 3297da6c28aaSamw txtype &= ~TX_CI; 3298da6c28aaSamw 3299b24ab676SJeff Bonwick if (txtype == 0 || txtype >= TX_MAX_TYPE) 3300b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, EINVAL)); 3301b24ab676SJeff Bonwick 3302b24ab676SJeff Bonwick /* 3303b24ab676SJeff Bonwick * If this record type can be logged out of order, the object 3304b24ab676SJeff Bonwick * (lr_foid) may no longer exist. That's legitimate, not an error. 3305b24ab676SJeff Bonwick */ 3306b24ab676SJeff Bonwick if (TX_OOO(txtype)) { 3307b24ab676SJeff Bonwick error = dmu_object_info(zilog->zl_os, 3308*54811da5SToomas Soome LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); 3309b24ab676SJeff Bonwick if (error == ENOENT || error == EEXIST) 3310b24ab676SJeff Bonwick return (0); 33111209a471SNeil Perrin } 33121209a471SNeil Perrin 3313fa9e4066Sahrens /* 3314fa9e4066Sahrens * Make a copy of the data so we can revise and extend it. 3315fa9e4066Sahrens */ 3316b24ab676SJeff Bonwick bcopy(lr, zr->zr_lr, reclen); 3317b24ab676SJeff Bonwick 3318b24ab676SJeff Bonwick /* 3319b24ab676SJeff Bonwick * If this is a TX_WRITE with a blkptr, suck in the data. 3320b24ab676SJeff Bonwick */ 3321b24ab676SJeff Bonwick if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 3322b24ab676SJeff Bonwick error = zil_read_log_data(zilog, (lr_write_t *)lr, 3323b24ab676SJeff Bonwick zr->zr_lr + reclen); 33243b2aab18SMatthew Ahrens if (error != 0) 3325b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 3326b24ab676SJeff Bonwick } 3327fa9e4066Sahrens 3328fa9e4066Sahrens /* 3329fa9e4066Sahrens * The log block containing this lr may have been byteswapped 3330fa9e4066Sahrens * so that we can easily examine common fields like lrc_txtype. 3331b24ab676SJeff Bonwick * However, the log is a mix of different record types, and only the 3332fa9e4066Sahrens * replay vectors know how to byteswap their records. Therefore, if 3333fa9e4066Sahrens * the lr was byteswapped, undo it before invoking the replay vector. 3334fa9e4066Sahrens */ 3335fa9e4066Sahrens if (zr->zr_byteswap) 3336b24ab676SJeff Bonwick byteswap_uint64_array(zr->zr_lr, reclen); 3337fa9e4066Sahrens 3338fa9e4066Sahrens /* 3339fa9e4066Sahrens * We must now do two things atomically: replay this log record, 33401209a471SNeil Perrin * and update the log header sequence number to reflect the fact that 33411209a471SNeil Perrin * we did so. At the end of each replay function the sequence number 33421209a471SNeil Perrin * is updated if we are in replay mode. 3343fa9e4066Sahrens */ 3344b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 33453b2aab18SMatthew Ahrens if (error != 0) { 334667bd71c6Sperrin /* 334767bd71c6Sperrin * The DMU's dnode layer doesn't see removes until the txg 334867bd71c6Sperrin * commits, so a subsequent claim can spuriously fail with 33491209a471SNeil Perrin * EEXIST. So if we receive any error we try syncing out 3350b24ab676SJeff Bonwick * any removes then retry the transaction. Note that we 3351b24ab676SJeff Bonwick * specify B_FALSE for byteswap now, so we don't do it twice. 335267bd71c6Sperrin */ 3353b24ab676SJeff Bonwick txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 3354b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 33553b2aab18SMatthew Ahrens if (error != 0) 3356b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 3357fa9e4066Sahrens } 3358b24ab676SJeff Bonwick return (0); 335967bd71c6Sperrin } 3360fa9e4066Sahrens 336167bd71c6Sperrin /* ARGSUSED */ 3362b24ab676SJeff Bonwick static int 336367bd71c6Sperrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 336467bd71c6Sperrin { 336567bd71c6Sperrin zilog->zl_replay_blks++; 3366b24ab676SJeff Bonwick 3367b24ab676SJeff Bonwick return (0); 3368fa9e4066Sahrens } 3369fa9e4066Sahrens 3370fa9e4066Sahrens /* 337113f5297eSperrin * If this dataset has a non-empty intent log, replay it and destroy it. 3372fa9e4066Sahrens */ 3373fa9e4066Sahrens void 33741209a471SNeil Perrin zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 3375fa9e4066Sahrens { 3376fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 3377d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 3378d80c45e0Sbonwick zil_replay_arg_t zr; 337913f5297eSperrin 33803589c4f0SNeil Perrin if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 3381d80c45e0Sbonwick zil_destroy(zilog, B_TRUE); 338213f5297eSperrin return; 338313f5297eSperrin } 3384fa9e4066Sahrens 3385fa9e4066Sahrens zr.zr_replay = replay_func; 3386fa9e4066Sahrens zr.zr_arg = arg; 3387d80c45e0Sbonwick zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 3388b24ab676SJeff Bonwick zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 3389fa9e4066Sahrens 3390fa9e4066Sahrens /* 3391fa9e4066Sahrens * Wait for in-progress removes to sync before starting replay. 3392fa9e4066Sahrens */ 3393fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 3394fa9e4066Sahrens 33951209a471SNeil Perrin zilog->zl_replay = B_TRUE; 3396d3d50737SRafael Vanoni zilog->zl_replay_time = ddi_get_lbolt(); 339767bd71c6Sperrin ASSERT(zilog->zl_replay_blks == 0); 339867bd71c6Sperrin (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 3399d80c45e0Sbonwick zh->zh_claim_txg); 3400b24ab676SJeff Bonwick kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 3401fa9e4066Sahrens 3402d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 3403a4611edeSahrens txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 34041209a471SNeil Perrin zilog->zl_replay = B_FALSE; 3405fa9e4066Sahrens } 3406436b2950Sperrin 3407b24ab676SJeff Bonwick boolean_t 3408b24ab676SJeff Bonwick zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 3409436b2950Sperrin { 341055da60b9SMark J Musante if (zilog->zl_sync == ZFS_SYNC_DISABLED) 3411b24ab676SJeff Bonwick return (B_TRUE); 3412436b2950Sperrin 3413b24ab676SJeff Bonwick if (zilog->zl_replay) { 3414b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 3415b24ab676SJeff Bonwick zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 3416b24ab676SJeff Bonwick zilog->zl_replaying_seq; 3417b24ab676SJeff Bonwick return (B_TRUE); 3418b19a79ecSperrin } 3419b19a79ecSperrin 3420b24ab676SJeff Bonwick return (B_FALSE); 3421436b2950Sperrin } 3422e6ca193dSGeorge Wilson 3423e6ca193dSGeorge Wilson /* ARGSUSED */ 3424e6ca193dSGeorge Wilson int 34255cabbc6bSPrashanth Sreenivasa zil_reset(const char *osname, void *arg) 3426e6ca193dSGeorge Wilson { 3427e6ca193dSGeorge Wilson int error; 3428e6ca193dSGeorge Wilson 34293b2aab18SMatthew Ahrens error = zil_suspend(osname, NULL); 34303b2aab18SMatthew Ahrens if (error != 0) 3431be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST)); 34323b2aab18SMatthew Ahrens return (0); 3433e6ca193dSGeorge Wilson } 3434