1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5fe9cf88cSperrin * Common Development and Distribution License (the "License"). 6fe9cf88cSperrin * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2255da60b9SMark J Musante * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23fa9e4066Sahrens */ 24fa9e4066Sahrens 2555da60b9SMark J Musante /* Portions Copyright 2010 Robert Milkowski */ 2655da60b9SMark J Musante 27fa9e4066Sahrens #include <sys/zfs_context.h> 28fa9e4066Sahrens #include <sys/spa.h> 29fa9e4066Sahrens #include <sys/dmu.h> 30fa9e4066Sahrens #include <sys/zap.h> 31fa9e4066Sahrens #include <sys/arc.h> 32fa9e4066Sahrens #include <sys/stat.h> 33fa9e4066Sahrens #include <sys/resource.h> 34fa9e4066Sahrens #include <sys/zil.h> 35fa9e4066Sahrens #include <sys/zil_impl.h> 36fa9e4066Sahrens #include <sys/dsl_dataset.h> 37*4b964adaSGeorge Wilson #include <sys/vdev_impl.h> 38d63d470bSgw #include <sys/dmu_tx.h> 393f9d6ad7SLin Ling #include <sys/dsl_pool.h> 40fa9e4066Sahrens 41fa9e4066Sahrens /* 42fa9e4066Sahrens * The zfs intent log (ZIL) saves transaction records of system calls 43fa9e4066Sahrens * that change the file system in memory with enough information 44fa9e4066Sahrens * to be able to replay them. These are stored in memory until 45fa9e4066Sahrens * either the DMU transaction group (txg) commits them to the stable pool 46fa9e4066Sahrens * and they can be discarded, or they are flushed to the stable log 47fa9e4066Sahrens * (also in the pool) due to a fsync, O_DSYNC or other synchronous 48fa9e4066Sahrens * requirement. In the event of a panic or power fail then those log 49fa9e4066Sahrens * records (transactions) are replayed. 50fa9e4066Sahrens * 51fa9e4066Sahrens * There is one ZIL per file system. Its on-disk (pool) format consists 52fa9e4066Sahrens * of 3 parts: 53fa9e4066Sahrens * 54fa9e4066Sahrens * - ZIL header 55fa9e4066Sahrens * - ZIL blocks 56fa9e4066Sahrens * - ZIL records 57fa9e4066Sahrens * 58fa9e4066Sahrens * A log record holds a system call transaction. Log blocks can 59fa9e4066Sahrens * hold many log records and the blocks are chained together. 60fa9e4066Sahrens * Each ZIL block contains a block pointer (blkptr_t) to the next 61fa9e4066Sahrens * ZIL block in the chain. The ZIL header points to the first 62fa9e4066Sahrens * block in the chain. Note there is not a fixed place in the pool 63fa9e4066Sahrens * to hold blocks. They are dynamically allocated and freed as 64fa9e4066Sahrens * needed from the blocks available. Figure X shows the ZIL structure: 65fa9e4066Sahrens */ 66fa9e4066Sahrens 67fa9e4066Sahrens /* 68416e0cd8Sek * This global ZIL switch affects all pools 69fa9e4066Sahrens */ 7055da60b9SMark J Musante int zil_replay_disable = 0; /* disable intent logging replay */ 71416e0cd8Sek 72416e0cd8Sek /* 73416e0cd8Sek * Tunable parameter for debugging or performance analysis. Setting 74416e0cd8Sek * zfs_nocacheflush will cause corruption on power loss if a volatile 75416e0cd8Sek * out-of-order write cache is enabled. 76416e0cd8Sek */ 77416e0cd8Sek boolean_t zfs_nocacheflush = B_FALSE; 78fa9e4066Sahrens 79fa9e4066Sahrens static kmem_cache_t *zil_lwb_cache; 80fa9e4066Sahrens 8191de656bSNeil Perrin static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); 828f18d1faSGeorge Wilson 836e1f5caaSNeil Perrin #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 846e1f5caaSNeil Perrin sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 856e1f5caaSNeil Perrin 866e1f5caaSNeil Perrin 875002558fSNeil Perrin /* 885002558fSNeil Perrin * ziltest is by and large an ugly hack, but very useful in 895002558fSNeil Perrin * checking replay without tedious work. 905002558fSNeil Perrin * When running ziltest we want to keep all itx's and so maintain 915002558fSNeil Perrin * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG 925002558fSNeil Perrin * We subtract TXG_CONCURRENT_STATES to allow for common code. 935002558fSNeil Perrin */ 945002558fSNeil Perrin #define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) 955002558fSNeil Perrin 96fa9e4066Sahrens static int 97b24ab676SJeff Bonwick zil_bp_compare(const void *x1, const void *x2) 98fa9e4066Sahrens { 99b24ab676SJeff Bonwick const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 100b24ab676SJeff Bonwick const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 101fa9e4066Sahrens 102fa9e4066Sahrens if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 103fa9e4066Sahrens return (-1); 104fa9e4066Sahrens if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 105fa9e4066Sahrens return (1); 106fa9e4066Sahrens 107fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 108fa9e4066Sahrens return (-1); 109fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 110fa9e4066Sahrens return (1); 111fa9e4066Sahrens 112fa9e4066Sahrens return (0); 113fa9e4066Sahrens } 114fa9e4066Sahrens 115fa9e4066Sahrens static void 116b24ab676SJeff Bonwick zil_bp_tree_init(zilog_t *zilog) 117fa9e4066Sahrens { 118b24ab676SJeff Bonwick avl_create(&zilog->zl_bp_tree, zil_bp_compare, 119b24ab676SJeff Bonwick sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 120fa9e4066Sahrens } 121fa9e4066Sahrens 122fa9e4066Sahrens static void 123b24ab676SJeff Bonwick zil_bp_tree_fini(zilog_t *zilog) 124fa9e4066Sahrens { 125b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 126b24ab676SJeff Bonwick zil_bp_node_t *zn; 127fa9e4066Sahrens void *cookie = NULL; 128fa9e4066Sahrens 129fa9e4066Sahrens while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 130b24ab676SJeff Bonwick kmem_free(zn, sizeof (zil_bp_node_t)); 131fa9e4066Sahrens 132fa9e4066Sahrens avl_destroy(t); 133fa9e4066Sahrens } 134fa9e4066Sahrens 135b24ab676SJeff Bonwick int 136b24ab676SJeff Bonwick zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 137fa9e4066Sahrens { 138b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 139b24ab676SJeff Bonwick const dva_t *dva = BP_IDENTITY(bp); 140b24ab676SJeff Bonwick zil_bp_node_t *zn; 141fa9e4066Sahrens avl_index_t where; 142fa9e4066Sahrens 143fa9e4066Sahrens if (avl_find(t, dva, &where) != NULL) 144fa9e4066Sahrens return (EEXIST); 145fa9e4066Sahrens 146b24ab676SJeff Bonwick zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 147fa9e4066Sahrens zn->zn_dva = *dva; 148fa9e4066Sahrens avl_insert(t, zn, where); 149fa9e4066Sahrens 150fa9e4066Sahrens return (0); 151fa9e4066Sahrens } 152fa9e4066Sahrens 153d80c45e0Sbonwick static zil_header_t * 154d80c45e0Sbonwick zil_header_in_syncing_context(zilog_t *zilog) 155d80c45e0Sbonwick { 156d80c45e0Sbonwick return ((zil_header_t *)zilog->zl_header); 157d80c45e0Sbonwick } 158d80c45e0Sbonwick 159d80c45e0Sbonwick static void 160d80c45e0Sbonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 161d80c45e0Sbonwick { 162d80c45e0Sbonwick zio_cksum_t *zc = &bp->blk_cksum; 163d80c45e0Sbonwick 164d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 165d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 166d80c45e0Sbonwick zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 167d80c45e0Sbonwick zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 168d80c45e0Sbonwick } 169d80c45e0Sbonwick 170fa9e4066Sahrens /* 171b24ab676SJeff Bonwick * Read a log block and make sure it's valid. 172fa9e4066Sahrens */ 173fa9e4066Sahrens static int 1746e1f5caaSNeil Perrin zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 1756e1f5caaSNeil Perrin char **end) 176fa9e4066Sahrens { 177b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 17813506d1eSmaybee uint32_t aflags = ARC_WAIT; 179b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 180b24ab676SJeff Bonwick zbookmark_t zb; 181fa9e4066Sahrens int error; 182fa9e4066Sahrens 183b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 184b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 185ea8dc4b6Seschrock 186b24ab676SJeff Bonwick if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 187b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE; 188fa9e4066Sahrens 189b24ab676SJeff Bonwick SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 190b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 191b24ab676SJeff Bonwick 1923f9d6ad7SLin Ling error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 193b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 194fa9e4066Sahrens 195d80c45e0Sbonwick if (error == 0) { 196d80c45e0Sbonwick zio_cksum_t cksum = bp->blk_cksum; 197fa9e4066Sahrens 198d80c45e0Sbonwick /* 199f5e6e722SNeil Perrin * Validate the checksummed log block. 200f5e6e722SNeil Perrin * 201d80c45e0Sbonwick * Sequence numbers should be... sequential. The checksum 202d80c45e0Sbonwick * verifier for the next block should be bp's checksum plus 1. 203f5e6e722SNeil Perrin * 204f5e6e722SNeil Perrin * Also check the log chain linkage and size used. 205d80c45e0Sbonwick */ 206d80c45e0Sbonwick cksum.zc_word[ZIL_ZC_SEQ]++; 207d80c45e0Sbonwick 2086e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 2096e1f5caaSNeil Perrin zil_chain_t *zilc = abuf->b_data; 2106e1f5caaSNeil Perrin char *lr = (char *)(zilc + 1); 2116e1f5caaSNeil Perrin uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 2126e1f5caaSNeil Perrin 2136e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2146e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 2156e1f5caaSNeil Perrin error = ECKSUM; 2166e1f5caaSNeil Perrin } else { 2176e1f5caaSNeil Perrin bcopy(lr, dst, len); 2186e1f5caaSNeil Perrin *end = (char *)dst + len; 2196e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2206e1f5caaSNeil Perrin } 2216e1f5caaSNeil Perrin } else { 2226e1f5caaSNeil Perrin char *lr = abuf->b_data; 2236e1f5caaSNeil Perrin uint64_t size = BP_GET_LSIZE(bp); 2246e1f5caaSNeil Perrin zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 2256e1f5caaSNeil Perrin 2266e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2276e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 2286e1f5caaSNeil Perrin (zilc->zc_nused > (size - sizeof (*zilc)))) { 2296e1f5caaSNeil Perrin error = ECKSUM; 2306e1f5caaSNeil Perrin } else { 2316e1f5caaSNeil Perrin bcopy(lr, dst, zilc->zc_nused); 2326e1f5caaSNeil Perrin *end = (char *)dst + zilc->zc_nused; 2336e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2346e1f5caaSNeil Perrin } 2356e1f5caaSNeil Perrin } 236fa9e4066Sahrens 237b24ab676SJeff Bonwick VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 238fa9e4066Sahrens } 239fa9e4066Sahrens 240b24ab676SJeff Bonwick return (error); 241b24ab676SJeff Bonwick } 242b24ab676SJeff Bonwick 243b24ab676SJeff Bonwick /* 244b24ab676SJeff Bonwick * Read a TX_WRITE log data block. 245b24ab676SJeff Bonwick */ 246b24ab676SJeff Bonwick static int 247b24ab676SJeff Bonwick zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 248b24ab676SJeff Bonwick { 249b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 250b24ab676SJeff Bonwick const blkptr_t *bp = &lr->lr_blkptr; 251b24ab676SJeff Bonwick uint32_t aflags = ARC_WAIT; 252b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 253b24ab676SJeff Bonwick zbookmark_t zb; 254b24ab676SJeff Bonwick int error; 255b24ab676SJeff Bonwick 256b24ab676SJeff Bonwick if (BP_IS_HOLE(bp)) { 257b24ab676SJeff Bonwick if (wbuf != NULL) 258b24ab676SJeff Bonwick bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 259b24ab676SJeff Bonwick return (0); 260b24ab676SJeff Bonwick } 261b24ab676SJeff Bonwick 262b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 263b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 264b24ab676SJeff Bonwick 265b24ab676SJeff Bonwick SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 266b24ab676SJeff Bonwick ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 267b24ab676SJeff Bonwick 268b24ab676SJeff Bonwick error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 269b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 270b24ab676SJeff Bonwick 271b24ab676SJeff Bonwick if (error == 0) { 272b24ab676SJeff Bonwick if (wbuf != NULL) 273b24ab676SJeff Bonwick bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 274b24ab676SJeff Bonwick (void) arc_buf_remove_ref(abuf, &abuf); 275b24ab676SJeff Bonwick } 276fa9e4066Sahrens 277d80c45e0Sbonwick return (error); 278fa9e4066Sahrens } 279fa9e4066Sahrens 280fa9e4066Sahrens /* 281fa9e4066Sahrens * Parse the intent log, and call parse_func for each valid record within. 282fa9e4066Sahrens */ 283b24ab676SJeff Bonwick int 284fa9e4066Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 285fa9e4066Sahrens zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 286fa9e4066Sahrens { 287d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 288b24ab676SJeff Bonwick boolean_t claimed = !!zh->zh_claim_txg; 289b24ab676SJeff Bonwick uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 290b24ab676SJeff Bonwick uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 291b24ab676SJeff Bonwick uint64_t max_blk_seq = 0; 292b24ab676SJeff Bonwick uint64_t max_lr_seq = 0; 293b24ab676SJeff Bonwick uint64_t blk_count = 0; 294b24ab676SJeff Bonwick uint64_t lr_count = 0; 295b24ab676SJeff Bonwick blkptr_t blk, next_blk; 296fa9e4066Sahrens char *lrbuf, *lrp; 297b24ab676SJeff Bonwick int error = 0; 298fa9e4066Sahrens 299b24ab676SJeff Bonwick /* 300b24ab676SJeff Bonwick * Old logs didn't record the maximum zh_claim_lr_seq. 301b24ab676SJeff Bonwick */ 302b24ab676SJeff Bonwick if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 303b24ab676SJeff Bonwick claim_lr_seq = UINT64_MAX; 304fa9e4066Sahrens 305fa9e4066Sahrens /* 306fa9e4066Sahrens * Starting at the block pointed to by zh_log we read the log chain. 307fa9e4066Sahrens * For each block in the chain we strongly check that block to 308fa9e4066Sahrens * ensure its validity. We stop when an invalid block is found. 309fa9e4066Sahrens * For each block pointer in the chain we call parse_blk_func(). 310fa9e4066Sahrens * For each record in each valid block we call parse_lr_func(). 311d80c45e0Sbonwick * If the log has been claimed, stop if we encounter a sequence 312d80c45e0Sbonwick * number greater than the highest claimed sequence number. 313fa9e4066Sahrens */ 314b24ab676SJeff Bonwick lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); 315b24ab676SJeff Bonwick zil_bp_tree_init(zilog); 316d80c45e0Sbonwick 317b24ab676SJeff Bonwick for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 318b24ab676SJeff Bonwick uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 319b24ab676SJeff Bonwick int reclen; 3206e1f5caaSNeil Perrin char *end; 321d80c45e0Sbonwick 322b24ab676SJeff Bonwick if (blk_seq > claim_blk_seq) 323b24ab676SJeff Bonwick break; 324b24ab676SJeff Bonwick if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 325b24ab676SJeff Bonwick break; 3266e1f5caaSNeil Perrin ASSERT3U(max_blk_seq, <, blk_seq); 327b24ab676SJeff Bonwick max_blk_seq = blk_seq; 328b24ab676SJeff Bonwick blk_count++; 329fa9e4066Sahrens 330b24ab676SJeff Bonwick if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 331b24ab676SJeff Bonwick break; 332fa9e4066Sahrens 3336e1f5caaSNeil Perrin error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 334fa9e4066Sahrens if (error) 335fa9e4066Sahrens break; 336fa9e4066Sahrens 3376e1f5caaSNeil Perrin for (lrp = lrbuf; lrp < end; lrp += reclen) { 338fa9e4066Sahrens lr_t *lr = (lr_t *)lrp; 339fa9e4066Sahrens reclen = lr->lrc_reclen; 340fa9e4066Sahrens ASSERT3U(reclen, >=, sizeof (lr_t)); 341b24ab676SJeff Bonwick if (lr->lrc_seq > claim_lr_seq) 342b24ab676SJeff Bonwick goto done; 343b24ab676SJeff Bonwick if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 344b24ab676SJeff Bonwick goto done; 3456e1f5caaSNeil Perrin ASSERT3U(max_lr_seq, <, lr->lrc_seq); 346b24ab676SJeff Bonwick max_lr_seq = lr->lrc_seq; 347b24ab676SJeff Bonwick lr_count++; 348fa9e4066Sahrens } 349fa9e4066Sahrens } 350b24ab676SJeff Bonwick done: 351b24ab676SJeff Bonwick zilog->zl_parse_error = error; 352b24ab676SJeff Bonwick zilog->zl_parse_blk_seq = max_blk_seq; 353b24ab676SJeff Bonwick zilog->zl_parse_lr_seq = max_lr_seq; 354b24ab676SJeff Bonwick zilog->zl_parse_blk_count = blk_count; 355b24ab676SJeff Bonwick zilog->zl_parse_lr_count = lr_count; 356b24ab676SJeff Bonwick 357b24ab676SJeff Bonwick ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 358b24ab676SJeff Bonwick (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 359d80c45e0Sbonwick 360b24ab676SJeff Bonwick zil_bp_tree_fini(zilog); 361b24ab676SJeff Bonwick zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); 362b24ab676SJeff Bonwick 363b24ab676SJeff Bonwick return (error); 364fa9e4066Sahrens } 365fa9e4066Sahrens 366b24ab676SJeff Bonwick static int 367fa9e4066Sahrens zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 368fa9e4066Sahrens { 369fa9e4066Sahrens /* 370fa9e4066Sahrens * Claim log block if not already committed and not already claimed. 371b24ab676SJeff Bonwick * If tx == NULL, just verify that the block is claimable. 372fa9e4066Sahrens */ 373b24ab676SJeff Bonwick if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) 374b24ab676SJeff Bonwick return (0); 375b24ab676SJeff Bonwick 376b24ab676SJeff Bonwick return (zio_wait(zio_claim(NULL, zilog->zl_spa, 377b24ab676SJeff Bonwick tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 378b24ab676SJeff Bonwick ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 379fa9e4066Sahrens } 380fa9e4066Sahrens 381b24ab676SJeff Bonwick static int 382fa9e4066Sahrens zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 383fa9e4066Sahrens { 384b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 385b24ab676SJeff Bonwick int error; 386b24ab676SJeff Bonwick 387b24ab676SJeff Bonwick if (lrc->lrc_txtype != TX_WRITE) 388b24ab676SJeff Bonwick return (0); 389b24ab676SJeff Bonwick 390b24ab676SJeff Bonwick /* 391b24ab676SJeff Bonwick * If the block is not readable, don't claim it. This can happen 392b24ab676SJeff Bonwick * in normal operation when a log block is written to disk before 393b24ab676SJeff Bonwick * some of the dmu_sync() blocks it points to. In this case, the 394b24ab676SJeff Bonwick * transaction cannot have been committed to anyone (we would have 395b24ab676SJeff Bonwick * waited for all writes to be stable first), so it is semantically 396b24ab676SJeff Bonwick * correct to declare this the end of the log. 397b24ab676SJeff Bonwick */ 398b24ab676SJeff Bonwick if (lr->lr_blkptr.blk_birth >= first_txg && 399b24ab676SJeff Bonwick (error = zil_read_log_data(zilog, lr, NULL)) != 0) 400b24ab676SJeff Bonwick return (error); 401b24ab676SJeff Bonwick return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 402fa9e4066Sahrens } 403fa9e4066Sahrens 404fa9e4066Sahrens /* ARGSUSED */ 405b24ab676SJeff Bonwick static int 406fa9e4066Sahrens zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 407fa9e4066Sahrens { 408b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 409b24ab676SJeff Bonwick 410b24ab676SJeff Bonwick return (0); 411fa9e4066Sahrens } 412fa9e4066Sahrens 413b24ab676SJeff Bonwick static int 414fa9e4066Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 415fa9e4066Sahrens { 416b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 417b24ab676SJeff Bonwick blkptr_t *bp = &lr->lr_blkptr; 418b24ab676SJeff Bonwick 419fa9e4066Sahrens /* 420fa9e4066Sahrens * If we previously claimed it, we need to free it. 421fa9e4066Sahrens */ 422b24ab676SJeff Bonwick if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 423b24ab676SJeff Bonwick bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) 424b24ab676SJeff Bonwick zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 425b24ab676SJeff Bonwick 426b24ab676SJeff Bonwick return (0); 427fa9e4066Sahrens } 428fa9e4066Sahrens 4296e1f5caaSNeil Perrin static lwb_t * 4306e1f5caaSNeil Perrin zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) 4316e1f5caaSNeil Perrin { 4326e1f5caaSNeil Perrin lwb_t *lwb; 4336e1f5caaSNeil Perrin 4346e1f5caaSNeil Perrin lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 4356e1f5caaSNeil Perrin lwb->lwb_zilog = zilog; 4366e1f5caaSNeil Perrin lwb->lwb_blk = *bp; 4376e1f5caaSNeil Perrin lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 4386e1f5caaSNeil Perrin lwb->lwb_max_txg = txg; 4396e1f5caaSNeil Perrin lwb->lwb_zio = NULL; 4406e1f5caaSNeil Perrin lwb->lwb_tx = NULL; 4416e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 4426e1f5caaSNeil Perrin lwb->lwb_nused = sizeof (zil_chain_t); 4436e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp); 4446e1f5caaSNeil Perrin } else { 4456e1f5caaSNeil Perrin lwb->lwb_nused = 0; 4466e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 4476e1f5caaSNeil Perrin } 4486e1f5caaSNeil Perrin 4496e1f5caaSNeil Perrin mutex_enter(&zilog->zl_lock); 4506e1f5caaSNeil Perrin list_insert_tail(&zilog->zl_lwb_list, lwb); 4516e1f5caaSNeil Perrin mutex_exit(&zilog->zl_lock); 4526e1f5caaSNeil Perrin 4536e1f5caaSNeil Perrin return (lwb); 4546e1f5caaSNeil Perrin } 4556e1f5caaSNeil Perrin 456fa9e4066Sahrens /* 457fa9e4066Sahrens * Create an on-disk intent log. 458fa9e4066Sahrens */ 4596e1f5caaSNeil Perrin static lwb_t * 460fa9e4066Sahrens zil_create(zilog_t *zilog) 461fa9e4066Sahrens { 462d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 4636e1f5caaSNeil Perrin lwb_t *lwb = NULL; 464d80c45e0Sbonwick uint64_t txg = 0; 465d80c45e0Sbonwick dmu_tx_t *tx = NULL; 466fa9e4066Sahrens blkptr_t blk; 467d80c45e0Sbonwick int error = 0; 468fa9e4066Sahrens 469fa9e4066Sahrens /* 470d80c45e0Sbonwick * Wait for any previous destroy to complete. 471fa9e4066Sahrens */ 472d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 473d80c45e0Sbonwick 474d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 475d80c45e0Sbonwick ASSERT(zh->zh_replay_seq == 0); 476d80c45e0Sbonwick 477d80c45e0Sbonwick blk = zh->zh_log; 478fa9e4066Sahrens 479fa9e4066Sahrens /* 4806e1f5caaSNeil Perrin * Allocate an initial log block if: 4816e1f5caaSNeil Perrin * - there isn't one already 4826e1f5caaSNeil Perrin * - the existing block is the wrong endianess 483fa9e4066Sahrens */ 484899217ddSNeil Perrin if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 485d80c45e0Sbonwick tx = dmu_tx_create(zilog->zl_os); 486b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 487d80c45e0Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 488d80c45e0Sbonwick txg = dmu_tx_get_txg(tx); 489d80c45e0Sbonwick 490899217ddSNeil Perrin if (!BP_IS_HOLE(&blk)) { 491b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &blk); 492899217ddSNeil Perrin BP_ZERO(&blk); 493899217ddSNeil Perrin } 494899217ddSNeil Perrin 495b24ab676SJeff Bonwick error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 496b24ab676SJeff Bonwick ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); 497d80c45e0Sbonwick 498d80c45e0Sbonwick if (error == 0) 499d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 50013f5297eSperrin } 501fa9e4066Sahrens 502d80c45e0Sbonwick /* 503d80c45e0Sbonwick * Allocate a log write buffer (lwb) for the first log block. 504d80c45e0Sbonwick */ 5056e1f5caaSNeil Perrin if (error == 0) 5066e1f5caaSNeil Perrin lwb = zil_alloc_lwb(zilog, &blk, txg); 507fa9e4066Sahrens 508d80c45e0Sbonwick /* 509d80c45e0Sbonwick * If we just allocated the first log block, commit our transaction 510d80c45e0Sbonwick * and wait for zil_sync() to stuff the block poiner into zh_log. 511d80c45e0Sbonwick * (zh is part of the MOS, so we cannot modify it in open context.) 512d80c45e0Sbonwick */ 513d80c45e0Sbonwick if (tx != NULL) { 514d80c45e0Sbonwick dmu_tx_commit(tx); 51513f5297eSperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 516d80c45e0Sbonwick } 517d80c45e0Sbonwick 518d80c45e0Sbonwick ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 5196e1f5caaSNeil Perrin 5206e1f5caaSNeil Perrin return (lwb); 521fa9e4066Sahrens } 522fa9e4066Sahrens 523fa9e4066Sahrens /* 524fa9e4066Sahrens * In one tx, free all log blocks and clear the log header. 525d80c45e0Sbonwick * If keep_first is set, then we're replaying a log with no content. 526d80c45e0Sbonwick * We want to keep the first block, however, so that the first 527d80c45e0Sbonwick * synchronous transaction doesn't require a txg_wait_synced() 528d80c45e0Sbonwick * in zil_create(). We don't need to txg_wait_synced() here either 529d80c45e0Sbonwick * when keep_first is set, because both zil_create() and zil_destroy() 530d80c45e0Sbonwick * will wait for any in-progress destroys to complete. 531fa9e4066Sahrens */ 532fa9e4066Sahrens void 533d80c45e0Sbonwick zil_destroy(zilog_t *zilog, boolean_t keep_first) 534fa9e4066Sahrens { 535d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 536d80c45e0Sbonwick lwb_t *lwb; 537fa9e4066Sahrens dmu_tx_t *tx; 538fa9e4066Sahrens uint64_t txg; 539fa9e4066Sahrens 540d80c45e0Sbonwick /* 541d80c45e0Sbonwick * Wait for any previous destroy to complete. 542d80c45e0Sbonwick */ 543d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 544fa9e4066Sahrens 545b24ab676SJeff Bonwick zilog->zl_old_header = *zh; /* debugging aid */ 546b24ab676SJeff Bonwick 547d80c45e0Sbonwick if (BP_IS_HOLE(&zh->zh_log)) 548fa9e4066Sahrens return; 549fa9e4066Sahrens 550fa9e4066Sahrens tx = dmu_tx_create(zilog->zl_os); 551b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 552fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 553fa9e4066Sahrens txg = dmu_tx_get_txg(tx); 554fa9e4066Sahrens 555d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 556d80c45e0Sbonwick 557d80c45e0Sbonwick ASSERT3U(zilog->zl_destroy_txg, <, txg); 558fa9e4066Sahrens zilog->zl_destroy_txg = txg; 559b24ab676SJeff Bonwick zilog->zl_keep_first = keep_first; 560d80c45e0Sbonwick 561d80c45e0Sbonwick if (!list_is_empty(&zilog->zl_lwb_list)) { 562d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 563b24ab676SJeff Bonwick ASSERT(!keep_first); 564d80c45e0Sbonwick while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 565d80c45e0Sbonwick list_remove(&zilog->zl_lwb_list, lwb); 566d80c45e0Sbonwick if (lwb->lwb_buf != NULL) 567d80c45e0Sbonwick zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 568b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); 569d80c45e0Sbonwick kmem_cache_free(zil_lwb_cache, lwb); 570d80c45e0Sbonwick } 571b24ab676SJeff Bonwick } else if (!keep_first) { 572b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_free_log_block, 573b24ab676SJeff Bonwick zil_free_log_record, tx, zh->zh_claim_txg); 574d80c45e0Sbonwick } 575b19a79ecSperrin mutex_exit(&zilog->zl_lock); 576fa9e4066Sahrens 577fa9e4066Sahrens dmu_tx_commit(tx); 578fa9e4066Sahrens } 579fa9e4066Sahrens 5801d452cf5Sahrens int 581fd136879SMatthew Ahrens zil_claim(const char *osname, void *txarg) 582fa9e4066Sahrens { 583fa9e4066Sahrens dmu_tx_t *tx = txarg; 584fa9e4066Sahrens uint64_t first_txg = dmu_tx_get_txg(tx); 585fa9e4066Sahrens zilog_t *zilog; 586fa9e4066Sahrens zil_header_t *zh; 587fa9e4066Sahrens objset_t *os; 588fa9e4066Sahrens int error; 589fa9e4066Sahrens 590503ad85cSMatthew Ahrens error = dmu_objset_hold(osname, FTAG, &os); 591fa9e4066Sahrens if (error) { 592b87f3af3Sperrin cmn_err(CE_WARN, "can't open objset for %s", osname); 5931d452cf5Sahrens return (0); 594fa9e4066Sahrens } 595fa9e4066Sahrens 596fa9e4066Sahrens zilog = dmu_objset_zil(os); 597d80c45e0Sbonwick zh = zil_header_in_syncing_context(zilog); 598fa9e4066Sahrens 599b24ab676SJeff Bonwick if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 600e6ca193dSGeorge Wilson if (!BP_IS_HOLE(&zh->zh_log)) 601b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 602e6ca193dSGeorge Wilson BP_ZERO(&zh->zh_log); 603e6ca193dSGeorge Wilson dsl_dataset_dirty(dmu_objset_ds(os), tx); 604468c413aSTim Haley dmu_objset_rele(os, FTAG); 605468c413aSTim Haley return (0); 606e6ca193dSGeorge Wilson } 607e6ca193dSGeorge Wilson 608fa9e4066Sahrens /* 609d80c45e0Sbonwick * Claim all log blocks if we haven't already done so, and remember 610d80c45e0Sbonwick * the highest claimed sequence number. This ensures that if we can 611d80c45e0Sbonwick * read only part of the log now (e.g. due to a missing device), 612d80c45e0Sbonwick * but we can read the entire log later, we will not try to replay 613d80c45e0Sbonwick * or destroy beyond the last block we successfully claimed. 614fa9e4066Sahrens */ 615fa9e4066Sahrens ASSERT3U(zh->zh_claim_txg, <=, first_txg); 616fa9e4066Sahrens if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 617b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_claim_log_block, 618d80c45e0Sbonwick zil_claim_log_record, tx, first_txg); 619b24ab676SJeff Bonwick zh->zh_claim_txg = first_txg; 620b24ab676SJeff Bonwick zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 621b24ab676SJeff Bonwick zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 622b24ab676SJeff Bonwick if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 623b24ab676SJeff Bonwick zh->zh_flags |= ZIL_REPLAY_NEEDED; 624b24ab676SJeff Bonwick zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 625fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(os), tx); 626fa9e4066Sahrens } 627d80c45e0Sbonwick 628fa9e4066Sahrens ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 629503ad85cSMatthew Ahrens dmu_objset_rele(os, FTAG); 6301d452cf5Sahrens return (0); 631b87f3af3Sperrin } 632b87f3af3Sperrin 633b87f3af3Sperrin /* 634b87f3af3Sperrin * Check the log by walking the log chain. 635b87f3af3Sperrin * Checksum errors are ok as they indicate the end of the chain. 636b87f3af3Sperrin * Any other error (no device or read failure) returns an error. 637b87f3af3Sperrin */ 638b87f3af3Sperrin int 639fd136879SMatthew Ahrens zil_check_log_chain(const char *osname, void *tx) 640b87f3af3Sperrin { 641b87f3af3Sperrin zilog_t *zilog; 642b87f3af3Sperrin objset_t *os; 643*4b964adaSGeorge Wilson blkptr_t *bp; 644b87f3af3Sperrin int error; 645b87f3af3Sperrin 646b24ab676SJeff Bonwick ASSERT(tx == NULL); 647b24ab676SJeff Bonwick 648503ad85cSMatthew Ahrens error = dmu_objset_hold(osname, FTAG, &os); 649b87f3af3Sperrin if (error) { 650b87f3af3Sperrin cmn_err(CE_WARN, "can't open objset for %s", osname); 651b87f3af3Sperrin return (0); 652b87f3af3Sperrin } 653b87f3af3Sperrin 654b87f3af3Sperrin zilog = dmu_objset_zil(os); 655*4b964adaSGeorge Wilson bp = (blkptr_t *)&zilog->zl_header->zh_log; 656*4b964adaSGeorge Wilson 657*4b964adaSGeorge Wilson /* 658*4b964adaSGeorge Wilson * Check the first block and determine if it's on a log device 659*4b964adaSGeorge Wilson * which may have been removed or faulted prior to loading this 660*4b964adaSGeorge Wilson * pool. If so, there's no point in checking the rest of the log 661*4b964adaSGeorge Wilson * as its content should have already been synced to the pool. 662*4b964adaSGeorge Wilson */ 663*4b964adaSGeorge Wilson if (!BP_IS_HOLE(bp)) { 664*4b964adaSGeorge Wilson vdev_t *vd; 665*4b964adaSGeorge Wilson boolean_t valid = B_TRUE; 666*4b964adaSGeorge Wilson 667*4b964adaSGeorge Wilson spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 668*4b964adaSGeorge Wilson vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 669*4b964adaSGeorge Wilson if (vd->vdev_islog && vdev_is_dead(vd)) 670*4b964adaSGeorge Wilson valid = vdev_log_state_valid(vd); 671*4b964adaSGeorge Wilson spa_config_exit(os->os_spa, SCL_STATE, FTAG); 672*4b964adaSGeorge Wilson 673*4b964adaSGeorge Wilson if (!valid) { 674*4b964adaSGeorge Wilson dmu_objset_rele(os, FTAG); 675*4b964adaSGeorge Wilson return (0); 676*4b964adaSGeorge Wilson } 677*4b964adaSGeorge Wilson } 678b87f3af3Sperrin 679b24ab676SJeff Bonwick /* 680b24ab676SJeff Bonwick * Because tx == NULL, zil_claim_log_block() will not actually claim 681b24ab676SJeff Bonwick * any blocks, but just determine whether it is possible to do so. 682b24ab676SJeff Bonwick * In addition to checking the log chain, zil_claim_log_block() 683b24ab676SJeff Bonwick * will invoke zio_claim() with a done func of spa_claim_notify(), 684b24ab676SJeff Bonwick * which will update spa_max_claim_txg. See spa_load() for details. 685b24ab676SJeff Bonwick */ 686b24ab676SJeff Bonwick error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 687b24ab676SJeff Bonwick zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 688b24ab676SJeff Bonwick 689503ad85cSMatthew Ahrens dmu_objset_rele(os, FTAG); 690b24ab676SJeff Bonwick 691b24ab676SJeff Bonwick return ((error == ECKSUM || error == ENOENT) ? 0 : error); 692b87f3af3Sperrin } 693b87f3af3Sperrin 69417f17c2dSbonwick static int 69517f17c2dSbonwick zil_vdev_compare(const void *x1, const void *x2) 69617f17c2dSbonwick { 6975002558fSNeil Perrin const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 6985002558fSNeil Perrin const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 69917f17c2dSbonwick 70017f17c2dSbonwick if (v1 < v2) 70117f17c2dSbonwick return (-1); 70217f17c2dSbonwick if (v1 > v2) 70317f17c2dSbonwick return (1); 70417f17c2dSbonwick 70517f17c2dSbonwick return (0); 70617f17c2dSbonwick } 70717f17c2dSbonwick 708fa9e4066Sahrens void 709b24ab676SJeff Bonwick zil_add_block(zilog_t *zilog, const blkptr_t *bp) 710fa9e4066Sahrens { 71117f17c2dSbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 71217f17c2dSbonwick avl_index_t where; 71317f17c2dSbonwick zil_vdev_node_t *zv, zvsearch; 71417f17c2dSbonwick int ndvas = BP_GET_NDVAS(bp); 71517f17c2dSbonwick int i; 716fa9e4066Sahrens 717416e0cd8Sek if (zfs_nocacheflush) 718fa9e4066Sahrens return; 719fa9e4066Sahrens 72017f17c2dSbonwick ASSERT(zilog->zl_writer); 72117f17c2dSbonwick 72217f17c2dSbonwick /* 72317f17c2dSbonwick * Even though we're zl_writer, we still need a lock because the 72417f17c2dSbonwick * zl_get_data() callbacks may have dmu_sync() done callbacks 72517f17c2dSbonwick * that will run concurrently. 72617f17c2dSbonwick */ 72717f17c2dSbonwick mutex_enter(&zilog->zl_vdev_lock); 72817f17c2dSbonwick for (i = 0; i < ndvas; i++) { 72917f17c2dSbonwick zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 73017f17c2dSbonwick if (avl_find(t, &zvsearch, &where) == NULL) { 73117f17c2dSbonwick zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 73217f17c2dSbonwick zv->zv_vdev = zvsearch.zv_vdev; 73317f17c2dSbonwick avl_insert(t, zv, where); 73467bd71c6Sperrin } 73567bd71c6Sperrin } 73617f17c2dSbonwick mutex_exit(&zilog->zl_vdev_lock); 737fa9e4066Sahrens } 738fa9e4066Sahrens 73991de656bSNeil Perrin static void 74067bd71c6Sperrin zil_flush_vdevs(zilog_t *zilog) 74167bd71c6Sperrin { 74267bd71c6Sperrin spa_t *spa = zilog->zl_spa; 74317f17c2dSbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 74417f17c2dSbonwick void *cookie = NULL; 74517f17c2dSbonwick zil_vdev_node_t *zv; 74617f17c2dSbonwick zio_t *zio; 747fa9e4066Sahrens 74867bd71c6Sperrin ASSERT(zilog->zl_writer); 74967bd71c6Sperrin 75017f17c2dSbonwick /* 75117f17c2dSbonwick * We don't need zl_vdev_lock here because we're the zl_writer, 75217f17c2dSbonwick * and all zl_get_data() callbacks are done. 75317f17c2dSbonwick */ 75417f17c2dSbonwick if (avl_numnodes(t) == 0) 75517f17c2dSbonwick return; 75617f17c2dSbonwick 757e14bb325SJeff Bonwick spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 75817f17c2dSbonwick 759e14bb325SJeff Bonwick zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 760fa9e4066Sahrens 76117f17c2dSbonwick while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 76217f17c2dSbonwick vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 76317f17c2dSbonwick if (vd != NULL) 76417f17c2dSbonwick zio_flush(zio, vd); 76517f17c2dSbonwick kmem_free(zv, sizeof (*zv)); 76667bd71c6Sperrin } 76717f17c2dSbonwick 768fa9e4066Sahrens /* 769fa9e4066Sahrens * Wait for all the flushes to complete. Not all devices actually 770fa9e4066Sahrens * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. 771fa9e4066Sahrens */ 77217f17c2dSbonwick (void) zio_wait(zio); 77317f17c2dSbonwick 774e14bb325SJeff Bonwick spa_config_exit(spa, SCL_STATE, FTAG); 775fa9e4066Sahrens } 776fa9e4066Sahrens 777fa9e4066Sahrens /* 778fa9e4066Sahrens * Function called when a log block write completes 779fa9e4066Sahrens */ 780fa9e4066Sahrens static void 781fa9e4066Sahrens zil_lwb_write_done(zio_t *zio) 782fa9e4066Sahrens { 783fa9e4066Sahrens lwb_t *lwb = zio->io_private; 784fa9e4066Sahrens zilog_t *zilog = lwb->lwb_zilog; 785b24ab676SJeff Bonwick dmu_tx_t *tx = lwb->lwb_tx; 786fa9e4066Sahrens 787e14bb325SJeff Bonwick ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 788e14bb325SJeff Bonwick ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 789e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 790e14bb325SJeff Bonwick ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 791e14bb325SJeff Bonwick ASSERT(!BP_IS_GANG(zio->io_bp)); 792e14bb325SJeff Bonwick ASSERT(!BP_IS_HOLE(zio->io_bp)); 793e14bb325SJeff Bonwick ASSERT(zio->io_bp->blk_fill == 0); 794e14bb325SJeff Bonwick 795fa9e4066Sahrens /* 796ef0d8e11SNeil Perrin * Ensure the lwb buffer pointer is cleared before releasing 797ef0d8e11SNeil Perrin * the txg. If we have had an allocation failure and 798ef0d8e11SNeil Perrin * the txg is waiting to sync then we want want zil_sync() 799ef0d8e11SNeil Perrin * to remove the lwb so that it's not picked up as the next new 800ef0d8e11SNeil Perrin * one in zil_commit_writer(). zil_sync() will only remove 801ef0d8e11SNeil Perrin * the lwb if lwb_buf is null. 802fa9e4066Sahrens */ 803fa9e4066Sahrens zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 804fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 805fa9e4066Sahrens lwb->lwb_buf = NULL; 806b24ab676SJeff Bonwick lwb->lwb_tx = NULL; 807b24ab676SJeff Bonwick mutex_exit(&zilog->zl_lock); 808ef0d8e11SNeil Perrin 809ef0d8e11SNeil Perrin /* 810ef0d8e11SNeil Perrin * Now that we've written this log block, we have a stable pointer 811ef0d8e11SNeil Perrin * to the next block in the chain, so it's OK to let the txg in 812b24ab676SJeff Bonwick * which we allocated the next block sync. 813ef0d8e11SNeil Perrin */ 814b24ab676SJeff Bonwick dmu_tx_commit(tx); 815fa9e4066Sahrens } 816fa9e4066Sahrens 817c5c6ffa0Smaybee /* 818c5c6ffa0Smaybee * Initialize the io for a log block. 819c5c6ffa0Smaybee */ 820c5c6ffa0Smaybee static void 821c5c6ffa0Smaybee zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) 822c5c6ffa0Smaybee { 823c5c6ffa0Smaybee zbookmark_t zb; 824c5c6ffa0Smaybee 825b24ab676SJeff Bonwick SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 826b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 827b24ab676SJeff Bonwick lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 828c5c6ffa0Smaybee 829b19a79ecSperrin if (zilog->zl_root_zio == NULL) { 830b19a79ecSperrin zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, 831b19a79ecSperrin ZIO_FLAG_CANFAIL); 832b19a79ecSperrin } 83367bd71c6Sperrin if (lwb->lwb_zio == NULL) { 83467bd71c6Sperrin lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 8356e1f5caaSNeil Perrin 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), 836e6ca193dSGeorge Wilson zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, 8378f18d1faSGeorge Wilson ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 83867bd71c6Sperrin } 839c5c6ffa0Smaybee } 840c5c6ffa0Smaybee 8416e1f5caaSNeil Perrin /* 8426e1f5caaSNeil Perrin * Define a limited set of intent log block sizes. 8436e1f5caaSNeil Perrin * These must be a multiple of 4KB. Note only the amount used (again 8446e1f5caaSNeil Perrin * aligned to 4KB) actually gets written. However, we can't always just 8456e1f5caaSNeil Perrin * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. 8466e1f5caaSNeil Perrin */ 8476e1f5caaSNeil Perrin uint64_t zil_block_buckets[] = { 8486e1f5caaSNeil Perrin 4096, /* non TX_WRITE */ 8496e1f5caaSNeil Perrin 8192+4096, /* data base */ 8506e1f5caaSNeil Perrin 32*1024 + 4096, /* NFS writes */ 8516e1f5caaSNeil Perrin UINT64_MAX 8526e1f5caaSNeil Perrin }; 8536e1f5caaSNeil Perrin 854d48e086fSNeil Perrin /* 855d48e086fSNeil Perrin * Use the slog as long as the logbias is 'latency' and the current commit size 856d48e086fSNeil Perrin * is less than the limit or the total list size is less than 2X the limit. 857d48e086fSNeil Perrin * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. 858d48e086fSNeil Perrin */ 859d48e086fSNeil Perrin uint64_t zil_slog_limit = 1024 * 1024; 860d48e086fSNeil Perrin #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ 861d48e086fSNeil Perrin (((zilog)->zl_cur_used < zil_slog_limit) || \ 862d48e086fSNeil Perrin ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) 863d48e086fSNeil Perrin 864fa9e4066Sahrens /* 865fa9e4066Sahrens * Start a log block write and advance to the next log block. 866fa9e4066Sahrens * Calls are serialized. 867fa9e4066Sahrens */ 868fa9e4066Sahrens static lwb_t * 869fa9e4066Sahrens zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) 870fa9e4066Sahrens { 8716e1f5caaSNeil Perrin lwb_t *nlwb = NULL; 8726e1f5caaSNeil Perrin zil_chain_t *zilc; 873d80c45e0Sbonwick spa_t *spa = zilog->zl_spa; 8746e1f5caaSNeil Perrin blkptr_t *bp; 875b24ab676SJeff Bonwick dmu_tx_t *tx; 876fa9e4066Sahrens uint64_t txg; 877ada693c4SNeil Perrin uint64_t zil_blksz, wsz; 8786e1f5caaSNeil Perrin int i, error; 8796e1f5caaSNeil Perrin 8806e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 8816e1f5caaSNeil Perrin zilc = (zil_chain_t *)lwb->lwb_buf; 8826e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 8836e1f5caaSNeil Perrin } else { 8846e1f5caaSNeil Perrin zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 8856e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 8866e1f5caaSNeil Perrin } 887fa9e4066Sahrens 8886e1f5caaSNeil Perrin ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 889fa9e4066Sahrens 890fa9e4066Sahrens /* 891fa9e4066Sahrens * Allocate the next block and save its address in this block 892fa9e4066Sahrens * before writing it in order to establish the log chain. 893fa9e4066Sahrens * Note that if the allocation of nlwb synced before we wrote 894fa9e4066Sahrens * the block that points at it (lwb), we'd leak it if we crashed. 895b24ab676SJeff Bonwick * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 896b24ab676SJeff Bonwick * We dirty the dataset to ensure that zil_sync() will be called 897b24ab676SJeff Bonwick * to clean up in the event of allocation failure or I/O failure. 898fa9e4066Sahrens */ 899b24ab676SJeff Bonwick tx = dmu_tx_create(zilog->zl_os); 900b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 901b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 902b24ab676SJeff Bonwick txg = dmu_tx_get_txg(tx); 903b24ab676SJeff Bonwick 904b24ab676SJeff Bonwick lwb->lwb_tx = tx; 905fa9e4066Sahrens 906fa9e4066Sahrens /* 9076e1f5caaSNeil Perrin * Log blocks are pre-allocated. Here we select the size of the next 9086e1f5caaSNeil Perrin * block, based on size used in the last block. 9096e1f5caaSNeil Perrin * - first find the smallest bucket that will fit the block from a 9106e1f5caaSNeil Perrin * limited set of block sizes. This is because it's faster to write 9116e1f5caaSNeil Perrin * blocks allocated from the same metaslab as they are adjacent or 9126e1f5caaSNeil Perrin * close. 9136e1f5caaSNeil Perrin * - next find the maximum from the new suggested size and an array of 9146e1f5caaSNeil Perrin * previous sizes. This lessens a picket fence effect of wrongly 9156e1f5caaSNeil Perrin * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 9166e1f5caaSNeil Perrin * requests. 9176e1f5caaSNeil Perrin * 9186e1f5caaSNeil Perrin * Note we only write what is used, but we can't just allocate 9196e1f5caaSNeil Perrin * the maximum block size because we can exhaust the available 9206e1f5caaSNeil Perrin * pool log space. 921fa9e4066Sahrens */ 9226e1f5caaSNeil Perrin zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 9236e1f5caaSNeil Perrin for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 9246e1f5caaSNeil Perrin continue; 9256e1f5caaSNeil Perrin zil_blksz = zil_block_buckets[i]; 9266e1f5caaSNeil Perrin if (zil_blksz == UINT64_MAX) 9276e1f5caaSNeil Perrin zil_blksz = SPA_MAXBLOCKSIZE; 9286e1f5caaSNeil Perrin zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 9296e1f5caaSNeil Perrin for (i = 0; i < ZIL_PREV_BLKS; i++) 9306e1f5caaSNeil Perrin zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 9316e1f5caaSNeil Perrin zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 932fa9e4066Sahrens 93367bd71c6Sperrin BP_ZERO(bp); 93467bd71c6Sperrin /* pass the old blkptr in order to spread log blocks across devs */ 935b24ab676SJeff Bonwick error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, 936d48e086fSNeil Perrin USE_SLOG(zilog)); 9376e1f5caaSNeil Perrin if (!error) { 9386e1f5caaSNeil Perrin ASSERT3U(bp->blk_birth, ==, txg); 9396e1f5caaSNeil Perrin bp->blk_cksum = lwb->lwb_blk.blk_cksum; 9406e1f5caaSNeil Perrin bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 941d63d470bSgw 942ea8dc4b6Seschrock /* 9436e1f5caaSNeil Perrin * Allocate a new log write buffer (lwb). 944ea8dc4b6Seschrock */ 9456e1f5caaSNeil Perrin nlwb = zil_alloc_lwb(zilog, bp, txg); 9466e1f5caaSNeil Perrin 9476e1f5caaSNeil Perrin /* Record the block for later vdev flushing */ 9486e1f5caaSNeil Perrin zil_add_block(zilog, &lwb->lwb_blk); 949fa9e4066Sahrens } 950fa9e4066Sahrens 9516e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 9526e1f5caaSNeil Perrin /* For Slim ZIL only write what is used. */ 953ada693c4SNeil Perrin wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 954ada693c4SNeil Perrin ASSERT3U(wsz, <=, lwb->lwb_sz); 955ada693c4SNeil Perrin zio_shrink(lwb->lwb_zio, wsz); 956fa9e4066Sahrens 957ada693c4SNeil Perrin } else { 958ada693c4SNeil Perrin wsz = lwb->lwb_sz; 9596e1f5caaSNeil Perrin } 960ada693c4SNeil Perrin 9616e1f5caaSNeil Perrin zilc->zc_pad = 0; 9626e1f5caaSNeil Perrin zilc->zc_nused = lwb->lwb_nused; 9636e1f5caaSNeil Perrin zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 964fa9e4066Sahrens 965ada693c4SNeil Perrin /* 966ada693c4SNeil Perrin * clear unused data for security 967ada693c4SNeil Perrin */ 968ada693c4SNeil Perrin bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 969ada693c4SNeil Perrin 9706e1f5caaSNeil Perrin zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ 97167bd71c6Sperrin 972fa9e4066Sahrens /* 9736e1f5caaSNeil Perrin * If there was an allocation failure then nlwb will be null which 9746e1f5caaSNeil Perrin * forces a txg_wait_synced(). 975fa9e4066Sahrens */ 976fa9e4066Sahrens return (nlwb); 977fa9e4066Sahrens } 978fa9e4066Sahrens 979fa9e4066Sahrens static lwb_t * 980fa9e4066Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 981fa9e4066Sahrens { 982fa9e4066Sahrens lr_t *lrc = &itx->itx_lr; /* common log record */ 983b24ab676SJeff Bonwick lr_write_t *lrw = (lr_write_t *)lrc; 984b24ab676SJeff Bonwick char *lr_buf; 985fa9e4066Sahrens uint64_t txg = lrc->lrc_txg; 986fa9e4066Sahrens uint64_t reclen = lrc->lrc_reclen; 987b24ab676SJeff Bonwick uint64_t dlen = 0; 988fa9e4066Sahrens 989fa9e4066Sahrens if (lwb == NULL) 990fa9e4066Sahrens return (NULL); 991b24ab676SJeff Bonwick 992fa9e4066Sahrens ASSERT(lwb->lwb_buf != NULL); 993fa9e4066Sahrens 994c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) 995c5c6ffa0Smaybee dlen = P2ROUNDUP_TYPED( 996b24ab676SJeff Bonwick lrw->lr_length, sizeof (uint64_t), uint64_t); 997fa9e4066Sahrens 998104e2ed7Sperrin zilog->zl_cur_used += (reclen + dlen); 99922ac5be4Sperrin 100067bd71c6Sperrin zil_lwb_write_init(zilog, lwb); 100167bd71c6Sperrin 1002fa9e4066Sahrens /* 1003fa9e4066Sahrens * If this record won't fit in the current log block, start a new one. 1004fa9e4066Sahrens */ 10056e1f5caaSNeil Perrin if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 1006fa9e4066Sahrens lwb = zil_lwb_write_start(zilog, lwb); 1007c5c6ffa0Smaybee if (lwb == NULL) 1008fa9e4066Sahrens return (NULL); 100967bd71c6Sperrin zil_lwb_write_init(zilog, lwb); 10106e1f5caaSNeil Perrin ASSERT(LWB_EMPTY(lwb)); 10116e1f5caaSNeil Perrin if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 1012fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, txg); 1013fa9e4066Sahrens return (lwb); 1014fa9e4066Sahrens } 1015fa9e4066Sahrens } 1016fa9e4066Sahrens 1017b24ab676SJeff Bonwick lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1018b24ab676SJeff Bonwick bcopy(lrc, lr_buf, reclen); 1019b24ab676SJeff Bonwick lrc = (lr_t *)lr_buf; 1020b24ab676SJeff Bonwick lrw = (lr_write_t *)lrc; 1021c5c6ffa0Smaybee 1022c5c6ffa0Smaybee /* 1023c5c6ffa0Smaybee * If it's a write, fetch the data or get its blkptr as appropriate. 1024c5c6ffa0Smaybee */ 1025c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE) { 1026c5c6ffa0Smaybee if (txg > spa_freeze_txg(zilog->zl_spa)) 1027c5c6ffa0Smaybee txg_wait_synced(zilog->zl_dmu_pool, txg); 1028c5c6ffa0Smaybee if (itx->itx_wr_state != WR_COPIED) { 1029c5c6ffa0Smaybee char *dbuf; 1030c5c6ffa0Smaybee int error; 1031c5c6ffa0Smaybee 1032c5c6ffa0Smaybee if (dlen) { 1033c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_NEED_COPY); 1034b24ab676SJeff Bonwick dbuf = lr_buf + reclen; 1035b24ab676SJeff Bonwick lrw->lr_common.lrc_reclen += dlen; 1036c5c6ffa0Smaybee } else { 1037c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_INDIRECT); 1038c5c6ffa0Smaybee dbuf = NULL; 1039c5c6ffa0Smaybee } 1040c5c6ffa0Smaybee error = zilog->zl_get_data( 1041b24ab676SJeff Bonwick itx->itx_private, lrw, dbuf, lwb->lwb_zio); 1042c87b8fc5SMark J Musante if (error == EIO) { 1043c87b8fc5SMark J Musante txg_wait_synced(zilog->zl_dmu_pool, txg); 1044c87b8fc5SMark J Musante return (lwb); 1045c87b8fc5SMark J Musante } 1046c5c6ffa0Smaybee if (error) { 1047c5c6ffa0Smaybee ASSERT(error == ENOENT || error == EEXIST || 1048c5c6ffa0Smaybee error == EALREADY); 1049c5c6ffa0Smaybee return (lwb); 1050c5c6ffa0Smaybee } 1051c5c6ffa0Smaybee } 1052104e2ed7Sperrin } 1053c5c6ffa0Smaybee 1054b24ab676SJeff Bonwick /* 1055b24ab676SJeff Bonwick * We're actually making an entry, so update lrc_seq to be the 1056b24ab676SJeff Bonwick * log record sequence number. Note that this is generally not 1057b24ab676SJeff Bonwick * equal to the itx sequence number because not all transactions 1058b24ab676SJeff Bonwick * are synchronous, and sometimes spa_sync() gets there first. 1059b24ab676SJeff Bonwick */ 1060b24ab676SJeff Bonwick lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ 1061c5c6ffa0Smaybee lwb->lwb_nused += reclen + dlen; 1062fa9e4066Sahrens lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 10636e1f5caaSNeil Perrin ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1064fa9e4066Sahrens ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); 1065fa9e4066Sahrens 1066fa9e4066Sahrens return (lwb); 1067fa9e4066Sahrens } 1068fa9e4066Sahrens 1069fa9e4066Sahrens itx_t * 1070da6c28aaSamw zil_itx_create(uint64_t txtype, size_t lrsize) 1071fa9e4066Sahrens { 1072fa9e4066Sahrens itx_t *itx; 1073fa9e4066Sahrens 1074b4d654b0Sperrin lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1075fa9e4066Sahrens 1076fa9e4066Sahrens itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1077fa9e4066Sahrens itx->itx_lr.lrc_txtype = txtype; 1078fa9e4066Sahrens itx->itx_lr.lrc_reclen = lrsize; 1079abf76b6eSperrin itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ 1080fa9e4066Sahrens itx->itx_lr.lrc_seq = 0; /* defensive */ 10815002558fSNeil Perrin itx->itx_sync = B_TRUE; /* default is synchronous */ 1082fa9e4066Sahrens 1083fa9e4066Sahrens return (itx); 1084fa9e4066Sahrens } 1085fa9e4066Sahrens 1086b24ab676SJeff Bonwick void 1087b24ab676SJeff Bonwick zil_itx_destroy(itx_t *itx) 1088b24ab676SJeff Bonwick { 1089b24ab676SJeff Bonwick kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1090b24ab676SJeff Bonwick } 1091b24ab676SJeff Bonwick 10925002558fSNeil Perrin /* 10935002558fSNeil Perrin * Free up the sync and async itxs. The itxs_t has already been detached 10945002558fSNeil Perrin * so no locks are needed. 10955002558fSNeil Perrin */ 10965002558fSNeil Perrin static void 10975002558fSNeil Perrin zil_itxg_clean(itxs_t *itxs) 1098fa9e4066Sahrens { 10995002558fSNeil Perrin itx_t *itx; 11005002558fSNeil Perrin list_t *list; 11015002558fSNeil Perrin avl_tree_t *t; 11025002558fSNeil Perrin void *cookie; 11035002558fSNeil Perrin itx_async_node_t *ian; 11045002558fSNeil Perrin 11055002558fSNeil Perrin list = &itxs->i_sync_list; 11065002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 11075002558fSNeil Perrin list_remove(list, itx); 11085002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) + 11095002558fSNeil Perrin itx->itx_lr.lrc_reclen); 11105002558fSNeil Perrin } 1111fa9e4066Sahrens 11125002558fSNeil Perrin cookie = NULL; 11135002558fSNeil Perrin t = &itxs->i_async_tree; 11145002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 11155002558fSNeil Perrin list = &ian->ia_list; 11165002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 11175002558fSNeil Perrin list_remove(list, itx); 11185002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) + 11195002558fSNeil Perrin itx->itx_lr.lrc_reclen); 11205002558fSNeil Perrin } 11215002558fSNeil Perrin list_destroy(list); 11225002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 11235002558fSNeil Perrin } 11245002558fSNeil Perrin avl_destroy(t); 1125fa9e4066Sahrens 11265002558fSNeil Perrin kmem_free(itxs, sizeof (itxs_t)); 11275002558fSNeil Perrin } 11285002558fSNeil Perrin 11295002558fSNeil Perrin static int 11305002558fSNeil Perrin zil_aitx_compare(const void *x1, const void *x2) 11315002558fSNeil Perrin { 11325002558fSNeil Perrin const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 11335002558fSNeil Perrin const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1134fa9e4066Sahrens 11355002558fSNeil Perrin if (o1 < o2) 11365002558fSNeil Perrin return (-1); 11375002558fSNeil Perrin if (o1 > o2) 11385002558fSNeil Perrin return (1); 11395002558fSNeil Perrin 11405002558fSNeil Perrin return (0); 1141fa9e4066Sahrens } 1142fa9e4066Sahrens 1143fa9e4066Sahrens /* 11445002558fSNeil Perrin * Remove all async itx with the given oid. 1145fa9e4066Sahrens */ 114691de656bSNeil Perrin static void 11475002558fSNeil Perrin zil_remove_async(zilog_t *zilog, uint64_t oid) 1148fa9e4066Sahrens { 11495002558fSNeil Perrin uint64_t otxg, txg; 11505002558fSNeil Perrin itx_async_node_t *ian; 11515002558fSNeil Perrin avl_tree_t *t; 11525002558fSNeil Perrin avl_index_t where; 1153a584ef65Sjohansen list_t clean_list; 1154fa9e4066Sahrens itx_t *itx; 1155fa9e4066Sahrens 11565002558fSNeil Perrin ASSERT(oid != 0); 1157a584ef65Sjohansen list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1158a584ef65Sjohansen 11595002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 11605002558fSNeil Perrin otxg = ZILTEST_TXG; 11615002558fSNeil Perrin else 11625002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1163a584ef65Sjohansen 11645002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 11655002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 11665002558fSNeil Perrin 11675002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 11685002558fSNeil Perrin if (itxg->itxg_txg != txg) { 11695002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 11705002558fSNeil Perrin continue; 11715002558fSNeil Perrin } 1172a584ef65Sjohansen 11735002558fSNeil Perrin /* 11745002558fSNeil Perrin * Locate the object node and append its list. 11755002558fSNeil Perrin */ 11765002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 11775002558fSNeil Perrin ian = avl_find(t, &oid, &where); 11785002558fSNeil Perrin if (ian != NULL) 11795002558fSNeil Perrin list_move_tail(&clean_list, &ian->ia_list); 11805002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 11815002558fSNeil Perrin } 1182a584ef65Sjohansen while ((itx = list_head(&clean_list)) != NULL) { 1183a584ef65Sjohansen list_remove(&clean_list, itx); 11845002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) + 11855002558fSNeil Perrin itx->itx_lr.lrc_reclen); 1186a584ef65Sjohansen } 1187a584ef65Sjohansen list_destroy(&clean_list); 1188fa9e4066Sahrens } 1189fa9e4066Sahrens 11905002558fSNeil Perrin void 11915002558fSNeil Perrin zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 11925002558fSNeil Perrin { 11935002558fSNeil Perrin uint64_t txg; 11945002558fSNeil Perrin itxg_t *itxg; 11955002558fSNeil Perrin itxs_t *itxs, *clean = NULL; 11965002558fSNeil Perrin 11975002558fSNeil Perrin /* 119891de656bSNeil Perrin * Object ids can be re-instantiated in the next txg so 11995002558fSNeil Perrin * remove any async transactions to avoid future leaks. 12005002558fSNeil Perrin * This can happen if a fsync occurs on the re-instantiated 12015002558fSNeil Perrin * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 12025002558fSNeil Perrin * the new file data and flushes a write record for the old object. 12035002558fSNeil Perrin */ 12045002558fSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 120551bd2f97SNeil Perrin zil_remove_async(zilog, itx->itx_oid); 12065002558fSNeil Perrin 120791de656bSNeil Perrin /* 120891de656bSNeil Perrin * Ensure the data of a renamed file is committed before the rename. 120991de656bSNeil Perrin */ 121091de656bSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 121191de656bSNeil Perrin zil_async_to_sync(zilog, itx->itx_oid); 121291de656bSNeil Perrin 12135002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 12145002558fSNeil Perrin txg = ZILTEST_TXG; 12155002558fSNeil Perrin else 12165002558fSNeil Perrin txg = dmu_tx_get_txg(tx); 12175002558fSNeil Perrin 12185002558fSNeil Perrin itxg = &zilog->zl_itxg[txg & TXG_MASK]; 12195002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 12205002558fSNeil Perrin itxs = itxg->itxg_itxs; 12215002558fSNeil Perrin if (itxg->itxg_txg != txg) { 12225002558fSNeil Perrin if (itxs != NULL) { 12235002558fSNeil Perrin /* 12245002558fSNeil Perrin * The zil_clean callback hasn't got around to cleaning 12255002558fSNeil Perrin * this itxg. Save the itxs for release below. 12265002558fSNeil Perrin * This should be rare. 12275002558fSNeil Perrin */ 12285002558fSNeil Perrin atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); 12295002558fSNeil Perrin itxg->itxg_sod = 0; 12305002558fSNeil Perrin clean = itxg->itxg_itxs; 12315002558fSNeil Perrin } 12325002558fSNeil Perrin ASSERT(itxg->itxg_sod == 0); 12335002558fSNeil Perrin itxg->itxg_txg = txg; 12345002558fSNeil Perrin itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 12355002558fSNeil Perrin 12365002558fSNeil Perrin list_create(&itxs->i_sync_list, sizeof (itx_t), 12375002558fSNeil Perrin offsetof(itx_t, itx_node)); 12385002558fSNeil Perrin avl_create(&itxs->i_async_tree, zil_aitx_compare, 12395002558fSNeil Perrin sizeof (itx_async_node_t), 12405002558fSNeil Perrin offsetof(itx_async_node_t, ia_node)); 12415002558fSNeil Perrin } 12425002558fSNeil Perrin if (itx->itx_sync) { 12435002558fSNeil Perrin list_insert_tail(&itxs->i_sync_list, itx); 12445002558fSNeil Perrin atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); 12455002558fSNeil Perrin itxg->itxg_sod += itx->itx_sod; 12465002558fSNeil Perrin } else { 12475002558fSNeil Perrin avl_tree_t *t = &itxs->i_async_tree; 12485002558fSNeil Perrin uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 12495002558fSNeil Perrin itx_async_node_t *ian; 12505002558fSNeil Perrin avl_index_t where; 12515002558fSNeil Perrin 12525002558fSNeil Perrin ian = avl_find(t, &foid, &where); 12535002558fSNeil Perrin if (ian == NULL) { 12545002558fSNeil Perrin ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 12555002558fSNeil Perrin list_create(&ian->ia_list, sizeof (itx_t), 12565002558fSNeil Perrin offsetof(itx_t, itx_node)); 12575002558fSNeil Perrin ian->ia_foid = foid; 12585002558fSNeil Perrin avl_insert(t, ian, where); 12595002558fSNeil Perrin } 12605002558fSNeil Perrin list_insert_tail(&ian->ia_list, itx); 12615002558fSNeil Perrin } 12625002558fSNeil Perrin 12635002558fSNeil Perrin itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 12645002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 12655002558fSNeil Perrin 12665002558fSNeil Perrin /* Release the old itxs now we've dropped the lock */ 12675002558fSNeil Perrin if (clean != NULL) 12685002558fSNeil Perrin zil_itxg_clean(clean); 12695002558fSNeil Perrin } 12705002558fSNeil Perrin 1271b19a79ecSperrin /* 127267bd71c6Sperrin * If there are any in-memory intent log transactions which have now been 127367bd71c6Sperrin * synced then start up a taskq to free them. 1274b19a79ecSperrin */ 1275fa9e4066Sahrens void 12765002558fSNeil Perrin zil_clean(zilog_t *zilog, uint64_t synced_txg) 1277fa9e4066Sahrens { 12785002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 12795002558fSNeil Perrin itxs_t *clean_me; 128067bd71c6Sperrin 12815002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 12825002558fSNeil Perrin if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 12835002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 12845002558fSNeil Perrin return; 12855002558fSNeil Perrin } 12865002558fSNeil Perrin ASSERT3U(itxg->itxg_txg, <=, synced_txg); 12875002558fSNeil Perrin ASSERT(itxg->itxg_txg != 0); 12885002558fSNeil Perrin ASSERT(zilog->zl_clean_taskq != NULL); 12895002558fSNeil Perrin atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); 12905002558fSNeil Perrin itxg->itxg_sod = 0; 12915002558fSNeil Perrin clean_me = itxg->itxg_itxs; 12925002558fSNeil Perrin itxg->itxg_itxs = NULL; 12935002558fSNeil Perrin itxg->itxg_txg = 0; 12945002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 12955002558fSNeil Perrin /* 12965002558fSNeil Perrin * Preferably start a task queue to free up the old itxs but 12975002558fSNeil Perrin * if taskq_dispatch can't allocate resources to do that then 12985002558fSNeil Perrin * free it in-line. This should be rare. Note, using TQ_SLEEP 12995002558fSNeil Perrin * created a bad performance problem. 13005002558fSNeil Perrin */ 13015002558fSNeil Perrin if (taskq_dispatch(zilog->zl_clean_taskq, 13025002558fSNeil Perrin (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) 13035002558fSNeil Perrin zil_itxg_clean(clean_me); 13045002558fSNeil Perrin } 13055002558fSNeil Perrin 13065002558fSNeil Perrin /* 13075002558fSNeil Perrin * Get the list of itxs to commit into zl_itx_commit_list. 13085002558fSNeil Perrin */ 130991de656bSNeil Perrin static void 13105002558fSNeil Perrin zil_get_commit_list(zilog_t *zilog) 13115002558fSNeil Perrin { 13125002558fSNeil Perrin uint64_t otxg, txg; 13135002558fSNeil Perrin list_t *commit_list = &zilog->zl_itx_commit_list; 13145002558fSNeil Perrin uint64_t push_sod = 0; 13155002558fSNeil Perrin 13165002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 13175002558fSNeil Perrin otxg = ZILTEST_TXG; 13185002558fSNeil Perrin else 13195002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 13205002558fSNeil Perrin 13215002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 13225002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 13235002558fSNeil Perrin 13245002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 13255002558fSNeil Perrin if (itxg->itxg_txg != txg) { 13265002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 13275002558fSNeil Perrin continue; 13285002558fSNeil Perrin } 13295002558fSNeil Perrin 13305002558fSNeil Perrin list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 13315002558fSNeil Perrin push_sod += itxg->itxg_sod; 13325002558fSNeil Perrin itxg->itxg_sod = 0; 13335002558fSNeil Perrin 13345002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 13355002558fSNeil Perrin } 13365002558fSNeil Perrin atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); 13375002558fSNeil Perrin } 13385002558fSNeil Perrin 13395002558fSNeil Perrin /* 13405002558fSNeil Perrin * Move the async itxs for a specified object to commit into sync lists. 13415002558fSNeil Perrin */ 134291de656bSNeil Perrin static void 13435002558fSNeil Perrin zil_async_to_sync(zilog_t *zilog, uint64_t foid) 13445002558fSNeil Perrin { 13455002558fSNeil Perrin uint64_t otxg, txg; 13465002558fSNeil Perrin itx_async_node_t *ian; 13475002558fSNeil Perrin avl_tree_t *t; 13485002558fSNeil Perrin avl_index_t where; 13495002558fSNeil Perrin 13505002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 13515002558fSNeil Perrin otxg = ZILTEST_TXG; 13525002558fSNeil Perrin else 13535002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 13545002558fSNeil Perrin 13555002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 13565002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 13575002558fSNeil Perrin 13585002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 13595002558fSNeil Perrin if (itxg->itxg_txg != txg) { 13605002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 13615002558fSNeil Perrin continue; 13625002558fSNeil Perrin } 13635002558fSNeil Perrin 13645002558fSNeil Perrin /* 13655002558fSNeil Perrin * If a foid is specified then find that node and append its 13665002558fSNeil Perrin * list. Otherwise walk the tree appending all the lists 13675002558fSNeil Perrin * to the sync list. We add to the end rather than the 13685002558fSNeil Perrin * beginning to ensure the create has happened. 13695002558fSNeil Perrin */ 13705002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 13715002558fSNeil Perrin if (foid != 0) { 13725002558fSNeil Perrin ian = avl_find(t, &foid, &where); 13735002558fSNeil Perrin if (ian != NULL) { 13745002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 13755002558fSNeil Perrin &ian->ia_list); 13765002558fSNeil Perrin } 13775002558fSNeil Perrin } else { 13785002558fSNeil Perrin void *cookie = NULL; 13795002558fSNeil Perrin 13805002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 13815002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 13825002558fSNeil Perrin &ian->ia_list); 13835002558fSNeil Perrin list_destroy(&ian->ia_list); 13845002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 13855002558fSNeil Perrin } 13865002558fSNeil Perrin } 13875002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 138867bd71c6Sperrin } 1389fa9e4066Sahrens } 1390fa9e4066Sahrens 1391e14bb325SJeff Bonwick static void 13925002558fSNeil Perrin zil_commit_writer(zilog_t *zilog) 1393fa9e4066Sahrens { 1394fa9e4066Sahrens uint64_t txg; 13955002558fSNeil Perrin itx_t *itx; 1396fa9e4066Sahrens lwb_t *lwb; 13975002558fSNeil Perrin spa_t *spa = zilog->zl_spa; 1398b24ab676SJeff Bonwick int error = 0; 1399fa9e4066Sahrens 1400e14bb325SJeff Bonwick ASSERT(zilog->zl_root_zio == NULL); 14015002558fSNeil Perrin 14025002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 14035002558fSNeil Perrin 14045002558fSNeil Perrin zil_get_commit_list(zilog); 14055002558fSNeil Perrin 14065002558fSNeil Perrin /* 14075002558fSNeil Perrin * Return if there's nothing to commit before we dirty the fs by 14085002558fSNeil Perrin * calling zil_create(). 14095002558fSNeil Perrin */ 14105002558fSNeil Perrin if (list_head(&zilog->zl_itx_commit_list) == NULL) { 14115002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 14125002558fSNeil Perrin return; 14135002558fSNeil Perrin } 1414fa9e4066Sahrens 1415fa9e4066Sahrens if (zilog->zl_suspend) { 1416fa9e4066Sahrens lwb = NULL; 1417fa9e4066Sahrens } else { 1418fa9e4066Sahrens lwb = list_tail(&zilog->zl_lwb_list); 14195002558fSNeil Perrin if (lwb == NULL) 14206e1f5caaSNeil Perrin lwb = zil_create(zilog); 1421fa9e4066Sahrens } 1422fa9e4066Sahrens 1423b19a79ecSperrin DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); 14245002558fSNeil Perrin while (itx = list_head(&zilog->zl_itx_commit_list)) { 1425fa9e4066Sahrens txg = itx->itx_lr.lrc_txg; 1426fa9e4066Sahrens ASSERT(txg); 1427fa9e4066Sahrens 14285002558fSNeil Perrin if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) 1429fa9e4066Sahrens lwb = zil_lwb_commit(zilog, itx, lwb); 14305002558fSNeil Perrin list_remove(&zilog->zl_itx_commit_list, itx); 14315002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) 14325002558fSNeil Perrin + itx->itx_lr.lrc_reclen); 1433fa9e4066Sahrens } 1434b19a79ecSperrin DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 1435fa9e4066Sahrens 1436fa9e4066Sahrens /* write the last block out */ 143767bd71c6Sperrin if (lwb != NULL && lwb->lwb_zio != NULL) 1438fa9e4066Sahrens lwb = zil_lwb_write_start(zilog, lwb); 1439fa9e4066Sahrens 144022ac5be4Sperrin zilog->zl_cur_used = 0; 1441fa9e4066Sahrens 1442fa9e4066Sahrens /* 1443b19a79ecSperrin * Wait if necessary for the log blocks to be on stable storage. 1444fa9e4066Sahrens */ 1445b19a79ecSperrin if (zilog->zl_root_zio) { 1446b24ab676SJeff Bonwick error = zio_wait(zilog->zl_root_zio); 1447e14bb325SJeff Bonwick zilog->zl_root_zio = NULL; 144817f17c2dSbonwick zil_flush_vdevs(zilog); 1449fa9e4066Sahrens } 145022ac5be4Sperrin 1451b24ab676SJeff Bonwick if (error || lwb == NULL) 1452fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 145367bd71c6Sperrin 145467bd71c6Sperrin mutex_enter(&zilog->zl_lock); 1455b24ab676SJeff Bonwick 1456b24ab676SJeff Bonwick /* 1457b24ab676SJeff Bonwick * Remember the highest committed log sequence number for ztest. 1458b24ab676SJeff Bonwick * We only update this value when all the log writes succeeded, 1459b24ab676SJeff Bonwick * because ztest wants to ASSERT that it got the whole log chain. 1460b24ab676SJeff Bonwick */ 1461b24ab676SJeff Bonwick if (error == 0 && lwb != NULL) 1462b24ab676SJeff Bonwick zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1463b19a79ecSperrin } 1464b19a79ecSperrin 1465b19a79ecSperrin /* 14665002558fSNeil Perrin * Commit zfs transactions to stable storage. 1467b19a79ecSperrin * If foid is 0 push out all transactions, otherwise push only those 14685002558fSNeil Perrin * for that object or might reference that object. 14695002558fSNeil Perrin * 14705002558fSNeil Perrin * itxs are committed in batches. In a heavily stressed zil there will be 14715002558fSNeil Perrin * a commit writer thread who is writing out a bunch of itxs to the log 14725002558fSNeil Perrin * for a set of committing threads (cthreads) in the same batch as the writer. 14735002558fSNeil Perrin * Those cthreads are all waiting on the same cv for that batch. 14745002558fSNeil Perrin * 14755002558fSNeil Perrin * There will also be a different and growing batch of threads that are 14765002558fSNeil Perrin * waiting to commit (qthreads). When the committing batch completes 14775002558fSNeil Perrin * a transition occurs such that the cthreads exit and the qthreads become 14785002558fSNeil Perrin * cthreads. One of the new cthreads becomes the writer thread for the 14795002558fSNeil Perrin * batch. Any new threads arriving become new qthreads. 14805002558fSNeil Perrin * 14815002558fSNeil Perrin * Only 2 condition variables are needed and there's no transition 14825002558fSNeil Perrin * between the two cvs needed. They just flip-flop between qthreads 14835002558fSNeil Perrin * and cthreads. 14845002558fSNeil Perrin * 14855002558fSNeil Perrin * Using this scheme we can efficiently wakeup up only those threads 14865002558fSNeil Perrin * that have been committed. 1487b19a79ecSperrin */ 1488b19a79ecSperrin void 14895002558fSNeil Perrin zil_commit(zilog_t *zilog, uint64_t foid) 1490b19a79ecSperrin { 14915002558fSNeil Perrin uint64_t mybatch; 1492b19a79ecSperrin 14935002558fSNeil Perrin if (zilog->zl_sync == ZFS_SYNC_DISABLED) 14945002558fSNeil Perrin return; 1495b19a79ecSperrin 14965002558fSNeil Perrin /* move the async itxs for the foid to the sync queues */ 14975002558fSNeil Perrin zil_async_to_sync(zilog, foid); 1498b19a79ecSperrin 14995002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 15005002558fSNeil Perrin mybatch = zilog->zl_next_batch; 150167bd71c6Sperrin while (zilog->zl_writer) { 15025002558fSNeil Perrin cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); 15035002558fSNeil Perrin if (mybatch <= zilog->zl_com_batch) { 150467bd71c6Sperrin mutex_exit(&zilog->zl_lock); 150567bd71c6Sperrin return; 150667bd71c6Sperrin } 150767bd71c6Sperrin } 1508b24ab676SJeff Bonwick 15095002558fSNeil Perrin zilog->zl_next_batch++; 15105002558fSNeil Perrin zilog->zl_writer = B_TRUE; 15115002558fSNeil Perrin zil_commit_writer(zilog); 15125002558fSNeil Perrin zilog->zl_com_batch = mybatch; 15135002558fSNeil Perrin zilog->zl_writer = B_FALSE; 15145002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 1515b24ab676SJeff Bonwick 15165002558fSNeil Perrin /* wake up one thread to become the next writer */ 15175002558fSNeil Perrin cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); 1518b24ab676SJeff Bonwick 15195002558fSNeil Perrin /* wake up all threads waiting for this batch to be committed */ 15205002558fSNeil Perrin cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); 1521b24ab676SJeff Bonwick } 1522b24ab676SJeff Bonwick 1523fa9e4066Sahrens /* 1524fa9e4066Sahrens * Called in syncing context to free committed log blocks and update log header. 1525fa9e4066Sahrens */ 1526fa9e4066Sahrens void 1527fa9e4066Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx) 1528fa9e4066Sahrens { 1529d80c45e0Sbonwick zil_header_t *zh = zil_header_in_syncing_context(zilog); 1530fa9e4066Sahrens uint64_t txg = dmu_tx_get_txg(tx); 1531fa9e4066Sahrens spa_t *spa = zilog->zl_spa; 1532b24ab676SJeff Bonwick uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 1533fa9e4066Sahrens lwb_t *lwb; 1534fa9e4066Sahrens 153514843421SMatthew Ahrens /* 153614843421SMatthew Ahrens * We don't zero out zl_destroy_txg, so make sure we don't try 153714843421SMatthew Ahrens * to destroy it twice. 153814843421SMatthew Ahrens */ 153914843421SMatthew Ahrens if (spa_sync_pass(spa) != 1) 154014843421SMatthew Ahrens return; 154114843421SMatthew Ahrens 1542d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 1543d80c45e0Sbonwick 1544fa9e4066Sahrens ASSERT(zilog->zl_stop_sync == 0); 1545fa9e4066Sahrens 1546b24ab676SJeff Bonwick if (*replayed_seq != 0) { 1547b24ab676SJeff Bonwick ASSERT(zh->zh_replay_seq < *replayed_seq); 1548b24ab676SJeff Bonwick zh->zh_replay_seq = *replayed_seq; 1549b24ab676SJeff Bonwick *replayed_seq = 0; 1550b24ab676SJeff Bonwick } 1551fa9e4066Sahrens 1552fa9e4066Sahrens if (zilog->zl_destroy_txg == txg) { 1553d80c45e0Sbonwick blkptr_t blk = zh->zh_log; 1554d80c45e0Sbonwick 1555d80c45e0Sbonwick ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 1556d80c45e0Sbonwick 1557d80c45e0Sbonwick bzero(zh, sizeof (zil_header_t)); 15581209a471SNeil Perrin bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 1559d80c45e0Sbonwick 1560d80c45e0Sbonwick if (zilog->zl_keep_first) { 1561d80c45e0Sbonwick /* 1562d80c45e0Sbonwick * If this block was part of log chain that couldn't 1563d80c45e0Sbonwick * be claimed because a device was missing during 1564d80c45e0Sbonwick * zil_claim(), but that device later returns, 1565d80c45e0Sbonwick * then this block could erroneously appear valid. 1566d80c45e0Sbonwick * To guard against this, assign a new GUID to the new 1567d80c45e0Sbonwick * log chain so it doesn't matter what blk points to. 1568d80c45e0Sbonwick */ 1569d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 1570d80c45e0Sbonwick zh->zh_log = blk; 1571d80c45e0Sbonwick } 1572fa9e4066Sahrens } 1573fa9e4066Sahrens 1574e6ca193dSGeorge Wilson while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1575b19a79ecSperrin zh->zh_log = lwb->lwb_blk; 1576fa9e4066Sahrens if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 1577fa9e4066Sahrens break; 1578fa9e4066Sahrens list_remove(&zilog->zl_lwb_list, lwb); 1579b24ab676SJeff Bonwick zio_free_zil(spa, txg, &lwb->lwb_blk); 1580fa9e4066Sahrens kmem_cache_free(zil_lwb_cache, lwb); 1581d63d470bSgw 1582d63d470bSgw /* 1583d63d470bSgw * If we don't have anything left in the lwb list then 1584d63d470bSgw * we've had an allocation failure and we need to zero 1585d63d470bSgw * out the zil_header blkptr so that we don't end 1586d63d470bSgw * up freeing the same block twice. 1587d63d470bSgw */ 1588d63d470bSgw if (list_head(&zilog->zl_lwb_list) == NULL) 1589d63d470bSgw BP_ZERO(&zh->zh_log); 1590fa9e4066Sahrens } 1591fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1592fa9e4066Sahrens } 1593fa9e4066Sahrens 1594fa9e4066Sahrens void 1595fa9e4066Sahrens zil_init(void) 1596fa9e4066Sahrens { 1597fa9e4066Sahrens zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 15985ad82045Snd sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); 1599fa9e4066Sahrens } 1600fa9e4066Sahrens 1601fa9e4066Sahrens void 1602fa9e4066Sahrens zil_fini(void) 1603fa9e4066Sahrens { 1604fa9e4066Sahrens kmem_cache_destroy(zil_lwb_cache); 1605fa9e4066Sahrens } 1606fa9e4066Sahrens 160755da60b9SMark J Musante void 160855da60b9SMark J Musante zil_set_sync(zilog_t *zilog, uint64_t sync) 160955da60b9SMark J Musante { 161055da60b9SMark J Musante zilog->zl_sync = sync; 161155da60b9SMark J Musante } 161255da60b9SMark J Musante 1613e09fa4daSNeil Perrin void 1614e09fa4daSNeil Perrin zil_set_logbias(zilog_t *zilog, uint64_t logbias) 1615e09fa4daSNeil Perrin { 1616e09fa4daSNeil Perrin zilog->zl_logbias = logbias; 1617e09fa4daSNeil Perrin } 1618e09fa4daSNeil Perrin 1619fa9e4066Sahrens zilog_t * 1620fa9e4066Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys) 1621fa9e4066Sahrens { 1622fa9e4066Sahrens zilog_t *zilog; 1623fa9e4066Sahrens 1624fa9e4066Sahrens zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 1625fa9e4066Sahrens 1626fa9e4066Sahrens zilog->zl_header = zh_phys; 1627fa9e4066Sahrens zilog->zl_os = os; 1628fa9e4066Sahrens zilog->zl_spa = dmu_objset_spa(os); 1629fa9e4066Sahrens zilog->zl_dmu_pool = dmu_objset_pool(os); 1630d80c45e0Sbonwick zilog->zl_destroy_txg = TXG_INITIAL - 1; 1631e09fa4daSNeil Perrin zilog->zl_logbias = dmu_objset_logbias(os); 163255da60b9SMark J Musante zilog->zl_sync = dmu_objset_syncprop(os); 16335002558fSNeil Perrin zilog->zl_next_batch = 1; 1634fa9e4066Sahrens 16355ad82045Snd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 16365ad82045Snd 16375002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 16385002558fSNeil Perrin mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 16395002558fSNeil Perrin MUTEX_DEFAULT, NULL); 16405002558fSNeil Perrin } 1641fa9e4066Sahrens 1642fa9e4066Sahrens list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 1643fa9e4066Sahrens offsetof(lwb_t, lwb_node)); 1644fa9e4066Sahrens 16455002558fSNeil Perrin list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 16465002558fSNeil Perrin offsetof(itx_t, itx_node)); 16475002558fSNeil Perrin 164817f17c2dSbonwick mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 164917f17c2dSbonwick 165017f17c2dSbonwick avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, 165117f17c2dSbonwick sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 1652fa9e4066Sahrens 1653b7b97454Sperrin cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); 1654b7b97454Sperrin cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 16555002558fSNeil Perrin cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); 16565002558fSNeil Perrin cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); 1657b7b97454Sperrin 1658fa9e4066Sahrens return (zilog); 1659fa9e4066Sahrens } 1660fa9e4066Sahrens 1661fa9e4066Sahrens void 1662fa9e4066Sahrens zil_free(zilog_t *zilog) 1663fa9e4066Sahrens { 16645002558fSNeil Perrin lwb_t *head_lwb; 1665fa9e4066Sahrens 1666fa9e4066Sahrens zilog->zl_stop_sync = 1; 1667fa9e4066Sahrens 16685002558fSNeil Perrin /* 16695002558fSNeil Perrin * After zil_close() there should only be one lwb with a buffer. 16705002558fSNeil Perrin */ 16715002558fSNeil Perrin head_lwb = list_head(&zilog->zl_lwb_list); 16725002558fSNeil Perrin if (head_lwb) { 16735002558fSNeil Perrin ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list)); 16745002558fSNeil Perrin list_remove(&zilog->zl_lwb_list, head_lwb); 16758eb0a72cSNeil Perrin zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz); 16765002558fSNeil Perrin kmem_cache_free(zil_lwb_cache, head_lwb); 1677fa9e4066Sahrens } 1678fa9e4066Sahrens list_destroy(&zilog->zl_lwb_list); 1679fa9e4066Sahrens 168017f17c2dSbonwick avl_destroy(&zilog->zl_vdev_tree); 168117f17c2dSbonwick mutex_destroy(&zilog->zl_vdev_lock); 1682fa9e4066Sahrens 16835002558fSNeil Perrin ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 16845002558fSNeil Perrin list_destroy(&zilog->zl_itx_commit_list); 16855002558fSNeil Perrin 16865002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 16875002558fSNeil Perrin /* 16885002558fSNeil Perrin * It's possible for an itx to be generated that doesn't dirty 16895002558fSNeil Perrin * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 16905002558fSNeil Perrin * callback to remove the entry. We remove those here. 16915002558fSNeil Perrin * 16925002558fSNeil Perrin * Also free up the ziltest itxs. 16935002558fSNeil Perrin */ 16945002558fSNeil Perrin if (zilog->zl_itxg[i].itxg_itxs) 16955002558fSNeil Perrin zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 16965002558fSNeil Perrin mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 16975002558fSNeil Perrin } 16985002558fSNeil Perrin 16995ad82045Snd mutex_destroy(&zilog->zl_lock); 1700fa9e4066Sahrens 1701b7b97454Sperrin cv_destroy(&zilog->zl_cv_writer); 1702b7b97454Sperrin cv_destroy(&zilog->zl_cv_suspend); 17035002558fSNeil Perrin cv_destroy(&zilog->zl_cv_batch[0]); 17045002558fSNeil Perrin cv_destroy(&zilog->zl_cv_batch[1]); 1705b7b97454Sperrin 1706fa9e4066Sahrens kmem_free(zilog, sizeof (zilog_t)); 1707fa9e4066Sahrens } 1708fa9e4066Sahrens 1709fa9e4066Sahrens /* 1710fa9e4066Sahrens * Open an intent log. 1711fa9e4066Sahrens */ 1712fa9e4066Sahrens zilog_t * 1713fa9e4066Sahrens zil_open(objset_t *os, zil_get_data_t *get_data) 1714fa9e4066Sahrens { 1715fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 1716fa9e4066Sahrens 1717fa9e4066Sahrens zilog->zl_get_data = get_data; 1718fa9e4066Sahrens zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 1719fa9e4066Sahrens 2, 2, TASKQ_PREPOPULATE); 1720fa9e4066Sahrens 1721fa9e4066Sahrens return (zilog); 1722fa9e4066Sahrens } 1723fa9e4066Sahrens 1724fa9e4066Sahrens /* 1725fa9e4066Sahrens * Close an intent log. 1726fa9e4066Sahrens */ 1727fa9e4066Sahrens void 1728fa9e4066Sahrens zil_close(zilog_t *zilog) 1729fa9e4066Sahrens { 17305002558fSNeil Perrin lwb_t *tail_lwb; 17315002558fSNeil Perrin uint64_t txg = 0; 17325002558fSNeil Perrin 17335002558fSNeil Perrin zil_commit(zilog, 0); /* commit all itx */ 17345002558fSNeil Perrin 1735d80c45e0Sbonwick /* 17365002558fSNeil Perrin * The lwb_max_txg for the stubby lwb will reflect the last activity 17375002558fSNeil Perrin * for the zil. After a txg_wait_synced() on the txg we know all the 17385002558fSNeil Perrin * callbacks have occurred that may clean the zil. Only then can we 17395002558fSNeil Perrin * destroy the zl_clean_taskq. 1740d80c45e0Sbonwick */ 17415002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 17425002558fSNeil Perrin tail_lwb = list_tail(&zilog->zl_lwb_list); 17435002558fSNeil Perrin if (tail_lwb != NULL) 17445002558fSNeil Perrin txg = tail_lwb->lwb_max_txg; 17455002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 17465002558fSNeil Perrin if (txg) 1747d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, txg); 1748d80c45e0Sbonwick 1749fa9e4066Sahrens taskq_destroy(zilog->zl_clean_taskq); 1750fa9e4066Sahrens zilog->zl_clean_taskq = NULL; 1751fa9e4066Sahrens zilog->zl_get_data = NULL; 1752fa9e4066Sahrens } 1753fa9e4066Sahrens 1754fa9e4066Sahrens /* 1755fa9e4066Sahrens * Suspend an intent log. While in suspended mode, we still honor 1756fa9e4066Sahrens * synchronous semantics, but we rely on txg_wait_synced() to do it. 1757fa9e4066Sahrens * We suspend the log briefly when taking a snapshot so that the snapshot 1758fa9e4066Sahrens * contains all the data it's supposed to, and has an empty intent log. 1759fa9e4066Sahrens */ 1760fa9e4066Sahrens int 1761fa9e4066Sahrens zil_suspend(zilog_t *zilog) 1762fa9e4066Sahrens { 1763d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 1764fa9e4066Sahrens 1765fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 17663589c4f0SNeil Perrin if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 1767fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1768fa9e4066Sahrens return (EBUSY); 1769fa9e4066Sahrens } 1770d80c45e0Sbonwick if (zilog->zl_suspend++ != 0) { 1771d80c45e0Sbonwick /* 1772d80c45e0Sbonwick * Someone else already began a suspend. 1773d80c45e0Sbonwick * Just wait for them to finish. 1774d80c45e0Sbonwick */ 1775d80c45e0Sbonwick while (zilog->zl_suspending) 1776d80c45e0Sbonwick cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 1777d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 1778d80c45e0Sbonwick return (0); 1779d80c45e0Sbonwick } 1780d80c45e0Sbonwick zilog->zl_suspending = B_TRUE; 1781fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1782fa9e4066Sahrens 17835002558fSNeil Perrin zil_commit(zilog, 0); 1784fa9e4066Sahrens 1785d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 1786d80c45e0Sbonwick 1787d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 1788d80c45e0Sbonwick zilog->zl_suspending = B_FALSE; 1789d80c45e0Sbonwick cv_broadcast(&zilog->zl_cv_suspend); 1790d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 1791fa9e4066Sahrens 1792fa9e4066Sahrens return (0); 1793fa9e4066Sahrens } 1794fa9e4066Sahrens 1795fa9e4066Sahrens void 1796fa9e4066Sahrens zil_resume(zilog_t *zilog) 1797fa9e4066Sahrens { 1798fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1799fa9e4066Sahrens ASSERT(zilog->zl_suspend != 0); 1800fa9e4066Sahrens zilog->zl_suspend--; 1801fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1802fa9e4066Sahrens } 1803fa9e4066Sahrens 1804fa9e4066Sahrens typedef struct zil_replay_arg { 1805fa9e4066Sahrens zil_replay_func_t **zr_replay; 1806fa9e4066Sahrens void *zr_arg; 1807fa9e4066Sahrens boolean_t zr_byteswap; 1808b24ab676SJeff Bonwick char *zr_lr; 1809fa9e4066Sahrens } zil_replay_arg_t; 1810fa9e4066Sahrens 1811b24ab676SJeff Bonwick static int 1812b24ab676SJeff Bonwick zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 1813b24ab676SJeff Bonwick { 1814b24ab676SJeff Bonwick char name[MAXNAMELEN]; 1815b24ab676SJeff Bonwick 1816b24ab676SJeff Bonwick zilog->zl_replaying_seq--; /* didn't actually replay this one */ 1817b24ab676SJeff Bonwick 1818b24ab676SJeff Bonwick dmu_objset_name(zilog->zl_os, name); 1819b24ab676SJeff Bonwick 1820b24ab676SJeff Bonwick cmn_err(CE_WARN, "ZFS replay transaction error %d, " 1821b24ab676SJeff Bonwick "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 1822b24ab676SJeff Bonwick (u_longlong_t)lr->lrc_seq, 1823b24ab676SJeff Bonwick (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 1824b24ab676SJeff Bonwick (lr->lrc_txtype & TX_CI) ? "CI" : ""); 1825b24ab676SJeff Bonwick 1826b24ab676SJeff Bonwick return (error); 1827b24ab676SJeff Bonwick } 1828b24ab676SJeff Bonwick 1829b24ab676SJeff Bonwick static int 1830fa9e4066Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 1831fa9e4066Sahrens { 1832fa9e4066Sahrens zil_replay_arg_t *zr = zra; 1833d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 1834fa9e4066Sahrens uint64_t reclen = lr->lrc_reclen; 1835fa9e4066Sahrens uint64_t txtype = lr->lrc_txtype; 1836b24ab676SJeff Bonwick int error = 0; 1837fa9e4066Sahrens 1838b24ab676SJeff Bonwick zilog->zl_replaying_seq = lr->lrc_seq; 1839fa9e4066Sahrens 1840fa9e4066Sahrens if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 1841b24ab676SJeff Bonwick return (0); 1842b24ab676SJeff Bonwick 1843b24ab676SJeff Bonwick if (lr->lrc_txg < claim_txg) /* already committed */ 1844b24ab676SJeff Bonwick return (0); 1845fa9e4066Sahrens 1846da6c28aaSamw /* Strip case-insensitive bit, still present in log record */ 1847da6c28aaSamw txtype &= ~TX_CI; 1848da6c28aaSamw 1849b24ab676SJeff Bonwick if (txtype == 0 || txtype >= TX_MAX_TYPE) 1850b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, EINVAL)); 1851b24ab676SJeff Bonwick 1852b24ab676SJeff Bonwick /* 1853b24ab676SJeff Bonwick * If this record type can be logged out of order, the object 1854b24ab676SJeff Bonwick * (lr_foid) may no longer exist. That's legitimate, not an error. 1855b24ab676SJeff Bonwick */ 1856b24ab676SJeff Bonwick if (TX_OOO(txtype)) { 1857b24ab676SJeff Bonwick error = dmu_object_info(zilog->zl_os, 1858b24ab676SJeff Bonwick ((lr_ooo_t *)lr)->lr_foid, NULL); 1859b24ab676SJeff Bonwick if (error == ENOENT || error == EEXIST) 1860b24ab676SJeff Bonwick return (0); 18611209a471SNeil Perrin } 18621209a471SNeil Perrin 1863fa9e4066Sahrens /* 1864fa9e4066Sahrens * Make a copy of the data so we can revise and extend it. 1865fa9e4066Sahrens */ 1866b24ab676SJeff Bonwick bcopy(lr, zr->zr_lr, reclen); 1867b24ab676SJeff Bonwick 1868b24ab676SJeff Bonwick /* 1869b24ab676SJeff Bonwick * If this is a TX_WRITE with a blkptr, suck in the data. 1870b24ab676SJeff Bonwick */ 1871b24ab676SJeff Bonwick if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 1872b24ab676SJeff Bonwick error = zil_read_log_data(zilog, (lr_write_t *)lr, 1873b24ab676SJeff Bonwick zr->zr_lr + reclen); 1874b24ab676SJeff Bonwick if (error) 1875b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 1876b24ab676SJeff Bonwick } 1877fa9e4066Sahrens 1878fa9e4066Sahrens /* 1879fa9e4066Sahrens * The log block containing this lr may have been byteswapped 1880fa9e4066Sahrens * so that we can easily examine common fields like lrc_txtype. 1881b24ab676SJeff Bonwick * However, the log is a mix of different record types, and only the 1882fa9e4066Sahrens * replay vectors know how to byteswap their records. Therefore, if 1883fa9e4066Sahrens * the lr was byteswapped, undo it before invoking the replay vector. 1884fa9e4066Sahrens */ 1885fa9e4066Sahrens if (zr->zr_byteswap) 1886b24ab676SJeff Bonwick byteswap_uint64_array(zr->zr_lr, reclen); 1887fa9e4066Sahrens 1888fa9e4066Sahrens /* 1889fa9e4066Sahrens * We must now do two things atomically: replay this log record, 18901209a471SNeil Perrin * and update the log header sequence number to reflect the fact that 18911209a471SNeil Perrin * we did so. At the end of each replay function the sequence number 18921209a471SNeil Perrin * is updated if we are in replay mode. 1893fa9e4066Sahrens */ 1894b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 1895b24ab676SJeff Bonwick if (error) { 189667bd71c6Sperrin /* 189767bd71c6Sperrin * The DMU's dnode layer doesn't see removes until the txg 189867bd71c6Sperrin * commits, so a subsequent claim can spuriously fail with 18991209a471SNeil Perrin * EEXIST. So if we receive any error we try syncing out 1900b24ab676SJeff Bonwick * any removes then retry the transaction. Note that we 1901b24ab676SJeff Bonwick * specify B_FALSE for byteswap now, so we don't do it twice. 190267bd71c6Sperrin */ 1903b24ab676SJeff Bonwick txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 1904b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 1905b24ab676SJeff Bonwick if (error) 1906b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 1907fa9e4066Sahrens } 1908b24ab676SJeff Bonwick return (0); 190967bd71c6Sperrin } 1910fa9e4066Sahrens 191167bd71c6Sperrin /* ARGSUSED */ 1912b24ab676SJeff Bonwick static int 191367bd71c6Sperrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 191467bd71c6Sperrin { 191567bd71c6Sperrin zilog->zl_replay_blks++; 1916b24ab676SJeff Bonwick 1917b24ab676SJeff Bonwick return (0); 1918fa9e4066Sahrens } 1919fa9e4066Sahrens 1920fa9e4066Sahrens /* 192113f5297eSperrin * If this dataset has a non-empty intent log, replay it and destroy it. 1922fa9e4066Sahrens */ 1923fa9e4066Sahrens void 19241209a471SNeil Perrin zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 1925fa9e4066Sahrens { 1926fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 1927d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 1928d80c45e0Sbonwick zil_replay_arg_t zr; 192913f5297eSperrin 19303589c4f0SNeil Perrin if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 1931d80c45e0Sbonwick zil_destroy(zilog, B_TRUE); 193213f5297eSperrin return; 193313f5297eSperrin } 1934fa9e4066Sahrens 1935fa9e4066Sahrens zr.zr_replay = replay_func; 1936fa9e4066Sahrens zr.zr_arg = arg; 1937d80c45e0Sbonwick zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 1938b24ab676SJeff Bonwick zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 1939fa9e4066Sahrens 1940fa9e4066Sahrens /* 1941fa9e4066Sahrens * Wait for in-progress removes to sync before starting replay. 1942fa9e4066Sahrens */ 1943fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 1944fa9e4066Sahrens 19451209a471SNeil Perrin zilog->zl_replay = B_TRUE; 1946d3d50737SRafael Vanoni zilog->zl_replay_time = ddi_get_lbolt(); 194767bd71c6Sperrin ASSERT(zilog->zl_replay_blks == 0); 194867bd71c6Sperrin (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 1949d80c45e0Sbonwick zh->zh_claim_txg); 1950b24ab676SJeff Bonwick kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 1951fa9e4066Sahrens 1952d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 1953a4611edeSahrens txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 19541209a471SNeil Perrin zilog->zl_replay = B_FALSE; 1955fa9e4066Sahrens } 1956436b2950Sperrin 1957b24ab676SJeff Bonwick boolean_t 1958b24ab676SJeff Bonwick zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 1959436b2950Sperrin { 196055da60b9SMark J Musante if (zilog->zl_sync == ZFS_SYNC_DISABLED) 1961b24ab676SJeff Bonwick return (B_TRUE); 1962436b2950Sperrin 1963b24ab676SJeff Bonwick if (zilog->zl_replay) { 1964b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1965b24ab676SJeff Bonwick zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 1966b24ab676SJeff Bonwick zilog->zl_replaying_seq; 1967b24ab676SJeff Bonwick return (B_TRUE); 1968b19a79ecSperrin } 1969b19a79ecSperrin 1970b24ab676SJeff Bonwick return (B_FALSE); 1971436b2950Sperrin } 1972e6ca193dSGeorge Wilson 1973e6ca193dSGeorge Wilson /* ARGSUSED */ 1974e6ca193dSGeorge Wilson int 1975fd136879SMatthew Ahrens zil_vdev_offline(const char *osname, void *arg) 1976e6ca193dSGeorge Wilson { 1977e6ca193dSGeorge Wilson objset_t *os; 1978e6ca193dSGeorge Wilson zilog_t *zilog; 1979e6ca193dSGeorge Wilson int error; 1980e6ca193dSGeorge Wilson 1981503ad85cSMatthew Ahrens error = dmu_objset_hold(osname, FTAG, &os); 1982e6ca193dSGeorge Wilson if (error) 1983e6ca193dSGeorge Wilson return (error); 1984e6ca193dSGeorge Wilson 1985e6ca193dSGeorge Wilson zilog = dmu_objset_zil(os); 1986e6ca193dSGeorge Wilson if (zil_suspend(zilog) != 0) 1987e6ca193dSGeorge Wilson error = EEXIST; 1988e6ca193dSGeorge Wilson else 1989e6ca193dSGeorge Wilson zil_resume(zilog); 1990503ad85cSMatthew Ahrens dmu_objset_rele(os, FTAG); 1991e6ca193dSGeorge Wilson return (error); 1992e6ca193dSGeorge Wilson } 1993