1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5fe9cf88cSperrin * Common Development and Distribution License (the "License"). 6fe9cf88cSperrin * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2255da60b9SMark J Musante * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23b7b2590dSMatthew Ahrens * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 25fa9e4066Sahrens */ 26fa9e4066Sahrens 2755da60b9SMark J Musante /* Portions Copyright 2010 Robert Milkowski */ 2855da60b9SMark J Musante 29fa9e4066Sahrens #include <sys/zfs_context.h> 30fa9e4066Sahrens #include <sys/spa.h> 31fa9e4066Sahrens #include <sys/dmu.h> 32fa9e4066Sahrens #include <sys/zap.h> 33fa9e4066Sahrens #include <sys/arc.h> 34fa9e4066Sahrens #include <sys/stat.h> 35fa9e4066Sahrens #include <sys/resource.h> 36fa9e4066Sahrens #include <sys/zil.h> 37fa9e4066Sahrens #include <sys/zil_impl.h> 38fa9e4066Sahrens #include <sys/dsl_dataset.h> 394b964adaSGeorge Wilson #include <sys/vdev_impl.h> 40d63d470bSgw #include <sys/dmu_tx.h> 413f9d6ad7SLin Ling #include <sys/dsl_pool.h> 42770499e1SDan Kimmel #include <sys/abd.h> 43fa9e4066Sahrens 44fa9e4066Sahrens /* 45fa9e4066Sahrens * The zfs intent log (ZIL) saves transaction records of system calls 46fa9e4066Sahrens * that change the file system in memory with enough information 47fa9e4066Sahrens * to be able to replay them. These are stored in memory until 48fa9e4066Sahrens * either the DMU transaction group (txg) commits them to the stable pool 49fa9e4066Sahrens * and they can be discarded, or they are flushed to the stable log 50fa9e4066Sahrens * (also in the pool) due to a fsync, O_DSYNC or other synchronous 51fa9e4066Sahrens * requirement. In the event of a panic or power fail then those log 52fa9e4066Sahrens * records (transactions) are replayed. 53fa9e4066Sahrens * 54fa9e4066Sahrens * There is one ZIL per file system. Its on-disk (pool) format consists 55fa9e4066Sahrens * of 3 parts: 56fa9e4066Sahrens * 57fa9e4066Sahrens * - ZIL header 58fa9e4066Sahrens * - ZIL blocks 59fa9e4066Sahrens * - ZIL records 60fa9e4066Sahrens * 61fa9e4066Sahrens * A log record holds a system call transaction. Log blocks can 62fa9e4066Sahrens * hold many log records and the blocks are chained together. 63fa9e4066Sahrens * Each ZIL block contains a block pointer (blkptr_t) to the next 64fa9e4066Sahrens * ZIL block in the chain. The ZIL header points to the first 65fa9e4066Sahrens * block in the chain. Note there is not a fixed place in the pool 66fa9e4066Sahrens * to hold blocks. They are dynamically allocated and freed as 67fa9e4066Sahrens * needed from the blocks available. Figure X shows the ZIL structure: 68fa9e4066Sahrens */ 69fa9e4066Sahrens 70fa9e4066Sahrens /* 71f7170741SWill Andrews * Disable intent logging replay. This global ZIL switch affects all pools. 72fa9e4066Sahrens */ 73f7170741SWill Andrews int zil_replay_disable = 0; 74416e0cd8Sek 75416e0cd8Sek /* 76416e0cd8Sek * Tunable parameter for debugging or performance analysis. Setting 77416e0cd8Sek * zfs_nocacheflush will cause corruption on power loss if a volatile 78416e0cd8Sek * out-of-order write cache is enabled. 79416e0cd8Sek */ 80416e0cd8Sek boolean_t zfs_nocacheflush = B_FALSE; 81fa9e4066Sahrens 82*c5ee4681SAlexander Motin /* 83*c5ee4681SAlexander Motin * Limit SLOG write size per commit executed with synchronous priority. 84*c5ee4681SAlexander Motin * Any writes above that will be executed with lower (asynchronous) priority 85*c5ee4681SAlexander Motin * to limit potential SLOG device abuse by single active ZIL writer. 86*c5ee4681SAlexander Motin */ 87*c5ee4681SAlexander Motin uint64_t zil_slog_bulk = 768 * 1024; 88*c5ee4681SAlexander Motin 89fa9e4066Sahrens static kmem_cache_t *zil_lwb_cache; 90fa9e4066Sahrens 9191de656bSNeil Perrin static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); 928f18d1faSGeorge Wilson 936e1f5caaSNeil Perrin #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 946e1f5caaSNeil Perrin sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 956e1f5caaSNeil Perrin 96fa9e4066Sahrens static int 97b24ab676SJeff Bonwick zil_bp_compare(const void *x1, const void *x2) 98fa9e4066Sahrens { 99b24ab676SJeff Bonwick const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 100b24ab676SJeff Bonwick const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 101fa9e4066Sahrens 102fa9e4066Sahrens if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 103fa9e4066Sahrens return (-1); 104fa9e4066Sahrens if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 105fa9e4066Sahrens return (1); 106fa9e4066Sahrens 107fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 108fa9e4066Sahrens return (-1); 109fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 110fa9e4066Sahrens return (1); 111fa9e4066Sahrens 112fa9e4066Sahrens return (0); 113fa9e4066Sahrens } 114fa9e4066Sahrens 115fa9e4066Sahrens static void 116b24ab676SJeff Bonwick zil_bp_tree_init(zilog_t *zilog) 117fa9e4066Sahrens { 118b24ab676SJeff Bonwick avl_create(&zilog->zl_bp_tree, zil_bp_compare, 119b24ab676SJeff Bonwick sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 120fa9e4066Sahrens } 121fa9e4066Sahrens 122fa9e4066Sahrens static void 123b24ab676SJeff Bonwick zil_bp_tree_fini(zilog_t *zilog) 124fa9e4066Sahrens { 125b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 126b24ab676SJeff Bonwick zil_bp_node_t *zn; 127fa9e4066Sahrens void *cookie = NULL; 128fa9e4066Sahrens 129fa9e4066Sahrens while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 130b24ab676SJeff Bonwick kmem_free(zn, sizeof (zil_bp_node_t)); 131fa9e4066Sahrens 132fa9e4066Sahrens avl_destroy(t); 133fa9e4066Sahrens } 134fa9e4066Sahrens 135b24ab676SJeff Bonwick int 136b24ab676SJeff Bonwick zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 137fa9e4066Sahrens { 138b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 1395d7b4d43SMatthew Ahrens const dva_t *dva; 140b24ab676SJeff Bonwick zil_bp_node_t *zn; 141fa9e4066Sahrens avl_index_t where; 142fa9e4066Sahrens 1435d7b4d43SMatthew Ahrens if (BP_IS_EMBEDDED(bp)) 1445d7b4d43SMatthew Ahrens return (0); 1455d7b4d43SMatthew Ahrens 1465d7b4d43SMatthew Ahrens dva = BP_IDENTITY(bp); 1475d7b4d43SMatthew Ahrens 148fa9e4066Sahrens if (avl_find(t, dva, &where) != NULL) 149be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST)); 150fa9e4066Sahrens 151b24ab676SJeff Bonwick zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 152fa9e4066Sahrens zn->zn_dva = *dva; 153fa9e4066Sahrens avl_insert(t, zn, where); 154fa9e4066Sahrens 155fa9e4066Sahrens return (0); 156fa9e4066Sahrens } 157fa9e4066Sahrens 158d80c45e0Sbonwick static zil_header_t * 159d80c45e0Sbonwick zil_header_in_syncing_context(zilog_t *zilog) 160d80c45e0Sbonwick { 161d80c45e0Sbonwick return ((zil_header_t *)zilog->zl_header); 162d80c45e0Sbonwick } 163d80c45e0Sbonwick 164d80c45e0Sbonwick static void 165d80c45e0Sbonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 166d80c45e0Sbonwick { 167d80c45e0Sbonwick zio_cksum_t *zc = &bp->blk_cksum; 168d80c45e0Sbonwick 169d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 170d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 171d80c45e0Sbonwick zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 172d80c45e0Sbonwick zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 173d80c45e0Sbonwick } 174d80c45e0Sbonwick 175fa9e4066Sahrens /* 176b24ab676SJeff Bonwick * Read a log block and make sure it's valid. 177fa9e4066Sahrens */ 178fa9e4066Sahrens static int 1796e1f5caaSNeil Perrin zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 1806e1f5caaSNeil Perrin char **end) 181fa9e4066Sahrens { 182b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 1837adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 184b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 1857802d7bfSMatthew Ahrens zbookmark_phys_t zb; 186fa9e4066Sahrens int error; 187fa9e4066Sahrens 188b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 189b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 190ea8dc4b6Seschrock 191b24ab676SJeff Bonwick if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 192b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE; 193fa9e4066Sahrens 194b24ab676SJeff Bonwick SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 195b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 196b24ab676SJeff Bonwick 1971b912ec7SGeorge Wilson error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 198b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 199fa9e4066Sahrens 200d80c45e0Sbonwick if (error == 0) { 201d80c45e0Sbonwick zio_cksum_t cksum = bp->blk_cksum; 202fa9e4066Sahrens 203d80c45e0Sbonwick /* 204f5e6e722SNeil Perrin * Validate the checksummed log block. 205f5e6e722SNeil Perrin * 206d80c45e0Sbonwick * Sequence numbers should be... sequential. The checksum 207d80c45e0Sbonwick * verifier for the next block should be bp's checksum plus 1. 208f5e6e722SNeil Perrin * 209f5e6e722SNeil Perrin * Also check the log chain linkage and size used. 210d80c45e0Sbonwick */ 211d80c45e0Sbonwick cksum.zc_word[ZIL_ZC_SEQ]++; 212d80c45e0Sbonwick 2136e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 2146e1f5caaSNeil Perrin zil_chain_t *zilc = abuf->b_data; 2156e1f5caaSNeil Perrin char *lr = (char *)(zilc + 1); 2166e1f5caaSNeil Perrin uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 2176e1f5caaSNeil Perrin 2186e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2196e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 220be6fd75aSMatthew Ahrens error = SET_ERROR(ECKSUM); 2216e1f5caaSNeil Perrin } else { 222b5152584SMatthew Ahrens ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); 2236e1f5caaSNeil Perrin bcopy(lr, dst, len); 2246e1f5caaSNeil Perrin *end = (char *)dst + len; 2256e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2266e1f5caaSNeil Perrin } 2276e1f5caaSNeil Perrin } else { 2286e1f5caaSNeil Perrin char *lr = abuf->b_data; 2296e1f5caaSNeil Perrin uint64_t size = BP_GET_LSIZE(bp); 2306e1f5caaSNeil Perrin zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 2316e1f5caaSNeil Perrin 2326e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 2336e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 2346e1f5caaSNeil Perrin (zilc->zc_nused > (size - sizeof (*zilc)))) { 235be6fd75aSMatthew Ahrens error = SET_ERROR(ECKSUM); 2366e1f5caaSNeil Perrin } else { 237b5152584SMatthew Ahrens ASSERT3U(zilc->zc_nused, <=, 238b5152584SMatthew Ahrens SPA_OLD_MAXBLOCKSIZE); 2396e1f5caaSNeil Perrin bcopy(lr, dst, zilc->zc_nused); 2406e1f5caaSNeil Perrin *end = (char *)dst + zilc->zc_nused; 2416e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 2426e1f5caaSNeil Perrin } 2436e1f5caaSNeil Perrin } 244fa9e4066Sahrens 245dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 246fa9e4066Sahrens } 247fa9e4066Sahrens 248b24ab676SJeff Bonwick return (error); 249b24ab676SJeff Bonwick } 250b24ab676SJeff Bonwick 251b24ab676SJeff Bonwick /* 252b24ab676SJeff Bonwick * Read a TX_WRITE log data block. 253b24ab676SJeff Bonwick */ 254b24ab676SJeff Bonwick static int 255b24ab676SJeff Bonwick zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 256b24ab676SJeff Bonwick { 257b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 258b24ab676SJeff Bonwick const blkptr_t *bp = &lr->lr_blkptr; 2597adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_WAIT; 260b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 2617802d7bfSMatthew Ahrens zbookmark_phys_t zb; 262b24ab676SJeff Bonwick int error; 263b24ab676SJeff Bonwick 264b24ab676SJeff Bonwick if (BP_IS_HOLE(bp)) { 265b24ab676SJeff Bonwick if (wbuf != NULL) 266b24ab676SJeff Bonwick bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 267b24ab676SJeff Bonwick return (0); 268b24ab676SJeff Bonwick } 269b24ab676SJeff Bonwick 270b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 271b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 272b24ab676SJeff Bonwick 273b24ab676SJeff Bonwick SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 274b24ab676SJeff Bonwick ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 275b24ab676SJeff Bonwick 2761b912ec7SGeorge Wilson error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 277b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 278b24ab676SJeff Bonwick 279b24ab676SJeff Bonwick if (error == 0) { 280b24ab676SJeff Bonwick if (wbuf != NULL) 281b24ab676SJeff Bonwick bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 282dcbf3bd6SGeorge Wilson arc_buf_destroy(abuf, &abuf); 283b24ab676SJeff Bonwick } 284fa9e4066Sahrens 285d80c45e0Sbonwick return (error); 286fa9e4066Sahrens } 287fa9e4066Sahrens 288fa9e4066Sahrens /* 289fa9e4066Sahrens * Parse the intent log, and call parse_func for each valid record within. 290fa9e4066Sahrens */ 291b24ab676SJeff Bonwick int 292fa9e4066Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 293fa9e4066Sahrens zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 294fa9e4066Sahrens { 295d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 296b24ab676SJeff Bonwick boolean_t claimed = !!zh->zh_claim_txg; 297b24ab676SJeff Bonwick uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 298b24ab676SJeff Bonwick uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 299b24ab676SJeff Bonwick uint64_t max_blk_seq = 0; 300b24ab676SJeff Bonwick uint64_t max_lr_seq = 0; 301b24ab676SJeff Bonwick uint64_t blk_count = 0; 302b24ab676SJeff Bonwick uint64_t lr_count = 0; 303b24ab676SJeff Bonwick blkptr_t blk, next_blk; 304fa9e4066Sahrens char *lrbuf, *lrp; 305b24ab676SJeff Bonwick int error = 0; 306fa9e4066Sahrens 307b24ab676SJeff Bonwick /* 308b24ab676SJeff Bonwick * Old logs didn't record the maximum zh_claim_lr_seq. 309b24ab676SJeff Bonwick */ 310b24ab676SJeff Bonwick if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 311b24ab676SJeff Bonwick claim_lr_seq = UINT64_MAX; 312fa9e4066Sahrens 313fa9e4066Sahrens /* 314fa9e4066Sahrens * Starting at the block pointed to by zh_log we read the log chain. 315fa9e4066Sahrens * For each block in the chain we strongly check that block to 316fa9e4066Sahrens * ensure its validity. We stop when an invalid block is found. 317fa9e4066Sahrens * For each block pointer in the chain we call parse_blk_func(). 318fa9e4066Sahrens * For each record in each valid block we call parse_lr_func(). 319d80c45e0Sbonwick * If the log has been claimed, stop if we encounter a sequence 320d80c45e0Sbonwick * number greater than the highest claimed sequence number. 321fa9e4066Sahrens */ 322b5152584SMatthew Ahrens lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); 323b24ab676SJeff Bonwick zil_bp_tree_init(zilog); 324d80c45e0Sbonwick 325b24ab676SJeff Bonwick for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 326b24ab676SJeff Bonwick uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 327b24ab676SJeff Bonwick int reclen; 3286e1f5caaSNeil Perrin char *end; 329d80c45e0Sbonwick 330b24ab676SJeff Bonwick if (blk_seq > claim_blk_seq) 331b24ab676SJeff Bonwick break; 332b24ab676SJeff Bonwick if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 333b24ab676SJeff Bonwick break; 3346e1f5caaSNeil Perrin ASSERT3U(max_blk_seq, <, blk_seq); 335b24ab676SJeff Bonwick max_blk_seq = blk_seq; 336b24ab676SJeff Bonwick blk_count++; 337fa9e4066Sahrens 338b24ab676SJeff Bonwick if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 339b24ab676SJeff Bonwick break; 340fa9e4066Sahrens 3416e1f5caaSNeil Perrin error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 3423b2aab18SMatthew Ahrens if (error != 0) 343fa9e4066Sahrens break; 344fa9e4066Sahrens 3456e1f5caaSNeil Perrin for (lrp = lrbuf; lrp < end; lrp += reclen) { 346fa9e4066Sahrens lr_t *lr = (lr_t *)lrp; 347fa9e4066Sahrens reclen = lr->lrc_reclen; 348fa9e4066Sahrens ASSERT3U(reclen, >=, sizeof (lr_t)); 349b24ab676SJeff Bonwick if (lr->lrc_seq > claim_lr_seq) 350b24ab676SJeff Bonwick goto done; 351b24ab676SJeff Bonwick if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 352b24ab676SJeff Bonwick goto done; 3536e1f5caaSNeil Perrin ASSERT3U(max_lr_seq, <, lr->lrc_seq); 354b24ab676SJeff Bonwick max_lr_seq = lr->lrc_seq; 355b24ab676SJeff Bonwick lr_count++; 356fa9e4066Sahrens } 357fa9e4066Sahrens } 358b24ab676SJeff Bonwick done: 359b24ab676SJeff Bonwick zilog->zl_parse_error = error; 360b24ab676SJeff Bonwick zilog->zl_parse_blk_seq = max_blk_seq; 361b24ab676SJeff Bonwick zilog->zl_parse_lr_seq = max_lr_seq; 362b24ab676SJeff Bonwick zilog->zl_parse_blk_count = blk_count; 363b24ab676SJeff Bonwick zilog->zl_parse_lr_count = lr_count; 364b24ab676SJeff Bonwick 365b24ab676SJeff Bonwick ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 366b24ab676SJeff Bonwick (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 367d80c45e0Sbonwick 368b24ab676SJeff Bonwick zil_bp_tree_fini(zilog); 369b5152584SMatthew Ahrens zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); 370b24ab676SJeff Bonwick 371b24ab676SJeff Bonwick return (error); 372fa9e4066Sahrens } 373fa9e4066Sahrens 374b24ab676SJeff Bonwick static int 375fa9e4066Sahrens zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 376fa9e4066Sahrens { 377fa9e4066Sahrens /* 378fa9e4066Sahrens * Claim log block if not already committed and not already claimed. 379b24ab676SJeff Bonwick * If tx == NULL, just verify that the block is claimable. 380fa9e4066Sahrens */ 38143466aaeSMax Grossman if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || 38243466aaeSMax Grossman zil_bp_tree_add(zilog, bp) != 0) 383b24ab676SJeff Bonwick return (0); 384b24ab676SJeff Bonwick 385b24ab676SJeff Bonwick return (zio_wait(zio_claim(NULL, zilog->zl_spa, 386b24ab676SJeff Bonwick tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 387b24ab676SJeff Bonwick ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 388fa9e4066Sahrens } 389fa9e4066Sahrens 390b24ab676SJeff Bonwick static int 391fa9e4066Sahrens zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 392fa9e4066Sahrens { 393b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 394b24ab676SJeff Bonwick int error; 395b24ab676SJeff Bonwick 396b24ab676SJeff Bonwick if (lrc->lrc_txtype != TX_WRITE) 397b24ab676SJeff Bonwick return (0); 398b24ab676SJeff Bonwick 399b24ab676SJeff Bonwick /* 400b24ab676SJeff Bonwick * If the block is not readable, don't claim it. This can happen 401b24ab676SJeff Bonwick * in normal operation when a log block is written to disk before 402b24ab676SJeff Bonwick * some of the dmu_sync() blocks it points to. In this case, the 403b24ab676SJeff Bonwick * transaction cannot have been committed to anyone (we would have 404b24ab676SJeff Bonwick * waited for all writes to be stable first), so it is semantically 405b24ab676SJeff Bonwick * correct to declare this the end of the log. 406b24ab676SJeff Bonwick */ 407b24ab676SJeff Bonwick if (lr->lr_blkptr.blk_birth >= first_txg && 408b24ab676SJeff Bonwick (error = zil_read_log_data(zilog, lr, NULL)) != 0) 409b24ab676SJeff Bonwick return (error); 410b24ab676SJeff Bonwick return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 411fa9e4066Sahrens } 412fa9e4066Sahrens 413fa9e4066Sahrens /* ARGSUSED */ 414b24ab676SJeff Bonwick static int 415fa9e4066Sahrens zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 416fa9e4066Sahrens { 417b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 418b24ab676SJeff Bonwick 419b24ab676SJeff Bonwick return (0); 420fa9e4066Sahrens } 421fa9e4066Sahrens 422b24ab676SJeff Bonwick static int 423fa9e4066Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 424fa9e4066Sahrens { 425b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 426b24ab676SJeff Bonwick blkptr_t *bp = &lr->lr_blkptr; 427b24ab676SJeff Bonwick 428fa9e4066Sahrens /* 429fa9e4066Sahrens * If we previously claimed it, we need to free it. 430fa9e4066Sahrens */ 431b24ab676SJeff Bonwick if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 43243466aaeSMax Grossman bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && 43343466aaeSMax Grossman !BP_IS_HOLE(bp)) 434b24ab676SJeff Bonwick zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 435b24ab676SJeff Bonwick 436b24ab676SJeff Bonwick return (0); 437fa9e4066Sahrens } 438fa9e4066Sahrens 4396e1f5caaSNeil Perrin static lwb_t * 440*c5ee4681SAlexander Motin zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg) 4416e1f5caaSNeil Perrin { 4426e1f5caaSNeil Perrin lwb_t *lwb; 4436e1f5caaSNeil Perrin 4446e1f5caaSNeil Perrin lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 4456e1f5caaSNeil Perrin lwb->lwb_zilog = zilog; 4466e1f5caaSNeil Perrin lwb->lwb_blk = *bp; 447*c5ee4681SAlexander Motin lwb->lwb_slog = slog; 4486e1f5caaSNeil Perrin lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 4496e1f5caaSNeil Perrin lwb->lwb_max_txg = txg; 4506e1f5caaSNeil Perrin lwb->lwb_zio = NULL; 4516e1f5caaSNeil Perrin lwb->lwb_tx = NULL; 4526e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 4536e1f5caaSNeil Perrin lwb->lwb_nused = sizeof (zil_chain_t); 4546e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp); 4556e1f5caaSNeil Perrin } else { 4566e1f5caaSNeil Perrin lwb->lwb_nused = 0; 4576e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 4586e1f5caaSNeil Perrin } 4596e1f5caaSNeil Perrin 4606e1f5caaSNeil Perrin mutex_enter(&zilog->zl_lock); 4616e1f5caaSNeil Perrin list_insert_tail(&zilog->zl_lwb_list, lwb); 4626e1f5caaSNeil Perrin mutex_exit(&zilog->zl_lock); 4636e1f5caaSNeil Perrin 4646e1f5caaSNeil Perrin return (lwb); 4656e1f5caaSNeil Perrin } 4666e1f5caaSNeil Perrin 467ce636f8bSMatthew Ahrens /* 468ce636f8bSMatthew Ahrens * Called when we create in-memory log transactions so that we know 469ce636f8bSMatthew Ahrens * to cleanup the itxs at the end of spa_sync(). 470ce636f8bSMatthew Ahrens */ 471ce636f8bSMatthew Ahrens void 472ce636f8bSMatthew Ahrens zilog_dirty(zilog_t *zilog, uint64_t txg) 473ce636f8bSMatthew Ahrens { 474ce636f8bSMatthew Ahrens dsl_pool_t *dp = zilog->zl_dmu_pool; 475ce636f8bSMatthew Ahrens dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 476ce636f8bSMatthew Ahrens 477bc9014e6SJustin Gibbs if (ds->ds_is_snapshot) 478ce636f8bSMatthew Ahrens panic("dirtying snapshot!"); 479ce636f8bSMatthew Ahrens 4803b2aab18SMatthew Ahrens if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { 481ce636f8bSMatthew Ahrens /* up the hold count until we can be written out */ 482ce636f8bSMatthew Ahrens dmu_buf_add_ref(ds->ds_dbuf, zilog); 483ce636f8bSMatthew Ahrens } 484ce636f8bSMatthew Ahrens } 485ce636f8bSMatthew Ahrens 48643297f97SGeorge Wilson /* 48743297f97SGeorge Wilson * Determine if the zil is dirty in the specified txg. Callers wanting to 48843297f97SGeorge Wilson * ensure that the dirty state does not change must hold the itxg_lock for 48943297f97SGeorge Wilson * the specified txg. Holding the lock will ensure that the zil cannot be 49043297f97SGeorge Wilson * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current 49143297f97SGeorge Wilson * state. 49243297f97SGeorge Wilson */ 49343297f97SGeorge Wilson boolean_t 49443297f97SGeorge Wilson zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) 49543297f97SGeorge Wilson { 49643297f97SGeorge Wilson dsl_pool_t *dp = zilog->zl_dmu_pool; 49743297f97SGeorge Wilson 49843297f97SGeorge Wilson if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) 49943297f97SGeorge Wilson return (B_TRUE); 50043297f97SGeorge Wilson return (B_FALSE); 50143297f97SGeorge Wilson } 50243297f97SGeorge Wilson 50343297f97SGeorge Wilson /* 50443297f97SGeorge Wilson * Determine if the zil is dirty. The zil is considered dirty if it has 50543297f97SGeorge Wilson * any pending itx records that have not been cleaned by zil_clean(). 50643297f97SGeorge Wilson */ 507ce636f8bSMatthew Ahrens boolean_t 508ce636f8bSMatthew Ahrens zilog_is_dirty(zilog_t *zilog) 509ce636f8bSMatthew Ahrens { 510ce636f8bSMatthew Ahrens dsl_pool_t *dp = zilog->zl_dmu_pool; 511ce636f8bSMatthew Ahrens 512ce636f8bSMatthew Ahrens for (int t = 0; t < TXG_SIZE; t++) { 513ce636f8bSMatthew Ahrens if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) 514ce636f8bSMatthew Ahrens return (B_TRUE); 515ce636f8bSMatthew Ahrens } 516ce636f8bSMatthew Ahrens return (B_FALSE); 517ce636f8bSMatthew Ahrens } 518ce636f8bSMatthew Ahrens 519fa9e4066Sahrens /* 520fa9e4066Sahrens * Create an on-disk intent log. 521fa9e4066Sahrens */ 5226e1f5caaSNeil Perrin static lwb_t * 523fa9e4066Sahrens zil_create(zilog_t *zilog) 524fa9e4066Sahrens { 525d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 5266e1f5caaSNeil Perrin lwb_t *lwb = NULL; 527d80c45e0Sbonwick uint64_t txg = 0; 528d80c45e0Sbonwick dmu_tx_t *tx = NULL; 529fa9e4066Sahrens blkptr_t blk; 530d80c45e0Sbonwick int error = 0; 531*c5ee4681SAlexander Motin boolean_t slog = FALSE; 532fa9e4066Sahrens 533fa9e4066Sahrens /* 534d80c45e0Sbonwick * Wait for any previous destroy to complete. 535fa9e4066Sahrens */ 536d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 537d80c45e0Sbonwick 538d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 539d80c45e0Sbonwick ASSERT(zh->zh_replay_seq == 0); 540d80c45e0Sbonwick 541d80c45e0Sbonwick blk = zh->zh_log; 542fa9e4066Sahrens 543fa9e4066Sahrens /* 5446e1f5caaSNeil Perrin * Allocate an initial log block if: 5456e1f5caaSNeil Perrin * - there isn't one already 5466e1f5caaSNeil Perrin * - the existing block is the wrong endianess 547fa9e4066Sahrens */ 548899217ddSNeil Perrin if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 549d80c45e0Sbonwick tx = dmu_tx_create(zilog->zl_os); 550b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 551d80c45e0Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 552d80c45e0Sbonwick txg = dmu_tx_get_txg(tx); 553d80c45e0Sbonwick 554899217ddSNeil Perrin if (!BP_IS_HOLE(&blk)) { 555b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &blk); 556899217ddSNeil Perrin BP_ZERO(&blk); 557899217ddSNeil Perrin } 558899217ddSNeil Perrin 559b24ab676SJeff Bonwick error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 560*c5ee4681SAlexander Motin ZIL_MIN_BLKSZ, &slog); 561d80c45e0Sbonwick 562d80c45e0Sbonwick if (error == 0) 563d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 56413f5297eSperrin } 565fa9e4066Sahrens 566d80c45e0Sbonwick /* 567d80c45e0Sbonwick * Allocate a log write buffer (lwb) for the first log block. 568d80c45e0Sbonwick */ 5696e1f5caaSNeil Perrin if (error == 0) 570*c5ee4681SAlexander Motin lwb = zil_alloc_lwb(zilog, &blk, slog, txg); 571fa9e4066Sahrens 572d80c45e0Sbonwick /* 573d80c45e0Sbonwick * If we just allocated the first log block, commit our transaction 574d80c45e0Sbonwick * and wait for zil_sync() to stuff the block poiner into zh_log. 575d80c45e0Sbonwick * (zh is part of the MOS, so we cannot modify it in open context.) 576d80c45e0Sbonwick */ 577d80c45e0Sbonwick if (tx != NULL) { 578d80c45e0Sbonwick dmu_tx_commit(tx); 57913f5297eSperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 580d80c45e0Sbonwick } 581d80c45e0Sbonwick 582d80c45e0Sbonwick ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 5836e1f5caaSNeil Perrin 5846e1f5caaSNeil Perrin return (lwb); 585fa9e4066Sahrens } 586fa9e4066Sahrens 587fa9e4066Sahrens /* 588fa9e4066Sahrens * In one tx, free all log blocks and clear the log header. 589d80c45e0Sbonwick * If keep_first is set, then we're replaying a log with no content. 590d80c45e0Sbonwick * We want to keep the first block, however, so that the first 591d80c45e0Sbonwick * synchronous transaction doesn't require a txg_wait_synced() 592d80c45e0Sbonwick * in zil_create(). We don't need to txg_wait_synced() here either 593d80c45e0Sbonwick * when keep_first is set, because both zil_create() and zil_destroy() 594d80c45e0Sbonwick * will wait for any in-progress destroys to complete. 595fa9e4066Sahrens */ 596fa9e4066Sahrens void 597d80c45e0Sbonwick zil_destroy(zilog_t *zilog, boolean_t keep_first) 598fa9e4066Sahrens { 599d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 600d80c45e0Sbonwick lwb_t *lwb; 601fa9e4066Sahrens dmu_tx_t *tx; 602fa9e4066Sahrens uint64_t txg; 603fa9e4066Sahrens 604d80c45e0Sbonwick /* 605d80c45e0Sbonwick * Wait for any previous destroy to complete. 606d80c45e0Sbonwick */ 607d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 608fa9e4066Sahrens 609b24ab676SJeff Bonwick zilog->zl_old_header = *zh; /* debugging aid */ 610b24ab676SJeff Bonwick 611d80c45e0Sbonwick if (BP_IS_HOLE(&zh->zh_log)) 612fa9e4066Sahrens return; 613fa9e4066Sahrens 614fa9e4066Sahrens tx = dmu_tx_create(zilog->zl_os); 615b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 616fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 617fa9e4066Sahrens txg = dmu_tx_get_txg(tx); 618fa9e4066Sahrens 619d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 620d80c45e0Sbonwick 621d80c45e0Sbonwick ASSERT3U(zilog->zl_destroy_txg, <, txg); 622fa9e4066Sahrens zilog->zl_destroy_txg = txg; 623b24ab676SJeff Bonwick zilog->zl_keep_first = keep_first; 624d80c45e0Sbonwick 625d80c45e0Sbonwick if (!list_is_empty(&zilog->zl_lwb_list)) { 626d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 627c9ba2a43SEric Schrock VERIFY(!keep_first); 628d80c45e0Sbonwick while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 629d80c45e0Sbonwick list_remove(&zilog->zl_lwb_list, lwb); 630d80c45e0Sbonwick if (lwb->lwb_buf != NULL) 631d80c45e0Sbonwick zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 632b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); 633d80c45e0Sbonwick kmem_cache_free(zil_lwb_cache, lwb); 634d80c45e0Sbonwick } 635b24ab676SJeff Bonwick } else if (!keep_first) { 636ce636f8bSMatthew Ahrens zil_destroy_sync(zilog, tx); 637d80c45e0Sbonwick } 638b19a79ecSperrin mutex_exit(&zilog->zl_lock); 639fa9e4066Sahrens 640fa9e4066Sahrens dmu_tx_commit(tx); 641fa9e4066Sahrens } 642fa9e4066Sahrens 643ce636f8bSMatthew Ahrens void 644ce636f8bSMatthew Ahrens zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) 645ce636f8bSMatthew Ahrens { 646ce636f8bSMatthew Ahrens ASSERT(list_is_empty(&zilog->zl_lwb_list)); 647ce636f8bSMatthew Ahrens (void) zil_parse(zilog, zil_free_log_block, 648ce636f8bSMatthew Ahrens zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); 649ce636f8bSMatthew Ahrens } 650ce636f8bSMatthew Ahrens 6511d452cf5Sahrens int 65212380e1eSArne Jansen zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) 653fa9e4066Sahrens { 654fa9e4066Sahrens dmu_tx_t *tx = txarg; 655fa9e4066Sahrens uint64_t first_txg = dmu_tx_get_txg(tx); 656fa9e4066Sahrens zilog_t *zilog; 657fa9e4066Sahrens zil_header_t *zh; 658fa9e4066Sahrens objset_t *os; 659fa9e4066Sahrens int error; 660fa9e4066Sahrens 66112380e1eSArne Jansen error = dmu_objset_own_obj(dp, ds->ds_object, 66212380e1eSArne Jansen DMU_OST_ANY, B_FALSE, FTAG, &os); 6633b2aab18SMatthew Ahrens if (error != 0) { 66422438533SMatthew Ahrens /* 66522438533SMatthew Ahrens * EBUSY indicates that the objset is inconsistent, in which 66622438533SMatthew Ahrens * case it can not have a ZIL. 66722438533SMatthew Ahrens */ 66822438533SMatthew Ahrens if (error != EBUSY) { 66912380e1eSArne Jansen cmn_err(CE_WARN, "can't open objset for %llu, error %u", 67012380e1eSArne Jansen (unsigned long long)ds->ds_object, error); 67122438533SMatthew Ahrens } 6721d452cf5Sahrens return (0); 673fa9e4066Sahrens } 674fa9e4066Sahrens 675fa9e4066Sahrens zilog = dmu_objset_zil(os); 676d80c45e0Sbonwick zh = zil_header_in_syncing_context(zilog); 677fa9e4066Sahrens 678b24ab676SJeff Bonwick if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 679e6ca193dSGeorge Wilson if (!BP_IS_HOLE(&zh->zh_log)) 680b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 681e6ca193dSGeorge Wilson BP_ZERO(&zh->zh_log); 682e6ca193dSGeorge Wilson dsl_dataset_dirty(dmu_objset_ds(os), tx); 6833b2aab18SMatthew Ahrens dmu_objset_disown(os, FTAG); 684468c413aSTim Haley return (0); 685e6ca193dSGeorge Wilson } 686e6ca193dSGeorge Wilson 687fa9e4066Sahrens /* 688d80c45e0Sbonwick * Claim all log blocks if we haven't already done so, and remember 689d80c45e0Sbonwick * the highest claimed sequence number. This ensures that if we can 690d80c45e0Sbonwick * read only part of the log now (e.g. due to a missing device), 691d80c45e0Sbonwick * but we can read the entire log later, we will not try to replay 692d80c45e0Sbonwick * or destroy beyond the last block we successfully claimed. 693fa9e4066Sahrens */ 694fa9e4066Sahrens ASSERT3U(zh->zh_claim_txg, <=, first_txg); 695fa9e4066Sahrens if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 696b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_claim_log_block, 697d80c45e0Sbonwick zil_claim_log_record, tx, first_txg); 698b24ab676SJeff Bonwick zh->zh_claim_txg = first_txg; 699b24ab676SJeff Bonwick zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 700b24ab676SJeff Bonwick zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 701b24ab676SJeff Bonwick if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 702b24ab676SJeff Bonwick zh->zh_flags |= ZIL_REPLAY_NEEDED; 703b24ab676SJeff Bonwick zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 704fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(os), tx); 705fa9e4066Sahrens } 706d80c45e0Sbonwick 707fa9e4066Sahrens ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 7083b2aab18SMatthew Ahrens dmu_objset_disown(os, FTAG); 7091d452cf5Sahrens return (0); 710b87f3af3Sperrin } 711b87f3af3Sperrin 712b87f3af3Sperrin /* 713b87f3af3Sperrin * Check the log by walking the log chain. 714b87f3af3Sperrin * Checksum errors are ok as they indicate the end of the chain. 715b87f3af3Sperrin * Any other error (no device or read failure) returns an error. 716b87f3af3Sperrin */ 71712380e1eSArne Jansen /* ARGSUSED */ 718b87f3af3Sperrin int 71912380e1eSArne Jansen zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) 720b87f3af3Sperrin { 721b87f3af3Sperrin zilog_t *zilog; 722b87f3af3Sperrin objset_t *os; 7234b964adaSGeorge Wilson blkptr_t *bp; 724b87f3af3Sperrin int error; 725b87f3af3Sperrin 726b24ab676SJeff Bonwick ASSERT(tx == NULL); 727b24ab676SJeff Bonwick 72812380e1eSArne Jansen error = dmu_objset_from_ds(ds, &os); 7293b2aab18SMatthew Ahrens if (error != 0) { 73012380e1eSArne Jansen cmn_err(CE_WARN, "can't open objset %llu, error %d", 73112380e1eSArne Jansen (unsigned long long)ds->ds_object, error); 732b87f3af3Sperrin return (0); 733b87f3af3Sperrin } 734b87f3af3Sperrin 735b87f3af3Sperrin zilog = dmu_objset_zil(os); 7364b964adaSGeorge Wilson bp = (blkptr_t *)&zilog->zl_header->zh_log; 7374b964adaSGeorge Wilson 7384b964adaSGeorge Wilson /* 7394b964adaSGeorge Wilson * Check the first block and determine if it's on a log device 7404b964adaSGeorge Wilson * which may have been removed or faulted prior to loading this 7414b964adaSGeorge Wilson * pool. If so, there's no point in checking the rest of the log 7424b964adaSGeorge Wilson * as its content should have already been synced to the pool. 7434b964adaSGeorge Wilson */ 7444b964adaSGeorge Wilson if (!BP_IS_HOLE(bp)) { 7454b964adaSGeorge Wilson vdev_t *vd; 7464b964adaSGeorge Wilson boolean_t valid = B_TRUE; 7474b964adaSGeorge Wilson 7484b964adaSGeorge Wilson spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); 7494b964adaSGeorge Wilson vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); 7504b964adaSGeorge Wilson if (vd->vdev_islog && vdev_is_dead(vd)) 7514b964adaSGeorge Wilson valid = vdev_log_state_valid(vd); 7524b964adaSGeorge Wilson spa_config_exit(os->os_spa, SCL_STATE, FTAG); 7534b964adaSGeorge Wilson 75412380e1eSArne Jansen if (!valid) 7554b964adaSGeorge Wilson return (0); 7564b964adaSGeorge Wilson } 757b87f3af3Sperrin 758b24ab676SJeff Bonwick /* 759b24ab676SJeff Bonwick * Because tx == NULL, zil_claim_log_block() will not actually claim 760b24ab676SJeff Bonwick * any blocks, but just determine whether it is possible to do so. 761b24ab676SJeff Bonwick * In addition to checking the log chain, zil_claim_log_block() 762b24ab676SJeff Bonwick * will invoke zio_claim() with a done func of spa_claim_notify(), 763b24ab676SJeff Bonwick * which will update spa_max_claim_txg. See spa_load() for details. 764b24ab676SJeff Bonwick */ 765b24ab676SJeff Bonwick error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 766b24ab676SJeff Bonwick zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 767b24ab676SJeff Bonwick 768b24ab676SJeff Bonwick return ((error == ECKSUM || error == ENOENT) ? 0 : error); 769b87f3af3Sperrin } 770b87f3af3Sperrin 77117f17c2dSbonwick static int 77217f17c2dSbonwick zil_vdev_compare(const void *x1, const void *x2) 77317f17c2dSbonwick { 7745002558fSNeil Perrin const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 7755002558fSNeil Perrin const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 77617f17c2dSbonwick 77717f17c2dSbonwick if (v1 < v2) 77817f17c2dSbonwick return (-1); 77917f17c2dSbonwick if (v1 > v2) 78017f17c2dSbonwick return (1); 78117f17c2dSbonwick 78217f17c2dSbonwick return (0); 78317f17c2dSbonwick } 78417f17c2dSbonwick 785fa9e4066Sahrens void 786b24ab676SJeff Bonwick zil_add_block(zilog_t *zilog, const blkptr_t *bp) 787fa9e4066Sahrens { 78817f17c2dSbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 78917f17c2dSbonwick avl_index_t where; 79017f17c2dSbonwick zil_vdev_node_t *zv, zvsearch; 79117f17c2dSbonwick int ndvas = BP_GET_NDVAS(bp); 79217f17c2dSbonwick int i; 793fa9e4066Sahrens 794416e0cd8Sek if (zfs_nocacheflush) 795fa9e4066Sahrens return; 796fa9e4066Sahrens 79717f17c2dSbonwick ASSERT(zilog->zl_writer); 79817f17c2dSbonwick 79917f17c2dSbonwick /* 80017f17c2dSbonwick * Even though we're zl_writer, we still need a lock because the 80117f17c2dSbonwick * zl_get_data() callbacks may have dmu_sync() done callbacks 80217f17c2dSbonwick * that will run concurrently. 80317f17c2dSbonwick */ 80417f17c2dSbonwick mutex_enter(&zilog->zl_vdev_lock); 80517f17c2dSbonwick for (i = 0; i < ndvas; i++) { 80617f17c2dSbonwick zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 80717f17c2dSbonwick if (avl_find(t, &zvsearch, &where) == NULL) { 80817f17c2dSbonwick zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 80917f17c2dSbonwick zv->zv_vdev = zvsearch.zv_vdev; 81017f17c2dSbonwick avl_insert(t, zv, where); 81167bd71c6Sperrin } 81267bd71c6Sperrin } 81317f17c2dSbonwick mutex_exit(&zilog->zl_vdev_lock); 814fa9e4066Sahrens } 815fa9e4066Sahrens 81691de656bSNeil Perrin static void 81767bd71c6Sperrin zil_flush_vdevs(zilog_t *zilog) 81867bd71c6Sperrin { 81967bd71c6Sperrin spa_t *spa = zilog->zl_spa; 82017f17c2dSbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 82117f17c2dSbonwick void *cookie = NULL; 82217f17c2dSbonwick zil_vdev_node_t *zv; 82317f17c2dSbonwick zio_t *zio; 824fa9e4066Sahrens 82567bd71c6Sperrin ASSERT(zilog->zl_writer); 82667bd71c6Sperrin 82717f17c2dSbonwick /* 82817f17c2dSbonwick * We don't need zl_vdev_lock here because we're the zl_writer, 82917f17c2dSbonwick * and all zl_get_data() callbacks are done. 83017f17c2dSbonwick */ 83117f17c2dSbonwick if (avl_numnodes(t) == 0) 83217f17c2dSbonwick return; 83317f17c2dSbonwick 834e14bb325SJeff Bonwick spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 83517f17c2dSbonwick 836e14bb325SJeff Bonwick zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 837fa9e4066Sahrens 83817f17c2dSbonwick while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 83917f17c2dSbonwick vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 84017f17c2dSbonwick if (vd != NULL) 84117f17c2dSbonwick zio_flush(zio, vd); 84217f17c2dSbonwick kmem_free(zv, sizeof (*zv)); 84367bd71c6Sperrin } 84417f17c2dSbonwick 845fa9e4066Sahrens /* 846fa9e4066Sahrens * Wait for all the flushes to complete. Not all devices actually 847fa9e4066Sahrens * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. 848fa9e4066Sahrens */ 84917f17c2dSbonwick (void) zio_wait(zio); 85017f17c2dSbonwick 851e14bb325SJeff Bonwick spa_config_exit(spa, SCL_STATE, FTAG); 852fa9e4066Sahrens } 853fa9e4066Sahrens 854fa9e4066Sahrens /* 855fa9e4066Sahrens * Function called when a log block write completes 856fa9e4066Sahrens */ 857fa9e4066Sahrens static void 858fa9e4066Sahrens zil_lwb_write_done(zio_t *zio) 859fa9e4066Sahrens { 860fa9e4066Sahrens lwb_t *lwb = zio->io_private; 861fa9e4066Sahrens zilog_t *zilog = lwb->lwb_zilog; 862b24ab676SJeff Bonwick dmu_tx_t *tx = lwb->lwb_tx; 863fa9e4066Sahrens 864e14bb325SJeff Bonwick ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 865e14bb325SJeff Bonwick ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 866e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 867e14bb325SJeff Bonwick ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 868e14bb325SJeff Bonwick ASSERT(!BP_IS_GANG(zio->io_bp)); 869e14bb325SJeff Bonwick ASSERT(!BP_IS_HOLE(zio->io_bp)); 8705d7b4d43SMatthew Ahrens ASSERT(BP_GET_FILL(zio->io_bp) == 0); 871e14bb325SJeff Bonwick 872fa9e4066Sahrens /* 873ef0d8e11SNeil Perrin * Ensure the lwb buffer pointer is cleared before releasing 874ef0d8e11SNeil Perrin * the txg. If we have had an allocation failure and 875ef0d8e11SNeil Perrin * the txg is waiting to sync then we want want zil_sync() 876ef0d8e11SNeil Perrin * to remove the lwb so that it's not picked up as the next new 877ef0d8e11SNeil Perrin * one in zil_commit_writer(). zil_sync() will only remove 878ef0d8e11SNeil Perrin * the lwb if lwb_buf is null. 879fa9e4066Sahrens */ 880770499e1SDan Kimmel abd_put(zio->io_abd); 881fa9e4066Sahrens zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 882fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 883fa9e4066Sahrens lwb->lwb_buf = NULL; 884b24ab676SJeff Bonwick lwb->lwb_tx = NULL; 885b24ab676SJeff Bonwick mutex_exit(&zilog->zl_lock); 886ef0d8e11SNeil Perrin 887ef0d8e11SNeil Perrin /* 888ef0d8e11SNeil Perrin * Now that we've written this log block, we have a stable pointer 889ef0d8e11SNeil Perrin * to the next block in the chain, so it's OK to let the txg in 890b24ab676SJeff Bonwick * which we allocated the next block sync. 891ef0d8e11SNeil Perrin */ 892b24ab676SJeff Bonwick dmu_tx_commit(tx); 893fa9e4066Sahrens } 894fa9e4066Sahrens 895c5c6ffa0Smaybee /* 896c5c6ffa0Smaybee * Initialize the io for a log block. 897c5c6ffa0Smaybee */ 898c5c6ffa0Smaybee static void 899c5c6ffa0Smaybee zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) 900c5c6ffa0Smaybee { 9017802d7bfSMatthew Ahrens zbookmark_phys_t zb; 902*c5ee4681SAlexander Motin zio_priority_t prio; 903c5c6ffa0Smaybee 904b24ab676SJeff Bonwick SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 905b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 906b24ab676SJeff Bonwick lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 907c5c6ffa0Smaybee 908b19a79ecSperrin if (zilog->zl_root_zio == NULL) { 909b19a79ecSperrin zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, 910b19a79ecSperrin ZIO_FLAG_CANFAIL); 911b19a79ecSperrin } 91267bd71c6Sperrin if (lwb->lwb_zio == NULL) { 913770499e1SDan Kimmel abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, 914770499e1SDan Kimmel BP_GET_LSIZE(&lwb->lwb_blk)); 915*c5ee4681SAlexander Motin if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) 916*c5ee4681SAlexander Motin prio = ZIO_PRIORITY_SYNC_WRITE; 917*c5ee4681SAlexander Motin else 918*c5ee4681SAlexander Motin prio = ZIO_PRIORITY_ASYNC_WRITE; 91967bd71c6Sperrin lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 920770499e1SDan Kimmel 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), 921*c5ee4681SAlexander Motin zil_lwb_write_done, lwb, prio, 9228f18d1faSGeorge Wilson ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 92367bd71c6Sperrin } 924c5c6ffa0Smaybee } 925c5c6ffa0Smaybee 9266e1f5caaSNeil Perrin /* 9276e1f5caaSNeil Perrin * Define a limited set of intent log block sizes. 928f7170741SWill Andrews * 9296e1f5caaSNeil Perrin * These must be a multiple of 4KB. Note only the amount used (again 9306e1f5caaSNeil Perrin * aligned to 4KB) actually gets written. However, we can't always just 931b5152584SMatthew Ahrens * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. 9326e1f5caaSNeil Perrin */ 9336e1f5caaSNeil Perrin uint64_t zil_block_buckets[] = { 9346e1f5caaSNeil Perrin 4096, /* non TX_WRITE */ 9356e1f5caaSNeil Perrin 8192+4096, /* data base */ 9366e1f5caaSNeil Perrin 32*1024 + 4096, /* NFS writes */ 9376e1f5caaSNeil Perrin UINT64_MAX 9386e1f5caaSNeil Perrin }; 9396e1f5caaSNeil Perrin 940fa9e4066Sahrens /* 941fa9e4066Sahrens * Start a log block write and advance to the next log block. 942fa9e4066Sahrens * Calls are serialized. 943fa9e4066Sahrens */ 944fa9e4066Sahrens static lwb_t * 945fa9e4066Sahrens zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) 946fa9e4066Sahrens { 9476e1f5caaSNeil Perrin lwb_t *nlwb = NULL; 9486e1f5caaSNeil Perrin zil_chain_t *zilc; 949d80c45e0Sbonwick spa_t *spa = zilog->zl_spa; 9506e1f5caaSNeil Perrin blkptr_t *bp; 951b24ab676SJeff Bonwick dmu_tx_t *tx; 952fa9e4066Sahrens uint64_t txg; 953ada693c4SNeil Perrin uint64_t zil_blksz, wsz; 9546e1f5caaSNeil Perrin int i, error; 955*c5ee4681SAlexander Motin boolean_t slog; 9566e1f5caaSNeil Perrin 9576e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 9586e1f5caaSNeil Perrin zilc = (zil_chain_t *)lwb->lwb_buf; 9596e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 9606e1f5caaSNeil Perrin } else { 9616e1f5caaSNeil Perrin zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 9626e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 9636e1f5caaSNeil Perrin } 964fa9e4066Sahrens 9656e1f5caaSNeil Perrin ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 966fa9e4066Sahrens 967fa9e4066Sahrens /* 968fa9e4066Sahrens * Allocate the next block and save its address in this block 969fa9e4066Sahrens * before writing it in order to establish the log chain. 970fa9e4066Sahrens * Note that if the allocation of nlwb synced before we wrote 971fa9e4066Sahrens * the block that points at it (lwb), we'd leak it if we crashed. 972b24ab676SJeff Bonwick * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 973b24ab676SJeff Bonwick * We dirty the dataset to ensure that zil_sync() will be called 974b24ab676SJeff Bonwick * to clean up in the event of allocation failure or I/O failure. 975fa9e4066Sahrens */ 976b24ab676SJeff Bonwick tx = dmu_tx_create(zilog->zl_os); 977b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 978b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 979b24ab676SJeff Bonwick txg = dmu_tx_get_txg(tx); 980b24ab676SJeff Bonwick 981b24ab676SJeff Bonwick lwb->lwb_tx = tx; 982fa9e4066Sahrens 983fa9e4066Sahrens /* 9846e1f5caaSNeil Perrin * Log blocks are pre-allocated. Here we select the size of the next 9856e1f5caaSNeil Perrin * block, based on size used in the last block. 9866e1f5caaSNeil Perrin * - first find the smallest bucket that will fit the block from a 9876e1f5caaSNeil Perrin * limited set of block sizes. This is because it's faster to write 9886e1f5caaSNeil Perrin * blocks allocated from the same metaslab as they are adjacent or 9896e1f5caaSNeil Perrin * close. 9906e1f5caaSNeil Perrin * - next find the maximum from the new suggested size and an array of 9916e1f5caaSNeil Perrin * previous sizes. This lessens a picket fence effect of wrongly 9926e1f5caaSNeil Perrin * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 9936e1f5caaSNeil Perrin * requests. 9946e1f5caaSNeil Perrin * 9956e1f5caaSNeil Perrin * Note we only write what is used, but we can't just allocate 9966e1f5caaSNeil Perrin * the maximum block size because we can exhaust the available 9976e1f5caaSNeil Perrin * pool log space. 998fa9e4066Sahrens */ 9996e1f5caaSNeil Perrin zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 10006e1f5caaSNeil Perrin for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 10016e1f5caaSNeil Perrin continue; 10026e1f5caaSNeil Perrin zil_blksz = zil_block_buckets[i]; 10036e1f5caaSNeil Perrin if (zil_blksz == UINT64_MAX) 1004b5152584SMatthew Ahrens zil_blksz = SPA_OLD_MAXBLOCKSIZE; 10056e1f5caaSNeil Perrin zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 10066e1f5caaSNeil Perrin for (i = 0; i < ZIL_PREV_BLKS; i++) 10076e1f5caaSNeil Perrin zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 10086e1f5caaSNeil Perrin zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 1009fa9e4066Sahrens 101067bd71c6Sperrin BP_ZERO(bp); 101167bd71c6Sperrin /* pass the old blkptr in order to spread log blocks across devs */ 1012*c5ee4681SAlexander Motin error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog); 10133b2aab18SMatthew Ahrens if (error == 0) { 10146e1f5caaSNeil Perrin ASSERT3U(bp->blk_birth, ==, txg); 10156e1f5caaSNeil Perrin bp->blk_cksum = lwb->lwb_blk.blk_cksum; 10166e1f5caaSNeil Perrin bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 1017d63d470bSgw 1018ea8dc4b6Seschrock /* 10196e1f5caaSNeil Perrin * Allocate a new log write buffer (lwb). 1020ea8dc4b6Seschrock */ 1021*c5ee4681SAlexander Motin nlwb = zil_alloc_lwb(zilog, bp, slog, txg); 10226e1f5caaSNeil Perrin 10236e1f5caaSNeil Perrin /* Record the block for later vdev flushing */ 10246e1f5caaSNeil Perrin zil_add_block(zilog, &lwb->lwb_blk); 1025fa9e4066Sahrens } 1026fa9e4066Sahrens 10276e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 10286e1f5caaSNeil Perrin /* For Slim ZIL only write what is used. */ 1029ada693c4SNeil Perrin wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 1030ada693c4SNeil Perrin ASSERT3U(wsz, <=, lwb->lwb_sz); 1031ada693c4SNeil Perrin zio_shrink(lwb->lwb_zio, wsz); 1032fa9e4066Sahrens 1033ada693c4SNeil Perrin } else { 1034ada693c4SNeil Perrin wsz = lwb->lwb_sz; 10356e1f5caaSNeil Perrin } 1036ada693c4SNeil Perrin 10376e1f5caaSNeil Perrin zilc->zc_pad = 0; 10386e1f5caaSNeil Perrin zilc->zc_nused = lwb->lwb_nused; 10396e1f5caaSNeil Perrin zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 1040fa9e4066Sahrens 1041ada693c4SNeil Perrin /* 1042ada693c4SNeil Perrin * clear unused data for security 1043ada693c4SNeil Perrin */ 1044ada693c4SNeil Perrin bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 1045ada693c4SNeil Perrin 10466e1f5caaSNeil Perrin zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ 104767bd71c6Sperrin 1048fa9e4066Sahrens /* 10496e1f5caaSNeil Perrin * If there was an allocation failure then nlwb will be null which 10506e1f5caaSNeil Perrin * forces a txg_wait_synced(). 1051fa9e4066Sahrens */ 1052fa9e4066Sahrens return (nlwb); 1053fa9e4066Sahrens } 1054fa9e4066Sahrens 1055fa9e4066Sahrens static lwb_t * 1056fa9e4066Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 1057fa9e4066Sahrens { 1058*c5ee4681SAlexander Motin lr_t *lrcb, *lrc; 1059*c5ee4681SAlexander Motin lr_write_t *lrwb, *lrw; 1060b24ab676SJeff Bonwick char *lr_buf; 1061*c5ee4681SAlexander Motin uint64_t dlen, dnow, lwb_sp, reclen, txg; 1062fa9e4066Sahrens 1063fa9e4066Sahrens if (lwb == NULL) 1064fa9e4066Sahrens return (NULL); 1065b24ab676SJeff Bonwick 1066fa9e4066Sahrens ASSERT(lwb->lwb_buf != NULL); 1067fa9e4066Sahrens 1068*c5ee4681SAlexander Motin lrc = &itx->itx_lr; /* Common log record inside itx. */ 1069*c5ee4681SAlexander Motin lrw = (lr_write_t *)lrc; /* Write log record inside itx. */ 1070*c5ee4681SAlexander Motin if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { 1071c5c6ffa0Smaybee dlen = P2ROUNDUP_TYPED( 1072b24ab676SJeff Bonwick lrw->lr_length, sizeof (uint64_t), uint64_t); 1073*c5ee4681SAlexander Motin } else { 1074*c5ee4681SAlexander Motin dlen = 0; 1075*c5ee4681SAlexander Motin } 1076*c5ee4681SAlexander Motin reclen = lrc->lrc_reclen; 1077104e2ed7Sperrin zilog->zl_cur_used += (reclen + dlen); 1078*c5ee4681SAlexander Motin txg = lrc->lrc_txg; 107922ac5be4Sperrin 108067bd71c6Sperrin zil_lwb_write_init(zilog, lwb); 108167bd71c6Sperrin 1082*c5ee4681SAlexander Motin cont: 1083fa9e4066Sahrens /* 1084fa9e4066Sahrens * If this record won't fit in the current log block, start a new one. 1085*c5ee4681SAlexander Motin * For WR_NEED_COPY optimize layout for minimal number of chunks. 1086fa9e4066Sahrens */ 1087*c5ee4681SAlexander Motin lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1088*c5ee4681SAlexander Motin if (reclen > lwb_sp || (reclen + dlen > lwb_sp && 1089*c5ee4681SAlexander Motin lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || 1090*c5ee4681SAlexander Motin lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { 1091fa9e4066Sahrens lwb = zil_lwb_write_start(zilog, lwb); 1092c5c6ffa0Smaybee if (lwb == NULL) 1093fa9e4066Sahrens return (NULL); 109467bd71c6Sperrin zil_lwb_write_init(zilog, lwb); 10956e1f5caaSNeil Perrin ASSERT(LWB_EMPTY(lwb)); 1096*c5ee4681SAlexander Motin lwb_sp = lwb->lwb_sz - lwb->lwb_nused; 1097*c5ee4681SAlexander Motin ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); 1098fa9e4066Sahrens } 1099fa9e4066Sahrens 1100*c5ee4681SAlexander Motin dnow = MIN(dlen, lwb_sp - reclen); 1101b24ab676SJeff Bonwick lr_buf = lwb->lwb_buf + lwb->lwb_nused; 1102b24ab676SJeff Bonwick bcopy(lrc, lr_buf, reclen); 1103*c5ee4681SAlexander Motin lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ 1104*c5ee4681SAlexander Motin lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ 1105c5c6ffa0Smaybee 1106c5c6ffa0Smaybee /* 1107c5c6ffa0Smaybee * If it's a write, fetch the data or get its blkptr as appropriate. 1108c5c6ffa0Smaybee */ 1109c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE) { 1110c5c6ffa0Smaybee if (txg > spa_freeze_txg(zilog->zl_spa)) 1111c5c6ffa0Smaybee txg_wait_synced(zilog->zl_dmu_pool, txg); 1112c5c6ffa0Smaybee if (itx->itx_wr_state != WR_COPIED) { 1113c5c6ffa0Smaybee char *dbuf; 1114c5c6ffa0Smaybee int error; 1115c5c6ffa0Smaybee 1116*c5ee4681SAlexander Motin if (itx->itx_wr_state == WR_NEED_COPY) { 1117b24ab676SJeff Bonwick dbuf = lr_buf + reclen; 1118*c5ee4681SAlexander Motin lrcb->lrc_reclen += dnow; 1119*c5ee4681SAlexander Motin if (lrwb->lr_length > dnow) 1120*c5ee4681SAlexander Motin lrwb->lr_length = dnow; 1121*c5ee4681SAlexander Motin lrw->lr_offset += dnow; 1122*c5ee4681SAlexander Motin lrw->lr_length -= dnow; 1123c5c6ffa0Smaybee } else { 1124c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_INDIRECT); 1125c5c6ffa0Smaybee dbuf = NULL; 1126c5c6ffa0Smaybee } 1127c5c6ffa0Smaybee error = zilog->zl_get_data( 1128*c5ee4681SAlexander Motin itx->itx_private, lrwb, dbuf, lwb->lwb_zio); 1129c87b8fc5SMark J Musante if (error == EIO) { 1130c87b8fc5SMark J Musante txg_wait_synced(zilog->zl_dmu_pool, txg); 1131c87b8fc5SMark J Musante return (lwb); 1132c87b8fc5SMark J Musante } 11333b2aab18SMatthew Ahrens if (error != 0) { 1134c5c6ffa0Smaybee ASSERT(error == ENOENT || error == EEXIST || 1135c5c6ffa0Smaybee error == EALREADY); 1136c5c6ffa0Smaybee return (lwb); 1137c5c6ffa0Smaybee } 1138c5c6ffa0Smaybee } 1139104e2ed7Sperrin } 1140c5c6ffa0Smaybee 1141b24ab676SJeff Bonwick /* 1142b24ab676SJeff Bonwick * We're actually making an entry, so update lrc_seq to be the 1143b24ab676SJeff Bonwick * log record sequence number. Note that this is generally not 1144b24ab676SJeff Bonwick * equal to the itx sequence number because not all transactions 1145b24ab676SJeff Bonwick * are synchronous, and sometimes spa_sync() gets there first. 1146b24ab676SJeff Bonwick */ 1147*c5ee4681SAlexander Motin lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ 1148*c5ee4681SAlexander Motin lwb->lwb_nused += reclen + dnow; 1149fa9e4066Sahrens lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 11506e1f5caaSNeil Perrin ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1151fb09f5aaSMadhav Suresh ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); 1152fa9e4066Sahrens 1153*c5ee4681SAlexander Motin dlen -= dnow; 1154*c5ee4681SAlexander Motin if (dlen > 0) { 1155*c5ee4681SAlexander Motin zilog->zl_cur_used += reclen; 1156*c5ee4681SAlexander Motin goto cont; 1157*c5ee4681SAlexander Motin } 1158*c5ee4681SAlexander Motin 1159fa9e4066Sahrens return (lwb); 1160fa9e4066Sahrens } 1161fa9e4066Sahrens 1162fa9e4066Sahrens itx_t * 1163da6c28aaSamw zil_itx_create(uint64_t txtype, size_t lrsize) 1164fa9e4066Sahrens { 1165fa9e4066Sahrens itx_t *itx; 1166fa9e4066Sahrens 1167b4d654b0Sperrin lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1168fa9e4066Sahrens 1169fa9e4066Sahrens itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1170fa9e4066Sahrens itx->itx_lr.lrc_txtype = txtype; 1171fa9e4066Sahrens itx->itx_lr.lrc_reclen = lrsize; 1172fa9e4066Sahrens itx->itx_lr.lrc_seq = 0; /* defensive */ 11735002558fSNeil Perrin itx->itx_sync = B_TRUE; /* default is synchronous */ 1174fa9e4066Sahrens 1175fa9e4066Sahrens return (itx); 1176fa9e4066Sahrens } 1177fa9e4066Sahrens 1178b24ab676SJeff Bonwick void 1179b24ab676SJeff Bonwick zil_itx_destroy(itx_t *itx) 1180b24ab676SJeff Bonwick { 1181b24ab676SJeff Bonwick kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1182b24ab676SJeff Bonwick } 1183b24ab676SJeff Bonwick 11845002558fSNeil Perrin /* 11855002558fSNeil Perrin * Free up the sync and async itxs. The itxs_t has already been detached 11865002558fSNeil Perrin * so no locks are needed. 11875002558fSNeil Perrin */ 11885002558fSNeil Perrin static void 11895002558fSNeil Perrin zil_itxg_clean(itxs_t *itxs) 1190fa9e4066Sahrens { 11915002558fSNeil Perrin itx_t *itx; 11925002558fSNeil Perrin list_t *list; 11935002558fSNeil Perrin avl_tree_t *t; 11945002558fSNeil Perrin void *cookie; 11955002558fSNeil Perrin itx_async_node_t *ian; 11965002558fSNeil Perrin 11975002558fSNeil Perrin list = &itxs->i_sync_list; 11985002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 11995002558fSNeil Perrin list_remove(list, itx); 12005002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) + 12015002558fSNeil Perrin itx->itx_lr.lrc_reclen); 12025002558fSNeil Perrin } 1203fa9e4066Sahrens 12045002558fSNeil Perrin cookie = NULL; 12055002558fSNeil Perrin t = &itxs->i_async_tree; 12065002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 12075002558fSNeil Perrin list = &ian->ia_list; 12085002558fSNeil Perrin while ((itx = list_head(list)) != NULL) { 12095002558fSNeil Perrin list_remove(list, itx); 12105002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) + 12115002558fSNeil Perrin itx->itx_lr.lrc_reclen); 12125002558fSNeil Perrin } 12135002558fSNeil Perrin list_destroy(list); 12145002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 12155002558fSNeil Perrin } 12165002558fSNeil Perrin avl_destroy(t); 1217fa9e4066Sahrens 12185002558fSNeil Perrin kmem_free(itxs, sizeof (itxs_t)); 12195002558fSNeil Perrin } 12205002558fSNeil Perrin 12215002558fSNeil Perrin static int 12225002558fSNeil Perrin zil_aitx_compare(const void *x1, const void *x2) 12235002558fSNeil Perrin { 12245002558fSNeil Perrin const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 12255002558fSNeil Perrin const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 1226fa9e4066Sahrens 12275002558fSNeil Perrin if (o1 < o2) 12285002558fSNeil Perrin return (-1); 12295002558fSNeil Perrin if (o1 > o2) 12305002558fSNeil Perrin return (1); 12315002558fSNeil Perrin 12325002558fSNeil Perrin return (0); 1233fa9e4066Sahrens } 1234fa9e4066Sahrens 1235fa9e4066Sahrens /* 12365002558fSNeil Perrin * Remove all async itx with the given oid. 1237fa9e4066Sahrens */ 123891de656bSNeil Perrin static void 12395002558fSNeil Perrin zil_remove_async(zilog_t *zilog, uint64_t oid) 1240fa9e4066Sahrens { 12415002558fSNeil Perrin uint64_t otxg, txg; 12425002558fSNeil Perrin itx_async_node_t *ian; 12435002558fSNeil Perrin avl_tree_t *t; 12445002558fSNeil Perrin avl_index_t where; 1245a584ef65Sjohansen list_t clean_list; 1246fa9e4066Sahrens itx_t *itx; 1247fa9e4066Sahrens 12485002558fSNeil Perrin ASSERT(oid != 0); 1249a584ef65Sjohansen list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1250a584ef65Sjohansen 12515002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 12525002558fSNeil Perrin otxg = ZILTEST_TXG; 12535002558fSNeil Perrin else 12545002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 1255a584ef65Sjohansen 12565002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 12575002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 12585002558fSNeil Perrin 12595002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 12605002558fSNeil Perrin if (itxg->itxg_txg != txg) { 12615002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 12625002558fSNeil Perrin continue; 12635002558fSNeil Perrin } 1264a584ef65Sjohansen 12655002558fSNeil Perrin /* 12665002558fSNeil Perrin * Locate the object node and append its list. 12675002558fSNeil Perrin */ 12685002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 12695002558fSNeil Perrin ian = avl_find(t, &oid, &where); 12705002558fSNeil Perrin if (ian != NULL) 12715002558fSNeil Perrin list_move_tail(&clean_list, &ian->ia_list); 12725002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 12735002558fSNeil Perrin } 1274a584ef65Sjohansen while ((itx = list_head(&clean_list)) != NULL) { 1275a584ef65Sjohansen list_remove(&clean_list, itx); 12765002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) + 12775002558fSNeil Perrin itx->itx_lr.lrc_reclen); 1278a584ef65Sjohansen } 1279a584ef65Sjohansen list_destroy(&clean_list); 1280fa9e4066Sahrens } 1281fa9e4066Sahrens 12825002558fSNeil Perrin void 12835002558fSNeil Perrin zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 12845002558fSNeil Perrin { 12855002558fSNeil Perrin uint64_t txg; 12865002558fSNeil Perrin itxg_t *itxg; 12875002558fSNeil Perrin itxs_t *itxs, *clean = NULL; 12885002558fSNeil Perrin 12895002558fSNeil Perrin /* 129091de656bSNeil Perrin * Object ids can be re-instantiated in the next txg so 12915002558fSNeil Perrin * remove any async transactions to avoid future leaks. 12925002558fSNeil Perrin * This can happen if a fsync occurs on the re-instantiated 12935002558fSNeil Perrin * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 12945002558fSNeil Perrin * the new file data and flushes a write record for the old object. 12955002558fSNeil Perrin */ 12965002558fSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 129751bd2f97SNeil Perrin zil_remove_async(zilog, itx->itx_oid); 12985002558fSNeil Perrin 129991de656bSNeil Perrin /* 130091de656bSNeil Perrin * Ensure the data of a renamed file is committed before the rename. 130191de656bSNeil Perrin */ 130291de656bSNeil Perrin if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) 130391de656bSNeil Perrin zil_async_to_sync(zilog, itx->itx_oid); 130491de656bSNeil Perrin 1305ce636f8bSMatthew Ahrens if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 13065002558fSNeil Perrin txg = ZILTEST_TXG; 13075002558fSNeil Perrin else 13085002558fSNeil Perrin txg = dmu_tx_get_txg(tx); 13095002558fSNeil Perrin 13105002558fSNeil Perrin itxg = &zilog->zl_itxg[txg & TXG_MASK]; 13115002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 13125002558fSNeil Perrin itxs = itxg->itxg_itxs; 13135002558fSNeil Perrin if (itxg->itxg_txg != txg) { 13145002558fSNeil Perrin if (itxs != NULL) { 13155002558fSNeil Perrin /* 13165002558fSNeil Perrin * The zil_clean callback hasn't got around to cleaning 13175002558fSNeil Perrin * this itxg. Save the itxs for release below. 13185002558fSNeil Perrin * This should be rare. 13195002558fSNeil Perrin */ 132043297f97SGeorge Wilson zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " 132143297f97SGeorge Wilson "txg %llu", itxg->itxg_txg); 13225002558fSNeil Perrin clean = itxg->itxg_itxs; 13235002558fSNeil Perrin } 13245002558fSNeil Perrin itxg->itxg_txg = txg; 13255002558fSNeil Perrin itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 13265002558fSNeil Perrin 13275002558fSNeil Perrin list_create(&itxs->i_sync_list, sizeof (itx_t), 13285002558fSNeil Perrin offsetof(itx_t, itx_node)); 13295002558fSNeil Perrin avl_create(&itxs->i_async_tree, zil_aitx_compare, 13305002558fSNeil Perrin sizeof (itx_async_node_t), 13315002558fSNeil Perrin offsetof(itx_async_node_t, ia_node)); 13325002558fSNeil Perrin } 13335002558fSNeil Perrin if (itx->itx_sync) { 13345002558fSNeil Perrin list_insert_tail(&itxs->i_sync_list, itx); 13355002558fSNeil Perrin } else { 13365002558fSNeil Perrin avl_tree_t *t = &itxs->i_async_tree; 13375002558fSNeil Perrin uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 13385002558fSNeil Perrin itx_async_node_t *ian; 13395002558fSNeil Perrin avl_index_t where; 13405002558fSNeil Perrin 13415002558fSNeil Perrin ian = avl_find(t, &foid, &where); 13425002558fSNeil Perrin if (ian == NULL) { 13435002558fSNeil Perrin ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 13445002558fSNeil Perrin list_create(&ian->ia_list, sizeof (itx_t), 13455002558fSNeil Perrin offsetof(itx_t, itx_node)); 13465002558fSNeil Perrin ian->ia_foid = foid; 13475002558fSNeil Perrin avl_insert(t, ian, where); 13485002558fSNeil Perrin } 13495002558fSNeil Perrin list_insert_tail(&ian->ia_list, itx); 13505002558fSNeil Perrin } 13515002558fSNeil Perrin 13525002558fSNeil Perrin itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1353ce636f8bSMatthew Ahrens zilog_dirty(zilog, txg); 13545002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 13555002558fSNeil Perrin 13565002558fSNeil Perrin /* Release the old itxs now we've dropped the lock */ 13575002558fSNeil Perrin if (clean != NULL) 13585002558fSNeil Perrin zil_itxg_clean(clean); 13595002558fSNeil Perrin } 13605002558fSNeil Perrin 1361b19a79ecSperrin /* 136267bd71c6Sperrin * If there are any in-memory intent log transactions which have now been 1363ce636f8bSMatthew Ahrens * synced then start up a taskq to free them. We should only do this after we 1364ce636f8bSMatthew Ahrens * have written out the uberblocks (i.e. txg has been comitted) so that 1365ce636f8bSMatthew Ahrens * don't inadvertently clean out in-memory log records that would be required 1366ce636f8bSMatthew Ahrens * by zil_commit(). 1367b19a79ecSperrin */ 1368fa9e4066Sahrens void 13695002558fSNeil Perrin zil_clean(zilog_t *zilog, uint64_t synced_txg) 1370fa9e4066Sahrens { 13715002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 13725002558fSNeil Perrin itxs_t *clean_me; 137367bd71c6Sperrin 13745002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 13755002558fSNeil Perrin if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 13765002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 13775002558fSNeil Perrin return; 13785002558fSNeil Perrin } 13795002558fSNeil Perrin ASSERT3U(itxg->itxg_txg, <=, synced_txg); 13805002558fSNeil Perrin ASSERT(itxg->itxg_txg != 0); 13815002558fSNeil Perrin ASSERT(zilog->zl_clean_taskq != NULL); 13825002558fSNeil Perrin clean_me = itxg->itxg_itxs; 13835002558fSNeil Perrin itxg->itxg_itxs = NULL; 13845002558fSNeil Perrin itxg->itxg_txg = 0; 13855002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 13865002558fSNeil Perrin /* 13875002558fSNeil Perrin * Preferably start a task queue to free up the old itxs but 13885002558fSNeil Perrin * if taskq_dispatch can't allocate resources to do that then 13895002558fSNeil Perrin * free it in-line. This should be rare. Note, using TQ_SLEEP 13905002558fSNeil Perrin * created a bad performance problem. 13915002558fSNeil Perrin */ 13925002558fSNeil Perrin if (taskq_dispatch(zilog->zl_clean_taskq, 13935002558fSNeil Perrin (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) 13945002558fSNeil Perrin zil_itxg_clean(clean_me); 13955002558fSNeil Perrin } 13965002558fSNeil Perrin 13975002558fSNeil Perrin /* 13985002558fSNeil Perrin * Get the list of itxs to commit into zl_itx_commit_list. 13995002558fSNeil Perrin */ 140091de656bSNeil Perrin static void 14015002558fSNeil Perrin zil_get_commit_list(zilog_t *zilog) 14025002558fSNeil Perrin { 14035002558fSNeil Perrin uint64_t otxg, txg; 14045002558fSNeil Perrin list_t *commit_list = &zilog->zl_itx_commit_list; 14055002558fSNeil Perrin 14065002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 14075002558fSNeil Perrin otxg = ZILTEST_TXG; 14085002558fSNeil Perrin else 14095002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 14105002558fSNeil Perrin 141143297f97SGeorge Wilson /* 141243297f97SGeorge Wilson * This is inherently racy, since there is nothing to prevent 141343297f97SGeorge Wilson * the last synced txg from changing. That's okay since we'll 141443297f97SGeorge Wilson * only commit things in the future. 141543297f97SGeorge Wilson */ 14165002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 14175002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 14185002558fSNeil Perrin 14195002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 14205002558fSNeil Perrin if (itxg->itxg_txg != txg) { 14215002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 14225002558fSNeil Perrin continue; 14235002558fSNeil Perrin } 14245002558fSNeil Perrin 142543297f97SGeorge Wilson /* 142643297f97SGeorge Wilson * If we're adding itx records to the zl_itx_commit_list, 142743297f97SGeorge Wilson * then the zil better be dirty in this "txg". We can assert 142843297f97SGeorge Wilson * that here since we're holding the itxg_lock which will 142943297f97SGeorge Wilson * prevent spa_sync from cleaning it. Once we add the itxs 143043297f97SGeorge Wilson * to the zl_itx_commit_list we must commit it to disk even 143143297f97SGeorge Wilson * if it's unnecessary (i.e. the txg was synced). 143243297f97SGeorge Wilson */ 143343297f97SGeorge Wilson ASSERT(zilog_is_dirty_in_txg(zilog, txg) || 143443297f97SGeorge Wilson spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); 14355002558fSNeil Perrin list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 14365002558fSNeil Perrin 14375002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 14385002558fSNeil Perrin } 14395002558fSNeil Perrin } 14405002558fSNeil Perrin 14415002558fSNeil Perrin /* 14425002558fSNeil Perrin * Move the async itxs for a specified object to commit into sync lists. 14435002558fSNeil Perrin */ 144491de656bSNeil Perrin static void 14455002558fSNeil Perrin zil_async_to_sync(zilog_t *zilog, uint64_t foid) 14465002558fSNeil Perrin { 14475002558fSNeil Perrin uint64_t otxg, txg; 14485002558fSNeil Perrin itx_async_node_t *ian; 14495002558fSNeil Perrin avl_tree_t *t; 14505002558fSNeil Perrin avl_index_t where; 14515002558fSNeil Perrin 14525002558fSNeil Perrin if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 14535002558fSNeil Perrin otxg = ZILTEST_TXG; 14545002558fSNeil Perrin else 14555002558fSNeil Perrin otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 14565002558fSNeil Perrin 145743297f97SGeorge Wilson /* 145843297f97SGeorge Wilson * This is inherently racy, since there is nothing to prevent 145943297f97SGeorge Wilson * the last synced txg from changing. 146043297f97SGeorge Wilson */ 14615002558fSNeil Perrin for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 14625002558fSNeil Perrin itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 14635002558fSNeil Perrin 14645002558fSNeil Perrin mutex_enter(&itxg->itxg_lock); 14655002558fSNeil Perrin if (itxg->itxg_txg != txg) { 14665002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 14675002558fSNeil Perrin continue; 14685002558fSNeil Perrin } 14695002558fSNeil Perrin 14705002558fSNeil Perrin /* 14715002558fSNeil Perrin * If a foid is specified then find that node and append its 14725002558fSNeil Perrin * list. Otherwise walk the tree appending all the lists 14735002558fSNeil Perrin * to the sync list. We add to the end rather than the 14745002558fSNeil Perrin * beginning to ensure the create has happened. 14755002558fSNeil Perrin */ 14765002558fSNeil Perrin t = &itxg->itxg_itxs->i_async_tree; 14775002558fSNeil Perrin if (foid != 0) { 14785002558fSNeil Perrin ian = avl_find(t, &foid, &where); 14795002558fSNeil Perrin if (ian != NULL) { 14805002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 14815002558fSNeil Perrin &ian->ia_list); 14825002558fSNeil Perrin } 14835002558fSNeil Perrin } else { 14845002558fSNeil Perrin void *cookie = NULL; 14855002558fSNeil Perrin 14865002558fSNeil Perrin while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 14875002558fSNeil Perrin list_move_tail(&itxg->itxg_itxs->i_sync_list, 14885002558fSNeil Perrin &ian->ia_list); 14895002558fSNeil Perrin list_destroy(&ian->ia_list); 14905002558fSNeil Perrin kmem_free(ian, sizeof (itx_async_node_t)); 14915002558fSNeil Perrin } 14925002558fSNeil Perrin } 14935002558fSNeil Perrin mutex_exit(&itxg->itxg_lock); 149467bd71c6Sperrin } 1495fa9e4066Sahrens } 1496fa9e4066Sahrens 1497e14bb325SJeff Bonwick static void 14985002558fSNeil Perrin zil_commit_writer(zilog_t *zilog) 1499fa9e4066Sahrens { 1500fa9e4066Sahrens uint64_t txg; 15015002558fSNeil Perrin itx_t *itx; 1502fa9e4066Sahrens lwb_t *lwb; 15035002558fSNeil Perrin spa_t *spa = zilog->zl_spa; 1504b24ab676SJeff Bonwick int error = 0; 1505fa9e4066Sahrens 1506e14bb325SJeff Bonwick ASSERT(zilog->zl_root_zio == NULL); 15075002558fSNeil Perrin 15085002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 15095002558fSNeil Perrin 15105002558fSNeil Perrin zil_get_commit_list(zilog); 15115002558fSNeil Perrin 15125002558fSNeil Perrin /* 15135002558fSNeil Perrin * Return if there's nothing to commit before we dirty the fs by 15145002558fSNeil Perrin * calling zil_create(). 15155002558fSNeil Perrin */ 15165002558fSNeil Perrin if (list_head(&zilog->zl_itx_commit_list) == NULL) { 15175002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 15185002558fSNeil Perrin return; 15195002558fSNeil Perrin } 1520fa9e4066Sahrens 1521fa9e4066Sahrens if (zilog->zl_suspend) { 1522fa9e4066Sahrens lwb = NULL; 1523fa9e4066Sahrens } else { 1524fa9e4066Sahrens lwb = list_tail(&zilog->zl_lwb_list); 15255002558fSNeil Perrin if (lwb == NULL) 15266e1f5caaSNeil Perrin lwb = zil_create(zilog); 1527fa9e4066Sahrens } 1528fa9e4066Sahrens 1529b19a79ecSperrin DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); 15305002558fSNeil Perrin while (itx = list_head(&zilog->zl_itx_commit_list)) { 1531fa9e4066Sahrens txg = itx->itx_lr.lrc_txg; 153243297f97SGeorge Wilson ASSERT3U(txg, !=, 0); 1533fa9e4066Sahrens 153443297f97SGeorge Wilson /* 153543297f97SGeorge Wilson * This is inherently racy and may result in us writing 153643297f97SGeorge Wilson * out a log block for a txg that was just synced. This is 153743297f97SGeorge Wilson * ok since we'll end cleaning up that log block the next 153843297f97SGeorge Wilson * time we call zil_sync(). 153943297f97SGeorge Wilson */ 15405002558fSNeil Perrin if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) 1541fa9e4066Sahrens lwb = zil_lwb_commit(zilog, itx, lwb); 15425002558fSNeil Perrin list_remove(&zilog->zl_itx_commit_list, itx); 15435002558fSNeil Perrin kmem_free(itx, offsetof(itx_t, itx_lr) 15445002558fSNeil Perrin + itx->itx_lr.lrc_reclen); 1545fa9e4066Sahrens } 1546b19a79ecSperrin DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 1547fa9e4066Sahrens 1548fa9e4066Sahrens /* write the last block out */ 154967bd71c6Sperrin if (lwb != NULL && lwb->lwb_zio != NULL) 1550fa9e4066Sahrens lwb = zil_lwb_write_start(zilog, lwb); 1551fa9e4066Sahrens 155222ac5be4Sperrin zilog->zl_cur_used = 0; 1553fa9e4066Sahrens 1554fa9e4066Sahrens /* 1555b19a79ecSperrin * Wait if necessary for the log blocks to be on stable storage. 1556fa9e4066Sahrens */ 1557b19a79ecSperrin if (zilog->zl_root_zio) { 1558b24ab676SJeff Bonwick error = zio_wait(zilog->zl_root_zio); 1559e14bb325SJeff Bonwick zilog->zl_root_zio = NULL; 156017f17c2dSbonwick zil_flush_vdevs(zilog); 1561fa9e4066Sahrens } 156222ac5be4Sperrin 1563b24ab676SJeff Bonwick if (error || lwb == NULL) 1564fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 156567bd71c6Sperrin 156667bd71c6Sperrin mutex_enter(&zilog->zl_lock); 1567b24ab676SJeff Bonwick 1568b24ab676SJeff Bonwick /* 1569b24ab676SJeff Bonwick * Remember the highest committed log sequence number for ztest. 1570b24ab676SJeff Bonwick * We only update this value when all the log writes succeeded, 1571b24ab676SJeff Bonwick * because ztest wants to ASSERT that it got the whole log chain. 1572b24ab676SJeff Bonwick */ 1573b24ab676SJeff Bonwick if (error == 0 && lwb != NULL) 1574b24ab676SJeff Bonwick zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1575b19a79ecSperrin } 1576b19a79ecSperrin 1577b19a79ecSperrin /* 15785002558fSNeil Perrin * Commit zfs transactions to stable storage. 1579b19a79ecSperrin * If foid is 0 push out all transactions, otherwise push only those 15805002558fSNeil Perrin * for that object or might reference that object. 15815002558fSNeil Perrin * 15825002558fSNeil Perrin * itxs are committed in batches. In a heavily stressed zil there will be 15835002558fSNeil Perrin * a commit writer thread who is writing out a bunch of itxs to the log 15845002558fSNeil Perrin * for a set of committing threads (cthreads) in the same batch as the writer. 15855002558fSNeil Perrin * Those cthreads are all waiting on the same cv for that batch. 15865002558fSNeil Perrin * 15875002558fSNeil Perrin * There will also be a different and growing batch of threads that are 15885002558fSNeil Perrin * waiting to commit (qthreads). When the committing batch completes 15895002558fSNeil Perrin * a transition occurs such that the cthreads exit and the qthreads become 15905002558fSNeil Perrin * cthreads. One of the new cthreads becomes the writer thread for the 15915002558fSNeil Perrin * batch. Any new threads arriving become new qthreads. 15925002558fSNeil Perrin * 15935002558fSNeil Perrin * Only 2 condition variables are needed and there's no transition 15945002558fSNeil Perrin * between the two cvs needed. They just flip-flop between qthreads 15955002558fSNeil Perrin * and cthreads. 15965002558fSNeil Perrin * 15975002558fSNeil Perrin * Using this scheme we can efficiently wakeup up only those threads 15985002558fSNeil Perrin * that have been committed. 1599b19a79ecSperrin */ 1600b19a79ecSperrin void 16015002558fSNeil Perrin zil_commit(zilog_t *zilog, uint64_t foid) 1602b19a79ecSperrin { 16035002558fSNeil Perrin uint64_t mybatch; 1604b19a79ecSperrin 16055002558fSNeil Perrin if (zilog->zl_sync == ZFS_SYNC_DISABLED) 16065002558fSNeil Perrin return; 1607b19a79ecSperrin 16085002558fSNeil Perrin /* move the async itxs for the foid to the sync queues */ 16095002558fSNeil Perrin zil_async_to_sync(zilog, foid); 1610b19a79ecSperrin 16115002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 16125002558fSNeil Perrin mybatch = zilog->zl_next_batch; 161367bd71c6Sperrin while (zilog->zl_writer) { 16145002558fSNeil Perrin cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); 16155002558fSNeil Perrin if (mybatch <= zilog->zl_com_batch) { 161667bd71c6Sperrin mutex_exit(&zilog->zl_lock); 161767bd71c6Sperrin return; 161867bd71c6Sperrin } 161967bd71c6Sperrin } 1620b24ab676SJeff Bonwick 16215002558fSNeil Perrin zilog->zl_next_batch++; 16225002558fSNeil Perrin zilog->zl_writer = B_TRUE; 16235002558fSNeil Perrin zil_commit_writer(zilog); 16245002558fSNeil Perrin zilog->zl_com_batch = mybatch; 16255002558fSNeil Perrin zilog->zl_writer = B_FALSE; 16265002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 1627b24ab676SJeff Bonwick 16285002558fSNeil Perrin /* wake up one thread to become the next writer */ 16295002558fSNeil Perrin cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); 1630b24ab676SJeff Bonwick 16315002558fSNeil Perrin /* wake up all threads waiting for this batch to be committed */ 16325002558fSNeil Perrin cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); 1633b24ab676SJeff Bonwick } 1634b24ab676SJeff Bonwick 1635fa9e4066Sahrens /* 1636fa9e4066Sahrens * Called in syncing context to free committed log blocks and update log header. 1637fa9e4066Sahrens */ 1638fa9e4066Sahrens void 1639fa9e4066Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx) 1640fa9e4066Sahrens { 1641d80c45e0Sbonwick zil_header_t *zh = zil_header_in_syncing_context(zilog); 1642fa9e4066Sahrens uint64_t txg = dmu_tx_get_txg(tx); 1643fa9e4066Sahrens spa_t *spa = zilog->zl_spa; 1644b24ab676SJeff Bonwick uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 1645fa9e4066Sahrens lwb_t *lwb; 1646fa9e4066Sahrens 164714843421SMatthew Ahrens /* 164814843421SMatthew Ahrens * We don't zero out zl_destroy_txg, so make sure we don't try 164914843421SMatthew Ahrens * to destroy it twice. 165014843421SMatthew Ahrens */ 165114843421SMatthew Ahrens if (spa_sync_pass(spa) != 1) 165214843421SMatthew Ahrens return; 165314843421SMatthew Ahrens 1654d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 1655d80c45e0Sbonwick 1656fa9e4066Sahrens ASSERT(zilog->zl_stop_sync == 0); 1657fa9e4066Sahrens 1658b24ab676SJeff Bonwick if (*replayed_seq != 0) { 1659b24ab676SJeff Bonwick ASSERT(zh->zh_replay_seq < *replayed_seq); 1660b24ab676SJeff Bonwick zh->zh_replay_seq = *replayed_seq; 1661b24ab676SJeff Bonwick *replayed_seq = 0; 1662b24ab676SJeff Bonwick } 1663fa9e4066Sahrens 1664fa9e4066Sahrens if (zilog->zl_destroy_txg == txg) { 1665d80c45e0Sbonwick blkptr_t blk = zh->zh_log; 1666d80c45e0Sbonwick 1667d80c45e0Sbonwick ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 1668d80c45e0Sbonwick 1669d80c45e0Sbonwick bzero(zh, sizeof (zil_header_t)); 16701209a471SNeil Perrin bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 1671d80c45e0Sbonwick 1672d80c45e0Sbonwick if (zilog->zl_keep_first) { 1673d80c45e0Sbonwick /* 1674d80c45e0Sbonwick * If this block was part of log chain that couldn't 1675d80c45e0Sbonwick * be claimed because a device was missing during 1676d80c45e0Sbonwick * zil_claim(), but that device later returns, 1677d80c45e0Sbonwick * then this block could erroneously appear valid. 1678d80c45e0Sbonwick * To guard against this, assign a new GUID to the new 1679d80c45e0Sbonwick * log chain so it doesn't matter what blk points to. 1680d80c45e0Sbonwick */ 1681d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 1682d80c45e0Sbonwick zh->zh_log = blk; 1683d80c45e0Sbonwick } 1684fa9e4066Sahrens } 1685fa9e4066Sahrens 1686e6ca193dSGeorge Wilson while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1687b19a79ecSperrin zh->zh_log = lwb->lwb_blk; 1688fa9e4066Sahrens if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 1689fa9e4066Sahrens break; 1690fa9e4066Sahrens list_remove(&zilog->zl_lwb_list, lwb); 1691b24ab676SJeff Bonwick zio_free_zil(spa, txg, &lwb->lwb_blk); 1692fa9e4066Sahrens kmem_cache_free(zil_lwb_cache, lwb); 1693d63d470bSgw 1694d63d470bSgw /* 1695d63d470bSgw * If we don't have anything left in the lwb list then 1696d63d470bSgw * we've had an allocation failure and we need to zero 1697d63d470bSgw * out the zil_header blkptr so that we don't end 1698d63d470bSgw * up freeing the same block twice. 1699d63d470bSgw */ 1700d63d470bSgw if (list_head(&zilog->zl_lwb_list) == NULL) 1701d63d470bSgw BP_ZERO(&zh->zh_log); 1702fa9e4066Sahrens } 1703fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1704fa9e4066Sahrens } 1705fa9e4066Sahrens 1706fa9e4066Sahrens void 1707fa9e4066Sahrens zil_init(void) 1708fa9e4066Sahrens { 1709fa9e4066Sahrens zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 17105ad82045Snd sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); 1711fa9e4066Sahrens } 1712fa9e4066Sahrens 1713fa9e4066Sahrens void 1714fa9e4066Sahrens zil_fini(void) 1715fa9e4066Sahrens { 1716fa9e4066Sahrens kmem_cache_destroy(zil_lwb_cache); 1717fa9e4066Sahrens } 1718fa9e4066Sahrens 171955da60b9SMark J Musante void 172055da60b9SMark J Musante zil_set_sync(zilog_t *zilog, uint64_t sync) 172155da60b9SMark J Musante { 172255da60b9SMark J Musante zilog->zl_sync = sync; 172355da60b9SMark J Musante } 172455da60b9SMark J Musante 1725e09fa4daSNeil Perrin void 1726e09fa4daSNeil Perrin zil_set_logbias(zilog_t *zilog, uint64_t logbias) 1727e09fa4daSNeil Perrin { 1728e09fa4daSNeil Perrin zilog->zl_logbias = logbias; 1729e09fa4daSNeil Perrin } 1730e09fa4daSNeil Perrin 1731fa9e4066Sahrens zilog_t * 1732fa9e4066Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys) 1733fa9e4066Sahrens { 1734fa9e4066Sahrens zilog_t *zilog; 1735fa9e4066Sahrens 1736fa9e4066Sahrens zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 1737fa9e4066Sahrens 1738fa9e4066Sahrens zilog->zl_header = zh_phys; 1739fa9e4066Sahrens zilog->zl_os = os; 1740fa9e4066Sahrens zilog->zl_spa = dmu_objset_spa(os); 1741fa9e4066Sahrens zilog->zl_dmu_pool = dmu_objset_pool(os); 1742d80c45e0Sbonwick zilog->zl_destroy_txg = TXG_INITIAL - 1; 1743e09fa4daSNeil Perrin zilog->zl_logbias = dmu_objset_logbias(os); 174455da60b9SMark J Musante zilog->zl_sync = dmu_objset_syncprop(os); 17455002558fSNeil Perrin zilog->zl_next_batch = 1; 1746fa9e4066Sahrens 17475ad82045Snd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 17485ad82045Snd 17495002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 17505002558fSNeil Perrin mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 17515002558fSNeil Perrin MUTEX_DEFAULT, NULL); 17525002558fSNeil Perrin } 1753fa9e4066Sahrens 1754fa9e4066Sahrens list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 1755fa9e4066Sahrens offsetof(lwb_t, lwb_node)); 1756fa9e4066Sahrens 17575002558fSNeil Perrin list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 17585002558fSNeil Perrin offsetof(itx_t, itx_node)); 17595002558fSNeil Perrin 176017f17c2dSbonwick mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 176117f17c2dSbonwick 176217f17c2dSbonwick avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, 176317f17c2dSbonwick sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 1764fa9e4066Sahrens 1765b7b97454Sperrin cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); 1766b7b97454Sperrin cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 17675002558fSNeil Perrin cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); 17685002558fSNeil Perrin cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); 1769b7b97454Sperrin 1770fa9e4066Sahrens return (zilog); 1771fa9e4066Sahrens } 1772fa9e4066Sahrens 1773fa9e4066Sahrens void 1774fa9e4066Sahrens zil_free(zilog_t *zilog) 1775fa9e4066Sahrens { 1776fa9e4066Sahrens zilog->zl_stop_sync = 1; 1777fa9e4066Sahrens 17783b2aab18SMatthew Ahrens ASSERT0(zilog->zl_suspend); 17793b2aab18SMatthew Ahrens ASSERT0(zilog->zl_suspending); 17803b2aab18SMatthew Ahrens 1781c9ba2a43SEric Schrock ASSERT(list_is_empty(&zilog->zl_lwb_list)); 1782fa9e4066Sahrens list_destroy(&zilog->zl_lwb_list); 1783fa9e4066Sahrens 178417f17c2dSbonwick avl_destroy(&zilog->zl_vdev_tree); 178517f17c2dSbonwick mutex_destroy(&zilog->zl_vdev_lock); 1786fa9e4066Sahrens 17875002558fSNeil Perrin ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 17885002558fSNeil Perrin list_destroy(&zilog->zl_itx_commit_list); 17895002558fSNeil Perrin 17905002558fSNeil Perrin for (int i = 0; i < TXG_SIZE; i++) { 17915002558fSNeil Perrin /* 17925002558fSNeil Perrin * It's possible for an itx to be generated that doesn't dirty 17935002558fSNeil Perrin * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 17945002558fSNeil Perrin * callback to remove the entry. We remove those here. 17955002558fSNeil Perrin * 17965002558fSNeil Perrin * Also free up the ziltest itxs. 17975002558fSNeil Perrin */ 17985002558fSNeil Perrin if (zilog->zl_itxg[i].itxg_itxs) 17995002558fSNeil Perrin zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 18005002558fSNeil Perrin mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 18015002558fSNeil Perrin } 18025002558fSNeil Perrin 18035ad82045Snd mutex_destroy(&zilog->zl_lock); 1804fa9e4066Sahrens 1805b7b97454Sperrin cv_destroy(&zilog->zl_cv_writer); 1806b7b97454Sperrin cv_destroy(&zilog->zl_cv_suspend); 18075002558fSNeil Perrin cv_destroy(&zilog->zl_cv_batch[0]); 18085002558fSNeil Perrin cv_destroy(&zilog->zl_cv_batch[1]); 1809b7b97454Sperrin 1810fa9e4066Sahrens kmem_free(zilog, sizeof (zilog_t)); 1811fa9e4066Sahrens } 1812fa9e4066Sahrens 1813fa9e4066Sahrens /* 1814fa9e4066Sahrens * Open an intent log. 1815fa9e4066Sahrens */ 1816fa9e4066Sahrens zilog_t * 1817fa9e4066Sahrens zil_open(objset_t *os, zil_get_data_t *get_data) 1818fa9e4066Sahrens { 1819fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 1820fa9e4066Sahrens 1821c9ba2a43SEric Schrock ASSERT(zilog->zl_clean_taskq == NULL); 1822c9ba2a43SEric Schrock ASSERT(zilog->zl_get_data == NULL); 1823c9ba2a43SEric Schrock ASSERT(list_is_empty(&zilog->zl_lwb_list)); 1824c9ba2a43SEric Schrock 1825fa9e4066Sahrens zilog->zl_get_data = get_data; 1826fa9e4066Sahrens zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 1827fa9e4066Sahrens 2, 2, TASKQ_PREPOPULATE); 1828fa9e4066Sahrens 1829fa9e4066Sahrens return (zilog); 1830fa9e4066Sahrens } 1831fa9e4066Sahrens 1832fa9e4066Sahrens /* 1833fa9e4066Sahrens * Close an intent log. 1834fa9e4066Sahrens */ 1835fa9e4066Sahrens void 1836fa9e4066Sahrens zil_close(zilog_t *zilog) 1837fa9e4066Sahrens { 1838c9ba2a43SEric Schrock lwb_t *lwb; 18395002558fSNeil Perrin uint64_t txg = 0; 18405002558fSNeil Perrin 18415002558fSNeil Perrin zil_commit(zilog, 0); /* commit all itx */ 18425002558fSNeil Perrin 1843d80c45e0Sbonwick /* 18445002558fSNeil Perrin * The lwb_max_txg for the stubby lwb will reflect the last activity 18455002558fSNeil Perrin * for the zil. After a txg_wait_synced() on the txg we know all the 18465002558fSNeil Perrin * callbacks have occurred that may clean the zil. Only then can we 18475002558fSNeil Perrin * destroy the zl_clean_taskq. 1848d80c45e0Sbonwick */ 18495002558fSNeil Perrin mutex_enter(&zilog->zl_lock); 1850c9ba2a43SEric Schrock lwb = list_tail(&zilog->zl_lwb_list); 1851c9ba2a43SEric Schrock if (lwb != NULL) 1852c9ba2a43SEric Schrock txg = lwb->lwb_max_txg; 18535002558fSNeil Perrin mutex_exit(&zilog->zl_lock); 18545002558fSNeil Perrin if (txg) 1855d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, txg); 185643297f97SGeorge Wilson 185743297f97SGeorge Wilson if (zilog_is_dirty(zilog)) 185843297f97SGeorge Wilson zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); 185943297f97SGeorge Wilson VERIFY(!zilog_is_dirty(zilog)); 1860d80c45e0Sbonwick 1861fa9e4066Sahrens taskq_destroy(zilog->zl_clean_taskq); 1862fa9e4066Sahrens zilog->zl_clean_taskq = NULL; 1863fa9e4066Sahrens zilog->zl_get_data = NULL; 1864c9ba2a43SEric Schrock 1865c9ba2a43SEric Schrock /* 1866c9ba2a43SEric Schrock * We should have only one LWB left on the list; remove it now. 1867c9ba2a43SEric Schrock */ 1868c9ba2a43SEric Schrock mutex_enter(&zilog->zl_lock); 1869c9ba2a43SEric Schrock lwb = list_head(&zilog->zl_lwb_list); 1870c9ba2a43SEric Schrock if (lwb != NULL) { 1871c9ba2a43SEric Schrock ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); 1872c9ba2a43SEric Schrock list_remove(&zilog->zl_lwb_list, lwb); 1873c9ba2a43SEric Schrock zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1874c9ba2a43SEric Schrock kmem_cache_free(zil_lwb_cache, lwb); 1875c9ba2a43SEric Schrock } 1876c9ba2a43SEric Schrock mutex_exit(&zilog->zl_lock); 1877fa9e4066Sahrens } 1878fa9e4066Sahrens 18793b2aab18SMatthew Ahrens static char *suspend_tag = "zil suspending"; 18803b2aab18SMatthew Ahrens 1881fa9e4066Sahrens /* 1882fa9e4066Sahrens * Suspend an intent log. While in suspended mode, we still honor 1883fa9e4066Sahrens * synchronous semantics, but we rely on txg_wait_synced() to do it. 18843b2aab18SMatthew Ahrens * On old version pools, we suspend the log briefly when taking a 18853b2aab18SMatthew Ahrens * snapshot so that it will have an empty intent log. 18863b2aab18SMatthew Ahrens * 18873b2aab18SMatthew Ahrens * Long holds are not really intended to be used the way we do here -- 18883b2aab18SMatthew Ahrens * held for such a short time. A concurrent caller of dsl_dataset_long_held() 18893b2aab18SMatthew Ahrens * could fail. Therefore we take pains to only put a long hold if it is 18903b2aab18SMatthew Ahrens * actually necessary. Fortunately, it will only be necessary if the 18913b2aab18SMatthew Ahrens * objset is currently mounted (or the ZVOL equivalent). In that case it 18923b2aab18SMatthew Ahrens * will already have a long hold, so we are not really making things any worse. 18933b2aab18SMatthew Ahrens * 18943b2aab18SMatthew Ahrens * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or 18953b2aab18SMatthew Ahrens * zvol_state_t), and use their mechanism to prevent their hold from being 18963b2aab18SMatthew Ahrens * dropped (e.g. VFS_HOLD()). However, that would be even more pain for 18973b2aab18SMatthew Ahrens * very little gain. 18983b2aab18SMatthew Ahrens * 18993b2aab18SMatthew Ahrens * if cookiep == NULL, this does both the suspend & resume. 19003b2aab18SMatthew Ahrens * Otherwise, it returns with the dataset "long held", and the cookie 19013b2aab18SMatthew Ahrens * should be passed into zil_resume(). 1902fa9e4066Sahrens */ 1903fa9e4066Sahrens int 19043b2aab18SMatthew Ahrens zil_suspend(const char *osname, void **cookiep) 1905fa9e4066Sahrens { 19063b2aab18SMatthew Ahrens objset_t *os; 19073b2aab18SMatthew Ahrens zilog_t *zilog; 19083b2aab18SMatthew Ahrens const zil_header_t *zh; 19093b2aab18SMatthew Ahrens int error; 19103b2aab18SMatthew Ahrens 19113b2aab18SMatthew Ahrens error = dmu_objset_hold(osname, suspend_tag, &os); 19123b2aab18SMatthew Ahrens if (error != 0) 19133b2aab18SMatthew Ahrens return (error); 19143b2aab18SMatthew Ahrens zilog = dmu_objset_zil(os); 1915fa9e4066Sahrens 1916fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 19173b2aab18SMatthew Ahrens zh = zilog->zl_header; 19183b2aab18SMatthew Ahrens 19193589c4f0SNeil Perrin if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 1920fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 19213b2aab18SMatthew Ahrens dmu_objset_rele(os, suspend_tag); 1922be6fd75aSMatthew Ahrens return (SET_ERROR(EBUSY)); 1923fa9e4066Sahrens } 19243b2aab18SMatthew Ahrens 19253b2aab18SMatthew Ahrens /* 19263b2aab18SMatthew Ahrens * Don't put a long hold in the cases where we can avoid it. This 19273b2aab18SMatthew Ahrens * is when there is no cookie so we are doing a suspend & resume 19283b2aab18SMatthew Ahrens * (i.e. called from zil_vdev_offline()), and there's nothing to do 19293b2aab18SMatthew Ahrens * for the suspend because it's already suspended, or there's no ZIL. 19303b2aab18SMatthew Ahrens */ 19313b2aab18SMatthew Ahrens if (cookiep == NULL && !zilog->zl_suspending && 19323b2aab18SMatthew Ahrens (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { 19333b2aab18SMatthew Ahrens mutex_exit(&zilog->zl_lock); 19343b2aab18SMatthew Ahrens dmu_objset_rele(os, suspend_tag); 19353b2aab18SMatthew Ahrens return (0); 19363b2aab18SMatthew Ahrens } 19373b2aab18SMatthew Ahrens 19383b2aab18SMatthew Ahrens dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); 19393b2aab18SMatthew Ahrens dsl_pool_rele(dmu_objset_pool(os), suspend_tag); 19403b2aab18SMatthew Ahrens 19413b2aab18SMatthew Ahrens zilog->zl_suspend++; 19423b2aab18SMatthew Ahrens 19433b2aab18SMatthew Ahrens if (zilog->zl_suspend > 1) { 1944d80c45e0Sbonwick /* 19453b2aab18SMatthew Ahrens * Someone else is already suspending it. 1946d80c45e0Sbonwick * Just wait for them to finish. 1947d80c45e0Sbonwick */ 19483b2aab18SMatthew Ahrens 1949d80c45e0Sbonwick while (zilog->zl_suspending) 1950d80c45e0Sbonwick cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 1951d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 19523b2aab18SMatthew Ahrens 19533b2aab18SMatthew Ahrens if (cookiep == NULL) 19543b2aab18SMatthew Ahrens zil_resume(os); 19553b2aab18SMatthew Ahrens else 19563b2aab18SMatthew Ahrens *cookiep = os; 19573b2aab18SMatthew Ahrens return (0); 19583b2aab18SMatthew Ahrens } 19593b2aab18SMatthew Ahrens 19603b2aab18SMatthew Ahrens /* 19613b2aab18SMatthew Ahrens * If there is no pointer to an on-disk block, this ZIL must not 19623b2aab18SMatthew Ahrens * be active (e.g. filesystem not mounted), so there's nothing 19633b2aab18SMatthew Ahrens * to clean up. 19643b2aab18SMatthew Ahrens */ 19653b2aab18SMatthew Ahrens if (BP_IS_HOLE(&zh->zh_log)) { 19663b2aab18SMatthew Ahrens ASSERT(cookiep != NULL); /* fast path already handled */ 19673b2aab18SMatthew Ahrens 19683b2aab18SMatthew Ahrens *cookiep = os; 19693b2aab18SMatthew Ahrens mutex_exit(&zilog->zl_lock); 1970d80c45e0Sbonwick return (0); 1971d80c45e0Sbonwick } 19723b2aab18SMatthew Ahrens 1973d80c45e0Sbonwick zilog->zl_suspending = B_TRUE; 1974fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1975fa9e4066Sahrens 19765002558fSNeil Perrin zil_commit(zilog, 0); 1977fa9e4066Sahrens 1978d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 1979d80c45e0Sbonwick 1980d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 1981d80c45e0Sbonwick zilog->zl_suspending = B_FALSE; 1982d80c45e0Sbonwick cv_broadcast(&zilog->zl_cv_suspend); 1983d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 1984fa9e4066Sahrens 19853b2aab18SMatthew Ahrens if (cookiep == NULL) 19863b2aab18SMatthew Ahrens zil_resume(os); 19873b2aab18SMatthew Ahrens else 19883b2aab18SMatthew Ahrens *cookiep = os; 1989fa9e4066Sahrens return (0); 1990fa9e4066Sahrens } 1991fa9e4066Sahrens 1992fa9e4066Sahrens void 19933b2aab18SMatthew Ahrens zil_resume(void *cookie) 1994fa9e4066Sahrens { 19953b2aab18SMatthew Ahrens objset_t *os = cookie; 19963b2aab18SMatthew Ahrens zilog_t *zilog = dmu_objset_zil(os); 19973b2aab18SMatthew Ahrens 1998fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1999fa9e4066Sahrens ASSERT(zilog->zl_suspend != 0); 2000fa9e4066Sahrens zilog->zl_suspend--; 2001fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 20023b2aab18SMatthew Ahrens dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); 20033b2aab18SMatthew Ahrens dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); 2004fa9e4066Sahrens } 2005fa9e4066Sahrens 2006fa9e4066Sahrens typedef struct zil_replay_arg { 2007fa9e4066Sahrens zil_replay_func_t **zr_replay; 2008fa9e4066Sahrens void *zr_arg; 2009fa9e4066Sahrens boolean_t zr_byteswap; 2010b24ab676SJeff Bonwick char *zr_lr; 2011fa9e4066Sahrens } zil_replay_arg_t; 2012fa9e4066Sahrens 2013b24ab676SJeff Bonwick static int 2014b24ab676SJeff Bonwick zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 2015b24ab676SJeff Bonwick { 20169adfa60dSMatthew Ahrens char name[ZFS_MAX_DATASET_NAME_LEN]; 2017b24ab676SJeff Bonwick 2018b24ab676SJeff Bonwick zilog->zl_replaying_seq--; /* didn't actually replay this one */ 2019b24ab676SJeff Bonwick 2020b24ab676SJeff Bonwick dmu_objset_name(zilog->zl_os, name); 2021b24ab676SJeff Bonwick 2022b24ab676SJeff Bonwick cmn_err(CE_WARN, "ZFS replay transaction error %d, " 2023b24ab676SJeff Bonwick "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 2024b24ab676SJeff Bonwick (u_longlong_t)lr->lrc_seq, 2025b24ab676SJeff Bonwick (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 2026b24ab676SJeff Bonwick (lr->lrc_txtype & TX_CI) ? "CI" : ""); 2027b24ab676SJeff Bonwick 2028b24ab676SJeff Bonwick return (error); 2029b24ab676SJeff Bonwick } 2030b24ab676SJeff Bonwick 2031b24ab676SJeff Bonwick static int 2032fa9e4066Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 2033fa9e4066Sahrens { 2034fa9e4066Sahrens zil_replay_arg_t *zr = zra; 2035d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 2036fa9e4066Sahrens uint64_t reclen = lr->lrc_reclen; 2037fa9e4066Sahrens uint64_t txtype = lr->lrc_txtype; 2038b24ab676SJeff Bonwick int error = 0; 2039fa9e4066Sahrens 2040b24ab676SJeff Bonwick zilog->zl_replaying_seq = lr->lrc_seq; 2041fa9e4066Sahrens 2042fa9e4066Sahrens if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 2043b24ab676SJeff Bonwick return (0); 2044b24ab676SJeff Bonwick 2045b24ab676SJeff Bonwick if (lr->lrc_txg < claim_txg) /* already committed */ 2046b24ab676SJeff Bonwick return (0); 2047fa9e4066Sahrens 2048da6c28aaSamw /* Strip case-insensitive bit, still present in log record */ 2049da6c28aaSamw txtype &= ~TX_CI; 2050da6c28aaSamw 2051b24ab676SJeff Bonwick if (txtype == 0 || txtype >= TX_MAX_TYPE) 2052b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, EINVAL)); 2053b24ab676SJeff Bonwick 2054b24ab676SJeff Bonwick /* 2055b24ab676SJeff Bonwick * If this record type can be logged out of order, the object 2056b24ab676SJeff Bonwick * (lr_foid) may no longer exist. That's legitimate, not an error. 2057b24ab676SJeff Bonwick */ 2058b24ab676SJeff Bonwick if (TX_OOO(txtype)) { 2059b24ab676SJeff Bonwick error = dmu_object_info(zilog->zl_os, 2060b24ab676SJeff Bonwick ((lr_ooo_t *)lr)->lr_foid, NULL); 2061b24ab676SJeff Bonwick if (error == ENOENT || error == EEXIST) 2062b24ab676SJeff Bonwick return (0); 20631209a471SNeil Perrin } 20641209a471SNeil Perrin 2065fa9e4066Sahrens /* 2066fa9e4066Sahrens * Make a copy of the data so we can revise and extend it. 2067fa9e4066Sahrens */ 2068b24ab676SJeff Bonwick bcopy(lr, zr->zr_lr, reclen); 2069b24ab676SJeff Bonwick 2070b24ab676SJeff Bonwick /* 2071b24ab676SJeff Bonwick * If this is a TX_WRITE with a blkptr, suck in the data. 2072b24ab676SJeff Bonwick */ 2073b24ab676SJeff Bonwick if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 2074b24ab676SJeff Bonwick error = zil_read_log_data(zilog, (lr_write_t *)lr, 2075b24ab676SJeff Bonwick zr->zr_lr + reclen); 20763b2aab18SMatthew Ahrens if (error != 0) 2077b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 2078b24ab676SJeff Bonwick } 2079fa9e4066Sahrens 2080fa9e4066Sahrens /* 2081fa9e4066Sahrens * The log block containing this lr may have been byteswapped 2082fa9e4066Sahrens * so that we can easily examine common fields like lrc_txtype. 2083b24ab676SJeff Bonwick * However, the log is a mix of different record types, and only the 2084fa9e4066Sahrens * replay vectors know how to byteswap their records. Therefore, if 2085fa9e4066Sahrens * the lr was byteswapped, undo it before invoking the replay vector. 2086fa9e4066Sahrens */ 2087fa9e4066Sahrens if (zr->zr_byteswap) 2088b24ab676SJeff Bonwick byteswap_uint64_array(zr->zr_lr, reclen); 2089fa9e4066Sahrens 2090fa9e4066Sahrens /* 2091fa9e4066Sahrens * We must now do two things atomically: replay this log record, 20921209a471SNeil Perrin * and update the log header sequence number to reflect the fact that 20931209a471SNeil Perrin * we did so. At the end of each replay function the sequence number 20941209a471SNeil Perrin * is updated if we are in replay mode. 2095fa9e4066Sahrens */ 2096b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 20973b2aab18SMatthew Ahrens if (error != 0) { 209867bd71c6Sperrin /* 209967bd71c6Sperrin * The DMU's dnode layer doesn't see removes until the txg 210067bd71c6Sperrin * commits, so a subsequent claim can spuriously fail with 21011209a471SNeil Perrin * EEXIST. So if we receive any error we try syncing out 2102b24ab676SJeff Bonwick * any removes then retry the transaction. Note that we 2103b24ab676SJeff Bonwick * specify B_FALSE for byteswap now, so we don't do it twice. 210467bd71c6Sperrin */ 2105b24ab676SJeff Bonwick txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 2106b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 21073b2aab18SMatthew Ahrens if (error != 0) 2108b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 2109fa9e4066Sahrens } 2110b24ab676SJeff Bonwick return (0); 211167bd71c6Sperrin } 2112fa9e4066Sahrens 211367bd71c6Sperrin /* ARGSUSED */ 2114b24ab676SJeff Bonwick static int 211567bd71c6Sperrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 211667bd71c6Sperrin { 211767bd71c6Sperrin zilog->zl_replay_blks++; 2118b24ab676SJeff Bonwick 2119b24ab676SJeff Bonwick return (0); 2120fa9e4066Sahrens } 2121fa9e4066Sahrens 2122fa9e4066Sahrens /* 212313f5297eSperrin * If this dataset has a non-empty intent log, replay it and destroy it. 2124fa9e4066Sahrens */ 2125fa9e4066Sahrens void 21261209a471SNeil Perrin zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 2127fa9e4066Sahrens { 2128fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 2129d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 2130d80c45e0Sbonwick zil_replay_arg_t zr; 213113f5297eSperrin 21323589c4f0SNeil Perrin if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 2133d80c45e0Sbonwick zil_destroy(zilog, B_TRUE); 213413f5297eSperrin return; 213513f5297eSperrin } 2136fa9e4066Sahrens 2137fa9e4066Sahrens zr.zr_replay = replay_func; 2138fa9e4066Sahrens zr.zr_arg = arg; 2139d80c45e0Sbonwick zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 2140b24ab676SJeff Bonwick zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 2141fa9e4066Sahrens 2142fa9e4066Sahrens /* 2143fa9e4066Sahrens * Wait for in-progress removes to sync before starting replay. 2144fa9e4066Sahrens */ 2145fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 2146fa9e4066Sahrens 21471209a471SNeil Perrin zilog->zl_replay = B_TRUE; 2148d3d50737SRafael Vanoni zilog->zl_replay_time = ddi_get_lbolt(); 214967bd71c6Sperrin ASSERT(zilog->zl_replay_blks == 0); 215067bd71c6Sperrin (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 2151d80c45e0Sbonwick zh->zh_claim_txg); 2152b24ab676SJeff Bonwick kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 2153fa9e4066Sahrens 2154d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 2155a4611edeSahrens txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 21561209a471SNeil Perrin zilog->zl_replay = B_FALSE; 2157fa9e4066Sahrens } 2158436b2950Sperrin 2159b24ab676SJeff Bonwick boolean_t 2160b24ab676SJeff Bonwick zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 2161436b2950Sperrin { 216255da60b9SMark J Musante if (zilog->zl_sync == ZFS_SYNC_DISABLED) 2163b24ab676SJeff Bonwick return (B_TRUE); 2164436b2950Sperrin 2165b24ab676SJeff Bonwick if (zilog->zl_replay) { 2166b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 2167b24ab676SJeff Bonwick zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 2168b24ab676SJeff Bonwick zilog->zl_replaying_seq; 2169b24ab676SJeff Bonwick return (B_TRUE); 2170b19a79ecSperrin } 2171b19a79ecSperrin 2172b24ab676SJeff Bonwick return (B_FALSE); 2173436b2950Sperrin } 2174e6ca193dSGeorge Wilson 2175e6ca193dSGeorge Wilson /* ARGSUSED */ 2176e6ca193dSGeorge Wilson int 2177fd136879SMatthew Ahrens zil_vdev_offline(const char *osname, void *arg) 2178e6ca193dSGeorge Wilson { 2179e6ca193dSGeorge Wilson int error; 2180e6ca193dSGeorge Wilson 21813b2aab18SMatthew Ahrens error = zil_suspend(osname, NULL); 21823b2aab18SMatthew Ahrens if (error != 0) 2183be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST)); 21843b2aab18SMatthew Ahrens return (0); 2185e6ca193dSGeorge Wilson } 2186