1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5fe9cf88cSperrin * Common Development and Distribution License (the "License"). 6fe9cf88cSperrin * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22*6e1f5caaSNeil Perrin * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #include <sys/zfs_context.h> 27fa9e4066Sahrens #include <sys/spa.h> 28fa9e4066Sahrens #include <sys/dmu.h> 29fa9e4066Sahrens #include <sys/zap.h> 30fa9e4066Sahrens #include <sys/arc.h> 31fa9e4066Sahrens #include <sys/stat.h> 32fa9e4066Sahrens #include <sys/resource.h> 33fa9e4066Sahrens #include <sys/zil.h> 34fa9e4066Sahrens #include <sys/zil_impl.h> 35fa9e4066Sahrens #include <sys/dsl_dataset.h> 36fa9e4066Sahrens #include <sys/vdev.h> 37d63d470bSgw #include <sys/dmu_tx.h> 38fa9e4066Sahrens 39fa9e4066Sahrens /* 40fa9e4066Sahrens * The zfs intent log (ZIL) saves transaction records of system calls 41fa9e4066Sahrens * that change the file system in memory with enough information 42fa9e4066Sahrens * to be able to replay them. These are stored in memory until 43fa9e4066Sahrens * either the DMU transaction group (txg) commits them to the stable pool 44fa9e4066Sahrens * and they can be discarded, or they are flushed to the stable log 45fa9e4066Sahrens * (also in the pool) due to a fsync, O_DSYNC or other synchronous 46fa9e4066Sahrens * requirement. In the event of a panic or power fail then those log 47fa9e4066Sahrens * records (transactions) are replayed. 48fa9e4066Sahrens * 49fa9e4066Sahrens * There is one ZIL per file system. Its on-disk (pool) format consists 50fa9e4066Sahrens * of 3 parts: 51fa9e4066Sahrens * 52fa9e4066Sahrens * - ZIL header 53fa9e4066Sahrens * - ZIL blocks 54fa9e4066Sahrens * - ZIL records 55fa9e4066Sahrens * 56fa9e4066Sahrens * A log record holds a system call transaction. Log blocks can 57fa9e4066Sahrens * hold many log records and the blocks are chained together. 58fa9e4066Sahrens * Each ZIL block contains a block pointer (blkptr_t) to the next 59fa9e4066Sahrens * ZIL block in the chain. The ZIL header points to the first 60fa9e4066Sahrens * block in the chain. Note there is not a fixed place in the pool 61fa9e4066Sahrens * to hold blocks. They are dynamically allocated and freed as 62fa9e4066Sahrens * needed from the blocks available. Figure X shows the ZIL structure: 63fa9e4066Sahrens */ 64fa9e4066Sahrens 65fa9e4066Sahrens /* 66416e0cd8Sek * This global ZIL switch affects all pools 67fa9e4066Sahrens */ 68fa9e4066Sahrens int zil_disable = 0; /* disable intent logging */ 69416e0cd8Sek 70416e0cd8Sek /* 71416e0cd8Sek * Tunable parameter for debugging or performance analysis. Setting 72416e0cd8Sek * zfs_nocacheflush will cause corruption on power loss if a volatile 73416e0cd8Sek * out-of-order write cache is enabled. 74416e0cd8Sek */ 75416e0cd8Sek boolean_t zfs_nocacheflush = B_FALSE; 76fa9e4066Sahrens 77fa9e4066Sahrens static kmem_cache_t *zil_lwb_cache; 78fa9e4066Sahrens 798f18d1faSGeorge Wilson static boolean_t zil_empty(zilog_t *zilog); 808f18d1faSGeorge Wilson 81*6e1f5caaSNeil Perrin #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 82*6e1f5caaSNeil Perrin sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 83*6e1f5caaSNeil Perrin 84*6e1f5caaSNeil Perrin 85fa9e4066Sahrens static int 86b24ab676SJeff Bonwick zil_bp_compare(const void *x1, const void *x2) 87fa9e4066Sahrens { 88b24ab676SJeff Bonwick const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 89b24ab676SJeff Bonwick const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 90fa9e4066Sahrens 91fa9e4066Sahrens if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 92fa9e4066Sahrens return (-1); 93fa9e4066Sahrens if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 94fa9e4066Sahrens return (1); 95fa9e4066Sahrens 96fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 97fa9e4066Sahrens return (-1); 98fa9e4066Sahrens if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 99fa9e4066Sahrens return (1); 100fa9e4066Sahrens 101fa9e4066Sahrens return (0); 102fa9e4066Sahrens } 103fa9e4066Sahrens 104fa9e4066Sahrens static void 105b24ab676SJeff Bonwick zil_bp_tree_init(zilog_t *zilog) 106fa9e4066Sahrens { 107b24ab676SJeff Bonwick avl_create(&zilog->zl_bp_tree, zil_bp_compare, 108b24ab676SJeff Bonwick sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 109fa9e4066Sahrens } 110fa9e4066Sahrens 111fa9e4066Sahrens static void 112b24ab676SJeff Bonwick zil_bp_tree_fini(zilog_t *zilog) 113fa9e4066Sahrens { 114b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 115b24ab676SJeff Bonwick zil_bp_node_t *zn; 116fa9e4066Sahrens void *cookie = NULL; 117fa9e4066Sahrens 118fa9e4066Sahrens while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 119b24ab676SJeff Bonwick kmem_free(zn, sizeof (zil_bp_node_t)); 120fa9e4066Sahrens 121fa9e4066Sahrens avl_destroy(t); 122fa9e4066Sahrens } 123fa9e4066Sahrens 124b24ab676SJeff Bonwick int 125b24ab676SJeff Bonwick zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 126fa9e4066Sahrens { 127b24ab676SJeff Bonwick avl_tree_t *t = &zilog->zl_bp_tree; 128b24ab676SJeff Bonwick const dva_t *dva = BP_IDENTITY(bp); 129b24ab676SJeff Bonwick zil_bp_node_t *zn; 130fa9e4066Sahrens avl_index_t where; 131fa9e4066Sahrens 132fa9e4066Sahrens if (avl_find(t, dva, &where) != NULL) 133fa9e4066Sahrens return (EEXIST); 134fa9e4066Sahrens 135b24ab676SJeff Bonwick zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 136fa9e4066Sahrens zn->zn_dva = *dva; 137fa9e4066Sahrens avl_insert(t, zn, where); 138fa9e4066Sahrens 139fa9e4066Sahrens return (0); 140fa9e4066Sahrens } 141fa9e4066Sahrens 142d80c45e0Sbonwick static zil_header_t * 143d80c45e0Sbonwick zil_header_in_syncing_context(zilog_t *zilog) 144d80c45e0Sbonwick { 145d80c45e0Sbonwick return ((zil_header_t *)zilog->zl_header); 146d80c45e0Sbonwick } 147d80c45e0Sbonwick 148d80c45e0Sbonwick static void 149d80c45e0Sbonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 150d80c45e0Sbonwick { 151d80c45e0Sbonwick zio_cksum_t *zc = &bp->blk_cksum; 152d80c45e0Sbonwick 153d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 154d80c45e0Sbonwick zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 155d80c45e0Sbonwick zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 156d80c45e0Sbonwick zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 157d80c45e0Sbonwick } 158d80c45e0Sbonwick 159fa9e4066Sahrens /* 160b24ab676SJeff Bonwick * Read a log block and make sure it's valid. 161fa9e4066Sahrens */ 162fa9e4066Sahrens static int 163*6e1f5caaSNeil Perrin zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 164*6e1f5caaSNeil Perrin char **end) 165fa9e4066Sahrens { 166b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 16713506d1eSmaybee uint32_t aflags = ARC_WAIT; 168b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 169b24ab676SJeff Bonwick zbookmark_t zb; 170fa9e4066Sahrens int error; 171fa9e4066Sahrens 172b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 173b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 174ea8dc4b6Seschrock 175b24ab676SJeff Bonwick if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 176b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE; 177fa9e4066Sahrens 178b24ab676SJeff Bonwick SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 179b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 180b24ab676SJeff Bonwick 181b24ab676SJeff Bonwick error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 182b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 183fa9e4066Sahrens 184d80c45e0Sbonwick if (error == 0) { 185d80c45e0Sbonwick zio_cksum_t cksum = bp->blk_cksum; 186fa9e4066Sahrens 187d80c45e0Sbonwick /* 188f5e6e722SNeil Perrin * Validate the checksummed log block. 189f5e6e722SNeil Perrin * 190d80c45e0Sbonwick * Sequence numbers should be... sequential. The checksum 191d80c45e0Sbonwick * verifier for the next block should be bp's checksum plus 1. 192f5e6e722SNeil Perrin * 193f5e6e722SNeil Perrin * Also check the log chain linkage and size used. 194d80c45e0Sbonwick */ 195d80c45e0Sbonwick cksum.zc_word[ZIL_ZC_SEQ]++; 196d80c45e0Sbonwick 197*6e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 198*6e1f5caaSNeil Perrin zil_chain_t *zilc = abuf->b_data; 199*6e1f5caaSNeil Perrin char *lr = (char *)(zilc + 1); 200*6e1f5caaSNeil Perrin uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 201*6e1f5caaSNeil Perrin 202*6e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 203*6e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 204*6e1f5caaSNeil Perrin error = ECKSUM; 205*6e1f5caaSNeil Perrin } else { 206*6e1f5caaSNeil Perrin bcopy(lr, dst, len); 207*6e1f5caaSNeil Perrin *end = (char *)dst + len; 208*6e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 209*6e1f5caaSNeil Perrin } 210*6e1f5caaSNeil Perrin } else { 211*6e1f5caaSNeil Perrin char *lr = abuf->b_data; 212*6e1f5caaSNeil Perrin uint64_t size = BP_GET_LSIZE(bp); 213*6e1f5caaSNeil Perrin zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 214*6e1f5caaSNeil Perrin 215*6e1f5caaSNeil Perrin if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 216*6e1f5caaSNeil Perrin sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 217*6e1f5caaSNeil Perrin (zilc->zc_nused > (size - sizeof (*zilc)))) { 218*6e1f5caaSNeil Perrin error = ECKSUM; 219*6e1f5caaSNeil Perrin } else { 220*6e1f5caaSNeil Perrin bcopy(lr, dst, zilc->zc_nused); 221*6e1f5caaSNeil Perrin *end = (char *)dst + zilc->zc_nused; 222*6e1f5caaSNeil Perrin *nbp = zilc->zc_next_blk; 223*6e1f5caaSNeil Perrin } 224*6e1f5caaSNeil Perrin } 225fa9e4066Sahrens 226b24ab676SJeff Bonwick VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 227fa9e4066Sahrens } 228fa9e4066Sahrens 229b24ab676SJeff Bonwick return (error); 230b24ab676SJeff Bonwick } 231b24ab676SJeff Bonwick 232b24ab676SJeff Bonwick /* 233b24ab676SJeff Bonwick * Read a TX_WRITE log data block. 234b24ab676SJeff Bonwick */ 235b24ab676SJeff Bonwick static int 236b24ab676SJeff Bonwick zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 237b24ab676SJeff Bonwick { 238b24ab676SJeff Bonwick enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 239b24ab676SJeff Bonwick const blkptr_t *bp = &lr->lr_blkptr; 240b24ab676SJeff Bonwick uint32_t aflags = ARC_WAIT; 241b24ab676SJeff Bonwick arc_buf_t *abuf = NULL; 242b24ab676SJeff Bonwick zbookmark_t zb; 243b24ab676SJeff Bonwick int error; 244b24ab676SJeff Bonwick 245b24ab676SJeff Bonwick if (BP_IS_HOLE(bp)) { 246b24ab676SJeff Bonwick if (wbuf != NULL) 247b24ab676SJeff Bonwick bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 248b24ab676SJeff Bonwick return (0); 249b24ab676SJeff Bonwick } 250b24ab676SJeff Bonwick 251b24ab676SJeff Bonwick if (zilog->zl_header->zh_claim_txg == 0) 252b24ab676SJeff Bonwick zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 253b24ab676SJeff Bonwick 254b24ab676SJeff Bonwick SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 255b24ab676SJeff Bonwick ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 256b24ab676SJeff Bonwick 257b24ab676SJeff Bonwick error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 258b24ab676SJeff Bonwick ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 259b24ab676SJeff Bonwick 260b24ab676SJeff Bonwick if (error == 0) { 261b24ab676SJeff Bonwick if (wbuf != NULL) 262b24ab676SJeff Bonwick bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 263b24ab676SJeff Bonwick (void) arc_buf_remove_ref(abuf, &abuf); 264b24ab676SJeff Bonwick } 265fa9e4066Sahrens 266d80c45e0Sbonwick return (error); 267fa9e4066Sahrens } 268fa9e4066Sahrens 269fa9e4066Sahrens /* 270fa9e4066Sahrens * Parse the intent log, and call parse_func for each valid record within. 271fa9e4066Sahrens */ 272b24ab676SJeff Bonwick int 273fa9e4066Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 274fa9e4066Sahrens zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 275fa9e4066Sahrens { 276d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 277b24ab676SJeff Bonwick boolean_t claimed = !!zh->zh_claim_txg; 278b24ab676SJeff Bonwick uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 279b24ab676SJeff Bonwick uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 280b24ab676SJeff Bonwick uint64_t max_blk_seq = 0; 281b24ab676SJeff Bonwick uint64_t max_lr_seq = 0; 282b24ab676SJeff Bonwick uint64_t blk_count = 0; 283b24ab676SJeff Bonwick uint64_t lr_count = 0; 284b24ab676SJeff Bonwick blkptr_t blk, next_blk; 285fa9e4066Sahrens char *lrbuf, *lrp; 286b24ab676SJeff Bonwick int error = 0; 287fa9e4066Sahrens 288b24ab676SJeff Bonwick /* 289b24ab676SJeff Bonwick * Old logs didn't record the maximum zh_claim_lr_seq. 290b24ab676SJeff Bonwick */ 291b24ab676SJeff Bonwick if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 292b24ab676SJeff Bonwick claim_lr_seq = UINT64_MAX; 293fa9e4066Sahrens 294fa9e4066Sahrens /* 295fa9e4066Sahrens * Starting at the block pointed to by zh_log we read the log chain. 296fa9e4066Sahrens * For each block in the chain we strongly check that block to 297fa9e4066Sahrens * ensure its validity. We stop when an invalid block is found. 298fa9e4066Sahrens * For each block pointer in the chain we call parse_blk_func(). 299fa9e4066Sahrens * For each record in each valid block we call parse_lr_func(). 300d80c45e0Sbonwick * If the log has been claimed, stop if we encounter a sequence 301d80c45e0Sbonwick * number greater than the highest claimed sequence number. 302fa9e4066Sahrens */ 303b24ab676SJeff Bonwick lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); 304b24ab676SJeff Bonwick zil_bp_tree_init(zilog); 305d80c45e0Sbonwick 306b24ab676SJeff Bonwick for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 307b24ab676SJeff Bonwick uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 308b24ab676SJeff Bonwick int reclen; 309*6e1f5caaSNeil Perrin char *end; 310d80c45e0Sbonwick 311b24ab676SJeff Bonwick if (blk_seq > claim_blk_seq) 312b24ab676SJeff Bonwick break; 313b24ab676SJeff Bonwick if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 314b24ab676SJeff Bonwick break; 315*6e1f5caaSNeil Perrin ASSERT3U(max_blk_seq, <, blk_seq); 316b24ab676SJeff Bonwick max_blk_seq = blk_seq; 317b24ab676SJeff Bonwick blk_count++; 318fa9e4066Sahrens 319b24ab676SJeff Bonwick if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 320b24ab676SJeff Bonwick break; 321fa9e4066Sahrens 322*6e1f5caaSNeil Perrin error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 323fa9e4066Sahrens if (error) 324fa9e4066Sahrens break; 325fa9e4066Sahrens 326*6e1f5caaSNeil Perrin for (lrp = lrbuf; lrp < end; lrp += reclen) { 327fa9e4066Sahrens lr_t *lr = (lr_t *)lrp; 328fa9e4066Sahrens reclen = lr->lrc_reclen; 329fa9e4066Sahrens ASSERT3U(reclen, >=, sizeof (lr_t)); 330b24ab676SJeff Bonwick if (lr->lrc_seq > claim_lr_seq) 331b24ab676SJeff Bonwick goto done; 332b24ab676SJeff Bonwick if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 333b24ab676SJeff Bonwick goto done; 334*6e1f5caaSNeil Perrin ASSERT3U(max_lr_seq, <, lr->lrc_seq); 335b24ab676SJeff Bonwick max_lr_seq = lr->lrc_seq; 336b24ab676SJeff Bonwick lr_count++; 337fa9e4066Sahrens } 338fa9e4066Sahrens } 339b24ab676SJeff Bonwick done: 340b24ab676SJeff Bonwick zilog->zl_parse_error = error; 341b24ab676SJeff Bonwick zilog->zl_parse_blk_seq = max_blk_seq; 342b24ab676SJeff Bonwick zilog->zl_parse_lr_seq = max_lr_seq; 343b24ab676SJeff Bonwick zilog->zl_parse_blk_count = blk_count; 344b24ab676SJeff Bonwick zilog->zl_parse_lr_count = lr_count; 345b24ab676SJeff Bonwick 346b24ab676SJeff Bonwick ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 347b24ab676SJeff Bonwick (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 348d80c45e0Sbonwick 349b24ab676SJeff Bonwick zil_bp_tree_fini(zilog); 350b24ab676SJeff Bonwick zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); 351b24ab676SJeff Bonwick 352b24ab676SJeff Bonwick return (error); 353fa9e4066Sahrens } 354fa9e4066Sahrens 355b24ab676SJeff Bonwick static int 356fa9e4066Sahrens zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 357fa9e4066Sahrens { 358fa9e4066Sahrens /* 359fa9e4066Sahrens * Claim log block if not already committed and not already claimed. 360b24ab676SJeff Bonwick * If tx == NULL, just verify that the block is claimable. 361fa9e4066Sahrens */ 362b24ab676SJeff Bonwick if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) 363b24ab676SJeff Bonwick return (0); 364b24ab676SJeff Bonwick 365b24ab676SJeff Bonwick return (zio_wait(zio_claim(NULL, zilog->zl_spa, 366b24ab676SJeff Bonwick tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 367b24ab676SJeff Bonwick ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 368fa9e4066Sahrens } 369fa9e4066Sahrens 370b24ab676SJeff Bonwick static int 371fa9e4066Sahrens zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 372fa9e4066Sahrens { 373b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 374b24ab676SJeff Bonwick int error; 375b24ab676SJeff Bonwick 376b24ab676SJeff Bonwick if (lrc->lrc_txtype != TX_WRITE) 377b24ab676SJeff Bonwick return (0); 378b24ab676SJeff Bonwick 379b24ab676SJeff Bonwick /* 380b24ab676SJeff Bonwick * If the block is not readable, don't claim it. This can happen 381b24ab676SJeff Bonwick * in normal operation when a log block is written to disk before 382b24ab676SJeff Bonwick * some of the dmu_sync() blocks it points to. In this case, the 383b24ab676SJeff Bonwick * transaction cannot have been committed to anyone (we would have 384b24ab676SJeff Bonwick * waited for all writes to be stable first), so it is semantically 385b24ab676SJeff Bonwick * correct to declare this the end of the log. 386b24ab676SJeff Bonwick */ 387b24ab676SJeff Bonwick if (lr->lr_blkptr.blk_birth >= first_txg && 388b24ab676SJeff Bonwick (error = zil_read_log_data(zilog, lr, NULL)) != 0) 389b24ab676SJeff Bonwick return (error); 390b24ab676SJeff Bonwick return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 391fa9e4066Sahrens } 392fa9e4066Sahrens 393fa9e4066Sahrens /* ARGSUSED */ 394b24ab676SJeff Bonwick static int 395fa9e4066Sahrens zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 396fa9e4066Sahrens { 397b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 398b24ab676SJeff Bonwick 399b24ab676SJeff Bonwick return (0); 400fa9e4066Sahrens } 401fa9e4066Sahrens 402b24ab676SJeff Bonwick static int 403fa9e4066Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 404fa9e4066Sahrens { 405b24ab676SJeff Bonwick lr_write_t *lr = (lr_write_t *)lrc; 406b24ab676SJeff Bonwick blkptr_t *bp = &lr->lr_blkptr; 407b24ab676SJeff Bonwick 408fa9e4066Sahrens /* 409fa9e4066Sahrens * If we previously claimed it, we need to free it. 410fa9e4066Sahrens */ 411b24ab676SJeff Bonwick if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 412b24ab676SJeff Bonwick bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) 413b24ab676SJeff Bonwick zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 414b24ab676SJeff Bonwick 415b24ab676SJeff Bonwick return (0); 416fa9e4066Sahrens } 417fa9e4066Sahrens 418*6e1f5caaSNeil Perrin static lwb_t * 419*6e1f5caaSNeil Perrin zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) 420*6e1f5caaSNeil Perrin { 421*6e1f5caaSNeil Perrin lwb_t *lwb; 422*6e1f5caaSNeil Perrin 423*6e1f5caaSNeil Perrin lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 424*6e1f5caaSNeil Perrin lwb->lwb_zilog = zilog; 425*6e1f5caaSNeil Perrin lwb->lwb_blk = *bp; 426*6e1f5caaSNeil Perrin lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 427*6e1f5caaSNeil Perrin lwb->lwb_max_txg = txg; 428*6e1f5caaSNeil Perrin lwb->lwb_zio = NULL; 429*6e1f5caaSNeil Perrin lwb->lwb_tx = NULL; 430*6e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 431*6e1f5caaSNeil Perrin lwb->lwb_nused = sizeof (zil_chain_t); 432*6e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp); 433*6e1f5caaSNeil Perrin } else { 434*6e1f5caaSNeil Perrin lwb->lwb_nused = 0; 435*6e1f5caaSNeil Perrin lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 436*6e1f5caaSNeil Perrin } 437*6e1f5caaSNeil Perrin 438*6e1f5caaSNeil Perrin mutex_enter(&zilog->zl_lock); 439*6e1f5caaSNeil Perrin list_insert_tail(&zilog->zl_lwb_list, lwb); 440*6e1f5caaSNeil Perrin mutex_exit(&zilog->zl_lock); 441*6e1f5caaSNeil Perrin 442*6e1f5caaSNeil Perrin return (lwb); 443*6e1f5caaSNeil Perrin } 444*6e1f5caaSNeil Perrin 445fa9e4066Sahrens /* 446fa9e4066Sahrens * Create an on-disk intent log. 447fa9e4066Sahrens */ 448*6e1f5caaSNeil Perrin static lwb_t * 449fa9e4066Sahrens zil_create(zilog_t *zilog) 450fa9e4066Sahrens { 451d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 452*6e1f5caaSNeil Perrin lwb_t *lwb = NULL; 453d80c45e0Sbonwick uint64_t txg = 0; 454d80c45e0Sbonwick dmu_tx_t *tx = NULL; 455fa9e4066Sahrens blkptr_t blk; 456d80c45e0Sbonwick int error = 0; 457fa9e4066Sahrens 458fa9e4066Sahrens /* 459d80c45e0Sbonwick * Wait for any previous destroy to complete. 460fa9e4066Sahrens */ 461d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 462d80c45e0Sbonwick 463d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 464d80c45e0Sbonwick ASSERT(zh->zh_replay_seq == 0); 465d80c45e0Sbonwick 466d80c45e0Sbonwick blk = zh->zh_log; 467fa9e4066Sahrens 468fa9e4066Sahrens /* 469*6e1f5caaSNeil Perrin * Allocate an initial log block if: 470*6e1f5caaSNeil Perrin * - there isn't one already 471*6e1f5caaSNeil Perrin * - the existing block is the wrong endianess 472fa9e4066Sahrens */ 473899217ddSNeil Perrin if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 474d80c45e0Sbonwick tx = dmu_tx_create(zilog->zl_os); 475b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 476d80c45e0Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 477d80c45e0Sbonwick txg = dmu_tx_get_txg(tx); 478d80c45e0Sbonwick 479899217ddSNeil Perrin if (!BP_IS_HOLE(&blk)) { 480b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &blk); 481899217ddSNeil Perrin BP_ZERO(&blk); 482899217ddSNeil Perrin } 483899217ddSNeil Perrin 484b24ab676SJeff Bonwick error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 485b24ab676SJeff Bonwick ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); 486d80c45e0Sbonwick 487d80c45e0Sbonwick if (error == 0) 488d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 48913f5297eSperrin } 490fa9e4066Sahrens 491d80c45e0Sbonwick /* 492d80c45e0Sbonwick * Allocate a log write buffer (lwb) for the first log block. 493d80c45e0Sbonwick */ 494*6e1f5caaSNeil Perrin if (error == 0) 495*6e1f5caaSNeil Perrin lwb = zil_alloc_lwb(zilog, &blk, txg); 496fa9e4066Sahrens 497d80c45e0Sbonwick /* 498d80c45e0Sbonwick * If we just allocated the first log block, commit our transaction 499d80c45e0Sbonwick * and wait for zil_sync() to stuff the block poiner into zh_log. 500d80c45e0Sbonwick * (zh is part of the MOS, so we cannot modify it in open context.) 501d80c45e0Sbonwick */ 502d80c45e0Sbonwick if (tx != NULL) { 503d80c45e0Sbonwick dmu_tx_commit(tx); 50413f5297eSperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 505d80c45e0Sbonwick } 506d80c45e0Sbonwick 507d80c45e0Sbonwick ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 508*6e1f5caaSNeil Perrin 509*6e1f5caaSNeil Perrin return (lwb); 510fa9e4066Sahrens } 511fa9e4066Sahrens 512fa9e4066Sahrens /* 513fa9e4066Sahrens * In one tx, free all log blocks and clear the log header. 514d80c45e0Sbonwick * If keep_first is set, then we're replaying a log with no content. 515d80c45e0Sbonwick * We want to keep the first block, however, so that the first 516d80c45e0Sbonwick * synchronous transaction doesn't require a txg_wait_synced() 517d80c45e0Sbonwick * in zil_create(). We don't need to txg_wait_synced() here either 518d80c45e0Sbonwick * when keep_first is set, because both zil_create() and zil_destroy() 519d80c45e0Sbonwick * will wait for any in-progress destroys to complete. 520fa9e4066Sahrens */ 521fa9e4066Sahrens void 522d80c45e0Sbonwick zil_destroy(zilog_t *zilog, boolean_t keep_first) 523fa9e4066Sahrens { 524d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 525d80c45e0Sbonwick lwb_t *lwb; 526fa9e4066Sahrens dmu_tx_t *tx; 527fa9e4066Sahrens uint64_t txg; 528fa9e4066Sahrens 529d80c45e0Sbonwick /* 530d80c45e0Sbonwick * Wait for any previous destroy to complete. 531d80c45e0Sbonwick */ 532d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 533fa9e4066Sahrens 534b24ab676SJeff Bonwick zilog->zl_old_header = *zh; /* debugging aid */ 535b24ab676SJeff Bonwick 536d80c45e0Sbonwick if (BP_IS_HOLE(&zh->zh_log)) 537fa9e4066Sahrens return; 538fa9e4066Sahrens 539fa9e4066Sahrens tx = dmu_tx_create(zilog->zl_os); 540b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 541fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 542fa9e4066Sahrens txg = dmu_tx_get_txg(tx); 543fa9e4066Sahrens 544d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 545d80c45e0Sbonwick 546d80c45e0Sbonwick ASSERT3U(zilog->zl_destroy_txg, <, txg); 547fa9e4066Sahrens zilog->zl_destroy_txg = txg; 548b24ab676SJeff Bonwick zilog->zl_keep_first = keep_first; 549d80c45e0Sbonwick 550d80c45e0Sbonwick if (!list_is_empty(&zilog->zl_lwb_list)) { 551d80c45e0Sbonwick ASSERT(zh->zh_claim_txg == 0); 552b24ab676SJeff Bonwick ASSERT(!keep_first); 553d80c45e0Sbonwick while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 554d80c45e0Sbonwick list_remove(&zilog->zl_lwb_list, lwb); 555d80c45e0Sbonwick if (lwb->lwb_buf != NULL) 556d80c45e0Sbonwick zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 557b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); 558d80c45e0Sbonwick kmem_cache_free(zil_lwb_cache, lwb); 559d80c45e0Sbonwick } 560b24ab676SJeff Bonwick } else if (!keep_first) { 561b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_free_log_block, 562b24ab676SJeff Bonwick zil_free_log_record, tx, zh->zh_claim_txg); 563d80c45e0Sbonwick } 564b19a79ecSperrin mutex_exit(&zilog->zl_lock); 565fa9e4066Sahrens 566fa9e4066Sahrens dmu_tx_commit(tx); 567fa9e4066Sahrens } 568fa9e4066Sahrens 5691d452cf5Sahrens int 570fd136879SMatthew Ahrens zil_claim(const char *osname, void *txarg) 571fa9e4066Sahrens { 572fa9e4066Sahrens dmu_tx_t *tx = txarg; 573fa9e4066Sahrens uint64_t first_txg = dmu_tx_get_txg(tx); 574fa9e4066Sahrens zilog_t *zilog; 575fa9e4066Sahrens zil_header_t *zh; 576fa9e4066Sahrens objset_t *os; 577fa9e4066Sahrens int error; 578fa9e4066Sahrens 579503ad85cSMatthew Ahrens error = dmu_objset_hold(osname, FTAG, &os); 580fa9e4066Sahrens if (error) { 581b87f3af3Sperrin cmn_err(CE_WARN, "can't open objset for %s", osname); 5821d452cf5Sahrens return (0); 583fa9e4066Sahrens } 584fa9e4066Sahrens 585fa9e4066Sahrens zilog = dmu_objset_zil(os); 586d80c45e0Sbonwick zh = zil_header_in_syncing_context(zilog); 587fa9e4066Sahrens 588b24ab676SJeff Bonwick if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 589e6ca193dSGeorge Wilson if (!BP_IS_HOLE(&zh->zh_log)) 590b24ab676SJeff Bonwick zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 591e6ca193dSGeorge Wilson BP_ZERO(&zh->zh_log); 592e6ca193dSGeorge Wilson dsl_dataset_dirty(dmu_objset_ds(os), tx); 593468c413aSTim Haley dmu_objset_rele(os, FTAG); 594468c413aSTim Haley return (0); 595e6ca193dSGeorge Wilson } 596e6ca193dSGeorge Wilson 597fa9e4066Sahrens /* 598d80c45e0Sbonwick * Claim all log blocks if we haven't already done so, and remember 599d80c45e0Sbonwick * the highest claimed sequence number. This ensures that if we can 600d80c45e0Sbonwick * read only part of the log now (e.g. due to a missing device), 601d80c45e0Sbonwick * but we can read the entire log later, we will not try to replay 602d80c45e0Sbonwick * or destroy beyond the last block we successfully claimed. 603fa9e4066Sahrens */ 604fa9e4066Sahrens ASSERT3U(zh->zh_claim_txg, <=, first_txg); 605fa9e4066Sahrens if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 606b24ab676SJeff Bonwick (void) zil_parse(zilog, zil_claim_log_block, 607d80c45e0Sbonwick zil_claim_log_record, tx, first_txg); 608b24ab676SJeff Bonwick zh->zh_claim_txg = first_txg; 609b24ab676SJeff Bonwick zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 610b24ab676SJeff Bonwick zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 611b24ab676SJeff Bonwick if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 612b24ab676SJeff Bonwick zh->zh_flags |= ZIL_REPLAY_NEEDED; 613b24ab676SJeff Bonwick zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 614fa9e4066Sahrens dsl_dataset_dirty(dmu_objset_ds(os), tx); 615fa9e4066Sahrens } 616d80c45e0Sbonwick 617fa9e4066Sahrens ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 618503ad85cSMatthew Ahrens dmu_objset_rele(os, FTAG); 6191d452cf5Sahrens return (0); 620b87f3af3Sperrin } 621b87f3af3Sperrin 622b87f3af3Sperrin /* 623b87f3af3Sperrin * Check the log by walking the log chain. 624b87f3af3Sperrin * Checksum errors are ok as they indicate the end of the chain. 625b87f3af3Sperrin * Any other error (no device or read failure) returns an error. 626b87f3af3Sperrin */ 627b87f3af3Sperrin int 628fd136879SMatthew Ahrens zil_check_log_chain(const char *osname, void *tx) 629b87f3af3Sperrin { 630b87f3af3Sperrin zilog_t *zilog; 631b87f3af3Sperrin objset_t *os; 632b87f3af3Sperrin int error; 633b87f3af3Sperrin 634b24ab676SJeff Bonwick ASSERT(tx == NULL); 635b24ab676SJeff Bonwick 636503ad85cSMatthew Ahrens error = dmu_objset_hold(osname, FTAG, &os); 637b87f3af3Sperrin if (error) { 638b87f3af3Sperrin cmn_err(CE_WARN, "can't open objset for %s", osname); 639b87f3af3Sperrin return (0); 640b87f3af3Sperrin } 641b87f3af3Sperrin 642b87f3af3Sperrin zilog = dmu_objset_zil(os); 643b87f3af3Sperrin 644b24ab676SJeff Bonwick /* 645b24ab676SJeff Bonwick * Because tx == NULL, zil_claim_log_block() will not actually claim 646b24ab676SJeff Bonwick * any blocks, but just determine whether it is possible to do so. 647b24ab676SJeff Bonwick * In addition to checking the log chain, zil_claim_log_block() 648b24ab676SJeff Bonwick * will invoke zio_claim() with a done func of spa_claim_notify(), 649b24ab676SJeff Bonwick * which will update spa_max_claim_txg. See spa_load() for details. 650b24ab676SJeff Bonwick */ 651b24ab676SJeff Bonwick error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 652b24ab676SJeff Bonwick zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 653b24ab676SJeff Bonwick 654503ad85cSMatthew Ahrens dmu_objset_rele(os, FTAG); 655b24ab676SJeff Bonwick 656b24ab676SJeff Bonwick return ((error == ECKSUM || error == ENOENT) ? 0 : error); 657b87f3af3Sperrin } 658b87f3af3Sperrin 65917f17c2dSbonwick static int 66017f17c2dSbonwick zil_vdev_compare(const void *x1, const void *x2) 66117f17c2dSbonwick { 662c0d19b33Sperrin uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 663c0d19b33Sperrin uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 66417f17c2dSbonwick 66517f17c2dSbonwick if (v1 < v2) 66617f17c2dSbonwick return (-1); 66717f17c2dSbonwick if (v1 > v2) 66817f17c2dSbonwick return (1); 66917f17c2dSbonwick 67017f17c2dSbonwick return (0); 67117f17c2dSbonwick } 67217f17c2dSbonwick 673fa9e4066Sahrens void 674b24ab676SJeff Bonwick zil_add_block(zilog_t *zilog, const blkptr_t *bp) 675fa9e4066Sahrens { 67617f17c2dSbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 67717f17c2dSbonwick avl_index_t where; 67817f17c2dSbonwick zil_vdev_node_t *zv, zvsearch; 67917f17c2dSbonwick int ndvas = BP_GET_NDVAS(bp); 68017f17c2dSbonwick int i; 681fa9e4066Sahrens 682416e0cd8Sek if (zfs_nocacheflush) 683fa9e4066Sahrens return; 684fa9e4066Sahrens 68517f17c2dSbonwick ASSERT(zilog->zl_writer); 68617f17c2dSbonwick 68717f17c2dSbonwick /* 68817f17c2dSbonwick * Even though we're zl_writer, we still need a lock because the 68917f17c2dSbonwick * zl_get_data() callbacks may have dmu_sync() done callbacks 69017f17c2dSbonwick * that will run concurrently. 69117f17c2dSbonwick */ 69217f17c2dSbonwick mutex_enter(&zilog->zl_vdev_lock); 69317f17c2dSbonwick for (i = 0; i < ndvas; i++) { 69417f17c2dSbonwick zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 69517f17c2dSbonwick if (avl_find(t, &zvsearch, &where) == NULL) { 69617f17c2dSbonwick zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 69717f17c2dSbonwick zv->zv_vdev = zvsearch.zv_vdev; 69817f17c2dSbonwick avl_insert(t, zv, where); 69967bd71c6Sperrin } 70067bd71c6Sperrin } 70117f17c2dSbonwick mutex_exit(&zilog->zl_vdev_lock); 702fa9e4066Sahrens } 703fa9e4066Sahrens 70467bd71c6Sperrin void 70567bd71c6Sperrin zil_flush_vdevs(zilog_t *zilog) 70667bd71c6Sperrin { 70767bd71c6Sperrin spa_t *spa = zilog->zl_spa; 70817f17c2dSbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 70917f17c2dSbonwick void *cookie = NULL; 71017f17c2dSbonwick zil_vdev_node_t *zv; 71117f17c2dSbonwick zio_t *zio; 712fa9e4066Sahrens 71367bd71c6Sperrin ASSERT(zilog->zl_writer); 71467bd71c6Sperrin 71517f17c2dSbonwick /* 71617f17c2dSbonwick * We don't need zl_vdev_lock here because we're the zl_writer, 71717f17c2dSbonwick * and all zl_get_data() callbacks are done. 71817f17c2dSbonwick */ 71917f17c2dSbonwick if (avl_numnodes(t) == 0) 72017f17c2dSbonwick return; 72117f17c2dSbonwick 722e14bb325SJeff Bonwick spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 72317f17c2dSbonwick 724e14bb325SJeff Bonwick zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 725fa9e4066Sahrens 72617f17c2dSbonwick while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 72717f17c2dSbonwick vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 72817f17c2dSbonwick if (vd != NULL) 72917f17c2dSbonwick zio_flush(zio, vd); 73017f17c2dSbonwick kmem_free(zv, sizeof (*zv)); 73167bd71c6Sperrin } 73217f17c2dSbonwick 733fa9e4066Sahrens /* 734fa9e4066Sahrens * Wait for all the flushes to complete. Not all devices actually 735fa9e4066Sahrens * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. 736fa9e4066Sahrens */ 73717f17c2dSbonwick (void) zio_wait(zio); 73817f17c2dSbonwick 739e14bb325SJeff Bonwick spa_config_exit(spa, SCL_STATE, FTAG); 740fa9e4066Sahrens } 741fa9e4066Sahrens 742fa9e4066Sahrens /* 743fa9e4066Sahrens * Function called when a log block write completes 744fa9e4066Sahrens */ 745fa9e4066Sahrens static void 746fa9e4066Sahrens zil_lwb_write_done(zio_t *zio) 747fa9e4066Sahrens { 748fa9e4066Sahrens lwb_t *lwb = zio->io_private; 749fa9e4066Sahrens zilog_t *zilog = lwb->lwb_zilog; 750b24ab676SJeff Bonwick dmu_tx_t *tx = lwb->lwb_tx; 751fa9e4066Sahrens 752e14bb325SJeff Bonwick ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 753e14bb325SJeff Bonwick ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 754e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 755e14bb325SJeff Bonwick ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 756e14bb325SJeff Bonwick ASSERT(!BP_IS_GANG(zio->io_bp)); 757e14bb325SJeff Bonwick ASSERT(!BP_IS_HOLE(zio->io_bp)); 758e14bb325SJeff Bonwick ASSERT(zio->io_bp->blk_fill == 0); 759e14bb325SJeff Bonwick 760fa9e4066Sahrens /* 761ef0d8e11SNeil Perrin * Ensure the lwb buffer pointer is cleared before releasing 762ef0d8e11SNeil Perrin * the txg. If we have had an allocation failure and 763ef0d8e11SNeil Perrin * the txg is waiting to sync then we want want zil_sync() 764ef0d8e11SNeil Perrin * to remove the lwb so that it's not picked up as the next new 765ef0d8e11SNeil Perrin * one in zil_commit_writer(). zil_sync() will only remove 766ef0d8e11SNeil Perrin * the lwb if lwb_buf is null. 767fa9e4066Sahrens */ 768fa9e4066Sahrens zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 769fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 770fa9e4066Sahrens lwb->lwb_buf = NULL; 771b24ab676SJeff Bonwick lwb->lwb_tx = NULL; 772b24ab676SJeff Bonwick mutex_exit(&zilog->zl_lock); 773ef0d8e11SNeil Perrin 774ef0d8e11SNeil Perrin /* 775ef0d8e11SNeil Perrin * Now that we've written this log block, we have a stable pointer 776ef0d8e11SNeil Perrin * to the next block in the chain, so it's OK to let the txg in 777b24ab676SJeff Bonwick * which we allocated the next block sync. 778ef0d8e11SNeil Perrin */ 779b24ab676SJeff Bonwick dmu_tx_commit(tx); 780fa9e4066Sahrens } 781fa9e4066Sahrens 782c5c6ffa0Smaybee /* 783c5c6ffa0Smaybee * Initialize the io for a log block. 784c5c6ffa0Smaybee */ 785c5c6ffa0Smaybee static void 786c5c6ffa0Smaybee zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) 787c5c6ffa0Smaybee { 788c5c6ffa0Smaybee zbookmark_t zb; 789c5c6ffa0Smaybee 790b24ab676SJeff Bonwick SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 791b24ab676SJeff Bonwick ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 792b24ab676SJeff Bonwick lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 793c5c6ffa0Smaybee 794b19a79ecSperrin if (zilog->zl_root_zio == NULL) { 795b19a79ecSperrin zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, 796b19a79ecSperrin ZIO_FLAG_CANFAIL); 797b19a79ecSperrin } 79867bd71c6Sperrin if (lwb->lwb_zio == NULL) { 79967bd71c6Sperrin lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 800*6e1f5caaSNeil Perrin 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), 801e6ca193dSGeorge Wilson zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, 8028f18d1faSGeorge Wilson ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 80367bd71c6Sperrin } 804c5c6ffa0Smaybee } 805c5c6ffa0Smaybee 806*6e1f5caaSNeil Perrin /* 807*6e1f5caaSNeil Perrin * Define a limited set of intent log block sizes. 808*6e1f5caaSNeil Perrin * These must be a multiple of 4KB. Note only the amount used (again 809*6e1f5caaSNeil Perrin * aligned to 4KB) actually gets written. However, we can't always just 810*6e1f5caaSNeil Perrin * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. 811*6e1f5caaSNeil Perrin */ 812*6e1f5caaSNeil Perrin uint64_t zil_block_buckets[] = { 813*6e1f5caaSNeil Perrin 4096, /* non TX_WRITE */ 814*6e1f5caaSNeil Perrin 8192+4096, /* data base */ 815*6e1f5caaSNeil Perrin 32*1024 + 4096, /* NFS writes */ 816*6e1f5caaSNeil Perrin UINT64_MAX 817*6e1f5caaSNeil Perrin }; 818*6e1f5caaSNeil Perrin 819d48e086fSNeil Perrin /* 820d48e086fSNeil Perrin * Use the slog as long as the logbias is 'latency' and the current commit size 821d48e086fSNeil Perrin * is less than the limit or the total list size is less than 2X the limit. 822d48e086fSNeil Perrin * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. 823d48e086fSNeil Perrin */ 824d48e086fSNeil Perrin uint64_t zil_slog_limit = 1024 * 1024; 825d48e086fSNeil Perrin #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ 826d48e086fSNeil Perrin (((zilog)->zl_cur_used < zil_slog_limit) || \ 827d48e086fSNeil Perrin ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) 828d48e086fSNeil Perrin 829fa9e4066Sahrens /* 830fa9e4066Sahrens * Start a log block write and advance to the next log block. 831fa9e4066Sahrens * Calls are serialized. 832fa9e4066Sahrens */ 833fa9e4066Sahrens static lwb_t * 834fa9e4066Sahrens zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) 835fa9e4066Sahrens { 836*6e1f5caaSNeil Perrin lwb_t *nlwb = NULL; 837*6e1f5caaSNeil Perrin zil_chain_t *zilc; 838d80c45e0Sbonwick spa_t *spa = zilog->zl_spa; 839*6e1f5caaSNeil Perrin blkptr_t *bp; 840b24ab676SJeff Bonwick dmu_tx_t *tx; 841fa9e4066Sahrens uint64_t txg; 842fa9e4066Sahrens uint64_t zil_blksz; 843*6e1f5caaSNeil Perrin int i, error; 844*6e1f5caaSNeil Perrin 845*6e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 846*6e1f5caaSNeil Perrin zilc = (zil_chain_t *)lwb->lwb_buf; 847*6e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 848*6e1f5caaSNeil Perrin } else { 849*6e1f5caaSNeil Perrin zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 850*6e1f5caaSNeil Perrin bp = &zilc->zc_next_blk; 851*6e1f5caaSNeil Perrin } 852fa9e4066Sahrens 853*6e1f5caaSNeil Perrin ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 854fa9e4066Sahrens 855fa9e4066Sahrens /* 856fa9e4066Sahrens * Allocate the next block and save its address in this block 857fa9e4066Sahrens * before writing it in order to establish the log chain. 858fa9e4066Sahrens * Note that if the allocation of nlwb synced before we wrote 859fa9e4066Sahrens * the block that points at it (lwb), we'd leak it if we crashed. 860b24ab676SJeff Bonwick * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 861b24ab676SJeff Bonwick * We dirty the dataset to ensure that zil_sync() will be called 862b24ab676SJeff Bonwick * to clean up in the event of allocation failure or I/O failure. 863fa9e4066Sahrens */ 864b24ab676SJeff Bonwick tx = dmu_tx_create(zilog->zl_os); 865b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 866b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 867b24ab676SJeff Bonwick txg = dmu_tx_get_txg(tx); 868b24ab676SJeff Bonwick 869b24ab676SJeff Bonwick lwb->lwb_tx = tx; 870fa9e4066Sahrens 871fa9e4066Sahrens /* 872*6e1f5caaSNeil Perrin * Log blocks are pre-allocated. Here we select the size of the next 873*6e1f5caaSNeil Perrin * block, based on size used in the last block. 874*6e1f5caaSNeil Perrin * - first find the smallest bucket that will fit the block from a 875*6e1f5caaSNeil Perrin * limited set of block sizes. This is because it's faster to write 876*6e1f5caaSNeil Perrin * blocks allocated from the same metaslab as they are adjacent or 877*6e1f5caaSNeil Perrin * close. 878*6e1f5caaSNeil Perrin * - next find the maximum from the new suggested size and an array of 879*6e1f5caaSNeil Perrin * previous sizes. This lessens a picket fence effect of wrongly 880*6e1f5caaSNeil Perrin * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 881*6e1f5caaSNeil Perrin * requests. 882*6e1f5caaSNeil Perrin * 883*6e1f5caaSNeil Perrin * Note we only write what is used, but we can't just allocate 884*6e1f5caaSNeil Perrin * the maximum block size because we can exhaust the available 885*6e1f5caaSNeil Perrin * pool log space. 886fa9e4066Sahrens */ 887*6e1f5caaSNeil Perrin zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 888*6e1f5caaSNeil Perrin for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 889*6e1f5caaSNeil Perrin continue; 890*6e1f5caaSNeil Perrin zil_blksz = zil_block_buckets[i]; 891*6e1f5caaSNeil Perrin if (zil_blksz == UINT64_MAX) 892*6e1f5caaSNeil Perrin zil_blksz = SPA_MAXBLOCKSIZE; 893*6e1f5caaSNeil Perrin zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 894*6e1f5caaSNeil Perrin for (i = 0; i < ZIL_PREV_BLKS; i++) 895*6e1f5caaSNeil Perrin zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 896*6e1f5caaSNeil Perrin zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 897fa9e4066Sahrens 89867bd71c6Sperrin BP_ZERO(bp); 89967bd71c6Sperrin /* pass the old blkptr in order to spread log blocks across devs */ 900b24ab676SJeff Bonwick error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, 901d48e086fSNeil Perrin USE_SLOG(zilog)); 902*6e1f5caaSNeil Perrin if (!error) { 903*6e1f5caaSNeil Perrin ASSERT3U(bp->blk_birth, ==, txg); 904*6e1f5caaSNeil Perrin bp->blk_cksum = lwb->lwb_blk.blk_cksum; 905*6e1f5caaSNeil Perrin bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 906d63d470bSgw 907ea8dc4b6Seschrock /* 908*6e1f5caaSNeil Perrin * Allocate a new log write buffer (lwb). 909ea8dc4b6Seschrock */ 910*6e1f5caaSNeil Perrin nlwb = zil_alloc_lwb(zilog, bp, txg); 911*6e1f5caaSNeil Perrin 912*6e1f5caaSNeil Perrin /* Record the block for later vdev flushing */ 913*6e1f5caaSNeil Perrin zil_add_block(zilog, &lwb->lwb_blk); 914fa9e4066Sahrens } 915fa9e4066Sahrens 916*6e1f5caaSNeil Perrin if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 917*6e1f5caaSNeil Perrin uint64_t len; 918fa9e4066Sahrens 919*6e1f5caaSNeil Perrin /* For Slim ZIL only write what is used. */ 920*6e1f5caaSNeil Perrin len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 921*6e1f5caaSNeil Perrin ASSERT3U(len, <=, lwb->lwb_sz); 922*6e1f5caaSNeil Perrin zio_shrink(lwb->lwb_zio, len); 923fa9e4066Sahrens 924*6e1f5caaSNeil Perrin } 925*6e1f5caaSNeil Perrin zilc->zc_pad = 0; 926*6e1f5caaSNeil Perrin zilc->zc_nused = lwb->lwb_nused; 927*6e1f5caaSNeil Perrin zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 928fa9e4066Sahrens 929*6e1f5caaSNeil Perrin zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ 93067bd71c6Sperrin 931fa9e4066Sahrens /* 932*6e1f5caaSNeil Perrin * If there was an allocation failure then nlwb will be null which 933*6e1f5caaSNeil Perrin * forces a txg_wait_synced(). 934fa9e4066Sahrens */ 935fa9e4066Sahrens return (nlwb); 936fa9e4066Sahrens } 937fa9e4066Sahrens 938fa9e4066Sahrens static lwb_t * 939fa9e4066Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 940fa9e4066Sahrens { 941fa9e4066Sahrens lr_t *lrc = &itx->itx_lr; /* common log record */ 942b24ab676SJeff Bonwick lr_write_t *lrw = (lr_write_t *)lrc; 943b24ab676SJeff Bonwick char *lr_buf; 944fa9e4066Sahrens uint64_t txg = lrc->lrc_txg; 945fa9e4066Sahrens uint64_t reclen = lrc->lrc_reclen; 946b24ab676SJeff Bonwick uint64_t dlen = 0; 947fa9e4066Sahrens 948fa9e4066Sahrens if (lwb == NULL) 949fa9e4066Sahrens return (NULL); 950b24ab676SJeff Bonwick 951fa9e4066Sahrens ASSERT(lwb->lwb_buf != NULL); 952fa9e4066Sahrens 953c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) 954c5c6ffa0Smaybee dlen = P2ROUNDUP_TYPED( 955b24ab676SJeff Bonwick lrw->lr_length, sizeof (uint64_t), uint64_t); 956fa9e4066Sahrens 957104e2ed7Sperrin zilog->zl_cur_used += (reclen + dlen); 95822ac5be4Sperrin 95967bd71c6Sperrin zil_lwb_write_init(zilog, lwb); 96067bd71c6Sperrin 961fa9e4066Sahrens /* 962fa9e4066Sahrens * If this record won't fit in the current log block, start a new one. 963fa9e4066Sahrens */ 964*6e1f5caaSNeil Perrin if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 965fa9e4066Sahrens lwb = zil_lwb_write_start(zilog, lwb); 966c5c6ffa0Smaybee if (lwb == NULL) 967fa9e4066Sahrens return (NULL); 96867bd71c6Sperrin zil_lwb_write_init(zilog, lwb); 969*6e1f5caaSNeil Perrin ASSERT(LWB_EMPTY(lwb)); 970*6e1f5caaSNeil Perrin if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 971fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, txg); 972fa9e4066Sahrens return (lwb); 973fa9e4066Sahrens } 974fa9e4066Sahrens } 975fa9e4066Sahrens 976b24ab676SJeff Bonwick lr_buf = lwb->lwb_buf + lwb->lwb_nused; 977b24ab676SJeff Bonwick bcopy(lrc, lr_buf, reclen); 978b24ab676SJeff Bonwick lrc = (lr_t *)lr_buf; 979b24ab676SJeff Bonwick lrw = (lr_write_t *)lrc; 980c5c6ffa0Smaybee 981c5c6ffa0Smaybee /* 982c5c6ffa0Smaybee * If it's a write, fetch the data or get its blkptr as appropriate. 983c5c6ffa0Smaybee */ 984c5c6ffa0Smaybee if (lrc->lrc_txtype == TX_WRITE) { 985c5c6ffa0Smaybee if (txg > spa_freeze_txg(zilog->zl_spa)) 986c5c6ffa0Smaybee txg_wait_synced(zilog->zl_dmu_pool, txg); 987c5c6ffa0Smaybee if (itx->itx_wr_state != WR_COPIED) { 988c5c6ffa0Smaybee char *dbuf; 989c5c6ffa0Smaybee int error; 990c5c6ffa0Smaybee 991c5c6ffa0Smaybee if (dlen) { 992c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_NEED_COPY); 993b24ab676SJeff Bonwick dbuf = lr_buf + reclen; 994b24ab676SJeff Bonwick lrw->lr_common.lrc_reclen += dlen; 995c5c6ffa0Smaybee } else { 996c5c6ffa0Smaybee ASSERT(itx->itx_wr_state == WR_INDIRECT); 997c5c6ffa0Smaybee dbuf = NULL; 998c5c6ffa0Smaybee } 999c5c6ffa0Smaybee error = zilog->zl_get_data( 1000b24ab676SJeff Bonwick itx->itx_private, lrw, dbuf, lwb->lwb_zio); 1001c87b8fc5SMark J Musante if (error == EIO) { 1002c87b8fc5SMark J Musante txg_wait_synced(zilog->zl_dmu_pool, txg); 1003c87b8fc5SMark J Musante return (lwb); 1004c87b8fc5SMark J Musante } 1005c5c6ffa0Smaybee if (error) { 1006c5c6ffa0Smaybee ASSERT(error == ENOENT || error == EEXIST || 1007c5c6ffa0Smaybee error == EALREADY); 1008c5c6ffa0Smaybee return (lwb); 1009c5c6ffa0Smaybee } 1010c5c6ffa0Smaybee } 1011104e2ed7Sperrin } 1012c5c6ffa0Smaybee 1013b24ab676SJeff Bonwick /* 1014b24ab676SJeff Bonwick * We're actually making an entry, so update lrc_seq to be the 1015b24ab676SJeff Bonwick * log record sequence number. Note that this is generally not 1016b24ab676SJeff Bonwick * equal to the itx sequence number because not all transactions 1017b24ab676SJeff Bonwick * are synchronous, and sometimes spa_sync() gets there first. 1018b24ab676SJeff Bonwick */ 1019b24ab676SJeff Bonwick lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ 1020c5c6ffa0Smaybee lwb->lwb_nused += reclen + dlen; 1021fa9e4066Sahrens lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1022*6e1f5caaSNeil Perrin ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1023fa9e4066Sahrens ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); 1024fa9e4066Sahrens 1025fa9e4066Sahrens return (lwb); 1026fa9e4066Sahrens } 1027fa9e4066Sahrens 1028fa9e4066Sahrens itx_t * 1029da6c28aaSamw zil_itx_create(uint64_t txtype, size_t lrsize) 1030fa9e4066Sahrens { 1031fa9e4066Sahrens itx_t *itx; 1032fa9e4066Sahrens 1033b4d654b0Sperrin lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1034fa9e4066Sahrens 1035fa9e4066Sahrens itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1036fa9e4066Sahrens itx->itx_lr.lrc_txtype = txtype; 1037fa9e4066Sahrens itx->itx_lr.lrc_reclen = lrsize; 1038abf76b6eSperrin itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ 1039fa9e4066Sahrens itx->itx_lr.lrc_seq = 0; /* defensive */ 1040fa9e4066Sahrens 1041fa9e4066Sahrens return (itx); 1042fa9e4066Sahrens } 1043fa9e4066Sahrens 1044b24ab676SJeff Bonwick void 1045b24ab676SJeff Bonwick zil_itx_destroy(itx_t *itx) 1046b24ab676SJeff Bonwick { 1047b24ab676SJeff Bonwick kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1048b24ab676SJeff Bonwick } 1049b24ab676SJeff Bonwick 1050fa9e4066Sahrens uint64_t 1051fa9e4066Sahrens zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 1052fa9e4066Sahrens { 1053fa9e4066Sahrens uint64_t seq; 1054fa9e4066Sahrens 1055fa9e4066Sahrens ASSERT(itx->itx_lr.lrc_seq == 0); 1056b24ab676SJeff Bonwick ASSERT(!zilog->zl_replay); 1057fa9e4066Sahrens 1058fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1059fa9e4066Sahrens list_insert_tail(&zilog->zl_itx_list, itx); 1060abf76b6eSperrin zilog->zl_itx_list_sz += itx->itx_sod; 1061fa9e4066Sahrens itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1062fa9e4066Sahrens itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; 1063fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1064fa9e4066Sahrens 1065fa9e4066Sahrens return (seq); 1066fa9e4066Sahrens } 1067fa9e4066Sahrens 1068fa9e4066Sahrens /* 1069fa9e4066Sahrens * Free up all in-memory intent log transactions that have now been synced. 1070fa9e4066Sahrens */ 1071fa9e4066Sahrens static void 1072fa9e4066Sahrens zil_itx_clean(zilog_t *zilog) 1073fa9e4066Sahrens { 1074fa9e4066Sahrens uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); 1075fa9e4066Sahrens uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); 1076a584ef65Sjohansen list_t clean_list; 1077fa9e4066Sahrens itx_t *itx; 1078fa9e4066Sahrens 1079a584ef65Sjohansen list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1080a584ef65Sjohansen 1081fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1082b19a79ecSperrin /* wait for a log writer to finish walking list */ 1083b19a79ecSperrin while (zilog->zl_writer) { 1084b19a79ecSperrin cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1085b19a79ecSperrin } 1086a584ef65Sjohansen 1087a584ef65Sjohansen /* 1088a584ef65Sjohansen * Move the sync'd log transactions to a separate list so we can call 1089a584ef65Sjohansen * kmem_free without holding the zl_lock. 1090a584ef65Sjohansen * 1091a584ef65Sjohansen * There is no need to set zl_writer as we don't drop zl_lock here 1092a584ef65Sjohansen */ 1093fa9e4066Sahrens while ((itx = list_head(&zilog->zl_itx_list)) != NULL && 1094fa9e4066Sahrens itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { 1095fa9e4066Sahrens list_remove(&zilog->zl_itx_list, itx); 1096abf76b6eSperrin zilog->zl_itx_list_sz -= itx->itx_sod; 1097a584ef65Sjohansen list_insert_tail(&clean_list, itx); 1098fa9e4066Sahrens } 109967bd71c6Sperrin cv_broadcast(&zilog->zl_cv_writer); 1100fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1101a584ef65Sjohansen 1102a584ef65Sjohansen /* destroy sync'd log transactions */ 1103a584ef65Sjohansen while ((itx = list_head(&clean_list)) != NULL) { 1104a584ef65Sjohansen list_remove(&clean_list, itx); 1105b24ab676SJeff Bonwick zil_itx_destroy(itx); 1106a584ef65Sjohansen } 1107a584ef65Sjohansen list_destroy(&clean_list); 1108fa9e4066Sahrens } 1109fa9e4066Sahrens 1110b19a79ecSperrin /* 111167bd71c6Sperrin * If there are any in-memory intent log transactions which have now been 111267bd71c6Sperrin * synced then start up a taskq to free them. 1113b19a79ecSperrin */ 1114fa9e4066Sahrens void 1115fa9e4066Sahrens zil_clean(zilog_t *zilog) 1116fa9e4066Sahrens { 111767bd71c6Sperrin itx_t *itx; 111867bd71c6Sperrin 1119fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 112067bd71c6Sperrin itx = list_head(&zilog->zl_itx_list); 112167bd71c6Sperrin if ((itx != NULL) && 112267bd71c6Sperrin (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { 1123fa9e4066Sahrens (void) taskq_dispatch(zilog->zl_clean_taskq, 1124d48e086fSNeil Perrin (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP); 112567bd71c6Sperrin } 1126fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1127fa9e4066Sahrens } 1128fa9e4066Sahrens 1129e14bb325SJeff Bonwick static void 1130b19a79ecSperrin zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) 1131fa9e4066Sahrens { 1132fa9e4066Sahrens uint64_t txg; 113367bd71c6Sperrin uint64_t commit_seq = 0; 1134b24ab676SJeff Bonwick itx_t *itx, *itx_next; 1135fa9e4066Sahrens lwb_t *lwb; 1136fa9e4066Sahrens spa_t *spa; 1137b24ab676SJeff Bonwick int error = 0; 1138fa9e4066Sahrens 1139fa9e4066Sahrens zilog->zl_writer = B_TRUE; 1140e14bb325SJeff Bonwick ASSERT(zilog->zl_root_zio == NULL); 1141b19a79ecSperrin spa = zilog->zl_spa; 1142fa9e4066Sahrens 1143fa9e4066Sahrens if (zilog->zl_suspend) { 1144fa9e4066Sahrens lwb = NULL; 1145fa9e4066Sahrens } else { 1146fa9e4066Sahrens lwb = list_tail(&zilog->zl_lwb_list); 1147fa9e4066Sahrens if (lwb == NULL) { 1148b19a79ecSperrin /* 1149b19a79ecSperrin * Return if there's nothing to flush before we 1150b19a79ecSperrin * dirty the fs by calling zil_create() 1151b19a79ecSperrin */ 1152b19a79ecSperrin if (list_is_empty(&zilog->zl_itx_list)) { 1153b19a79ecSperrin zilog->zl_writer = B_FALSE; 1154b19a79ecSperrin return; 1155b19a79ecSperrin } 1156fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1157*6e1f5caaSNeil Perrin lwb = zil_create(zilog); 1158fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1159fa9e4066Sahrens } 1160fa9e4066Sahrens } 1161*6e1f5caaSNeil Perrin ASSERT(lwb == NULL || lwb->lwb_zio == NULL); 1162fa9e4066Sahrens 116367bd71c6Sperrin /* Loop through in-memory log transactions filling log blocks. */ 1164b19a79ecSperrin DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); 1165b24ab676SJeff Bonwick 1166b24ab676SJeff Bonwick for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) { 1167b19a79ecSperrin /* 1168b24ab676SJeff Bonwick * Save the next pointer. Even though we drop zl_lock below, 1169b24ab676SJeff Bonwick * all threads that can remove itx list entries (other writers 1170b24ab676SJeff Bonwick * and zil_itx_clean()) can't do so until they have zl_writer. 1171b19a79ecSperrin */ 1172b24ab676SJeff Bonwick itx_next = list_next(&zilog->zl_itx_list, itx); 1173b24ab676SJeff Bonwick 1174b24ab676SJeff Bonwick /* 1175b24ab676SJeff Bonwick * Determine whether to push this itx. 1176b24ab676SJeff Bonwick * Push all transactions related to specified foid and 1177b24ab676SJeff Bonwick * all other transactions except those that can be logged 1178b24ab676SJeff Bonwick * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL) 1179b24ab676SJeff Bonwick * for all other files. 1180b24ab676SJeff Bonwick * 1181b24ab676SJeff Bonwick * If foid == 0 (meaning "push all foids") or 1182b24ab676SJeff Bonwick * itx->itx_sync is set (meaning O_[D]SYNC), push regardless. 1183b24ab676SJeff Bonwick */ 1184b24ab676SJeff Bonwick if (foid != 0 && !itx->itx_sync && 1185b24ab676SJeff Bonwick TX_OOO(itx->itx_lr.lrc_txtype) && 1186b24ab676SJeff Bonwick ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid) 1187b24ab676SJeff Bonwick continue; /* skip this record */ 1188fa9e4066Sahrens 1189fa9e4066Sahrens if ((itx->itx_lr.lrc_seq > seq) && 1190*6e1f5caaSNeil Perrin ((lwb == NULL) || (LWB_EMPTY(lwb)) || 1191*6e1f5caaSNeil Perrin (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz))) 1192fa9e4066Sahrens break; 1193fa9e4066Sahrens 1194fa9e4066Sahrens list_remove(&zilog->zl_itx_list, itx); 1195abf76b6eSperrin zilog->zl_itx_list_sz -= itx->itx_sod; 1196b24ab676SJeff Bonwick 119767bd71c6Sperrin mutex_exit(&zilog->zl_lock); 1198b24ab676SJeff Bonwick 1199fa9e4066Sahrens txg = itx->itx_lr.lrc_txg; 1200fa9e4066Sahrens ASSERT(txg); 1201fa9e4066Sahrens 1202fa9e4066Sahrens if (txg > spa_last_synced_txg(spa) || 1203fa9e4066Sahrens txg > spa_freeze_txg(spa)) 1204fa9e4066Sahrens lwb = zil_lwb_commit(zilog, itx, lwb); 1205b24ab676SJeff Bonwick 1206b24ab676SJeff Bonwick zil_itx_destroy(itx); 1207b24ab676SJeff Bonwick 1208fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1209fa9e4066Sahrens } 1210b19a79ecSperrin DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 121167bd71c6Sperrin /* determine commit sequence number */ 121267bd71c6Sperrin itx = list_head(&zilog->zl_itx_list); 121367bd71c6Sperrin if (itx) 1214b24ab676SJeff Bonwick commit_seq = itx->itx_lr.lrc_seq - 1; 121567bd71c6Sperrin else 121667bd71c6Sperrin commit_seq = zilog->zl_itx_seq; 1217fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1218fa9e4066Sahrens 1219fa9e4066Sahrens /* write the last block out */ 122067bd71c6Sperrin if (lwb != NULL && lwb->lwb_zio != NULL) 1221fa9e4066Sahrens lwb = zil_lwb_write_start(zilog, lwb); 1222fa9e4066Sahrens 122322ac5be4Sperrin zilog->zl_prev_used = zilog->zl_cur_used; 122422ac5be4Sperrin zilog->zl_cur_used = 0; 1225fa9e4066Sahrens 1226fa9e4066Sahrens /* 1227b19a79ecSperrin * Wait if necessary for the log blocks to be on stable storage. 1228fa9e4066Sahrens */ 1229b19a79ecSperrin if (zilog->zl_root_zio) { 1230b19a79ecSperrin DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); 1231b24ab676SJeff Bonwick error = zio_wait(zilog->zl_root_zio); 1232e14bb325SJeff Bonwick zilog->zl_root_zio = NULL; 1233b19a79ecSperrin DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); 123417f17c2dSbonwick zil_flush_vdevs(zilog); 1235fa9e4066Sahrens } 123622ac5be4Sperrin 1237b24ab676SJeff Bonwick if (error || lwb == NULL) 1238fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 123967bd71c6Sperrin 124067bd71c6Sperrin mutex_enter(&zilog->zl_lock); 124122ac5be4Sperrin zilog->zl_writer = B_FALSE; 124267bd71c6Sperrin 124367bd71c6Sperrin ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); 124467bd71c6Sperrin zilog->zl_commit_seq = commit_seq; 1245b24ab676SJeff Bonwick 1246b24ab676SJeff Bonwick /* 1247b24ab676SJeff Bonwick * Remember the highest committed log sequence number for ztest. 1248b24ab676SJeff Bonwick * We only update this value when all the log writes succeeded, 1249b24ab676SJeff Bonwick * because ztest wants to ASSERT that it got the whole log chain. 1250b24ab676SJeff Bonwick */ 1251b24ab676SJeff Bonwick if (error == 0 && lwb != NULL) 1252b24ab676SJeff Bonwick zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1253b19a79ecSperrin } 1254b19a79ecSperrin 1255b19a79ecSperrin /* 1256b19a79ecSperrin * Push zfs transactions to stable storage up to the supplied sequence number. 1257b19a79ecSperrin * If foid is 0 push out all transactions, otherwise push only those 1258b19a79ecSperrin * for that file or might have been used to create that file. 1259b19a79ecSperrin */ 1260b19a79ecSperrin void 1261b19a79ecSperrin zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) 1262b19a79ecSperrin { 1263b19a79ecSperrin if (zilog == NULL || seq == 0) 1264b19a79ecSperrin return; 1265b19a79ecSperrin 1266b19a79ecSperrin mutex_enter(&zilog->zl_lock); 1267b19a79ecSperrin 1268b19a79ecSperrin seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ 1269b19a79ecSperrin 127067bd71c6Sperrin while (zilog->zl_writer) { 1271b19a79ecSperrin cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1272b24ab676SJeff Bonwick if (seq <= zilog->zl_commit_seq) { 127367bd71c6Sperrin mutex_exit(&zilog->zl_lock); 127467bd71c6Sperrin return; 127567bd71c6Sperrin } 127667bd71c6Sperrin } 1277b19a79ecSperrin zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ 127867bd71c6Sperrin /* wake up others waiting on the commit */ 127967bd71c6Sperrin cv_broadcast(&zilog->zl_cv_writer); 128067bd71c6Sperrin mutex_exit(&zilog->zl_lock); 1281fa9e4066Sahrens } 1282fa9e4066Sahrens 1283b24ab676SJeff Bonwick /* 1284b24ab676SJeff Bonwick * Report whether all transactions are committed. 1285b24ab676SJeff Bonwick */ 1286b24ab676SJeff Bonwick static boolean_t 1287b24ab676SJeff Bonwick zil_is_committed(zilog_t *zilog) 1288b24ab676SJeff Bonwick { 1289b24ab676SJeff Bonwick lwb_t *lwb; 1290b24ab676SJeff Bonwick boolean_t committed; 1291b24ab676SJeff Bonwick 1292b24ab676SJeff Bonwick mutex_enter(&zilog->zl_lock); 1293b24ab676SJeff Bonwick 1294b24ab676SJeff Bonwick while (zilog->zl_writer) 1295b24ab676SJeff Bonwick cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1296b24ab676SJeff Bonwick 1297b24ab676SJeff Bonwick if (!list_is_empty(&zilog->zl_itx_list)) 1298b24ab676SJeff Bonwick committed = B_FALSE; /* unpushed transactions */ 1299b24ab676SJeff Bonwick else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL) 1300b24ab676SJeff Bonwick committed = B_TRUE; /* intent log never used */ 1301b24ab676SJeff Bonwick else if (list_next(&zilog->zl_lwb_list, lwb) != NULL) 1302b24ab676SJeff Bonwick committed = B_FALSE; /* zil_sync() not done yet */ 1303b24ab676SJeff Bonwick else 1304b24ab676SJeff Bonwick committed = B_TRUE; /* everything synced */ 1305b24ab676SJeff Bonwick 1306b24ab676SJeff Bonwick mutex_exit(&zilog->zl_lock); 1307b24ab676SJeff Bonwick return (committed); 1308b24ab676SJeff Bonwick } 1309b24ab676SJeff Bonwick 1310fa9e4066Sahrens /* 1311fa9e4066Sahrens * Called in syncing context to free committed log blocks and update log header. 1312fa9e4066Sahrens */ 1313fa9e4066Sahrens void 1314fa9e4066Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx) 1315fa9e4066Sahrens { 1316d80c45e0Sbonwick zil_header_t *zh = zil_header_in_syncing_context(zilog); 1317fa9e4066Sahrens uint64_t txg = dmu_tx_get_txg(tx); 1318fa9e4066Sahrens spa_t *spa = zilog->zl_spa; 1319b24ab676SJeff Bonwick uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 1320fa9e4066Sahrens lwb_t *lwb; 1321fa9e4066Sahrens 132214843421SMatthew Ahrens /* 132314843421SMatthew Ahrens * We don't zero out zl_destroy_txg, so make sure we don't try 132414843421SMatthew Ahrens * to destroy it twice. 132514843421SMatthew Ahrens */ 132614843421SMatthew Ahrens if (spa_sync_pass(spa) != 1) 132714843421SMatthew Ahrens return; 132814843421SMatthew Ahrens 1329d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 1330d80c45e0Sbonwick 1331fa9e4066Sahrens ASSERT(zilog->zl_stop_sync == 0); 1332fa9e4066Sahrens 1333b24ab676SJeff Bonwick if (*replayed_seq != 0) { 1334b24ab676SJeff Bonwick ASSERT(zh->zh_replay_seq < *replayed_seq); 1335b24ab676SJeff Bonwick zh->zh_replay_seq = *replayed_seq; 1336b24ab676SJeff Bonwick *replayed_seq = 0; 1337b24ab676SJeff Bonwick } 1338fa9e4066Sahrens 1339fa9e4066Sahrens if (zilog->zl_destroy_txg == txg) { 1340d80c45e0Sbonwick blkptr_t blk = zh->zh_log; 1341d80c45e0Sbonwick 1342d80c45e0Sbonwick ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 1343d80c45e0Sbonwick 1344d80c45e0Sbonwick bzero(zh, sizeof (zil_header_t)); 13451209a471SNeil Perrin bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 1346d80c45e0Sbonwick 1347d80c45e0Sbonwick if (zilog->zl_keep_first) { 1348d80c45e0Sbonwick /* 1349d80c45e0Sbonwick * If this block was part of log chain that couldn't 1350d80c45e0Sbonwick * be claimed because a device was missing during 1351d80c45e0Sbonwick * zil_claim(), but that device later returns, 1352d80c45e0Sbonwick * then this block could erroneously appear valid. 1353d80c45e0Sbonwick * To guard against this, assign a new GUID to the new 1354d80c45e0Sbonwick * log chain so it doesn't matter what blk points to. 1355d80c45e0Sbonwick */ 1356d80c45e0Sbonwick zil_init_log_chain(zilog, &blk); 1357d80c45e0Sbonwick zh->zh_log = blk; 1358d80c45e0Sbonwick } 1359fa9e4066Sahrens } 1360fa9e4066Sahrens 1361e6ca193dSGeorge Wilson while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1362b19a79ecSperrin zh->zh_log = lwb->lwb_blk; 1363fa9e4066Sahrens if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 1364fa9e4066Sahrens break; 1365fa9e4066Sahrens list_remove(&zilog->zl_lwb_list, lwb); 1366b24ab676SJeff Bonwick zio_free_zil(spa, txg, &lwb->lwb_blk); 1367fa9e4066Sahrens kmem_cache_free(zil_lwb_cache, lwb); 1368d63d470bSgw 1369d63d470bSgw /* 1370d63d470bSgw * If we don't have anything left in the lwb list then 1371d63d470bSgw * we've had an allocation failure and we need to zero 1372d63d470bSgw * out the zil_header blkptr so that we don't end 1373d63d470bSgw * up freeing the same block twice. 1374d63d470bSgw */ 1375d63d470bSgw if (list_head(&zilog->zl_lwb_list) == NULL) 1376d63d470bSgw BP_ZERO(&zh->zh_log); 1377fa9e4066Sahrens } 1378fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1379fa9e4066Sahrens } 1380fa9e4066Sahrens 1381fa9e4066Sahrens void 1382fa9e4066Sahrens zil_init(void) 1383fa9e4066Sahrens { 1384fa9e4066Sahrens zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 13855ad82045Snd sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); 1386fa9e4066Sahrens } 1387fa9e4066Sahrens 1388fa9e4066Sahrens void 1389fa9e4066Sahrens zil_fini(void) 1390fa9e4066Sahrens { 1391fa9e4066Sahrens kmem_cache_destroy(zil_lwb_cache); 1392fa9e4066Sahrens } 1393fa9e4066Sahrens 1394e09fa4daSNeil Perrin void 1395e09fa4daSNeil Perrin zil_set_logbias(zilog_t *zilog, uint64_t logbias) 1396e09fa4daSNeil Perrin { 1397e09fa4daSNeil Perrin zilog->zl_logbias = logbias; 1398e09fa4daSNeil Perrin } 1399e09fa4daSNeil Perrin 1400fa9e4066Sahrens zilog_t * 1401fa9e4066Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys) 1402fa9e4066Sahrens { 1403fa9e4066Sahrens zilog_t *zilog; 1404fa9e4066Sahrens 1405fa9e4066Sahrens zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 1406fa9e4066Sahrens 1407fa9e4066Sahrens zilog->zl_header = zh_phys; 1408fa9e4066Sahrens zilog->zl_os = os; 1409fa9e4066Sahrens zilog->zl_spa = dmu_objset_spa(os); 1410fa9e4066Sahrens zilog->zl_dmu_pool = dmu_objset_pool(os); 1411d80c45e0Sbonwick zilog->zl_destroy_txg = TXG_INITIAL - 1; 1412e09fa4daSNeil Perrin zilog->zl_logbias = dmu_objset_logbias(os); 1413fa9e4066Sahrens 14145ad82045Snd mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 14155ad82045Snd 1416fa9e4066Sahrens list_create(&zilog->zl_itx_list, sizeof (itx_t), 1417fa9e4066Sahrens offsetof(itx_t, itx_node)); 1418fa9e4066Sahrens 1419fa9e4066Sahrens list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 1420fa9e4066Sahrens offsetof(lwb_t, lwb_node)); 1421fa9e4066Sahrens 142217f17c2dSbonwick mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 142317f17c2dSbonwick 142417f17c2dSbonwick avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, 142517f17c2dSbonwick sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 1426fa9e4066Sahrens 1427b7b97454Sperrin cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); 1428b7b97454Sperrin cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 1429b7b97454Sperrin 1430fa9e4066Sahrens return (zilog); 1431fa9e4066Sahrens } 1432fa9e4066Sahrens 1433fa9e4066Sahrens void 1434fa9e4066Sahrens zil_free(zilog_t *zilog) 1435fa9e4066Sahrens { 1436fa9e4066Sahrens lwb_t *lwb; 1437fa9e4066Sahrens 1438fa9e4066Sahrens zilog->zl_stop_sync = 1; 1439fa9e4066Sahrens 1440fa9e4066Sahrens while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1441fa9e4066Sahrens list_remove(&zilog->zl_lwb_list, lwb); 1442fa9e4066Sahrens if (lwb->lwb_buf != NULL) 1443fa9e4066Sahrens zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1444fa9e4066Sahrens kmem_cache_free(zil_lwb_cache, lwb); 1445fa9e4066Sahrens } 1446fa9e4066Sahrens list_destroy(&zilog->zl_lwb_list); 1447fa9e4066Sahrens 144817f17c2dSbonwick avl_destroy(&zilog->zl_vdev_tree); 144917f17c2dSbonwick mutex_destroy(&zilog->zl_vdev_lock); 1450fa9e4066Sahrens 1451fa9e4066Sahrens ASSERT(list_head(&zilog->zl_itx_list) == NULL); 1452fa9e4066Sahrens list_destroy(&zilog->zl_itx_list); 14535ad82045Snd mutex_destroy(&zilog->zl_lock); 1454fa9e4066Sahrens 1455b7b97454Sperrin cv_destroy(&zilog->zl_cv_writer); 1456b7b97454Sperrin cv_destroy(&zilog->zl_cv_suspend); 1457b7b97454Sperrin 1458fa9e4066Sahrens kmem_free(zilog, sizeof (zilog_t)); 1459fa9e4066Sahrens } 1460fa9e4066Sahrens 1461fa9e4066Sahrens /* 1462fa9e4066Sahrens * Open an intent log. 1463fa9e4066Sahrens */ 1464fa9e4066Sahrens zilog_t * 1465fa9e4066Sahrens zil_open(objset_t *os, zil_get_data_t *get_data) 1466fa9e4066Sahrens { 1467fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 1468fa9e4066Sahrens 1469fa9e4066Sahrens zilog->zl_get_data = get_data; 1470fa9e4066Sahrens zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 1471fa9e4066Sahrens 2, 2, TASKQ_PREPOPULATE); 1472fa9e4066Sahrens 1473fa9e4066Sahrens return (zilog); 1474fa9e4066Sahrens } 1475fa9e4066Sahrens 1476fa9e4066Sahrens /* 1477fa9e4066Sahrens * Close an intent log. 1478fa9e4066Sahrens */ 1479fa9e4066Sahrens void 1480fa9e4066Sahrens zil_close(zilog_t *zilog) 1481fa9e4066Sahrens { 1482d80c45e0Sbonwick /* 1483d80c45e0Sbonwick * If the log isn't already committed, mark the objset dirty 1484d80c45e0Sbonwick * (so zil_sync() will be called) and wait for that txg to sync. 1485d80c45e0Sbonwick */ 1486d80c45e0Sbonwick if (!zil_is_committed(zilog)) { 1487d80c45e0Sbonwick uint64_t txg; 1488d80c45e0Sbonwick dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 1489b24ab676SJeff Bonwick VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 1490d80c45e0Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1491d80c45e0Sbonwick txg = dmu_tx_get_txg(tx); 1492d80c45e0Sbonwick dmu_tx_commit(tx); 1493d80c45e0Sbonwick txg_wait_synced(zilog->zl_dmu_pool, txg); 1494d80c45e0Sbonwick } 1495d80c45e0Sbonwick 1496fa9e4066Sahrens taskq_destroy(zilog->zl_clean_taskq); 1497fa9e4066Sahrens zilog->zl_clean_taskq = NULL; 1498fa9e4066Sahrens zilog->zl_get_data = NULL; 1499fa9e4066Sahrens 1500fa9e4066Sahrens zil_itx_clean(zilog); 1501fa9e4066Sahrens ASSERT(list_head(&zilog->zl_itx_list) == NULL); 1502fa9e4066Sahrens } 1503fa9e4066Sahrens 1504fa9e4066Sahrens /* 1505fa9e4066Sahrens * Suspend an intent log. While in suspended mode, we still honor 1506fa9e4066Sahrens * synchronous semantics, but we rely on txg_wait_synced() to do it. 1507fa9e4066Sahrens * We suspend the log briefly when taking a snapshot so that the snapshot 1508fa9e4066Sahrens * contains all the data it's supposed to, and has an empty intent log. 1509fa9e4066Sahrens */ 1510fa9e4066Sahrens int 1511fa9e4066Sahrens zil_suspend(zilog_t *zilog) 1512fa9e4066Sahrens { 1513d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 1514fa9e4066Sahrens 1515fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 15163589c4f0SNeil Perrin if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 1517fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1518fa9e4066Sahrens return (EBUSY); 1519fa9e4066Sahrens } 1520d80c45e0Sbonwick if (zilog->zl_suspend++ != 0) { 1521d80c45e0Sbonwick /* 1522d80c45e0Sbonwick * Someone else already began a suspend. 1523d80c45e0Sbonwick * Just wait for them to finish. 1524d80c45e0Sbonwick */ 1525d80c45e0Sbonwick while (zilog->zl_suspending) 1526d80c45e0Sbonwick cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 1527d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 1528d80c45e0Sbonwick return (0); 1529d80c45e0Sbonwick } 1530d80c45e0Sbonwick zilog->zl_suspending = B_TRUE; 1531fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1532fa9e4066Sahrens 1533b19a79ecSperrin zil_commit(zilog, UINT64_MAX, 0); 1534fa9e4066Sahrens 1535b19a79ecSperrin /* 1536b19a79ecSperrin * Wait for any in-flight log writes to complete. 1537b19a79ecSperrin */ 1538fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1539b19a79ecSperrin while (zilog->zl_writer) 1540b19a79ecSperrin cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1541fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1542fa9e4066Sahrens 1543d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 1544d80c45e0Sbonwick 1545d80c45e0Sbonwick mutex_enter(&zilog->zl_lock); 1546d80c45e0Sbonwick zilog->zl_suspending = B_FALSE; 1547d80c45e0Sbonwick cv_broadcast(&zilog->zl_cv_suspend); 1548d80c45e0Sbonwick mutex_exit(&zilog->zl_lock); 1549fa9e4066Sahrens 1550fa9e4066Sahrens return (0); 1551fa9e4066Sahrens } 1552fa9e4066Sahrens 1553fa9e4066Sahrens void 1554fa9e4066Sahrens zil_resume(zilog_t *zilog) 1555fa9e4066Sahrens { 1556fa9e4066Sahrens mutex_enter(&zilog->zl_lock); 1557fa9e4066Sahrens ASSERT(zilog->zl_suspend != 0); 1558fa9e4066Sahrens zilog->zl_suspend--; 1559fa9e4066Sahrens mutex_exit(&zilog->zl_lock); 1560fa9e4066Sahrens } 1561fa9e4066Sahrens 1562fa9e4066Sahrens typedef struct zil_replay_arg { 1563fa9e4066Sahrens zil_replay_func_t **zr_replay; 1564fa9e4066Sahrens void *zr_arg; 1565fa9e4066Sahrens boolean_t zr_byteswap; 1566b24ab676SJeff Bonwick char *zr_lr; 1567fa9e4066Sahrens } zil_replay_arg_t; 1568fa9e4066Sahrens 1569b24ab676SJeff Bonwick static int 1570b24ab676SJeff Bonwick zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 1571b24ab676SJeff Bonwick { 1572b24ab676SJeff Bonwick char name[MAXNAMELEN]; 1573b24ab676SJeff Bonwick 1574b24ab676SJeff Bonwick zilog->zl_replaying_seq--; /* didn't actually replay this one */ 1575b24ab676SJeff Bonwick 1576b24ab676SJeff Bonwick dmu_objset_name(zilog->zl_os, name); 1577b24ab676SJeff Bonwick 1578b24ab676SJeff Bonwick cmn_err(CE_WARN, "ZFS replay transaction error %d, " 1579b24ab676SJeff Bonwick "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 1580b24ab676SJeff Bonwick (u_longlong_t)lr->lrc_seq, 1581b24ab676SJeff Bonwick (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 1582b24ab676SJeff Bonwick (lr->lrc_txtype & TX_CI) ? "CI" : ""); 1583b24ab676SJeff Bonwick 1584b24ab676SJeff Bonwick return (error); 1585b24ab676SJeff Bonwick } 1586b24ab676SJeff Bonwick 1587b24ab676SJeff Bonwick static int 1588fa9e4066Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 1589fa9e4066Sahrens { 1590fa9e4066Sahrens zil_replay_arg_t *zr = zra; 1591d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 1592fa9e4066Sahrens uint64_t reclen = lr->lrc_reclen; 1593fa9e4066Sahrens uint64_t txtype = lr->lrc_txtype; 1594b24ab676SJeff Bonwick int error = 0; 1595fa9e4066Sahrens 1596b24ab676SJeff Bonwick zilog->zl_replaying_seq = lr->lrc_seq; 1597fa9e4066Sahrens 1598fa9e4066Sahrens if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 1599b24ab676SJeff Bonwick return (0); 1600b24ab676SJeff Bonwick 1601b24ab676SJeff Bonwick if (lr->lrc_txg < claim_txg) /* already committed */ 1602b24ab676SJeff Bonwick return (0); 1603fa9e4066Sahrens 1604da6c28aaSamw /* Strip case-insensitive bit, still present in log record */ 1605da6c28aaSamw txtype &= ~TX_CI; 1606da6c28aaSamw 1607b24ab676SJeff Bonwick if (txtype == 0 || txtype >= TX_MAX_TYPE) 1608b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, EINVAL)); 1609b24ab676SJeff Bonwick 1610b24ab676SJeff Bonwick /* 1611b24ab676SJeff Bonwick * If this record type can be logged out of order, the object 1612b24ab676SJeff Bonwick * (lr_foid) may no longer exist. That's legitimate, not an error. 1613b24ab676SJeff Bonwick */ 1614b24ab676SJeff Bonwick if (TX_OOO(txtype)) { 1615b24ab676SJeff Bonwick error = dmu_object_info(zilog->zl_os, 1616b24ab676SJeff Bonwick ((lr_ooo_t *)lr)->lr_foid, NULL); 1617b24ab676SJeff Bonwick if (error == ENOENT || error == EEXIST) 1618b24ab676SJeff Bonwick return (0); 16191209a471SNeil Perrin } 16201209a471SNeil Perrin 1621fa9e4066Sahrens /* 1622fa9e4066Sahrens * Make a copy of the data so we can revise and extend it. 1623fa9e4066Sahrens */ 1624b24ab676SJeff Bonwick bcopy(lr, zr->zr_lr, reclen); 1625b24ab676SJeff Bonwick 1626b24ab676SJeff Bonwick /* 1627b24ab676SJeff Bonwick * If this is a TX_WRITE with a blkptr, suck in the data. 1628b24ab676SJeff Bonwick */ 1629b24ab676SJeff Bonwick if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 1630b24ab676SJeff Bonwick error = zil_read_log_data(zilog, (lr_write_t *)lr, 1631b24ab676SJeff Bonwick zr->zr_lr + reclen); 1632b24ab676SJeff Bonwick if (error) 1633b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 1634b24ab676SJeff Bonwick } 1635fa9e4066Sahrens 1636fa9e4066Sahrens /* 1637fa9e4066Sahrens * The log block containing this lr may have been byteswapped 1638fa9e4066Sahrens * so that we can easily examine common fields like lrc_txtype. 1639b24ab676SJeff Bonwick * However, the log is a mix of different record types, and only the 1640fa9e4066Sahrens * replay vectors know how to byteswap their records. Therefore, if 1641fa9e4066Sahrens * the lr was byteswapped, undo it before invoking the replay vector. 1642fa9e4066Sahrens */ 1643fa9e4066Sahrens if (zr->zr_byteswap) 1644b24ab676SJeff Bonwick byteswap_uint64_array(zr->zr_lr, reclen); 1645fa9e4066Sahrens 1646fa9e4066Sahrens /* 1647fa9e4066Sahrens * We must now do two things atomically: replay this log record, 16481209a471SNeil Perrin * and update the log header sequence number to reflect the fact that 16491209a471SNeil Perrin * we did so. At the end of each replay function the sequence number 16501209a471SNeil Perrin * is updated if we are in replay mode. 1651fa9e4066Sahrens */ 1652b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 1653b24ab676SJeff Bonwick if (error) { 165467bd71c6Sperrin /* 165567bd71c6Sperrin * The DMU's dnode layer doesn't see removes until the txg 165667bd71c6Sperrin * commits, so a subsequent claim can spuriously fail with 16571209a471SNeil Perrin * EEXIST. So if we receive any error we try syncing out 1658b24ab676SJeff Bonwick * any removes then retry the transaction. Note that we 1659b24ab676SJeff Bonwick * specify B_FALSE for byteswap now, so we don't do it twice. 166067bd71c6Sperrin */ 1661b24ab676SJeff Bonwick txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 1662b24ab676SJeff Bonwick error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 1663b24ab676SJeff Bonwick if (error) 1664b24ab676SJeff Bonwick return (zil_replay_error(zilog, lr, error)); 1665fa9e4066Sahrens } 1666b24ab676SJeff Bonwick return (0); 166767bd71c6Sperrin } 1668fa9e4066Sahrens 166967bd71c6Sperrin /* ARGSUSED */ 1670b24ab676SJeff Bonwick static int 167167bd71c6Sperrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 167267bd71c6Sperrin { 167367bd71c6Sperrin zilog->zl_replay_blks++; 1674b24ab676SJeff Bonwick 1675b24ab676SJeff Bonwick return (0); 1676fa9e4066Sahrens } 1677fa9e4066Sahrens 1678fa9e4066Sahrens /* 167913f5297eSperrin * If this dataset has a non-empty intent log, replay it and destroy it. 1680fa9e4066Sahrens */ 1681fa9e4066Sahrens void 16821209a471SNeil Perrin zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 1683fa9e4066Sahrens { 1684fa9e4066Sahrens zilog_t *zilog = dmu_objset_zil(os); 1685d80c45e0Sbonwick const zil_header_t *zh = zilog->zl_header; 1686d80c45e0Sbonwick zil_replay_arg_t zr; 168713f5297eSperrin 16883589c4f0SNeil Perrin if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 1689d80c45e0Sbonwick zil_destroy(zilog, B_TRUE); 169013f5297eSperrin return; 169113f5297eSperrin } 1692fa9e4066Sahrens 1693fa9e4066Sahrens zr.zr_replay = replay_func; 1694fa9e4066Sahrens zr.zr_arg = arg; 1695d80c45e0Sbonwick zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 1696b24ab676SJeff Bonwick zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 1697fa9e4066Sahrens 1698fa9e4066Sahrens /* 1699fa9e4066Sahrens * Wait for in-progress removes to sync before starting replay. 1700fa9e4066Sahrens */ 1701fa9e4066Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 1702fa9e4066Sahrens 17031209a471SNeil Perrin zilog->zl_replay = B_TRUE; 1704d3d50737SRafael Vanoni zilog->zl_replay_time = ddi_get_lbolt(); 170567bd71c6Sperrin ASSERT(zilog->zl_replay_blks == 0); 170667bd71c6Sperrin (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 1707d80c45e0Sbonwick zh->zh_claim_txg); 1708b24ab676SJeff Bonwick kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 1709fa9e4066Sahrens 1710d80c45e0Sbonwick zil_destroy(zilog, B_FALSE); 1711a4611edeSahrens txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 17121209a471SNeil Perrin zilog->zl_replay = B_FALSE; 1713fa9e4066Sahrens } 1714436b2950Sperrin 1715b24ab676SJeff Bonwick boolean_t 1716b24ab676SJeff Bonwick zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 1717436b2950Sperrin { 1718b24ab676SJeff Bonwick if (zilog == NULL) 1719b24ab676SJeff Bonwick return (B_TRUE); 1720436b2950Sperrin 1721b24ab676SJeff Bonwick if (zilog->zl_replay) { 1722b24ab676SJeff Bonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1723b24ab676SJeff Bonwick zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 1724b24ab676SJeff Bonwick zilog->zl_replaying_seq; 1725b24ab676SJeff Bonwick return (B_TRUE); 1726b19a79ecSperrin } 1727b19a79ecSperrin 1728b24ab676SJeff Bonwick return (B_FALSE); 1729436b2950Sperrin } 1730e6ca193dSGeorge Wilson 1731e6ca193dSGeorge Wilson /* ARGSUSED */ 1732e6ca193dSGeorge Wilson int 1733fd136879SMatthew Ahrens zil_vdev_offline(const char *osname, void *arg) 1734e6ca193dSGeorge Wilson { 1735e6ca193dSGeorge Wilson objset_t *os; 1736e6ca193dSGeorge Wilson zilog_t *zilog; 1737e6ca193dSGeorge Wilson int error; 1738e6ca193dSGeorge Wilson 1739503ad85cSMatthew Ahrens error = dmu_objset_hold(osname, FTAG, &os); 1740e6ca193dSGeorge Wilson if (error) 1741e6ca193dSGeorge Wilson return (error); 1742e6ca193dSGeorge Wilson 1743e6ca193dSGeorge Wilson zilog = dmu_objset_zil(os); 1744e6ca193dSGeorge Wilson if (zil_suspend(zilog) != 0) 1745e6ca193dSGeorge Wilson error = EEXIST; 1746e6ca193dSGeorge Wilson else 1747e6ca193dSGeorge Wilson zil_resume(zilog); 1748503ad85cSMatthew Ahrens dmu_objset_rele(os, FTAG); 1749e6ca193dSGeorge Wilson return (error); 1750e6ca193dSGeorge Wilson } 1751