1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5fe9cf88perrin * Common Development and Distribution License (the "License").
6fe9cf88perrin * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
2255da60bMark J Musante * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23f78cdc3Paul Dagnelie * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
25fa9e406ahrens */
26fa9e406ahrens
2755da60bMark J Musante/* Portions Copyright 2010 Robert Milkowski */
2855da60bMark J Musante
29fa9e406ahrens#include <sys/zfs_context.h>
30fa9e406ahrens#include <sys/spa.h>
318671400Serapheim Dimitropoulos#include <sys/spa_impl.h>
32fa9e406ahrens#include <sys/dmu.h>
33fa9e406ahrens#include <sys/zap.h>
34fa9e406ahrens#include <sys/arc.h>
35fa9e406ahrens#include <sys/stat.h>
36fa9e406ahrens#include <sys/resource.h>
37fa9e406ahrens#include <sys/zil.h>
38fa9e406ahrens#include <sys/zil_impl.h>
39fa9e406ahrens#include <sys/dsl_dataset.h>
404b964adGeorge Wilson#include <sys/vdev_impl.h>
41d63d470gw#include <sys/dmu_tx.h>
423f9d6adLin Ling#include <sys/dsl_pool.h>
43770499eDan Kimmel#include <sys/abd.h>
44fa9e406ahrens
45fa9e406ahrens/*
461271e4bPrakash Surya * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
471271e4bPrakash Surya * calls that change the file system. Each itx has enough information to
481271e4bPrakash Surya * be able to replay them after a system crash, power loss, or
491271e4bPrakash Surya * equivalent failure mode. These are stored in memory until either:
50fa9e406ahrens *
511271e4bPrakash Surya *   1. they are committed to the pool by the DMU transaction group
521271e4bPrakash Surya *      (txg), at which point they can be discarded; or
531271e4bPrakash Surya *   2. they are committed to the on-disk ZIL for the dataset being
541271e4bPrakash Surya *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
551271e4bPrakash Surya *      requirement).
56fa9e406ahrens *
571271e4bPrakash Surya * In the event of a crash or power loss, the itxs contained by each
581271e4bPrakash Surya * dataset's on-disk ZIL will be replayed when that dataset is first
591271e4bPrakash Surya * instantianted (e.g. if the dataset is a normal fileystem, when it is
601271e4bPrakash Surya * first mounted).
61fa9e406ahrens *
621271e4bPrakash Surya * As hinted at above, there is one ZIL per dataset (both the in-memory
631271e4bPrakash Surya * representation, and the on-disk representation). The on-disk format
641271e4bPrakash Surya * consists of 3 parts:
651271e4bPrakash Surya *
6654811daToomas Soome *	- a single, per-dataset, ZIL header; which points to a chain of
6754811daToomas Soome *	- zero or more ZIL blocks; each of which contains
6854811daToomas Soome *	- zero or more ZIL records
691271e4bPrakash Surya *
701271e4bPrakash Surya * A ZIL record holds the information necessary to replay a single
711271e4bPrakash Surya * system call transaction. A ZIL block can hold many ZIL records, and
721271e4bPrakash Surya * the blocks are chained together, similarly to a singly linked list.
731271e4bPrakash Surya *
741271e4bPrakash Surya * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
751271e4bPrakash Surya * block in the chain, and the ZIL header points to the first block in
761271e4bPrakash Surya * the chain.
771271e4bPrakash Surya *
781271e4bPrakash Surya * Note, there is not a fixed place in the pool to hold these ZIL
791271e4bPrakash Surya * blocks; they are dynamically allocated and freed as needed from the
801271e4bPrakash Surya * blocks available on the pool, though they can be preferentially
811271e4bPrakash Surya * allocated from a dedicated "log" vdev.
821271e4bPrakash Surya */
831271e4bPrakash Surya
841271e4bPrakash Surya/*
851271e4bPrakash Surya * This controls the amount of time that a ZIL block (lwb) will remain
861271e4bPrakash Surya * "open" when it isn't "full", and it has a thread waiting for it to be
871271e4bPrakash Surya * committed to stable storage. Please refer to the zil_commit_waiter()
881271e4bPrakash Surya * function (and the comments within it) for more details.
89fa9e406ahrens */
901271e4bPrakash Suryaint zfs_commit_timeout_pct = 5;
91fa9e406ahrens
92fa9e406ahrens/*
93f717074Will Andrews * Disable intent logging replay.  This global ZIL switch affects all pools.
94fa9e406ahrens */
95f717074Will Andrewsint zil_replay_disable = 0;
96416e0cdek
97416e0cdek/*
98f8fdf68Prakash Surya * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
99f8fdf68Prakash Surya * the disk(s) by the ZIL after an LWB write has completed. Setting this
100f8fdf68Prakash Surya * will cause ZIL corruption on power loss if a volatile out-of-order
101f8fdf68Prakash Surya * write cache is enabled.
102416e0cdek */
103f8fdf68Prakash Suryaboolean_t zil_nocacheflush = B_FALSE;
104fa9e406ahrens
105c5ee468Alexander Motin/*
106c5ee468Alexander Motin * Limit SLOG write size per commit executed with synchronous priority.
107c5ee468Alexander Motin * Any writes above that will be executed with lower (asynchronous) priority
108c5ee468Alexander Motin * to limit potential SLOG device abuse by single active ZIL writer.
109c5ee468Alexander Motin */
110c5ee468Alexander Motinuint64_t zil_slog_bulk = 768 * 1024;
111c5ee468Alexander Motin
112fa9e406ahrensstatic kmem_cache_t *zil_lwb_cache;
1131271e4bPrakash Suryastatic kmem_cache_t *zil_zcw_cache;
114fa9e406ahrens
11591de656Neil Perrinstatic void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
1168f18d1fGeorge Wilson
1176e1f5caNeil Perrin#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
1186e1f5caNeil Perrin    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
1196e1f5caNeil Perrin
120fa9e406ahrensstatic int
121b24ab67Jeff Bonwickzil_bp_compare(const void *x1, const void *x2)
122fa9e406ahrens{
123b24ab67Jeff Bonwick	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
124b24ab67Jeff Bonwick	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
125fa9e406ahrens
1264d7988dPaul Dagnelie	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
127c4ab0d3Gvozden Neskovic	if (likely(cmp))
128c4ab0d3Gvozden Neskovic		return (cmp);
129fa9e406ahrens
1304d7988dPaul Dagnelie	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
131fa9e406ahrens}
132fa9e406ahrens
133fa9e406ahrensstatic void
134b24ab67Jeff Bonwickzil_bp_tree_init(zilog_t *zilog)
135fa9e406ahrens{
136b24ab67Jeff Bonwick	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
137b24ab67Jeff Bonwick	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
138fa9e406ahrens}
139fa9e406ahrens
140fa9e406ahrensstatic void
141b24ab67Jeff Bonwickzil_bp_tree_fini(zilog_t *zilog)
142fa9e406ahrens{
143b24ab67Jeff Bonwick	avl_tree_t *t = &zilog->zl_bp_tree;
144b24ab67Jeff Bonwick	zil_bp_node_t *zn;
145fa9e406ahrens	void *cookie = NULL;
146fa9e406ahrens
147fa9e406ahrens	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
148b24ab67Jeff Bonwick		kmem_free(zn, sizeof (zil_bp_node_t));
149fa9e406ahrens
150fa9e406ahrens	avl_destroy(t);
151fa9e406ahrens}
152fa9e406ahrens
153b24ab67Jeff Bonwickint
154b24ab67Jeff Bonwickzil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
155fa9e406ahrens{
156b24ab67Jeff Bonwick	avl_tree_t *t = &zilog->zl_bp_tree;
1575d7b4d4Matthew Ahrens	const dva_t *dva;
158b24ab67Jeff Bonwick	zil_bp_node_t *zn;
159fa9e406ahrens	avl_index_t where;
160fa9e406ahrens
1615d7b4d4Matthew Ahrens	if (BP_IS_EMBEDDED(bp))
1625d7b4d4Matthew Ahrens		return (0);
1635d7b4d4Matthew Ahrens
1645d7b4d4Matthew Ahrens	dva = BP_IDENTITY(bp);
1655d7b4d4Matthew Ahrens
166fa9e406ahrens	if (avl_find(t, dva, &where) != NULL)
167be6fd75Matthew Ahrens		return (SET_ERROR(EEXIST));
168fa9e406ahrens
169b24ab67Jeff Bonwick	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
170fa9e406ahrens	zn->zn_dva = *dva;
171fa9e406ahrens	avl_insert(t, zn, where);
172fa9e406ahrens
173fa9e406ahrens	return (0);
174fa9e406ahrens}
175fa9e406ahrens
176d80c45ebonwickstatic zil_header_t *
177d80c45ebonwickzil_header_in_syncing_context(zilog_t *zilog)
178d80c45ebonwick{
179d80c45ebonwick	return ((zil_header_t *)zilog->zl_header);
180d80c45ebonwick}
181d80c45ebonwick
182d80c45ebonwickstatic void
183d80c45ebonwickzil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
184d80c45ebonwick{
185d80c45ebonwick	zio_cksum_t *zc = &bp->blk_cksum;
186d80c45ebonwick
187d80c45ebonwick	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
188d80c45ebonwick	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
189d80c45ebonwick	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
190d80c45ebonwick	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
191d80c45ebonwick}
192d80c45ebonwick
193fa9e406ahrens/*
194b24ab67Jeff Bonwick * Read a log block and make sure it's valid.
195fa9e406ahrens */
196fa9e406ahrensstatic int
197eb63303Tom Caputizil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
198eb63303Tom Caputi    blkptr_t *nbp, void *dst, char **end)
199fa9e406ahrens{
200b24ab67Jeff Bonwick	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
2017adb730George Wilson	arc_flags_t aflags = ARC_FLAG_WAIT;
202b24ab67Jeff Bonwick	arc_buf_t *abuf = NULL;
2037802d7bMatthew Ahrens	zbookmark_phys_t zb;
204fa9e406ahrens	int error;
205fa9e406ahrens
206b24ab67Jeff Bonwick	if (zilog->zl_header->zh_claim_txg == 0)
207b24ab67Jeff Bonwick		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
208ea8dc4beschrock
209b24ab67Jeff Bonwick	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
210b24ab67Jeff Bonwick		zio_flags |= ZIO_FLAG_SPECULATIVE;
211fa9e406ahrens
212eb63303Tom Caputi	if (!decrypt)
213eb63303Tom Caputi		zio_flags |= ZIO_FLAG_RAW;
214eb63303Tom Caputi
215b24ab67Jeff Bonwick	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
216b24ab67Jeff Bonwick	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
217b24ab67Jeff Bonwick
218eb63303Tom Caputi	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
219eb63303Tom Caputi	    &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
220fa9e406ahrens
221d80c45ebonwick	if (error == 0) {
222d80c45ebonwick		zio_cksum_t cksum = bp->blk_cksum;
223fa9e406ahrens
224d80c45ebonwick		/*
225f5e6e72Neil Perrin		 * Validate the checksummed log block.
226f5e6e72Neil Perrin		 *
227d80c45ebonwick		 * Sequence numbers should be... sequential.  The checksum
228d80c45ebonwick		 * verifier for the next block should be bp's checksum plus 1.
229f5e6e72Neil Perrin		 *
230f5e6e72Neil Perrin		 * Also check the log chain linkage and size used.
231d80c45ebonwick		 */
232d80c45ebonwick		cksum.zc_word[ZIL_ZC_SEQ]++;
233d80c45ebonwick
2346e1f5caNeil Perrin		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
2356e1f5caNeil Perrin			zil_chain_t *zilc = abuf->b_data;
2366e1f5caNeil Perrin			char *lr = (char *)(zilc + 1);
2376e1f5caNeil Perrin			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
2386e1f5caNeil Perrin
2396e1f5caNeil Perrin			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
2406e1f5caNeil Perrin			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
241be6fd75Matthew Ahrens				error = SET_ERROR(ECKSUM);
2426e1f5caNeil Perrin			} else {
243b515258Matthew Ahrens				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
2446e1f5caNeil Perrin				bcopy(lr, dst, len);
2456e1f5caNeil Perrin				*end = (char *)dst + len;
2466e1f5caNeil Perrin				*nbp = zilc->zc_next_blk;
2476e1f5caNeil Perrin			}
2486e1f5caNeil Perrin		} else {
2496e1f5caNeil Perrin			char *lr = abuf->b_data;
2506e1f5caNeil Perrin			uint64_t size = BP_GET_LSIZE(bp);
2516e1f5caNeil Perrin			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
2526e1f5caNeil Perrin
2536e1f5caNeil Perrin			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
2546e1f5caNeil Perrin			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
2556e1f5caNeil Perrin			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
256be6fd75Matthew Ahrens				error = SET_ERROR(ECKSUM);
2576e1f5caNeil Perrin			} else {
258b515258Matthew Ahrens				ASSERT3U(zilc->zc_nused, <=,
259b515258Matthew Ahrens				    SPA_OLD_MAXBLOCKSIZE);
2606e1f5caNeil Perrin				bcopy(lr, dst, zilc->zc_nused);
2616e1f5caNeil Perrin				*end = (char *)dst + zilc->zc_nused;
2626e1f5caNeil Perrin				*nbp = zilc->zc_next_blk;
2636e1f5caNeil Perrin			}
2646e1f5caNeil Perrin		}
265fa9e406ahrens
266dcbf3bdGeorge Wilson		arc_buf_destroy(abuf, &abuf);
267fa9e406ahrens	}
268fa9e406ahrens
269b24ab67Jeff Bonwick	return (error);
270b24ab67Jeff Bonwick}
271b24ab67Jeff Bonwick
272b24ab67Jeff Bonwick/*
273b24ab67Jeff Bonwick * Read a TX_WRITE log data block.
274b24ab67Jeff Bonwick */
275b24ab67Jeff Bonwickstatic int
276b24ab67Jeff Bonwickzil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
277b24ab67Jeff Bonwick{
278b24ab67Jeff Bonwick	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
279b24ab67Jeff Bonwick	const blkptr_t *bp = &lr->lr_blkptr;
2807adb730George Wilson	arc_flags_t aflags = ARC_FLAG_WAIT;
281b24ab67Jeff Bonwick	arc_buf_t *abuf = NULL;
2827802d7bMatthew Ahrens	zbookmark_phys_t zb;
283b24ab67Jeff Bonwick	int error;
284b24ab67Jeff Bonwick
285b24ab67Jeff Bonwick	if (BP_IS_HOLE(bp)) {
286b24ab67Jeff Bonwick		if (wbuf != NULL)
287b24ab67Jeff Bonwick			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
288b24ab67Jeff Bonwick		return (0);
289b24ab67Jeff Bonwick	}
290b24ab67Jeff Bonwick
291b24ab67Jeff Bonwick	if (zilog->zl_header->zh_claim_txg == 0)
292b24ab67Jeff Bonwick		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
293b24ab67Jeff Bonwick
294eb63303Tom Caputi	/*
295eb63303Tom Caputi	 * If we are not using the resulting data, we are just checking that
296eb63303Tom Caputi	 * it hasn't been corrupted so we don't need to waste CPU time
297eb63303Tom Caputi	 * decompressing and decrypting it.
298eb63303Tom Caputi	 */
299eb63303Tom Caputi	if (wbuf == NULL)
300eb63303Tom Caputi		zio_flags |= ZIO_FLAG_RAW;
301eb63303Tom Caputi
302b24ab67Jeff Bonwick	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
303b24ab67Jeff Bonwick	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
304b24ab67Jeff Bonwick
3051b912ecGeorge Wilson	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
306b24ab67Jeff Bonwick	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
307b24ab67Jeff Bonwick
308b24ab67Jeff Bonwick	if (error == 0) {
309b24ab67Jeff Bonwick		if (wbuf != NULL)
310b24ab67Jeff Bonwick			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
311dcbf3bdGeorge Wilson		arc_buf_destroy(abuf, &abuf);
312b24ab67Jeff Bonwick	}
313fa9e406ahrens
314d80c45ebonwick	return (error);
315fa9e406ahrens}
316fa9e406ahrens
317fa9e406ahrens/*
318fa9e406ahrens * Parse the intent log, and call parse_func for each valid record within.
319fa9e406ahrens */
320b24ab67Jeff Bonwickint
321fa9e406ahrenszil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
322eb63303Tom Caputi    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
323eb63303Tom Caputi    boolean_t decrypt)
324fa9e406ahrens{
325d80c45ebonwick	const zil_header_t *zh = zilog->zl_header;
326b24ab67Jeff Bonwick	boolean_t claimed = !!zh->zh_claim_txg;
327b24ab67Jeff Bonwick	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
328b24ab67Jeff Bonwick	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
329b24ab67Jeff Bonwick	uint64_t max_blk_seq = 0;
330b24ab67Jeff Bonwick	uint64_t max_lr_seq = 0;
331b24ab67Jeff Bonwick	uint64_t blk_count = 0;
332b24ab67Jeff Bonwick	uint64_t lr_count = 0;
333b24ab67Jeff Bonwick	blkptr_t blk, next_blk;
334fa9e406ahrens	char *lrbuf, *lrp;
335b24ab67Jeff Bonwick	int error = 0;
336fa9e406ahrens
337b24ab67Jeff Bonwick	/*
338b24ab67Jeff Bonwick	 * Old logs didn't record the maximum zh_claim_lr_seq.
339b24ab67Jeff Bonwick	 */
340b24ab67Jeff Bonwick	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
341b24ab67Jeff Bonwick		claim_lr_seq = UINT64_MAX;
342fa9e406ahrens
343fa9e406ahrens	/*
344fa9e406ahrens	 * Starting at the block pointed to by zh_log we read the log chain.
345fa9e406ahrens	 * For each block in the chain we strongly check that block to
346fa9e406ahrens	 * ensure its validity.  We stop when an invalid block is found.
347fa9e406ahrens	 * For each block pointer in the chain we call parse_blk_func().
348fa9e406ahrens	 * For each record in each valid block we call parse_lr_func().
349d80c45ebonwick	 * If the log has been claimed, stop if we encounter a sequence
350d80c45ebonwick	 * number greater than the highest claimed sequence number.
351fa9e406ahrens	 */
352b515258Matthew Ahrens	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
353b24ab67Jeff Bonwick	zil_bp_tree_init(zilog);
354d80c45ebonwick
355b24ab67Jeff Bonwick	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
356b24ab67Jeff Bonwick		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
357b24ab67Jeff Bonwick		int reclen;
3586e1f5caNeil Perrin		char *end;
359d80c45ebonwick
360b24ab67Jeff Bonwick		if (blk_seq > claim_blk_seq)
361b24ab67Jeff Bonwick			break;
362eb63303Tom Caputi
363eb63303Tom Caputi		error = parse_blk_func(zilog, &blk, arg, txg);
364eb63303Tom Caputi		if (error != 0)
365b24ab67Jeff Bonwick			break;
3666e1f5caNeil Perrin		ASSERT3U(max_blk_seq, <, blk_seq);
367b24ab67Jeff Bonwick		max_blk_seq = blk_seq;
368b24ab67Jeff Bonwick		blk_count++;
369fa9e406ahrens
370b24ab67Jeff Bonwick		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
371b24ab67Jeff Bonwick			break;
372fa9e406ahrens
373eb63303Tom Caputi		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
374eb63303Tom Caputi		    lrbuf, &end);
3753b2aab1Matthew Ahrens		if (error != 0)
376fa9e406ahrens			break;
377fa9e406ahrens
3786e1f5caNeil Perrin		for (lrp = lrbuf; lrp < end; lrp += reclen) {
379fa9e406ahrens			lr_t *lr = (lr_t *)lrp;
380fa9e406ahrens			reclen = lr->lrc_reclen;
381fa9e406ahrens			ASSERT3U(reclen, >=, sizeof (lr_t));
382b24ab67Jeff Bonwick			if (lr->lrc_seq > claim_lr_seq)
383b24ab67Jeff Bonwick				goto done;
384eb63303Tom Caputi
385eb63303Tom Caputi			error = parse_lr_func(zilog, lr, arg, txg);
386eb63303Tom Caputi			if (error != 0)
387b24ab67Jeff Bonwick				goto done;
3886e1f5caNeil Perrin			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
389b24ab67Jeff Bonwick			max_lr_seq = lr->lrc_seq;
390b24ab67Jeff Bonwick			lr_count++;
391fa9e406ahrens		}
392fa9e406ahrens	}
393b24ab67Jeff Bonwickdone:
394b24ab67Jeff Bonwick	zilog->zl_parse_error = error;
395b24ab67Jeff Bonwick	zilog->zl_parse_blk_seq = max_blk_seq;
396b24ab67Jeff Bonwick	zilog->zl_parse_lr_seq = max_lr_seq;
397b24ab67Jeff Bonwick	zilog->zl_parse_blk_count = blk_count;
398b24ab67Jeff Bonwick	zilog->zl_parse_lr_count = lr_count;
399b24ab67Jeff Bonwick
400b24ab67Jeff Bonwick	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
401eb63303Tom Caputi	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
402eb63303Tom Caputi	    (decrypt && error == EIO));
403d80c45ebonwick
404b24ab67Jeff Bonwick	zil_bp_tree_fini(zilog);
405b515258Matthew Ahrens	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
406b24ab67Jeff Bonwick
407b24ab67Jeff Bonwick	return (error);
408fa9e406ahrens}
409fa9e406ahrens
4108671400Serapheim Dimitropoulos/* ARGSUSED */
4118671400Serapheim Dimitropoulosstatic int
4128671400Serapheim Dimitropouloszil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
4138671400Serapheim Dimitropoulos{
4148671400Serapheim Dimitropoulos	ASSERT(!BP_IS_HOLE(bp));
4158671400Serapheim Dimitropoulos
4168671400Serapheim Dimitropoulos	/*
4178671400Serapheim Dimitropoulos	 * As we call this function from the context of a rewind to a
4188671400Serapheim Dimitropoulos	 * checkpoint, each ZIL block whose txg is later than the txg
4198671400Serapheim Dimitropoulos	 * that we rewind to is invalid. Thus, we return -1 so
4208671400Serapheim Dimitropoulos	 * zil_parse() doesn't attempt to read it.
4218671400Serapheim Dimitropoulos	 */
4228671400Serapheim Dimitropoulos	if (bp->blk_birth >= first_txg)
4238671400Serapheim Dimitropoulos		return (-1);
4248671400Serapheim Dimitropoulos
4258671400Serapheim Dimitropoulos	if (zil_bp_tree_add(zilog, bp) != 0)
4268671400Serapheim Dimitropoulos		return (0);
4278671400Serapheim Dimitropoulos
4288671400Serapheim Dimitropoulos	zio_free(zilog->zl_spa, first_txg, bp);
4298671400Serapheim Dimitropoulos	return (0);
4308671400Serapheim Dimitropoulos}
4318671400Serapheim Dimitropoulos
4328671400Serapheim Dimitropoulos/* ARGSUSED */
4338671400Serapheim Dimitropoulosstatic int
4348671400Serapheim Dimitropouloszil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
4358671400Serapheim Dimitropoulos{
4368671400Serapheim Dimitropoulos	return (0);
4378671400Serapheim Dimitropoulos}
4388671400Serapheim Dimitropoulos
439b24ab67Jeff Bonwickstatic int
440fa9e406ahrenszil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
441fa9e406ahrens{
442fa9e406ahrens	/*
443fa9e406ahrens	 * Claim log block if not already committed and not already claimed.
444b24ab67Jeff Bonwick	 * If tx == NULL, just verify that the block is claimable.
445fa9e406ahrens	 */
44643466aaMax Grossman	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
44743466aaMax Grossman	    zil_bp_tree_add(zilog, bp) != 0)
448b24ab67Jeff Bonwick		return (0);
449b24ab67Jeff Bonwick
450b24ab67Jeff Bonwick	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
451b24ab67Jeff Bonwick	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
452b24ab67Jeff Bonwick	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
453fa9e406ahrens}
454fa9e406ahrens
455b24ab67Jeff Bonwickstatic int
456fa9e406ahrenszil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
457fa9e406ahrens{
458b24ab67Jeff Bonwick	lr_write_t *lr = (lr_write_t *)lrc;
459b24ab67Jeff Bonwick	int error;
460b24ab67Jeff Bonwick
461b24ab67Jeff Bonwick	if (lrc->lrc_txtype != TX_WRITE)
462b24ab67Jeff Bonwick		return (0);
463b24ab67Jeff Bonwick
464b24ab67Jeff Bonwick	/*
465b24ab67Jeff Bonwick	 * If the block is not readable, don't claim it.  This can happen
466b24ab67Jeff Bonwick	 * in normal operation when a log block is written to disk before
467b24ab67Jeff Bonwick	 * some of the dmu_sync() blocks it points to.  In this case, the
468b24ab67Jeff Bonwick	 * transaction cannot have been committed to anyone (we would have
469b24ab67Jeff Bonwick	 * waited for all writes to be stable first), so it is semantically
470b24ab67Jeff Bonwick	 * correct to declare this the end of the log.
471b24ab67Jeff Bonwick	 */
472eb63303Tom Caputi	if (lr->lr_blkptr.blk_birth >= first_txg) {
473eb63303Tom Caputi		error = zil_read_log_data(zilog, lr, NULL);
474eb63303Tom Caputi		if (error != 0)
475eb63303Tom Caputi			return (error);
476eb63303Tom Caputi	}
477eb63303Tom Caputi
478b24ab67Jeff Bonwick	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
479fa9e406ahrens}
480fa9e406ahrens
481fa9e406ahrens/* ARGSUSED */
482b24ab67Jeff Bonwickstatic int
483fa9e406ahrenszil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
484fa9e406ahrens{
4858671400Serapheim Dimitropoulos	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
486b24ab67Jeff Bonwick
487b24ab67Jeff Bonwick	return (0);
488fa9e406ahrens}
489fa9e406ahrens
490b24ab67Jeff Bonwickstatic int
491fa9e406ahrenszil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
492fa9e406ahrens{
493b24ab67Jeff Bonwick	lr_write_t *lr = (lr_write_t *)lrc;
494b24ab67Jeff Bonwick	blkptr_t *bp = &lr->lr_blkptr;
495b24ab67Jeff Bonwick
496fa9e406ahrens	/*
497fa9e406ahrens	 * If we previously claimed it, we need to free it.
498fa9e406ahrens	 */
499b24ab67Jeff Bonwick	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
50043466aaMax Grossman	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
50143466aaMax Grossman	    !BP_IS_HOLE(bp))
502b24ab67Jeff Bonwick		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
503b24ab67Jeff Bonwick
504b24ab67Jeff Bonwick	return (0);
505fa9e406ahrens}
506fa9e406ahrens
5071271e4bPrakash Suryastatic int
5081271e4bPrakash Suryazil_lwb_vdev_compare(const void *x1, const void *x2)
5091271e4bPrakash Surya{
5101271e4bPrakash Surya	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
5111271e4bPrakash Surya	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
5121271e4bPrakash Surya
5134d7988dPaul Dagnelie	return (TREE_CMP(v1, v2));
5141271e4bPrakash Surya}
5151271e4bPrakash Surya
5166e1f5caNeil Perrinstatic lwb_t *
517c5ee468Alexander Motinzil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
5186e1f5caNeil Perrin{
5196e1f5caNeil Perrin	lwb_t *lwb;
5206e1f5caNeil Perrin
5216e1f5caNeil Perrin	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
5226e1f5caNeil Perrin	lwb->lwb_zilog = zilog;
5236e1f5caNeil Perrin	lwb->lwb_blk = *bp;
524c5ee468Alexander Motin	lwb->lwb_slog = slog;
5251271e4bPrakash Surya	lwb->lwb_state = LWB_STATE_CLOSED;
5266e1f5caNeil Perrin	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
5276e1f5caNeil Perrin	lwb->lwb_max_txg = txg;
5281271e4bPrakash Surya	lwb->lwb_write_zio = NULL;
5291271e4bPrakash Surya	lwb->lwb_root_zio = NULL;
5306e1f5caNeil Perrin	lwb->lwb_tx = NULL;
5311271e4bPrakash Surya	lwb->lwb_issued_timestamp = 0;
5326e1f5caNeil Perrin	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
5336e1f5caNeil Perrin		lwb->lwb_nused = sizeof (zil_chain_t);
5346e1f5caNeil Perrin		lwb->lwb_sz = BP_GET_LSIZE(bp);
5356e1f5caNeil Perrin	} else {
5366e1f5caNeil Perrin		lwb->lwb_nused = 0;
5376e1f5caNeil Perrin		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
5386e1f5caNeil Perrin	}
5396e1f5caNeil Perrin
5406e1f5caNeil Perrin	mutex_enter(&zilog->zl_lock);
5416e1f5caNeil Perrin	list_insert_tail(&zilog->zl_lwb_list, lwb);
5426e1f5caNeil Perrin	mutex_exit(&zilog->zl_lock);
5436e1f5caNeil Perrin
5441271e4bPrakash Surya	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
5451271e4bPrakash Surya	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
54694ddd09Prakash Surya	VERIFY(list_is_empty(&lwb->lwb_waiters));
5471271e4bPrakash Surya
5486e1f5caNeil Perrin	return (lwb);
5496e1f5caNeil Perrin}
5506e1f5caNeil Perrin
5511271e4bPrakash Suryastatic void
5521271e4bPrakash Suryazil_free_lwb(zilog_t *zilog, lwb_t *lwb)
5531271e4bPrakash Surya{
5541271e4bPrakash Surya	ASSERT(MUTEX_HELD(&zilog->zl_lock));
5551271e4bPrakash Surya	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
55694ddd09Prakash Surya	VERIFY(list_is_empty(&lwb->lwb_waiters));
5571271e4bPrakash Surya	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
5581271e4bPrakash Surya	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
5591271e4bPrakash Surya	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
56094ddd09Prakash Surya	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
56194ddd09Prakash Surya	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
562cab3a55Prakash Surya	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
5631271e4bPrakash Surya
5641271e4bPrakash Surya	/*
5651271e4bPrakash Surya	 * Clear the zilog's field to indicate this lwb is no longer
5661271e4bPrakash Surya	 * valid, and prevent use-after-free errors.
5671271e4bPrakash Surya	 */
5681271e4bPrakash Surya	if (zilog->zl_last_lwb_opened == lwb)
5691271e4bPrakash Surya		zilog->zl_last_lwb_opened = NULL;
5701271e4bPrakash Surya
5711271e4bPrakash Surya	kmem_cache_free(zil_lwb_cache, lwb);
5721271e4bPrakash Surya}
5731271e4bPrakash Surya
574fa9e406ahrens/*
575ce636f8Matthew Ahrens * Called when we create in-memory log transactions so that we know
576ce636f8Matthew Ahrens * to cleanup the itxs at the end of spa_sync().
577ce636f8Matthew Ahrens */
578ce636f8Matthew Ahrensvoid
579ce636f8Matthew Ahrenszilog_dirty(zilog_t *zilog, uint64_t txg)
580ce636f8Matthew Ahrens{
581ce636f8Matthew Ahrens	dsl_pool_t *dp = zilog->zl_dmu_pool;
582ce636f8Matthew Ahrens	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
583ce636f8Matthew Ahrens
5841271e4bPrakash Surya	ASSERT(spa_writeable(zilog->zl_spa));
5851271e4bPrakash Surya
586bc9014eJustin Gibbs	if (ds->ds_is_snapshot)
587ce636f8Matthew Ahrens		panic("dirtying snapshot!");
588ce636f8Matthew Ahrens
5893b2aab1Matthew Ahrens	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
590ce636f8Matthew Ahrens		/* up the hold count until we can be written out */
591ce636f8Matthew Ahrens		dmu_buf_add_ref(ds->ds_dbuf, zilog);
5921271e4bPrakash Surya
5931271e4bPrakash Surya		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
594ce636f8Matthew Ahrens	}
595ce636f8Matthew Ahrens}
596ce636f8Matthew Ahrens
59743297f9George Wilson/*
59843297f9George Wilson * Determine if the zil is dirty in the specified txg. Callers wanting to
59943297f9George Wilson * ensure that the dirty state does not change must hold the itxg_lock for
60043297f9George Wilson * the specified txg. Holding the lock will ensure that the zil cannot be
60143297f9George Wilson * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
60243297f9George Wilson * state.
60343297f9George Wilson */
60443297f9George Wilsonboolean_t
60543297f9George Wilsonzilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
60643297f9George Wilson{
60743297f9George Wilson	dsl_pool_t *dp = zilog->zl_dmu_pool;
60843297f9George Wilson
60943297f9George Wilson	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
61043297f9George Wilson		return (B_TRUE);
61143297f9George Wilson	return (B_FALSE);
61243297f9George Wilson}
61343297f9George Wilson
61443297f9George Wilson/*
61543297f9George Wilson * Determine if the zil is dirty. The zil is considered dirty if it has
61643297f9George Wilson * any pending itx records that have not been cleaned by zil_clean().
61743297f9George Wilson */
618ce636f8Matthew Ahrensboolean_t
619ce636f8Matthew Ahrenszilog_is_dirty(zilog_t *zilog)
620ce636f8Matthew Ahrens{
621ce636f8Matthew Ahrens	dsl_pool_t *dp = zilog->zl_dmu_pool;
622ce636f8Matthew Ahrens
623ce636f8Matthew Ahrens	for (int t = 0; t < TXG_SIZE; t++) {
624ce636f8Matthew Ahrens		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
625ce636f8Matthew Ahrens			return (B_TRUE);
626ce636f8Matthew Ahrens	}
627ce636f8Matthew Ahrens	return (B_FALSE);
628ce636f8Matthew Ahrens}
629ce636f8Matthew Ahrens
630ce636f8Matthew Ahrens/*
631fa9e406ahrens * Create an on-disk intent log.
632fa9e406ahrens */
6336e1f5caNeil Perrinstatic lwb_t *
634fa9e406ahrenszil_create(zilog_t *zilog)
635fa9e406ahrens{
636d80c45ebonwick	const zil_header_t *zh = zilog->zl_header;
6376e1f5caNeil Perrin	lwb_t *lwb = NULL;
638d80c45ebonwick	uint64_t txg = 0;
639d80c45ebonwick	dmu_tx_t *tx = NULL;
640fa9e406ahrens	blkptr_t blk;
641d80c45ebonwick	int error = 0;
642c5ee468Alexander Motin	boolean_t slog = FALSE;
643fa9e406ahrens
644fa9e406ahrens	/*
645d80c45ebonwick	 * Wait for any previous destroy to complete.
646fa9e406ahrens	 */
647d80c45ebonwick	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
648d80c45ebonwick
649d80c45ebonwick	ASSERT(zh->zh_claim_txg == 0);
650d80c45ebonwick	ASSERT(zh->zh_replay_seq == 0);
651d80c45ebonwick
652d80c45ebonwick	blk = zh->zh_log;
653fa9e406ahrens
654fa9e406ahrens	/*
6556e1f5caNeil Perrin	 * Allocate an initial log block if:
6566e1f5caNeil Perrin	 *    - there isn't one already
6576e1f5caNeil Perrin	 *    - the existing block is the wrong endianess
658fa9e406ahrens	 */
659899217dNeil Perrin	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
660d80c45ebonwick		tx = dmu_tx_create(zilog->zl_os);
6611271e4bPrakash Surya		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
662d80c45ebonwick		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
663d80c45ebonwick		txg = dmu_tx_get_txg(tx);
664d80c45ebonwick
665899217dNeil Perrin		if (!BP_IS_HOLE(&blk)) {
6668671400Serapheim Dimitropoulos			zio_free(zilog->zl_spa, txg, &blk);
667899217dNeil Perrin			BP_ZERO(&blk);
668899217dNeil Perrin		}
669899217dNeil Perrin
670eb63303Tom Caputi		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
671eb63303Tom Caputi		    NULL, ZIL_MIN_BLKSZ, &slog);
672d80c45ebonwick
673d80c45ebonwick		if (error == 0)
674d80c45ebonwick			zil_init_log_chain(zilog, &blk);
67513f5297perrin	}
676fa9e406ahrens
677d80c45ebonwick	/*
6781271e4bPrakash Surya	 * Allocate a log write block (lwb) for the first log block.
679d80c45ebonwick	 */
6806e1f5caNeil Perrin	if (error == 0)
681c5ee468Alexander Motin		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
682fa9e406ahrens
683d80c45ebonwick	/*
684d80c45ebonwick	 * If we just allocated the first log block, commit our transaction
685d80c45ebonwick	 * and wait for zil_sync() to stuff the block poiner into zh_log.
686d80c45ebonwick	 * (zh is part of the MOS, so we cannot modify it in open context.)
687d80c45ebonwick	 */
688d80c45ebonwick	if (tx != NULL) {
689d80c45ebonwick		dmu_tx_commit(tx);
69013f5297perrin		txg_wait_synced(zilog->zl_dmu_pool, txg);
691d80c45ebonwick	}
692d80c45ebonwick
693d80c45ebonwick	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
6946e1f5caNeil Perrin
6956e1f5caNeil Perrin	return (lwb);
696fa9e406ahrens}
697fa9e406ahrens
698fa9e406ahrens/*
6991271e4bPrakash Surya * In one tx, free all log blocks and clear the log header. If keep_first
7001271e4bPrakash Surya * is set, then we're replaying a log with no content. We want to keep the
7011271e4bPrakash Surya * first block, however, so that the first synchronous transaction doesn't
7021271e4bPrakash Surya * require a txg_wait_synced() in zil_create(). We don't need to
7031271e4bPrakash Surya * txg_wait_synced() here either when keep_first is set, because both
7041271e4bPrakash Surya * zil_create() and zil_destroy() will wait for any in-progress destroys
7051271e4bPrakash Surya * to complete.
706fa9e406ahrens */
707fa9e406ahrensvoid
708d80c45ebonwickzil_destroy(zilog_t *zilog, boolean_t keep_first)
709fa9e406ahrens{
710d80c45ebonwick	const zil_header_t *zh = zilog->zl_header;
711d80c45ebonwick	lwb_t *lwb;
712fa9e406ahrens	dmu_tx_t *tx;
713fa9e406ahrens	uint64_t txg;
714fa9e406ahrens
715d80c45ebonwick	/*
716d80c45ebonwick	 * Wait for any previous destroy to complete.
717d80c45ebonwick	 */
718d80c45ebonwick	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
719fa9e406ahrens
720b24ab67Jeff Bonwick	zilog->zl_old_header = *zh;		/* debugging aid */
721b24ab67Jeff Bonwick
722d80c45ebonwick	if (BP_IS_HOLE(&zh->zh_log))
723fa9e406ahrens		return;
724fa9e406ahrens
725fa9e406ahrens	tx = dmu_tx_create(zilog->zl_os);
7261271e4bPrakash Surya	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
727fa9e406ahrens	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
728fa9e406ahrens	txg = dmu_tx_get_txg(tx);
729fa9e406ahrens
730d80c45ebonwick	mutex_enter(&zilog->zl_lock);
731d80c45ebonwick
732d80c45ebonwick	ASSERT3U(zilog->zl_destroy_txg, <, txg);
733fa9e406ahrens	zilog->zl_destroy_txg = txg;
734b24ab67Jeff Bonwick	zilog->zl_keep_first = keep_first;
735d80c45ebonwick
736d80c45ebonwick	if (!list_is_empty(&zilog->zl_lwb_list)) {
737d80c45ebonwick		ASSERT(zh->zh_claim_txg == 0);
738c9ba2a4Eric Schrock		VERIFY(!keep_first);
739d80c45ebonwick		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
740d80c45ebonwick			list_remove(&zilog->zl_lwb_list, lwb);
741d80c45ebonwick			if (lwb->lwb_buf != NULL)
742d80c45ebonwick				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
7431271e4bPrakash Surya			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
7441271e4bPrakash Surya			zil_free_lwb(zilog, lwb);
745d80c45ebonwick		}
746b24ab67Jeff Bonwick	} else if (!keep_first) {
747ce636f8Matthew Ahrens		zil_destroy_sync(zilog, tx);
748d80c45ebonwick	}
749b19a79eperrin	mutex_exit(&zilog->zl_lock);
750fa9e406ahrens
751fa9e406ahrens	dmu_tx_commit(tx);
752fa9e406ahrens}
753fa9e406ahrens
754ce636f8Matthew Ahrensvoid
755ce636f8Matthew Ahrenszil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
756ce636f8Matthew Ahrens{
757ce636f8Matthew Ahrens	ASSERT(list_is_empty(&zilog->zl_lwb_list));
758ce636f8Matthew Ahrens	(void) zil_parse(zilog, zil_free_log_block,
759eb63303Tom Caputi	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
760ce636f8Matthew Ahrens}
761ce636f8Matthew Ahrens
7621d452cfahrensint
76312380e1Arne Jansenzil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
764fa9e406ahrens{
765fa9e406ahrens	dmu_tx_t *tx = txarg;
766fa9e406ahrens	zilog_t *zilog;
7678671400Serapheim Dimitropoulos	uint64_t first_txg;
768fa9e406ahrens	zil_header_t *zh;
769fa9e406ahrens	objset_t *os;
770fa9e406ahrens	int error;
771fa9e406ahrens
77212380e1Arne Jansen	error = dmu_objset_own_obj(dp, ds->ds_object,
773eb63303Tom Caputi	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
7743b2aab1Matthew Ahrens	if (error != 0) {
7752243853Matthew Ahrens		/*
7762243853Matthew Ahrens		 * EBUSY indicates that the objset is inconsistent, in which
7772243853Matthew Ahrens		 * case it can not have a ZIL.
7782243853Matthew Ahrens		 */
7792243853Matthew Ahrens		if (error != EBUSY) {
78012380e1Arne Jansen			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
78112380e1Arne Jansen			    (unsigned long long)ds->ds_object, error);
7822243853Matthew Ahrens		}
7831d452cfahrens		return (0);
784fa9e406ahrens	}
785fa9e406ahrens
786fa9e406ahrens	zilog = dmu_objset_zil(os);
787d80c45ebonwick	zh = zil_header_in_syncing_context(zilog);
7888671400Serapheim Dimitropoulos	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
7898671400Serapheim Dimitropoulos	first_txg = spa_min_claim_txg(zilog->zl_spa);
790fa9e406ahrens
7918671400Serapheim Dimitropoulos	/*
7928671400Serapheim Dimitropoulos	 * If the spa_log_state is not set to be cleared, check whether
7938671400Serapheim Dimitropoulos	 * the current uberblock is a checkpoint one and if the current
7948671400Serapheim Dimitropoulos	 * header has been claimed before moving on.
7958671400Serapheim Dimitropoulos	 *
7968671400Serapheim Dimitropoulos	 * If the current uberblock is a checkpointed uberblock then
7978671400Serapheim Dimitropoulos	 * one of the following scenarios took place:
7988671400Serapheim Dimitropoulos	 *
7998671400Serapheim Dimitropoulos	 * 1] We are currently rewinding to the checkpoint of the pool.
8008671400Serapheim Dimitropoulos	 * 2] We crashed in the middle of a checkpoint rewind but we
8018671400Serapheim Dimitropoulos	 *    did manage to write the checkpointed uberblock to the
8028671400Serapheim Dimitropoulos	 *    vdev labels, so when we tried to import the pool again
8038671400Serapheim Dimitropoulos	 *    the checkpointed uberblock was selected from the import
8048671400Serapheim Dimitropoulos	 *    procedure.
8058671400Serapheim Dimitropoulos	 *
8068671400Serapheim Dimitropoulos	 * In both cases we want to zero out all the ZIL blocks, except
8078671400Serapheim Dimitropoulos	 * the ones that have been claimed at the time of the checkpoint
8088671400Serapheim Dimitropoulos	 * (their zh_claim_txg != 0). The reason is that these blocks
8098671400Serapheim Dimitropoulos	 * may be corrupted since we may have reused their locations on
8108671400Serapheim Dimitropoulos	 * disk after we took the checkpoint.
8118671400Serapheim Dimitropoulos	 *
8128671400Serapheim Dimitropoulos	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
8138671400Serapheim Dimitropoulos	 * when we first figure out whether the current uberblock is
8148671400Serapheim Dimitropoulos	 * checkpointed or not. Unfortunately, that would discard all
8158671400Serapheim Dimitropoulos	 * the logs, including the ones that are claimed, and we would
8168671400Serapheim Dimitropoulos	 * leak space.
8178671400Serapheim Dimitropoulos	 */
8188671400Serapheim Dimitropoulos	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
8198671400Serapheim Dimitropoulos	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
8208671400Serapheim Dimitropoulos	    zh->zh_claim_txg == 0)) {
8218671400Serapheim Dimitropoulos		if (!BP_IS_HOLE(&zh->zh_log)) {
8228671400Serapheim Dimitropoulos			(void) zil_parse(zilog, zil_clear_log_block,
823eb63303Tom Caputi			    zil_noop_log_record, tx, first_txg, B_FALSE);
8248671400Serapheim Dimitropoulos		}
825e6ca193George Wilson		BP_ZERO(&zh->zh_log);
826eb63303Tom Caputi		if (os->os_encrypted)
827eb63303Tom Caputi			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
828e6ca193George Wilson		dsl_dataset_dirty(dmu_objset_ds(os), tx);
829eb63303Tom Caputi		dmu_objset_disown(os, B_FALSE, FTAG);
830468c413Tim Haley		return (0);
831e6ca193George Wilson	}
832e6ca193George Wilson
833fa9e406ahrens	/*
8348671400Serapheim Dimitropoulos	 * If we are not rewinding and opening the pool normally, then
8358671400Serapheim Dimitropoulos	 * the min_claim_txg should be equal to the first txg of the pool.
8368671400Serapheim Dimitropoulos	 */
8378671400Serapheim Dimitropoulos	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
8388671400Serapheim Dimitropoulos
8398671400Serapheim Dimitropoulos	/*
840d80c45ebonwick	 * Claim all log blocks if we haven't already done so, and remember
841d80c45ebonwick	 * the highest claimed sequence number.  This ensures that if we can
842d80c45ebonwick	 * read only part of the log now (e.g. due to a missing device),
843d80c45ebonwick	 * but we can read the entire log later, we will not try to replay
844d80c45ebonwick	 * or destroy beyond the last block we successfully claimed.
845fa9e406ahrens	 */
846fa9e406ahrens	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
847fa9e406ahrens	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
848b24ab67Jeff Bonwick		(void) zil_parse(zilog, zil_claim_log_block,
849eb63303Tom Caputi		    zil_claim_log_record, tx, first_txg, B_FALSE);
850b24ab67Jeff Bonwick		zh->zh_claim_txg = first_txg;
851b24ab67Jeff Bonwick		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
852b24ab67Jeff Bonwick		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
853b24ab67Jeff Bonwick		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
854b24ab67Jeff Bonwick			zh->zh_flags |= ZIL_REPLAY_NEEDED;
855b24ab67Jeff Bonwick		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
856eb63303Tom Caputi		if (os->os_encrypted)
857eb63303Tom Caputi			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
858fa9e406ahrens		dsl_dataset_dirty(dmu_objset_ds(os), tx);
859fa9e406ahrens	}
860d80c45ebonwick
861fa9e406ahrens	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
862eb63303Tom Caputi	dmu_objset_disown(os, B_FALSE, FTAG);
8631d452cfahrens	return (0);
864b87f3afperrin}
865b87f3afperrin
866b87f3afperrin/*
867b87f3afperrin * Check the log by walking the log chain.
868b87f3afperrin * Checksum errors are ok as they indicate the end of the chain.
869b87f3afperrin * Any other error (no device or read failure) returns an error.
870b87f3afperrin */
87112380e1Arne Jansen/* ARGSUSED */
872b87f3afperrinint
87312380e1Arne Jansenzil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
874b87f3afperrin{
875b87f3afperrin	zilog_t *zilog;
876b87f3afperrin	objset_t *os;
8774b964adGeorge Wilson	blkptr_t *bp;
878b87f3afperrin	int error;
879b87f3afperrin
880b24ab67Jeff Bonwick	ASSERT(tx == NULL);
881b24ab67Jeff Bonwick
88212380e1Arne Jansen	error = dmu_objset_from_ds(ds, &os);
8833b2aab1Matthew Ahrens	if (error != 0) {
88412380e1Arne Jansen		cmn_err(CE_WARN, "can't open objset %llu, error %d",
88512380e1Arne Jansen		    (unsigned long long)ds->ds_object, error);
886b87f3afperrin		return (0);
887b87f3afperrin	}
888b87f3afperrin
889b87f3afperrin	zilog = dmu_objset_zil(os);
8904b964adGeorge Wilson	bp = (blkptr_t *)&zilog->zl_header->zh_log;
8914b964adGeorge Wilson
8924b964adGeorge Wilson	if (!BP_IS_HOLE(bp)) {
8934b964adGeorge Wilson		vdev_t *vd;
8944b964adGeorge Wilson		boolean_t valid = B_TRUE;
8954b964adGeorge Wilson
8968671400Serapheim Dimitropoulos		/*
8978671400Serapheim Dimitropoulos		 * Check the first block and determine if it's on a log device
8988671400Serapheim Dimitropoulos		 * which may have been removed or faulted prior to loading this
8998671400Serapheim Dimitropoulos		 * pool.  If so, there's no point in checking the rest of the
900