1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
55ad8204nd * Common Development and Distribution License (the "License").
65ad8204nd * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
223f9d6adLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23383e7c7Xin Li * Portions Copyright 2011 Martin Matuska
24814dcd4Serapheim Dimitropoulos * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
25fa9e406ahrens */
26fa9e406ahrens
27fa9e406ahrens#include <sys/zfs_context.h>
28fa9e406ahrens#include <sys/txg_impl.h>
29fa9e406ahrens#include <sys/dmu_impl.h>
30d20e665Ricardo M. Correia#include <sys/dmu_tx.h>
31fa9e406ahrens#include <sys/dsl_pool.h>
323f9d6adLin Ling#include <sys/dsl_scan.h>
33b7b2590Matthew Ahrens#include <sys/zil.h>
34fa9e406ahrens#include <sys/callb.h>
35fa9e406ahrens
36fa9e406ahrens/*
37adbbcffAdam H. Leventhal * ZFS Transaction Groups
38adbbcffAdam H. Leventhal * ----------------------
39adbbcffAdam H. Leventhal *
40adbbcffAdam H. Leventhal * ZFS transaction groups are, as the name implies, groups of transactions
41adbbcffAdam H. Leventhal * that act on persistent state. ZFS asserts consistency at the granularity of
42adbbcffAdam H. Leventhal * these transaction groups. Each successive transaction group (txg) is
43adbbcffAdam H. Leventhal * assigned a 64-bit consecutive identifier. There are three active
44adbbcffAdam H. Leventhal * transaction group states: open, quiescing, or syncing. At any given time,
45adbbcffAdam H. Leventhal * there may be an active txg associated with each state; each active txg may
46adbbcffAdam H. Leventhal * either be processing, or blocked waiting to enter the next state. There may
47adbbcffAdam H. Leventhal * be up to three active txgs, and there is always a txg in the open state
48adbbcffAdam H. Leventhal * (though it may be blocked waiting to enter the quiescing state). In broad
4969962b5Matthew Ahrens * strokes, transactions -- operations that change in-memory structures -- are
50adbbcffAdam H. Leventhal * accepted into the txg in the open state, and are completed while the txg is
51adbbcffAdam H. Leventhal * in the open or quiescing states. The accumulated changes are written to
52adbbcffAdam H. Leventhal * disk in the syncing state.
53adbbcffAdam H. Leventhal *
54adbbcffAdam H. Leventhal * Open
55adbbcffAdam H. Leventhal *
56adbbcffAdam H. Leventhal * When a new txg becomes active, it first enters the open state. New
5769962b5Matthew Ahrens * transactions -- updates to in-memory structures -- are assigned to the
58adbbcffAdam H. Leventhal * currently open txg. There is always a txg in the open state so that ZFS can
59adbbcffAdam H. Leventhal * accept new changes (though the txg may refuse new changes if it has hit
60adbbcffAdam H. Leventhal * some limit). ZFS advances the open txg to the next state for a variety of
61adbbcffAdam H. Leventhal * reasons such as it hitting a time or size threshold, or the execution of an
62adbbcffAdam H. Leventhal * administrative action that must be completed in the syncing state.
63adbbcffAdam H. Leventhal *
64adbbcffAdam H. Leventhal * Quiescing
65adbbcffAdam H. Leventhal *
66adbbcffAdam H. Leventhal * After a txg exits the open state, it enters the quiescing state. The
67adbbcffAdam H. Leventhal * quiescing state is intended to provide a buffer between accepting new
68adbbcffAdam H. Leventhal * transactions in the open state and writing them out to stable storage in
69adbbcffAdam H. Leventhal * the syncing state. While quiescing, transactions can continue their
70adbbcffAdam H. Leventhal * operation without delaying either of the other states. Typically, a txg is
71adbbcffAdam H. Leventhal * in the quiescing state very briefly since the operations are bounded by
72adbbcffAdam H. Leventhal * software latencies rather than, say, slower I/O latencies. After all
73adbbcffAdam H. Leventhal * transactions complete, the txg is ready to enter the next state.
74adbbcffAdam H. Leventhal *
75adbbcffAdam H. Leventhal * Syncing
76adbbcffAdam H. Leventhal *
77adbbcffAdam H. Leventhal * In the syncing state, the in-memory state built up during the open and (to
78adbbcffAdam H. Leventhal * a lesser degree) the quiescing states is written to stable storage. The
79adbbcffAdam H. Leventhal * process of writing out modified data can, in turn modify more data. For
80adbbcffAdam H. Leventhal * example when we write new blocks, we need to allocate space for them; those
81adbbcffAdam H. Leventhal * allocations modify metadata (space maps)... which themselves must be
82adbbcffAdam H. Leventhal * written to stable storage. During the sync state, ZFS iterates, writing out
83adbbcffAdam H. Leventhal * data until it converges and all in-memory changes have been written out.
84adbbcffAdam H. Leventhal * The first such pass is the largest as it encompasses all the modified user
85adbbcffAdam H. Leventhal * data (as opposed to filesystem metadata). Subsequent passes typically have
86adbbcffAdam H. Leventhal * far less data to write as they consist exclusively of filesystem metadata.
87adbbcffAdam H. Leventhal *
88adbbcffAdam H. Leventhal * To ensure convergence, after a certain number of passes ZFS begins
89adbbcffAdam H. Leventhal * overwriting locations on stable storage that had been allocated earlier in
90adbbcffAdam H. Leventhal * the syncing state (and subsequently freed). ZFS usually allocates new
91adbbcffAdam H. Leventhal * blocks to optimize for large, continuous, writes. For the syncing state to
92adbbcffAdam H. Leventhal * converge however it must complete a pass where no new blocks are allocated
93adbbcffAdam H. Leventhal * since each allocation requires a modification of persistent metadata.
94adbbcffAdam H. Leventhal * Further, to hasten convergence, after a prescribed number of passes, ZFS
95adbbcffAdam H. Leventhal * also defers frees, and stops compressing.
96adbbcffAdam H. Leventhal *
97adbbcffAdam H. Leventhal * In addition to writing out user data, we must also execute synctasks during
98adbbcffAdam H. Leventhal * the syncing context. A synctask is the mechanism by which some
99adbbcffAdam H. Leventhal * administrative activities work such as creating and destroying snapshots or
100adbbcffAdam H. Leventhal * datasets. Note that when a synctask is initiated it enters the open txg,
101adbbcffAdam H. Leventhal * and ZFS then pushes that txg as quickly as possible to completion of the
102adbbcffAdam H. Leventhal * syncing state in order to reduce the latency of the administrative
103adbbcffAdam H. Leventhal * activity. To complete the syncing state, ZFS writes out a new uberblock,
104adbbcffAdam H. Leventhal * the root of the tree of blocks that comprise all state stored on the ZFS
105adbbcffAdam H. Leventhal * pool. Finally, if there is a quiesced txg waiting, we signal that it can
106adbbcffAdam H. Leventhal * now transition to the syncing state.
107fa9e406ahrens */
108fa9e406ahrens
1093f7978dAlan Somersstatic void txg_sync_thread(void *arg);
1103f7978dAlan Somersstatic void txg_quiesce_thread(void *arg);
111fa9e406ahrens
11244ecc53George Wilsonint zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
113fa9e406ahrens
114fa9e406ahrens/*
115fa9e406ahrens * Prepare the txg subsystem.
116fa9e406ahrens */
117fa9e406ahrensvoid
118fa9e406ahrenstxg_init(dsl_pool_t *dp, uint64_t txg)
119fa9e406ahrens{
120fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
1215ad8204nd	int c;
122fa9e406ahrens	bzero(tx, sizeof (tx_state_t));
123fa9e406ahrens
124fa9e406ahrens	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
125fa9e406ahrens
1268f38d41ek	for (c = 0; c < max_ncpus; c++) {
1278f38d41ek		int i;
1288f38d41ek
1295ad8204nd		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
1304a92375George Wilson		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
1314a92375George Wilson		    NULL);
1328f38d41ek		for (i = 0; i < TXG_SIZE; i++) {
1338f38d41ek			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
1348f38d41ek			    NULL);
135d20e665Ricardo M. Correia			list_create(&tx->tx_cpu[c].tc_callbacks[i],
136d20e665Ricardo M. Correia			    sizeof (dmu_tx_callback_t),
137d20e665Ricardo M. Correia			    offsetof(dmu_tx_callback_t, dcb_node));
1388f38d41ek		}
1398f38d41ek	}
1405ad8204nd
1415ad8204nd	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
142fa9e406ahrens
143b5e70f9Ricardo M. Correia	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
144b5e70f9Ricardo M. Correia	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
145b5e70f9Ricardo M. Correia	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
146b5e70f9Ricardo M. Correia	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
147b5e70f9Ricardo M. Correia	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
148b5e70f9Ricardo M. Correia
149fa9e406ahrens	tx->tx_open_txg = txg;
150fa9e406ahrens}
151fa9e406ahrens
152fa9e406ahrens/*
153fa9e406ahrens * Close down the txg subsystem.
154fa9e406ahrens */
155fa9e406ahrensvoid
156fa9e406ahrenstxg_fini(dsl_pool_t *dp)
157fa9e406ahrens{
158fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
1595ad8204nd	int c;
160fa9e406ahrens
1611271e4bPrakash Surya	ASSERT0(tx->tx_threads);
162fa9e406ahrens
1635ad8204nd	mutex_destroy(&tx->tx_sync_lock);
1645ad8204nd
165b5e70f9Ricardo M. Correia	cv_destroy(&tx->tx_sync_more_cv);
166b5e70f9Ricardo M. Correia	cv_destroy(&tx->tx_sync_done_cv);
167b5e70f9Ricardo M. Correia	cv_destroy(&tx->tx_quiesce_more_cv);
168b5e70f9Ricardo M. Correia	cv_destroy(&tx->tx_quiesce_done_cv);
169b5e70f9Ricardo M. Correia	cv_destroy(&tx->tx_exit_cv);
170b5e70f9Ricardo M. Correia
1718f38d41ek	for (c = 0; c < max_ncpus; c++) {
1728f38d41ek		int i;
1738f38d41ek
1744a92375George Wilson		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
1755ad8204nd		mutex_destroy(&tx->tx_cpu[c].tc_lock);
176d20e665Ricardo M. Correia		for (i = 0; i < TXG_SIZE; i++) {
1778f38d41ek			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
178d20e665Ricardo M. Correia			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
179d20e665Ricardo M. Correia		}
1808f38d41ek	}
181fa9e406ahrens
182d20e665Ricardo M. Correia	if (tx->tx_commit_cb_taskq != NULL)
183d20e665Ricardo M. Correia		taskq_destroy(tx->tx_commit_cb_taskq);
184d20e665Ricardo M. Correia
185fa9e406ahrens	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
186fa9e406ahrens
187fa9e406ahrens	bzero(tx, sizeof (tx_state_t));
188fa9e406ahrens}
189fa9e406ahrens
190fa9e406ahrens/*
191fa9e406ahrens * Start syncing transaction groups.
192fa9e406ahrens */
193fa9e406ahrensvoid
194fa9e406ahrenstxg_sync_start(dsl_pool_t *dp)
195fa9e406ahrens{
196fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
197fa9e406ahrens
198fa9e406ahrens	mutex_enter(&tx->tx_sync_lock);
199fa9e406ahrens
200fa9e406ahrens	dprintf("pool %p\n", dp);
201fa9e406ahrens
2021271e4bPrakash Surya	ASSERT0(tx->tx_threads);
203fa9e406ahrens
2041ab7f2dmaybee	tx->tx_threads = 2;
205fa9e406ahrens
206fa9e406ahrens	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
207fa9e406ahrens	    dp, 0, &p0, TS_RUN, minclsyspri);
208fa9e406ahrens
209088f389ahrens	/*
210088f389ahrens	 * The sync thread can need a larger-than-default stack size on
211088f389ahrens	 * 32-bit x86.  This is due in part to nested pools and
212088f389ahrens	 * scrub_visitbp() recursion.
213088f389ahrens	 */
2143f9d6adLin Ling	tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
215fa9e406ahrens	    dp, 0, &p0, TS_RUN, minclsyspri);
216fa9e406ahrens
217fa9e406ahrens	mutex_exit(&tx->tx_sync_lock);
218fa9e406ahrens}
219fa9e406ahrens
220fa9e406ahrensstatic void
221fa9e406ahrenstxg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
222fa9e406ahrens{
223fa9e406ahrens	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
224fa9e406ahrens	mutex_enter(&tx->tx_sync_lock);
225fa9e406ahrens}
226fa9e406ahrens
227fa9e406ahrensstatic void
228fa9e406ahrenstxg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
229fa9e406ahrens{
230fa9e406ahrens	ASSERT(*tpp != NULL);
231fa9e406ahrens	*tpp = NULL;
232fa9e406ahrens	tx->tx_threads--;
233fa9e406ahrens	cv_broadcast(&tx->tx_exit_cv);
234fa9e406ahrens	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
235fa9e406ahrens	thread_exit();
236fa9e406ahrens}
237fa9e406ahrens
238fa9e406ahrensstatic void
2390689f76Adam Leventhaltxg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
240fa9e406ahrens{
241fa9e406ahrens	CALLB_CPR_SAFE_BEGIN(cpr);
242fa9e406ahrens
2431ab7f2dmaybee	if (time)
244d3d5073Rafael Vanoni		(void) cv_timedwait(cv, &tx->tx_sync_lock,
245d3d5073Rafael Vanoni		    ddi_get_lbolt() + time);
246fa9e406ahrens	else
247fa9e406ahrens		cv_wait(cv, &tx->tx_sync_lock);
248fa9e406ahrens
249fa9e406ahrens	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
250fa9e406ahrens}
251fa9e406ahrens
252fa9e406ahrens/*
253fa9e406ahrens * Stop syncing transaction groups.
254fa9e406ahrens */
255fa9e406ahrensvoid
256fa9e406ahrenstxg_sync_stop(dsl_pool_t *dp)
257fa9e406ahrens{
258fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
259fa9e406ahrens
260fa9e406ahrens	dprintf("pool %p\n", dp);
261fa9e406ahrens	/*
262fa9e406ahrens	 * Finish off any work in progress.
263fa9e406ahrens	 */
2641271e4bPrakash Surya	ASSERT3U(tx->tx_threads, ==, 2);
265468c413Tim Haley
266468c413Tim Haley	/*
267814dcd4Serapheim Dimitropoulos	 * We need to ensure that we've vacated the deferred metaslab trees.
268468c413Tim Haley	 */
269468c413Tim Haley	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
270fa9e406ahrens
271fa9e406ahrens	/*
2721ab7f2dmaybee	 * Wake all sync threads and wait for them to die.
273fa9e406ahrens	 */
274fa9e406ahrens	mutex_enter(&tx->tx_sync_lock);
275fa9e406ahrens
2761271e4bPrakash Surya	ASSERT3U(tx->tx_threads, ==, 2);
277fa9e406ahrens
278fa9e406ahrens	tx->tx_exiting = 1;
279fa9e406ahrens
280fa9e406ahrens	cv_broadcast(&tx->tx_quiesce_more_cv);
281fa9e406ahrens	cv_broadcast(&tx->tx_quiesce_done_cv);
282fa9e406ahrens	cv_broadcast(&tx->tx_sync_more_cv);
283fa9e406ahrens
284fa9e406ahrens	while (tx->tx_threads != 0)
285fa9e406ahrens		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
286fa9e406ahrens
287fa9e406ahrens	tx->tx_exiting = 0;
288fa9e406ahrens
289fa9e406ahrens	mutex_exit(&tx->tx_sync_lock);
290fa9e406ahrens}
291fa9e406ahrens
292fa9e406ahrensuint64_t
293fa9e406ahrenstxg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
294fa9e406ahrens{
295fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
296fa9e406ahrens	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
297fa9e406ahrens	uint64_t txg;
298fa9e406ahrens
2994a92375George Wilson	mutex_enter(&tc->tc_open_lock);
300fa9e406ahrens	txg = tx->tx_open_txg;
3014a92375George Wilson
3024a92375George Wilson	mutex_enter(&tc->tc_lock);
303fa9e406ahrens	tc->tc_count[txg & TXG_MASK]++;
3044a92375George Wilson	mutex_exit(&tc->tc_lock);
305fa9e406ahrens
306fa9e406ahrens	th->th_cpu = tc;
307fa9e406ahrens	th->th_txg = txg;
308fa9e406ahrens
309fa9e406ahrens	return (txg);
310fa9e406ahrens}
311fa9e406ahrens
312fa9e406ahrensvoid
313fa9e406ahrenstxg_rele_to_quiesce(txg_handle_t *th)
314fa9e406ahrens{
315fa9e406ahrens	tx_cpu_t *tc = th->th_cpu;
316fa9e406ahrens
3174a92375George Wilson	ASSERT(!MUTEX_HELD(&tc->tc_lock));
3184a92375George Wilson	mutex_exit(&tc->tc_open_lock);
319fa9e406ahrens}
320fa9e406ahrens
321fa9e406ahrensvoid
322d20e665Ricardo M. Correiatxg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
323d20e665Ricardo M. Correia{
324d20e665Ricardo M. Correia	tx_cpu_t *tc = th->th_cpu;
325d20e665Ricardo M. Correia	int g = th->th_txg & TXG_MASK;
326d20e665Ricardo M. Correia
327d20e665Ricardo M. Correia	mutex_enter(&tc->tc_lock);
328d20e665Ricardo M. Correia	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
329d20e665Ricardo M. Correia	mutex_exit(&tc->tc_lock);
330d20e665Ricardo M. Correia}
331d20e665Ricardo M. Correia
332d20e665Ricardo M. Correiavoid
333fa9e406ahrenstxg_rele_to_sync(txg_handle_t *th)
334fa9e406ahrens{
335fa9e406ahrens	tx_cpu_t *tc = th->th_cpu;
336fa9e406ahrens	int g = th->th_txg & TXG_MASK;
337fa9e406ahrens
338fa9e406ahrens	mutex_enter(&tc->tc_lock);
339fa9e406ahrens	ASSERT(tc->tc_count[g] != 0);
340fa9e406ahrens	if (--tc->tc_count[g] == 0)
341fa9e406ahrens		cv_broadcast(&tc->tc_cv[g]);
342fa9e406ahrens	mutex_exit(&tc->tc_lock);
343fa9e406ahrens
344fa9e406ahrens	th->th_cpu = NULL;	/* defensive */
345fa9e406ahrens}
346fa9e406ahrens
3473e30c24Will Andrews/*
3483e30c24Will Andrews * Blocks until all transactions in the group are committed.
3493e30c24Will Andrews *
3503e30c24Will Andrews * On return, the transaction group has reached a stable state in which it can
3513e30c24Will Andrews * then be passed off to the syncing context.
3523e30c24Will Andrews */
353fa9e406ahrensstatic void
354fa9e406ahrenstxg_quiesce(dsl_pool_t *dp, uint64_t txg)
355fa9e406ahrens{
356fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
357fa9e406ahrens	int g = txg & TXG_MASK;
358fa9e406ahrens	int c;
359fa9e406ahrens
360fa9e406ahrens	/*
3614a92375George Wilson	 * Grab all tc_open_locks so nobody else can get into this txg.
362fa9e406ahrens	 */
363fa9e406ahrens	for (c = 0; c < max_ncpus; c++)
3644a92375George Wilson		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
365fa9e406ahrens
366fa9e406ahrens	ASSERT(txg == tx->tx_open_txg);
367fa9e406ahrens	tx->tx_open_txg++;
36869962b5Matthew Ahrens	tx->tx_open_time = gethrtime();
369fa9e406ahrens
3700689f76Adam Leventhal	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
3710689f76Adam Leventhal	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
3720689f76Adam Leventhal
373fa9e406ahrens	/*
374fa9e406ahrens	 * Now that we've incremented tx_open_txg, we can let threads
375fa9e406ahrens	 * enter the next transaction group.
376fa9e406ahrens	 */
377fa9e406ahrens	for (c = 0; c < max_ncpus; c++)
3784a92375George Wilson		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
379fa9e406ahrens
380fa9e406ahrens	/*
381fa9e406ahrens	 * Quiesce the transaction group by waiting for everyone to txg_exit().
382fa9e406ahrens	 */
383fa9e406ahrens	for (c = 0; c < max_ncpus; c++) {
384fa9e406ahrens		tx_cpu_t *tc = &tx->tx_cpu[c];
385fa9e406ahrens		mutex_enter(&tc->tc_lock);
386fa9e406ahrens		while (tc->tc_count[g] != 0)
387fa9e406ahrens			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
388fa9e406ahrens		mutex_exit(&tc->tc_lock);
389fa9e406ahrens	}
390fa9e406ahrens}
391fa9e406ahrens
392fa9e406ahrensstatic void
393d20e665Ricardo M. Correiatxg_do_callbacks(list_t *cb_list)
394d20e665Ricardo M. Correia{
395d20e665Ricardo M. Correia	dmu_tx_do_callbacks(cb_list, 0);
396d20e665Ricardo M. Correia
397d20e665Ricardo M. Correia	list_destroy(cb_list);
398d20e665Ricardo M. Correia
399d20e665Ricardo M. Correia	kmem_free(cb_list, sizeof (list_t));
400d20e665Ricardo M. Correia}
401d20e665Ricardo M. Correia
402d20e665Ricardo M. Correia/*
403d20e665Ricardo M. Correia * Dispatch the commit callbacks registered on this txg to worker threads.
4043e30c24Will Andrews *
4053e30c24Will Andrews * If no callbacks are registered for a given TXG, nothing happens.
4063e30c24Will Andrews * This function creates a taskq for the associated pool, if needed.
407d20e665Ricardo M. Correia */
408d20e665Ricardo M. Correiastatic void
409d20e665Ricardo M. Correiatxg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
410d20e665Ricardo M. Correia{
411d20e665Ricardo M. Correia	int c;
412d20e665Ricardo M. Correia	tx_state_t *tx = &dp->dp_tx;
413d20e665Ricardo M. Correia	list_t *cb_list;
414d20e665Ricardo M. Correia
415d20e665Ricardo M. Correia	for (c = 0; c < max_ncpus; c++) {
416d20e665Ricardo M. Correia		tx_cpu_t *tc = &tx->tx_cpu[c];
4173e30c24Will Andrews		/*
4183e30c24Will Andrews		 * No need to lock tx_cpu_t at this point, since this can
4193e30c24Will Andrews		 * only be called once a txg has been synced.
4203e30c24Will Andrews		 */
421d20e665Ricardo M. Correia
422d20e665Ricardo M. Correia		int g = txg & TXG_MASK;
423d20e665Ricardo M. Correia
424d20e665Ricardo M. Correia		if (list_is_empty(&tc->tc_callbacks[g]))
425d20e665Ricardo M. Correia			continue;
426d20e665Ricardo M. Correia
427d20e665Ricardo M. Correia		if (tx->tx_commit_cb_taskq == NULL) {
428d20e665Ricardo M. Correia			/*
429d20e665Ricardo M. Correia			 * Commit callback taskq hasn't been created yet.
430d20e665Ricardo M. Correia			 */
431d20e665Ricardo M. Correia			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
432d20e665Ricardo M. Correia			    max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
433d20e665Ricardo M. Correia			    TASKQ_PREPOPULATE);
434d20e665Ricardo M. Correia		}
435d20e665Ricardo M. Correia
436d20e665Ricardo M. Correia		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
437d20e665Ricardo M. Correia		list_create(cb_list, sizeof (dmu_tx_callback_t),
438d20e665Ricardo M. Correia		    offsetof(dmu_tx_callback_t, dcb_node));
439d20e665Ricardo M. Correia
440b3d9f2eWill Andrews		list_move_tail(cb_list, &tc->tc_callbacks[g]);
441d20e665Ricardo M. Correia
442d20e665Ricardo M. Correia		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
443d20e665Ricardo M. Correia		    txg_do_callbacks, cb_list, TQ_SLEEP);
444d20e665Ricardo M. Correia	}
445d20e665Ricardo M. Correia}
446d20e665Ricardo M. Correia
447fa41d87Serapheim Dimitropoulosstatic boolean_t
448fa41d87Serapheim Dimitropoulostxg_is_syncing(dsl_pool_t *dp)
449fa41d87Serapheim Dimitropoulos{
450fa41d87Serapheim Dimitropoulos	tx_state_t *tx = &dp->dp_tx;
451fa41d87Serapheim Dimitropoulos	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
452fa41d87Serapheim Dimitropoulos	return (tx->tx_syncing_txg != 0);
453fa41d87Serapheim Dimitropoulos}
454fa41d87Serapheim Dimitropoulos
455fa41d87Serapheim Dimitropoulosstatic boolean_t
456fa41d87Serapheim Dimitropoulostxg_is_quiescing(dsl_pool_t *dp)
457fa41d87Serapheim Dimitropoulos{
458fa41d87Serapheim Dimitropoulos	tx_state_t *tx = &dp->dp_tx;
459fa41d87Serapheim Dimitropoulos	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
460fa41d87Serapheim Dimitropoulos	return (tx->tx_quiescing_txg != 0);
461fa41d87Serapheim Dimitropoulos}
462fa41d87Serapheim Dimitropoulos
463fa41d87Serapheim Dimitropoulosstatic boolean_t
464fa41d87Serapheim Dimitropoulostxg_has_quiesced_to_sync(dsl_pool_t *dp)
465fa41d87Serapheim Dimitropoulos{
466fa41d87Serapheim Dimitropoulos	tx_state_t *tx = &dp->dp_tx;
467fa41d87Serapheim Dimitropoulos	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
468fa41d87Serapheim Dimitropoulos	return (tx->tx_quiesced_txg != 0);
469fa41d87Serapheim Dimitropoulos}
470fa41d87Serapheim Dimitropoulos
471d20e665Ricardo M. Correiastatic void
4723f7978dAlan Somerstxg_sync_thread(void *arg)
473fa9e406ahrens{
4743f7978dAlan Somers	dsl_pool_t *dp = arg;
475b16da2eGeorge Wilson	spa_t *spa = dp->dp_spa;
476fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
477fa9e406ahrens	callb_cpr_t cpr;
47805715f9Mark Maybee	uint64_t start, delta;
479fa9e406ahrens
480fa9e406ahrens	txg_thread_enter(tx, &cpr);
481fa9e406ahrens
4821ab7f2dmaybee	start = delta = 0;
483fa9e406ahrens	for (;;) {
48469962b5Matthew Ahrens		uint64_t timeout = zfs_txg_timeout * hz;
48569962b5Matthew Ahrens		uint64_t timer;
48605715f9Mark Maybee		uint64_t txg;
4877928f4bMatthew Ahrens		uint64_t dirty_min_bytes =
4887928f4bMatthew Ahrens		    zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
489fa9e406ahrens
490fa9e406ahrens		/*
4913f9d6adLin Ling		 * We sync when we're scanning, there's someone waiting
49288b7b0fMatthew Ahrens		 * on us, or the quiesce thread has handed off a txg to
49388b7b0fMatthew Ahrens		 * us, or we have reached our timeout.
494fa9e406ahrens		 */
4951ab7f2dmaybee		timer = (delta >= timeout ? 0 : timeout - delta);
496cde58dbMatthew Ahrens		while (!dsl_scan_active(dp->dp_scan) &&
49788b7b0fMatthew Ahrens		    !tx->tx_exiting && timer > 0 &&
498fa9e406ahrens		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
499fa41d87Serapheim Dimitropoulos		    !txg_has_quiesced_to_sync(dp) &&
5007928f4bMatthew Ahrens		    dp->dp_dirty_total < dirty_min_bytes) {
501fa9e406ahrens			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
502fa9e406ahrens			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
5031ab7f2dmaybee			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
504d3d5073Rafael Vanoni			delta = ddi_get_lbolt() - start;
5051ab7f2dmaybee			timer = (delta > timeout ? 0 : timeout - delta);
506fa9e406ahrens		}
507fa9e406ahrens
508fa9e406ahrens		/*
509fa9e406ahrens		 * Wait until the quiesce thread hands off a txg to us,
510fa9e406ahrens		 * prompting it to do so if necessary.
511fa9e406ahrens		 */
512fa41d87Serapheim Dimitropoulos		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
513fa9e406ahrens			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
514fa9e406ahrens				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
515fa9e406ahrens			cv_broadcast(&tx->tx_quiesce_more_cv);
516fa9e406ahrens			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
517fa9e406ahrens		}
518fa9e406ahrens
519fa9e406ahrens		if (tx->tx_exiting)
520fa9e406ahrens			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
521fa9e406ahrens
522fa9e406ahrens		/*
523fa9e406ahrens		 * Consume the quiesced txg which has been handed off to
524fa9e406ahrens		 * us.  This may cause the quiescing thread to now be
525fa9e406ahrens		 * able to quiesce another txg, so we must signal it.
526fa9e406ahrens		 */
527fa41d87Serapheim Dimitropoulos		ASSERT(tx->tx_quiesced_txg != 0);
528fa9e406ahrens		txg = tx->tx_quiesced_txg;
529fa9e406ahrens		tx->tx_quiesced_txg = 0;
530fa9e406ahrens		tx->tx_syncing_txg = txg;
5310689f76Adam Leventhal		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
532fa9e406ahrens		cv_broadcast(&tx->tx_quiesce_more_cv);
533fa9e406ahrens
534fa9e406ahrens		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
5358f38d41ek		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
536fa9e406ahrens		mutex_exit(&tx->tx_sync_lock);
53705715f9Mark Maybee
538d3d5073Rafael Vanoni		start = ddi_get_lbolt();
539b16da2eGeorge Wilson		spa_sync(spa, txg);
540d3d5073Rafael Vanoni		delta = ddi_get_lbolt() - start;
5411ab7f2dmaybee
542fa9e406ahrens		mutex_enter(&tx->tx_sync_lock);
543fa9e406ahrens		tx->tx_synced_txg = txg;
544fa9e406ahrens		tx->tx_syncing_txg = 0;
5450689f76Adam Leventhal		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
546fa9e406ahrens		cv_broadcast(&tx->tx_sync_done_cv);
547d20e665Ricardo M. Correia
548d20e665Ricardo M. Correia		/*
549d20e665Ricardo M. Correia		 * Dispatch commit callbacks to worker threads.
550d20e665Ricardo M. Correia		 */
551d20e665Ricardo M. Correia		txg_dispatch_callbacks(dp, txg);
552fa9e406ahrens	}
553fa9e406ahrens}
554fa9e406ahrens
555fa9e406ahrensstatic void
5563f7978dAlan Somerstxg_quiesce_thread(void *arg)
557fa9e406ahrens{
5583f7978dAlan Somers	dsl_pool_t *dp = arg;
559fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
560fa9e406ahrens	callb_cpr_t cpr;
561fa9e406ahrens
562fa9e406ahrens	txg_thread_enter(tx, &cpr);
563fa9e406ahrens
564fa9e406ahrens	for (;;) {
565fa9e406ahrens		uint64_t txg;
566fa9e406ahrens
567fa9e406ahrens		/*
568fa9e406ahrens		 * We quiesce when there's someone waiting on us.
569fa9e406ahrens		 * However, we can only have one txg in "quiescing" or
570fa9e406ahrens		 * "quiesced, waiting to sync" state.  So we wait until
571fa9e406ahrens		 * the "quiesced, waiting to sync" txg has been consumed
572fa9e406ahrens		 * by the sync thread.
573fa9e406ahrens		 */
574fa9e406ahrens		while (!tx->tx_exiting &&
575fa9e406ahrens		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
576fa41d87Serapheim Dimitropoulos		    txg_has_quiesced_to_sync(dp)))
577fa9e406ahrens			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
578fa9e406ahrens
579fa9e406ahrens		if (tx->tx_exiting)
580fa9e406ahrens			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
581fa9e406ahrens
582fa9e406ahrens		txg = tx->tx_open_txg;
583fa9e406ahrens		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
584fa9e406ahrens		    txg, tx->tx_quiesce_txg_waiting,
585fa9e406ahrens		    tx->tx_sync_txg_waiting);
586fa41d87Serapheim Dimitropoulos		tx->tx_quiescing_txg = txg;
587fa41d87Serapheim Dimitropoulos
588fa9e406ahrens		mutex_exit(&tx->tx_sync_lock);
589fa9e406ahrens		txg_quiesce(dp, txg);
590fa9e406ahrens		mutex_enter(&tx->tx_sync_lock);
591fa9e406ahrens
592fa9e406ahrens		/*
593fa9e406ahrens		 * Hand this txg off to the sync thread.
594fa9e406ahrens		 */
595fa9e406ahrens		dprintf("quiesce done, handing off txg %llu\n", txg);
596fa41d87Serapheim Dimitropoulos		tx->tx_quiescing_txg = 0;
597fa9e406ahrens		tx->tx_quiesced_txg = txg;
5980689f76Adam Leventhal		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
599fa9e406ahrens		cv_broadcast(&tx->tx_sync_more_cv);
600fa9e406ahrens		cv_broadcast(&tx->tx_quiesce_done_cv);
601fa9e406ahrens	}
602fa9e406ahrens}
603fa9e406ahrens
6041ab7f2dmaybee/*
6050689f76Adam Leventhal * Delay this thread by delay nanoseconds if we are still in the open
606f717074Will Andrews * transaction group and there is already a waiting txg quiescing or quiesced.
607f717074Will Andrews * Abort the delay if this txg stalls or enters the quiescing state.
6081ab7f2dmaybee */
6091ab7f2dmaybeevoid
6100689f76Adam Leventhaltxg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
6111ab7f2dmaybee{
6121ab7f2dmaybee	tx_state_t *tx = &dp->dp_tx;
6130689f76Adam Leventhal	hrtime_t start = gethrtime();
6141ab7f2dmaybee
615f717074Will Andrews	/* don't delay if this txg could transition to quiescing immediately */
6161ab7f2dmaybee	if (tx->tx_open_txg > txg ||
6171ab7f2dmaybee	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
6181ab7f2dmaybee		return;
6191ab7f2dmaybee
6201ab7f2dmaybee	mutex_enter(&tx->tx_sync_lock);
6211ab7f2dmaybee	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
6221ab7f2dmaybee		mutex_exit(&tx->tx_sync_lock);
6231ab7f2dmaybee		return;
6241ab7f2dmaybee	}
6251ab7f2dmaybee
6260689f76Adam Leventhal	while (gethrtime() - start < delay &&
6270689f76Adam Leventhal	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
6280689f76Adam Leventhal		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
6290689f76Adam Leventhal		    &tx->tx_sync_lock, delay, resolution, 0);
6300689f76Adam Leventhal	}
6311ab7f2dmaybee
6321ab7f2dmaybee	mutex_exit(&tx->tx_sync_lock);
6331ab7f2dmaybee}
6341ab7f2dmaybee
635d0cb1fbDon Bradystatic boolean_t
636d0cb1fbDon Bradytxg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
637fa9e406ahrens{
638fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
639fa9e406ahrens
6403b2aab1Matthew Ahrens	ASSERT(!dsl_pool_config_held(dp));
6413b2aab1Matthew Ahrens
642fa9e406ahrens	mutex_enter(&tx->tx_sync_lock);
6431271e4bPrakash Surya	ASSERT3U(tx->tx_threads, ==, 2);
644fa9e406ahrens	if (txg == 0)
645b24ab67Jeff Bonwick		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
646fa9e406ahrens	if (tx->tx_sync_txg_waiting < txg)
647fa9e406ahrens		tx->tx_sync_txg_waiting = txg;
648fa9e406ahrens	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
649fa9e406ahrens	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
650fa9e406ahrens	while (tx->tx_synced_txg < txg) {
651fa9e406ahrens		dprintf("broadcasting sync more "
652fa9e406ahrens		    "tx_synced=%llu waiting=%llu dp=%p\n",
653fa9e406ahrens		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
654fa9e406ahrens		cv_broadcast(&tx->tx_sync_more_cv);
655d0cb1fbDon Brady		if (wait_sig) {
656d0cb1fbDon Brady			/*
657d0cb1fbDon Brady			 * Condition wait here but stop if the thread receives a
658d0cb1fbDon Brady			 * signal. The caller may call txg_wait_synced*() again
659d0cb1fbDon Brady			 * to resume waiting for this txg.
660d0cb1fbDon Brady			 */
661d0cb1fbDon Brady			if (cv_wait_sig(&tx->tx_sync_done_cv,
662d0cb1fbDon Brady			    &tx->tx_sync_lock) == 0) {
663d0cb1fbDon Brady				mutex_exit(&tx->tx_sync_lock);
664d0cb1fbDon Brady				return (B_TRUE);
665d0cb1fbDon Brady			}
666d0cb1fbDon Brady		} else {
667d0cb1fbDon Brady			cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
668d0cb1fbDon Brady		}
669fa9e406ahrens	}
670fa9e406ahrens	mutex_exit(&tx->tx_sync_lock);
671d0cb1fbDon Brady	return (B_FALSE);
672d0cb1fbDon Brady}
673d0cb1fbDon Brady
674d0cb1fbDon Bradyvoid
675d0cb1fbDon Bradytxg_wait_synced(dsl_pool_t *dp, uint64_t txg)
676d0cb1fbDon Brady{
677d0cb1fbDon Brady	VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
678d0cb1fbDon Brady}
679d0cb1fbDon Brady
680d0cb1fbDon Brady/*
681d0cb1fbDon Brady * Similar to a txg_wait_synced but it can be interrupted from a signal.
682d0cb1fbDon Brady * Returns B_TRUE if the thread was signaled while waiting.
683d0cb1fbDon Brady */
684d0cb1fbDon Bradyboolean_t
685d0cb1fbDon Bradytxg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
686d0cb1fbDon Brady{
687d0cb1fbDon Brady	return (txg_wait_synced_impl(dp, txg, B_TRUE));
688fa9e406ahrens}
689fa9e406ahrens
690084fd14Brian Behlendorf/*
691084fd14Brian Behlendorf * Wait for the specified open transaction group.  Set should_quiesce
692084fd14Brian Behlendorf * when the current open txg should be quiesced immediately.
693084fd14Brian Behlendorf */
694fa9e406ahrensvoid
695084fd14Brian Behlendorftxg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
696fa9e406ahrens{
697fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
698fa9e406ahrens
6993b2aab1Matthew Ahrens	ASSERT(!dsl_pool_config_held(dp));
7003b2aab1Matthew Ahrens
701fa9e406ahrens	mutex_enter(&tx->tx_sync_lock);
7021271e4bPrakash Surya	ASSERT3U(tx->tx_threads, ==, 2);
703fa9e406ahrens	if (txg == 0)
704fa9e406ahrens		txg = tx->tx_open_txg + 1;
705084fd14Brian Behlendorf	if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
706fa9e406ahrens		tx->tx_quiesce_txg_waiting = txg;
707fa9e406ahrens	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
708fa9e406ahrens	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
709fa9e406ahrens	while (tx->tx_open_txg < txg) {
710fa9e406ahrens		cv_broadcast(&tx->tx_quiesce_more_cv);
711fa9e406ahrens		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
712fa9e406ahrens	}
713fa9e406ahrens	mutex_exit(&tx->tx_sync_lock);
714fa9e406ahrens}
715fa9e406ahrens
71669962b5Matthew Ahrens/*
71769962b5Matthew Ahrens * If there isn't a txg syncing or in the pipeline, push another txg through
71869962b5Matthew Ahrens * the pipeline by queiscing the open txg.
71969962b5Matthew Ahrens */
72069962b5Matthew Ahrensvoid
72169962b5Matthew Ahrenstxg_kick(dsl_pool_t *dp)
72269962b5Matthew Ahrens{
72369962b5Matthew Ahrens	tx_state_t *tx = &dp->dp_tx;
72469962b5Matthew Ahrens
72569962b5Matthew Ahrens	ASSERT(!dsl_pool_config_held(dp));
72669962b5Matthew Ahrens
72769962b5Matthew Ahrens	mutex_enter(&tx->tx_sync_lock);
728fa41d87Serapheim Dimitropoulos	if (!txg_is_syncing(dp) &&
729fa41d87Serapheim Dimitropoulos	    !txg_is_quiescing(dp) &&
73069962b5Matthew Ahrens	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
73169962b5Matthew Ahrens	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
73269962b5Matthew Ahrens	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
73369962b5Matthew Ahrens		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
73469962b5Matthew Ahrens		cv_broadcast(&tx->tx_quiesce_more_cv);
73569962b5Matthew Ahrens	}
73669962b5Matthew Ahrens	mutex_exit(&tx->tx_sync_lock);
73769962b5Matthew Ahrens}
73869962b5Matthew Ahrens
739088f389ahrensboolean_t
740fa9e406ahrenstxg_stalled(dsl_pool_t *dp)
741fa9e406ahrens{
742fa9e406ahrens	tx_state_t *tx = &dp->dp_tx;
743fa9e406ahrens	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
744fa9e406ahrens}
745fa9e406ahrens
746088f389ahrensboolean_t
747088f389ahrenstxg_sync_waiting(dsl_pool_t *dp)
748088f389ahrens{
749088f389ahrens	tx_state_t *tx = &dp->dp_tx;
750088f389ahrens
751088f389ahrens	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
752088f389ahrens	    tx->tx_quiesced_txg != 0);
753088f389ahrens}
754088f389ahrens
755fa9e406ahrens/*
756b7b2590Matthew Ahrens * Verify that this txg is active (open, quiescing, syncing).  Non-active
757b7b2590Matthew Ahrens * txg's should not be manipulated.
758b7b2590Matthew Ahrens */
759b7b2590Matthew Ahrensvoid
760b7b2590Matthew Ahrenstxg_verify(spa_t *spa, uint64_t txg)
761b7b2590Matthew Ahrens{
762b7b2590Matthew Ahrens	dsl_pool_t *dp = spa_get_dsl(spa);
763b7b2590Matthew Ahrens	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
764b7b2590Matthew Ahrens		return;
765b7b2590Matthew Ahrens	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
766b7b2590Matthew Ahrens	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
767b7b2590Matthew Ahrens	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
768b7b2590Matthew Ahrens}
769b7b2590Matthew Ahrens
770b7b2590Matthew Ahrens/*
771fa9e406ahrens * Per-txg object lists.
772fa9e406ahrens */
773fa9e406ahrensvoid
774b7b2590Matthew Ahrenstxg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
775fa9e406ahrens{
776fa9e406ahrens	int t;
777fa9e406ahrens
778fa9e406ahrens	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
779fa9e406ahrens
780fa9e406ahrens	tl->tl_offset = offset;
781b7b2590Matthew Ahrens	tl->tl_spa = spa;
782fa9e406ahrens
783fa9e406ahrens	for (t = 0; t < TXG_SIZE; t++)
784fa9e406ahrens		tl->tl_head[t] = NULL;
785fa9e406ahrens}
786fa9e406ahrens
787fa9e406ahrensvoid
788fa9e406ahrenstxg_list_destroy(txg_list_t *tl)
789fa9e406ahrens{
790fa9e406ahrens	int t;
791fa9e406ahrens
792fa9e406ahrens	for (t = 0; t < TXG_SIZE; t++)
793fa9e406ahrens		ASSERT(txg_list_empty(tl, t));
794fa9e406ahrens
795fa9e406ahrens	mutex_destroy(&tl->tl_lock);
796fa9e406ahrens}
797fa9e406ahrens
798ce636f8Matthew Ahrensboolean_t
799fa9e406ahrenstxg_list_empty(txg_list_t *tl, uint64_t txg)
800fa9e406ahrens{
801b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
802fa9e406ahrens	return (tl->tl_head[txg & TXG_MASK] == NULL);
803fa9e406ahrens}
804fa9e406ahrens
805fa9e406ahrens/*
80673527f4Alex Reece * Returns true if all txg lists are empty.
80773527f4Alex Reece *
808b7b2590Matthew Ahrens * Warning: this is inherently racy (an item could be added immediately
809b7b2590Matthew Ahrens * after this function returns). We don't bother with the lock because
810b7b2590Matthew Ahrens * it wouldn't change the semantics.
81173527f4Alex Reece */
81273527f4Alex Reeceboolean_t
81373527f4Alex Reecetxg_all_lists_empty(txg_list_t *tl)
81473527f4Alex Reece{
81573527f4Alex Reece	for (int i = 0; i < TXG_SIZE; i++) {
81673527f4Alex Reece		if (!txg_list_empty(tl, i)) {
81773527f4Alex Reece			return (B_FALSE);
81873527f4Alex Reece		}
81973527f4Alex Reece	}
82073527f4Alex Reece	return (B_TRUE);
82173527f4Alex Reece}
82273527f4Alex Reece
82373527f4Alex Reece/*
8243b2aab1Matthew Ahrens * Add an entry to the list (unless it's already on the list).
8253b2aab1Matthew Ahrens * Returns B_TRUE if it was actually added.
826fa9e406ahrens */
8273b2aab1Matthew Ahrensboolean_t
828fa9e406ahrenstxg_list_add(txg_list_t *tl, void *p, uint64_t txg)
829fa9e406ahrens{
830fa9e406ahrens	int t = txg & TXG_MASK;
831fa9e406ahrens	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
8323b2aab1Matthew Ahrens	boolean_t add;
833fa9e406ahrens
834b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
835fa9e406ahrens	mutex_enter(&tl->tl_lock);
8363b2aab1Matthew Ahrens	add = (tn->tn_member[t] == 0);
8373b2aab1Matthew Ahrens	if (add) {
838fa9e406ahrens		tn->tn_member[t] = 1;
839fa9e406ahrens		tn->tn_next[t] = tl->tl_head[t];
840fa9e406ahrens		tl->tl_head[t] = tn;
841fa9e406ahrens	}
842fa9e406ahrens	mutex_exit(&tl->tl_lock);
843fa9e406ahrens
8443b2aab1Matthew Ahrens	return (add);
845fa9e406ahrens}
846fa9e406ahrens
847fa9e406ahrens/*
8483b2aab1Matthew Ahrens * Add an entry to the end of the list, unless it's already on the list.
8493b2aab1Matthew Ahrens * (walks list to find end)
8503b2aab1Matthew Ahrens * Returns B_TRUE if it was actually added.
851495807dMatthew Ahrens */
8523b2aab1Matthew Ahrensboolean_t
853495807dMatthew Ahrenstxg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
854495807dMatthew Ahrens{
855495807dMatthew Ahrens	int t = txg & TXG_MASK;
856495807dMatthew Ahrens	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
8573b2aab1Matthew Ahrens	boolean_t add;
858495807dMatthew Ahrens
859b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
860495807dMatthew Ahrens	mutex_enter(&tl->tl_lock);
8613b2aab1Matthew Ahrens	add = (tn->tn_member[t] == 0);
8623b2aab1Matthew Ahrens	if (add) {
863495807dMatthew Ahrens		txg_node_t **tp;
864495807dMatthew Ahrens
865495807dMatthew Ahrens		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
866495807dMatthew Ahrens			continue;
867495807dMatthew Ahrens
868495807dMatthew Ahrens		tn->tn_member[t] = 1;
869495807dMatthew Ahrens		tn->tn_next[t] = NULL;
870495807dMatthew Ahrens		*tp = tn;
871495807dMatthew Ahrens	}
872495807dMatthew Ahrens	mutex_exit(&tl->tl_lock);
873495807dMatthew Ahrens
8743b2aab1Matthew Ahrens	return (add);
875495807dMatthew Ahrens}
876495807dMatthew Ahrens
877495807dMatthew Ahrens/*
878fa9e406ahrens * Remove the head of the list and return it.
879fa9e406ahrens */
880fa9e406ahrensvoid *
881fa9e406ahrenstxg_list_remove(txg_list_t *tl, uint64_t txg)
882fa9e406ahrens{
883fa9e406ahrens	int t = txg & TXG_MASK;
884fa9e406ahrens	txg_node_t *tn;
885fa9e406ahrens	void *p = NULL;
886fa9e406ahrens
887b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
888fa9e406ahrens	mutex_enter(&tl->tl_lock);
889fa9e406ahrens	if ((tn = tl->tl_head[t]) != NULL) {
8905cabbc6Prashanth Sreenivasa		ASSERT(tn->tn_member[t]);
8915cabbc6Prashanth Sreenivasa		ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
892fa9e406ahrens		p = (char *)tn - tl->tl_offset;
893fa9e406ahrens		tl->tl_head[t] = tn->tn_next[t];
894fa9e406ahrens		tn->tn_next[t] = NULL;
895fa9e406ahrens		tn->tn_member[t] = 0;
896fa9e406ahrens	}
897fa9e406ahrens	mutex_exit(&tl->tl_lock);
898fa9e406ahrens
899fa9e406ahrens	return (p);
900fa9e406ahrens}
901fa9e406ahrens
902fa9e406ahrens/*
903fa9e406ahrens * Remove a specific item from the list and return it.
904fa9e406ahrens */
905fa9e406ahrensvoid *
906fa9e406ahrenstxg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
907fa9e406ahrens{
908fa9e406ahrens	int t = txg & TXG_MASK;
909fa9e406ahrens	txg_node_t *tn, **tp;
910fa9e406ahrens
911b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
912fa9e406ahrens	mutex_enter(&tl->tl_lock);
913fa9e406ahrens
914fa9e406ahrens	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
915fa9e406ahrens		if ((char *)tn - tl->tl_offset == p) {
916fa9e406ahrens			*tp = tn->tn_next[t];
917fa9e406ahrens			tn->tn_next[t] = NULL;
918fa9e406ahrens			tn->tn_member[t] = 0;
919fa9e406ahrens			mutex_exit(&tl->tl_lock);
920fa9e406ahrens			return (p);
921fa9e406ahrens		}
922fa9e406ahrens	}
923fa9e406ahrens
924fa9e406ahrens	mutex_exit(&tl->tl_lock);
925fa9e406ahrens
926fa9e406ahrens	return (NULL);
927fa9e406ahrens}
928fa9e406ahrens
9293b2aab1Matthew Ahrensboolean_t
930fa9e406ahrenstxg_list_member(txg_list_t *tl, void *p, uint64_t txg)
931fa9e406ahrens{
932fa9e406ahrens	int t = txg & TXG_MASK;
933fa9e406ahrens	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
934fa9e406ahrens
935b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
9363b2aab1Matthew Ahrens	return (tn->tn_member[t] != 0);
937fa9e406ahrens}
938fa9e406ahrens
939fa9e406ahrens/*
940fa9e406ahrens * Walk a txg list -- only safe if you know it's not changing.
941fa9e406ahrens */
942fa9e406ahrensvoid *
943fa9e406ahrenstxg_list_head(txg_list_t *tl, uint64_t txg)
944fa9e406ahrens{
945fa9e406ahrens	int t = txg & TXG_MASK;
946fa9e406ahrens	txg_node_t *tn = tl->tl_head[t];
947fa9e406ahrens
948b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
949fa9e406ahrens	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
950fa9e406ahrens}
951fa9e406ahrens
952fa9e406ahrensvoid *
953fa9e406ahrenstxg_list_next(txg_list_t *tl, void *p, uint64_t txg)
954fa9e406ahrens{
955fa9e406ahrens	int t = txg & TXG_MASK;
956fa9e406ahrens	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
957fa9e406ahrens
958b7b2590Matthew Ahrens	txg_verify(tl->tl_spa, txg);
959fa9e406ahrens	tn = tn->tn_next[t];
960fa9e406ahrens
961fa9e406ahrens	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
962fa9e406ahrens}
963