1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 55ad82045Snd * Common Development and Distribution License (the "License"). 65ad82045Snd * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23383e7c74SXin Li * Portions Copyright 2011 Martin Matuska 24b7b2590dSMatthew Ahrens * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #include <sys/zfs_context.h> 28fa9e4066Sahrens #include <sys/txg_impl.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30d20e665cSRicardo M. Correia #include <sys/dmu_tx.h> 31fa9e4066Sahrens #include <sys/dsl_pool.h> 323f9d6ad7SLin Ling #include <sys/dsl_scan.h> 33b7b2590dSMatthew Ahrens #include <sys/zil.h> 34fa9e4066Sahrens #include <sys/callb.h> 35fa9e4066Sahrens 36fa9e4066Sahrens /* 37adbbcffaSAdam H. Leventhal * ZFS Transaction Groups 38adbbcffaSAdam H. Leventhal * ---------------------- 39adbbcffaSAdam H. Leventhal * 40adbbcffaSAdam H. Leventhal * ZFS transaction groups are, as the name implies, groups of transactions 41adbbcffaSAdam H. Leventhal * that act on persistent state. ZFS asserts consistency at the granularity of 42adbbcffaSAdam H. Leventhal * these transaction groups. Each successive transaction group (txg) is 43adbbcffaSAdam H. Leventhal * assigned a 64-bit consecutive identifier. There are three active 44adbbcffaSAdam H. Leventhal * transaction group states: open, quiescing, or syncing. At any given time, 45adbbcffaSAdam H. Leventhal * there may be an active txg associated with each state; each active txg may 46adbbcffaSAdam H. Leventhal * either be processing, or blocked waiting to enter the next state. There may 47adbbcffaSAdam H. Leventhal * be up to three active txgs, and there is always a txg in the open state 48adbbcffaSAdam H. Leventhal * (though it may be blocked waiting to enter the quiescing state). In broad 4969962b56SMatthew Ahrens * strokes, transactions -- operations that change in-memory structures -- are 50adbbcffaSAdam H. Leventhal * accepted into the txg in the open state, and are completed while the txg is 51adbbcffaSAdam H. Leventhal * in the open or quiescing states. The accumulated changes are written to 52adbbcffaSAdam H. Leventhal * disk in the syncing state. 53adbbcffaSAdam H. Leventhal * 54adbbcffaSAdam H. Leventhal * Open 55adbbcffaSAdam H. Leventhal * 56adbbcffaSAdam H. Leventhal * When a new txg becomes active, it first enters the open state. New 5769962b56SMatthew Ahrens * transactions -- updates to in-memory structures -- are assigned to the 58adbbcffaSAdam H. Leventhal * currently open txg. There is always a txg in the open state so that ZFS can 59adbbcffaSAdam H. Leventhal * accept new changes (though the txg may refuse new changes if it has hit 60adbbcffaSAdam H. Leventhal * some limit). ZFS advances the open txg to the next state for a variety of 61adbbcffaSAdam H. Leventhal * reasons such as it hitting a time or size threshold, or the execution of an 62adbbcffaSAdam H. Leventhal * administrative action that must be completed in the syncing state. 63adbbcffaSAdam H. Leventhal * 64adbbcffaSAdam H. Leventhal * Quiescing 65adbbcffaSAdam H. Leventhal * 66adbbcffaSAdam H. Leventhal * After a txg exits the open state, it enters the quiescing state. The 67adbbcffaSAdam H. Leventhal * quiescing state is intended to provide a buffer between accepting new 68adbbcffaSAdam H. Leventhal * transactions in the open state and writing them out to stable storage in 69adbbcffaSAdam H. Leventhal * the syncing state. While quiescing, transactions can continue their 70adbbcffaSAdam H. Leventhal * operation without delaying either of the other states. Typically, a txg is 71adbbcffaSAdam H. Leventhal * in the quiescing state very briefly since the operations are bounded by 72adbbcffaSAdam H. Leventhal * software latencies rather than, say, slower I/O latencies. After all 73adbbcffaSAdam H. Leventhal * transactions complete, the txg is ready to enter the next state. 74adbbcffaSAdam H. Leventhal * 75adbbcffaSAdam H. Leventhal * Syncing 76adbbcffaSAdam H. Leventhal * 77adbbcffaSAdam H. Leventhal * In the syncing state, the in-memory state built up during the open and (to 78adbbcffaSAdam H. Leventhal * a lesser degree) the quiescing states is written to stable storage. The 79adbbcffaSAdam H. Leventhal * process of writing out modified data can, in turn modify more data. For 80adbbcffaSAdam H. Leventhal * example when we write new blocks, we need to allocate space for them; those 81adbbcffaSAdam H. Leventhal * allocations modify metadata (space maps)... which themselves must be 82adbbcffaSAdam H. Leventhal * written to stable storage. During the sync state, ZFS iterates, writing out 83adbbcffaSAdam H. Leventhal * data until it converges and all in-memory changes have been written out. 84adbbcffaSAdam H. Leventhal * The first such pass is the largest as it encompasses all the modified user 85adbbcffaSAdam H. Leventhal * data (as opposed to filesystem metadata). Subsequent passes typically have 86adbbcffaSAdam H. Leventhal * far less data to write as they consist exclusively of filesystem metadata. 87adbbcffaSAdam H. Leventhal * 88adbbcffaSAdam H. Leventhal * To ensure convergence, after a certain number of passes ZFS begins 89adbbcffaSAdam H. Leventhal * overwriting locations on stable storage that had been allocated earlier in 90adbbcffaSAdam H. Leventhal * the syncing state (and subsequently freed). ZFS usually allocates new 91adbbcffaSAdam H. Leventhal * blocks to optimize for large, continuous, writes. For the syncing state to 92adbbcffaSAdam H. Leventhal * converge however it must complete a pass where no new blocks are allocated 93adbbcffaSAdam H. Leventhal * since each allocation requires a modification of persistent metadata. 94adbbcffaSAdam H. Leventhal * Further, to hasten convergence, after a prescribed number of passes, ZFS 95adbbcffaSAdam H. Leventhal * also defers frees, and stops compressing. 96adbbcffaSAdam H. Leventhal * 97adbbcffaSAdam H. Leventhal * In addition to writing out user data, we must also execute synctasks during 98adbbcffaSAdam H. Leventhal * the syncing context. A synctask is the mechanism by which some 99adbbcffaSAdam H. Leventhal * administrative activities work such as creating and destroying snapshots or 100adbbcffaSAdam H. Leventhal * datasets. Note that when a synctask is initiated it enters the open txg, 101adbbcffaSAdam H. Leventhal * and ZFS then pushes that txg as quickly as possible to completion of the 102adbbcffaSAdam H. Leventhal * syncing state in order to reduce the latency of the administrative 103adbbcffaSAdam H. Leventhal * activity. To complete the syncing state, ZFS writes out a new uberblock, 104adbbcffaSAdam H. Leventhal * the root of the tree of blocks that comprise all state stored on the ZFS 105adbbcffaSAdam H. Leventhal * pool. Finally, if there is a quiesced txg waiting, we signal that it can 106adbbcffaSAdam H. Leventhal * now transition to the syncing state. 107fa9e4066Sahrens */ 108fa9e4066Sahrens 1093f7978d0SAlan Somers static void txg_sync_thread(void *arg); 1103f7978d0SAlan Somers static void txg_quiesce_thread(void *arg); 111fa9e4066Sahrens 11244ecc532SGeorge Wilson int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ 113fa9e4066Sahrens 114fa9e4066Sahrens /* 115fa9e4066Sahrens * Prepare the txg subsystem. 116fa9e4066Sahrens */ 117fa9e4066Sahrens void 118fa9e4066Sahrens txg_init(dsl_pool_t *dp, uint64_t txg) 119fa9e4066Sahrens { 120fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 1215ad82045Snd int c; 122fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 123fa9e4066Sahrens 124fa9e4066Sahrens tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); 125fa9e4066Sahrens 1268f38d419Sek for (c = 0; c < max_ncpus; c++) { 1278f38d419Sek int i; 1288f38d419Sek 1295ad82045Snd mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); 1304a923759SGeorge Wilson mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, 1314a923759SGeorge Wilson NULL); 1328f38d419Sek for (i = 0; i < TXG_SIZE; i++) { 1338f38d419Sek cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, 1348f38d419Sek NULL); 135d20e665cSRicardo M. Correia list_create(&tx->tx_cpu[c].tc_callbacks[i], 136d20e665cSRicardo M. Correia sizeof (dmu_tx_callback_t), 137d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 1388f38d419Sek } 1398f38d419Sek } 1405ad82045Snd 1415ad82045Snd mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); 142fa9e4066Sahrens 143b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); 144b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); 145b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); 146b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); 147b5e70f97SRicardo M. Correia cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); 148b5e70f97SRicardo M. Correia 149fa9e4066Sahrens tx->tx_open_txg = txg; 150fa9e4066Sahrens } 151fa9e4066Sahrens 152fa9e4066Sahrens /* 153fa9e4066Sahrens * Close down the txg subsystem. 154fa9e4066Sahrens */ 155fa9e4066Sahrens void 156fa9e4066Sahrens txg_fini(dsl_pool_t *dp) 157fa9e4066Sahrens { 158fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 1595ad82045Snd int c; 160fa9e4066Sahrens 1611271e4b1SPrakash Surya ASSERT0(tx->tx_threads); 162fa9e4066Sahrens 1635ad82045Snd mutex_destroy(&tx->tx_sync_lock); 1645ad82045Snd 165b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_more_cv); 166b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_done_cv); 167b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_more_cv); 168b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_done_cv); 169b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_exit_cv); 170b5e70f97SRicardo M. Correia 1718f38d419Sek for (c = 0; c < max_ncpus; c++) { 1728f38d419Sek int i; 1738f38d419Sek 1744a923759SGeorge Wilson mutex_destroy(&tx->tx_cpu[c].tc_open_lock); 1755ad82045Snd mutex_destroy(&tx->tx_cpu[c].tc_lock); 176d20e665cSRicardo M. Correia for (i = 0; i < TXG_SIZE; i++) { 1778f38d419Sek cv_destroy(&tx->tx_cpu[c].tc_cv[i]); 178d20e665cSRicardo M. Correia list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); 179d20e665cSRicardo M. Correia } 1808f38d419Sek } 181fa9e4066Sahrens 182d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq != NULL) 183d20e665cSRicardo M. Correia taskq_destroy(tx->tx_commit_cb_taskq); 184d20e665cSRicardo M. Correia 185fa9e4066Sahrens kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); 186fa9e4066Sahrens 187fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 188fa9e4066Sahrens } 189fa9e4066Sahrens 190fa9e4066Sahrens /* 191fa9e4066Sahrens * Start syncing transaction groups. 192fa9e4066Sahrens */ 193fa9e4066Sahrens void 194fa9e4066Sahrens txg_sync_start(dsl_pool_t *dp) 195fa9e4066Sahrens { 196fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 197fa9e4066Sahrens 198fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 199fa9e4066Sahrens 200fa9e4066Sahrens dprintf("pool %p\n", dp); 201fa9e4066Sahrens 2021271e4b1SPrakash Surya ASSERT0(tx->tx_threads); 203fa9e4066Sahrens 2041ab7f2deSmaybee tx->tx_threads = 2; 205fa9e4066Sahrens 206fa9e4066Sahrens tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, 207fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 208fa9e4066Sahrens 209088f3894Sahrens /* 210088f3894Sahrens * The sync thread can need a larger-than-default stack size on 211088f3894Sahrens * 32-bit x86. This is due in part to nested pools and 212088f3894Sahrens * scrub_visitbp() recursion. 213088f3894Sahrens */ 2143f9d6ad7SLin Ling tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, 215fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 216fa9e4066Sahrens 217fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 218fa9e4066Sahrens } 219fa9e4066Sahrens 220fa9e4066Sahrens static void 221fa9e4066Sahrens txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) 222fa9e4066Sahrens { 223fa9e4066Sahrens CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); 224fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 225fa9e4066Sahrens } 226fa9e4066Sahrens 227fa9e4066Sahrens static void 228fa9e4066Sahrens txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) 229fa9e4066Sahrens { 230fa9e4066Sahrens ASSERT(*tpp != NULL); 231fa9e4066Sahrens *tpp = NULL; 232fa9e4066Sahrens tx->tx_threads--; 233fa9e4066Sahrens cv_broadcast(&tx->tx_exit_cv); 234fa9e4066Sahrens CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ 235fa9e4066Sahrens thread_exit(); 236fa9e4066Sahrens } 237fa9e4066Sahrens 238fa9e4066Sahrens static void 2390689f76cSAdam Leventhal txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) 240fa9e4066Sahrens { 241fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(cpr); 242fa9e4066Sahrens 2431ab7f2deSmaybee if (time) 244d3d50737SRafael Vanoni (void) cv_timedwait(cv, &tx->tx_sync_lock, 245d3d50737SRafael Vanoni ddi_get_lbolt() + time); 246fa9e4066Sahrens else 247fa9e4066Sahrens cv_wait(cv, &tx->tx_sync_lock); 248fa9e4066Sahrens 249fa9e4066Sahrens CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); 250fa9e4066Sahrens } 251fa9e4066Sahrens 252fa9e4066Sahrens /* 253fa9e4066Sahrens * Stop syncing transaction groups. 254fa9e4066Sahrens */ 255fa9e4066Sahrens void 256fa9e4066Sahrens txg_sync_stop(dsl_pool_t *dp) 257fa9e4066Sahrens { 258fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 259fa9e4066Sahrens 260fa9e4066Sahrens dprintf("pool %p\n", dp); 261fa9e4066Sahrens /* 262fa9e4066Sahrens * Finish off any work in progress. 263fa9e4066Sahrens */ 2641271e4b1SPrakash Surya ASSERT3U(tx->tx_threads, ==, 2); 265468c413aSTim Haley 266468c413aSTim Haley /* 267468c413aSTim Haley * We need to ensure that we've vacated the deferred space_maps. 268468c413aSTim Haley */ 269468c413aSTim Haley txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); 270fa9e4066Sahrens 271fa9e4066Sahrens /* 2721ab7f2deSmaybee * Wake all sync threads and wait for them to die. 273fa9e4066Sahrens */ 274fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 275fa9e4066Sahrens 2761271e4b1SPrakash Surya ASSERT3U(tx->tx_threads, ==, 2); 277fa9e4066Sahrens 278fa9e4066Sahrens tx->tx_exiting = 1; 279fa9e4066Sahrens 280fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 281fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 282fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 283fa9e4066Sahrens 284fa9e4066Sahrens while (tx->tx_threads != 0) 285fa9e4066Sahrens cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); 286fa9e4066Sahrens 287fa9e4066Sahrens tx->tx_exiting = 0; 288fa9e4066Sahrens 289fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 290fa9e4066Sahrens } 291fa9e4066Sahrens 292fa9e4066Sahrens uint64_t 293fa9e4066Sahrens txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) 294fa9e4066Sahrens { 295fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 296fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; 297fa9e4066Sahrens uint64_t txg; 298fa9e4066Sahrens 2994a923759SGeorge Wilson mutex_enter(&tc->tc_open_lock); 300fa9e4066Sahrens txg = tx->tx_open_txg; 3014a923759SGeorge Wilson 3024a923759SGeorge Wilson mutex_enter(&tc->tc_lock); 303fa9e4066Sahrens tc->tc_count[txg & TXG_MASK]++; 3044a923759SGeorge Wilson mutex_exit(&tc->tc_lock); 305fa9e4066Sahrens 306fa9e4066Sahrens th->th_cpu = tc; 307fa9e4066Sahrens th->th_txg = txg; 308fa9e4066Sahrens 309fa9e4066Sahrens return (txg); 310fa9e4066Sahrens } 311fa9e4066Sahrens 312fa9e4066Sahrens void 313fa9e4066Sahrens txg_rele_to_quiesce(txg_handle_t *th) 314fa9e4066Sahrens { 315fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 316fa9e4066Sahrens 3174a923759SGeorge Wilson ASSERT(!MUTEX_HELD(&tc->tc_lock)); 3184a923759SGeorge Wilson mutex_exit(&tc->tc_open_lock); 319fa9e4066Sahrens } 320fa9e4066Sahrens 321d20e665cSRicardo M. Correia void 322d20e665cSRicardo M. Correia txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) 323d20e665cSRicardo M. Correia { 324d20e665cSRicardo M. Correia tx_cpu_t *tc = th->th_cpu; 325d20e665cSRicardo M. Correia int g = th->th_txg & TXG_MASK; 326d20e665cSRicardo M. Correia 327d20e665cSRicardo M. Correia mutex_enter(&tc->tc_lock); 328d20e665cSRicardo M. Correia list_move_tail(&tc->tc_callbacks[g], tx_callbacks); 329d20e665cSRicardo M. Correia mutex_exit(&tc->tc_lock); 330d20e665cSRicardo M. Correia } 331d20e665cSRicardo M. Correia 332fa9e4066Sahrens void 333fa9e4066Sahrens txg_rele_to_sync(txg_handle_t *th) 334fa9e4066Sahrens { 335fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 336fa9e4066Sahrens int g = th->th_txg & TXG_MASK; 337fa9e4066Sahrens 338fa9e4066Sahrens mutex_enter(&tc->tc_lock); 339fa9e4066Sahrens ASSERT(tc->tc_count[g] != 0); 340fa9e4066Sahrens if (--tc->tc_count[g] == 0) 341fa9e4066Sahrens cv_broadcast(&tc->tc_cv[g]); 342fa9e4066Sahrens mutex_exit(&tc->tc_lock); 343fa9e4066Sahrens 344fa9e4066Sahrens th->th_cpu = NULL; /* defensive */ 345fa9e4066Sahrens } 346fa9e4066Sahrens 3473e30c24aSWill Andrews /* 3483e30c24aSWill Andrews * Blocks until all transactions in the group are committed. 3493e30c24aSWill Andrews * 3503e30c24aSWill Andrews * On return, the transaction group has reached a stable state in which it can 3513e30c24aSWill Andrews * then be passed off to the syncing context. 3523e30c24aSWill Andrews */ 353fa9e4066Sahrens static void 354fa9e4066Sahrens txg_quiesce(dsl_pool_t *dp, uint64_t txg) 355fa9e4066Sahrens { 356fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 357fa9e4066Sahrens int g = txg & TXG_MASK; 358fa9e4066Sahrens int c; 359fa9e4066Sahrens 360fa9e4066Sahrens /* 3614a923759SGeorge Wilson * Grab all tc_open_locks so nobody else can get into this txg. 362fa9e4066Sahrens */ 363fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 3644a923759SGeorge Wilson mutex_enter(&tx->tx_cpu[c].tc_open_lock); 365fa9e4066Sahrens 366fa9e4066Sahrens ASSERT(txg == tx->tx_open_txg); 367fa9e4066Sahrens tx->tx_open_txg++; 36869962b56SMatthew Ahrens tx->tx_open_time = gethrtime(); 369fa9e4066Sahrens 3700689f76cSAdam Leventhal DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); 3710689f76cSAdam Leventhal DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); 3720689f76cSAdam Leventhal 373fa9e4066Sahrens /* 374fa9e4066Sahrens * Now that we've incremented tx_open_txg, we can let threads 375fa9e4066Sahrens * enter the next transaction group. 376fa9e4066Sahrens */ 377fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 3784a923759SGeorge Wilson mutex_exit(&tx->tx_cpu[c].tc_open_lock); 379fa9e4066Sahrens 380fa9e4066Sahrens /* 381fa9e4066Sahrens * Quiesce the transaction group by waiting for everyone to txg_exit(). 382fa9e4066Sahrens */ 383fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) { 384fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[c]; 385fa9e4066Sahrens mutex_enter(&tc->tc_lock); 386fa9e4066Sahrens while (tc->tc_count[g] != 0) 387fa9e4066Sahrens cv_wait(&tc->tc_cv[g], &tc->tc_lock); 388fa9e4066Sahrens mutex_exit(&tc->tc_lock); 389fa9e4066Sahrens } 390fa9e4066Sahrens } 391fa9e4066Sahrens 392d20e665cSRicardo M. Correia static void 393d20e665cSRicardo M. Correia txg_do_callbacks(list_t *cb_list) 394d20e665cSRicardo M. Correia { 395d20e665cSRicardo M. Correia dmu_tx_do_callbacks(cb_list, 0); 396d20e665cSRicardo M. Correia 397d20e665cSRicardo M. Correia list_destroy(cb_list); 398d20e665cSRicardo M. Correia 399d20e665cSRicardo M. Correia kmem_free(cb_list, sizeof (list_t)); 400d20e665cSRicardo M. Correia } 401d20e665cSRicardo M. Correia 402d20e665cSRicardo M. Correia /* 403d20e665cSRicardo M. Correia * Dispatch the commit callbacks registered on this txg to worker threads. 4043e30c24aSWill Andrews * 4053e30c24aSWill Andrews * If no callbacks are registered for a given TXG, nothing happens. 4063e30c24aSWill Andrews * This function creates a taskq for the associated pool, if needed. 407d20e665cSRicardo M. Correia */ 408d20e665cSRicardo M. Correia static void 409d20e665cSRicardo M. Correia txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) 410d20e665cSRicardo M. Correia { 411d20e665cSRicardo M. Correia int c; 412d20e665cSRicardo M. Correia tx_state_t *tx = &dp->dp_tx; 413d20e665cSRicardo M. Correia list_t *cb_list; 414d20e665cSRicardo M. Correia 415d20e665cSRicardo M. Correia for (c = 0; c < max_ncpus; c++) { 416d20e665cSRicardo M. Correia tx_cpu_t *tc = &tx->tx_cpu[c]; 4173e30c24aSWill Andrews /* 4183e30c24aSWill Andrews * No need to lock tx_cpu_t at this point, since this can 4193e30c24aSWill Andrews * only be called once a txg has been synced. 4203e30c24aSWill Andrews */ 421d20e665cSRicardo M. Correia 422d20e665cSRicardo M. Correia int g = txg & TXG_MASK; 423d20e665cSRicardo M. Correia 424d20e665cSRicardo M. Correia if (list_is_empty(&tc->tc_callbacks[g])) 425d20e665cSRicardo M. Correia continue; 426d20e665cSRicardo M. Correia 427d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq == NULL) { 428d20e665cSRicardo M. Correia /* 429d20e665cSRicardo M. Correia * Commit callback taskq hasn't been created yet. 430d20e665cSRicardo M. Correia */ 431d20e665cSRicardo M. Correia tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 432d20e665cSRicardo M. Correia max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, 433d20e665cSRicardo M. Correia TASKQ_PREPOPULATE); 434d20e665cSRicardo M. Correia } 435d20e665cSRicardo M. Correia 436d20e665cSRicardo M. Correia cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 437d20e665cSRicardo M. Correia list_create(cb_list, sizeof (dmu_tx_callback_t), 438d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 439d20e665cSRicardo M. Correia 440b3d9f2e2SWill Andrews list_move_tail(cb_list, &tc->tc_callbacks[g]); 441d20e665cSRicardo M. Correia 442d20e665cSRicardo M. Correia (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) 443d20e665cSRicardo M. Correia txg_do_callbacks, cb_list, TQ_SLEEP); 444d20e665cSRicardo M. Correia } 445d20e665cSRicardo M. Correia } 446d20e665cSRicardo M. Correia 447fa41d87dSSerapheim Dimitropoulos static boolean_t 448fa41d87dSSerapheim Dimitropoulos txg_is_syncing(dsl_pool_t *dp) 449fa41d87dSSerapheim Dimitropoulos { 450fa41d87dSSerapheim Dimitropoulos tx_state_t *tx = &dp->dp_tx; 451fa41d87dSSerapheim Dimitropoulos ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); 452fa41d87dSSerapheim Dimitropoulos return (tx->tx_syncing_txg != 0); 453fa41d87dSSerapheim Dimitropoulos } 454fa41d87dSSerapheim Dimitropoulos 455fa41d87dSSerapheim Dimitropoulos static boolean_t 456fa41d87dSSerapheim Dimitropoulos txg_is_quiescing(dsl_pool_t *dp) 457fa41d87dSSerapheim Dimitropoulos { 458fa41d87dSSerapheim Dimitropoulos tx_state_t *tx = &dp->dp_tx; 459fa41d87dSSerapheim Dimitropoulos ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); 460fa41d87dSSerapheim Dimitropoulos return (tx->tx_quiescing_txg != 0); 461fa41d87dSSerapheim Dimitropoulos } 462fa41d87dSSerapheim Dimitropoulos 463fa41d87dSSerapheim Dimitropoulos static boolean_t 464fa41d87dSSerapheim Dimitropoulos txg_has_quiesced_to_sync(dsl_pool_t *dp) 465fa41d87dSSerapheim Dimitropoulos { 466fa41d87dSSerapheim Dimitropoulos tx_state_t *tx = &dp->dp_tx; 467fa41d87dSSerapheim Dimitropoulos ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); 468fa41d87dSSerapheim Dimitropoulos return (tx->tx_quiesced_txg != 0); 469fa41d87dSSerapheim Dimitropoulos } 470fa41d87dSSerapheim Dimitropoulos 471fa9e4066Sahrens static void 4723f7978d0SAlan Somers txg_sync_thread(void *arg) 473fa9e4066Sahrens { 4743f7978d0SAlan Somers dsl_pool_t *dp = arg; 475b16da2e2SGeorge Wilson spa_t *spa = dp->dp_spa; 476fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 477fa9e4066Sahrens callb_cpr_t cpr; 47805715f94SMark Maybee uint64_t start, delta; 479fa9e4066Sahrens 480fa9e4066Sahrens txg_thread_enter(tx, &cpr); 481fa9e4066Sahrens 4821ab7f2deSmaybee start = delta = 0; 483fa9e4066Sahrens for (;;) { 48469962b56SMatthew Ahrens uint64_t timeout = zfs_txg_timeout * hz; 48569962b56SMatthew Ahrens uint64_t timer; 48605715f94SMark Maybee uint64_t txg; 487*7928f4baSMatthew Ahrens uint64_t dirty_min_bytes = 488*7928f4baSMatthew Ahrens zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100; 489fa9e4066Sahrens 490fa9e4066Sahrens /* 4913f9d6ad7SLin Ling * We sync when we're scanning, there's someone waiting 49288b7b0f2SMatthew Ahrens * on us, or the quiesce thread has handed off a txg to 49388b7b0f2SMatthew Ahrens * us, or we have reached our timeout. 494fa9e4066Sahrens */ 4951ab7f2deSmaybee timer = (delta >= timeout ? 0 : timeout - delta); 496cde58dbcSMatthew Ahrens while (!dsl_scan_active(dp->dp_scan) && 49788b7b0f2SMatthew Ahrens !tx->tx_exiting && timer > 0 && 498fa9e4066Sahrens tx->tx_synced_txg >= tx->tx_sync_txg_waiting && 499fa41d87dSSerapheim Dimitropoulos !txg_has_quiesced_to_sync(dp) && 500*7928f4baSMatthew Ahrens dp->dp_dirty_total < dirty_min_bytes) { 501fa9e4066Sahrens dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", 502fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 5031ab7f2deSmaybee txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); 504d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 5051ab7f2deSmaybee timer = (delta > timeout ? 0 : timeout - delta); 506fa9e4066Sahrens } 507fa9e4066Sahrens 508fa9e4066Sahrens /* 509fa9e4066Sahrens * Wait until the quiesce thread hands off a txg to us, 510fa9e4066Sahrens * prompting it to do so if necessary. 511fa9e4066Sahrens */ 512fa41d87dSSerapheim Dimitropoulos while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { 513fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) 514fa9e4066Sahrens tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; 515fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 516fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); 517fa9e4066Sahrens } 518fa9e4066Sahrens 519fa9e4066Sahrens if (tx->tx_exiting) 520fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); 521fa9e4066Sahrens 522fa9e4066Sahrens /* 523fa9e4066Sahrens * Consume the quiesced txg which has been handed off to 524fa9e4066Sahrens * us. This may cause the quiescing thread to now be 525fa9e4066Sahrens * able to quiesce another txg, so we must signal it. 526fa9e4066Sahrens */ 527fa41d87dSSerapheim Dimitropoulos ASSERT(tx->tx_quiesced_txg != 0); 528fa9e4066Sahrens txg = tx->tx_quiesced_txg; 529fa9e4066Sahrens tx->tx_quiesced_txg = 0; 530fa9e4066Sahrens tx->tx_syncing_txg = txg; 5310689f76cSAdam Leventhal DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); 532fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 533fa9e4066Sahrens 534fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 5358f38d419Sek txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 536fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 53705715f94SMark Maybee 538d3d50737SRafael Vanoni start = ddi_get_lbolt(); 539b16da2e2SGeorge Wilson spa_sync(spa, txg); 540d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 5411ab7f2deSmaybee 542fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 543fa9e4066Sahrens tx->tx_synced_txg = txg; 544fa9e4066Sahrens tx->tx_syncing_txg = 0; 5450689f76cSAdam Leventhal DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); 546fa9e4066Sahrens cv_broadcast(&tx->tx_sync_done_cv); 547d20e665cSRicardo M. Correia 548d20e665cSRicardo M. Correia /* 549d20e665cSRicardo M. Correia * Dispatch commit callbacks to worker threads. 550d20e665cSRicardo M. Correia */ 551d20e665cSRicardo M. Correia txg_dispatch_callbacks(dp, txg); 552fa9e4066Sahrens } 553fa9e4066Sahrens } 554fa9e4066Sahrens 555fa9e4066Sahrens static void 5563f7978d0SAlan Somers txg_quiesce_thread(void *arg) 557fa9e4066Sahrens { 5583f7978d0SAlan Somers dsl_pool_t *dp = arg; 559fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 560fa9e4066Sahrens callb_cpr_t cpr; 561fa9e4066Sahrens 562fa9e4066Sahrens txg_thread_enter(tx, &cpr); 563fa9e4066Sahrens 564fa9e4066Sahrens for (;;) { 565fa9e4066Sahrens uint64_t txg; 566fa9e4066Sahrens 567fa9e4066Sahrens /* 568fa9e4066Sahrens * We quiesce when there's someone waiting on us. 569fa9e4066Sahrens * However, we can only have one txg in "quiescing" or 570fa9e4066Sahrens * "quiesced, waiting to sync" state. So we wait until 571fa9e4066Sahrens * the "quiesced, waiting to sync" txg has been consumed 572fa9e4066Sahrens * by the sync thread. 573fa9e4066Sahrens */ 574fa9e4066Sahrens while (!tx->tx_exiting && 575fa9e4066Sahrens (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || 576fa41d87dSSerapheim Dimitropoulos txg_has_quiesced_to_sync(dp))) 577fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); 578fa9e4066Sahrens 579fa9e4066Sahrens if (tx->tx_exiting) 580fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); 581fa9e4066Sahrens 582fa9e4066Sahrens txg = tx->tx_open_txg; 583fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 584fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, 585fa9e4066Sahrens tx->tx_sync_txg_waiting); 586fa41d87dSSerapheim Dimitropoulos tx->tx_quiescing_txg = txg; 587fa41d87dSSerapheim Dimitropoulos 588fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 589fa9e4066Sahrens txg_quiesce(dp, txg); 590fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 591fa9e4066Sahrens 592fa9e4066Sahrens /* 593fa9e4066Sahrens * Hand this txg off to the sync thread. 594fa9e4066Sahrens */ 595fa9e4066Sahrens dprintf("quiesce done, handing off txg %llu\n", txg); 596fa41d87dSSerapheim Dimitropoulos tx->tx_quiescing_txg = 0; 597fa9e4066Sahrens tx->tx_quiesced_txg = txg; 5980689f76cSAdam Leventhal DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); 599fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 600fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 601fa9e4066Sahrens } 602fa9e4066Sahrens } 603fa9e4066Sahrens 6041ab7f2deSmaybee /* 6050689f76cSAdam Leventhal * Delay this thread by delay nanoseconds if we are still in the open 606f7170741SWill Andrews * transaction group and there is already a waiting txg quiescing or quiesced. 607f7170741SWill Andrews * Abort the delay if this txg stalls or enters the quiescing state. 6081ab7f2deSmaybee */ 6091ab7f2deSmaybee void 6100689f76cSAdam Leventhal txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) 6111ab7f2deSmaybee { 6121ab7f2deSmaybee tx_state_t *tx = &dp->dp_tx; 6130689f76cSAdam Leventhal hrtime_t start = gethrtime(); 6141ab7f2deSmaybee 615f7170741SWill Andrews /* don't delay if this txg could transition to quiescing immediately */ 6161ab7f2deSmaybee if (tx->tx_open_txg > txg || 6171ab7f2deSmaybee tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) 6181ab7f2deSmaybee return; 6191ab7f2deSmaybee 6201ab7f2deSmaybee mutex_enter(&tx->tx_sync_lock); 6211ab7f2deSmaybee if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { 6221ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 6231ab7f2deSmaybee return; 6241ab7f2deSmaybee } 6251ab7f2deSmaybee 6260689f76cSAdam Leventhal while (gethrtime() - start < delay && 6270689f76cSAdam Leventhal tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { 6280689f76cSAdam Leventhal (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, 6290689f76cSAdam Leventhal &tx->tx_sync_lock, delay, resolution, 0); 6300689f76cSAdam Leventhal } 6311ab7f2deSmaybee 6321ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 6331ab7f2deSmaybee } 6341ab7f2deSmaybee 635fa9e4066Sahrens void 636fa9e4066Sahrens txg_wait_synced(dsl_pool_t *dp, uint64_t txg) 637fa9e4066Sahrens { 638fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 639fa9e4066Sahrens 6403b2aab18SMatthew Ahrens ASSERT(!dsl_pool_config_held(dp)); 6413b2aab18SMatthew Ahrens 642fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 6431271e4b1SPrakash Surya ASSERT3U(tx->tx_threads, ==, 2); 644fa9e4066Sahrens if (txg == 0) 645b24ab676SJeff Bonwick txg = tx->tx_open_txg + TXG_DEFER_SIZE; 646fa9e4066Sahrens if (tx->tx_sync_txg_waiting < txg) 647fa9e4066Sahrens tx->tx_sync_txg_waiting = txg; 648fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 649fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 650fa9e4066Sahrens while (tx->tx_synced_txg < txg) { 651fa9e4066Sahrens dprintf("broadcasting sync more " 652fa9e4066Sahrens "tx_synced=%llu waiting=%llu dp=%p\n", 653fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 654fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 655fa9e4066Sahrens cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); 656fa9e4066Sahrens } 657fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 658fa9e4066Sahrens } 659fa9e4066Sahrens 660fa9e4066Sahrens void 661fa9e4066Sahrens txg_wait_open(dsl_pool_t *dp, uint64_t txg) 662fa9e4066Sahrens { 663fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 664fa9e4066Sahrens 6653b2aab18SMatthew Ahrens ASSERT(!dsl_pool_config_held(dp)); 6663b2aab18SMatthew Ahrens 667fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 6681271e4b1SPrakash Surya ASSERT3U(tx->tx_threads, ==, 2); 669fa9e4066Sahrens if (txg == 0) 670fa9e4066Sahrens txg = tx->tx_open_txg + 1; 671fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < txg) 672fa9e4066Sahrens tx->tx_quiesce_txg_waiting = txg; 673fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 674fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 675fa9e4066Sahrens while (tx->tx_open_txg < txg) { 676fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 677fa9e4066Sahrens cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); 678fa9e4066Sahrens } 679fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 680fa9e4066Sahrens } 681fa9e4066Sahrens 68269962b56SMatthew Ahrens /* 68369962b56SMatthew Ahrens * If there isn't a txg syncing or in the pipeline, push another txg through 68469962b56SMatthew Ahrens * the pipeline by queiscing the open txg. 68569962b56SMatthew Ahrens */ 68669962b56SMatthew Ahrens void 68769962b56SMatthew Ahrens txg_kick(dsl_pool_t *dp) 68869962b56SMatthew Ahrens { 68969962b56SMatthew Ahrens tx_state_t *tx = &dp->dp_tx; 69069962b56SMatthew Ahrens 69169962b56SMatthew Ahrens ASSERT(!dsl_pool_config_held(dp)); 69269962b56SMatthew Ahrens 69369962b56SMatthew Ahrens mutex_enter(&tx->tx_sync_lock); 694fa41d87dSSerapheim Dimitropoulos if (!txg_is_syncing(dp) && 695fa41d87dSSerapheim Dimitropoulos !txg_is_quiescing(dp) && 69669962b56SMatthew Ahrens tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && 69769962b56SMatthew Ahrens tx->tx_sync_txg_waiting <= tx->tx_synced_txg && 69869962b56SMatthew Ahrens tx->tx_quiesced_txg <= tx->tx_synced_txg) { 69969962b56SMatthew Ahrens tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; 70069962b56SMatthew Ahrens cv_broadcast(&tx->tx_quiesce_more_cv); 70169962b56SMatthew Ahrens } 70269962b56SMatthew Ahrens mutex_exit(&tx->tx_sync_lock); 70369962b56SMatthew Ahrens } 70469962b56SMatthew Ahrens 705088f3894Sahrens boolean_t 706fa9e4066Sahrens txg_stalled(dsl_pool_t *dp) 707fa9e4066Sahrens { 708fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 709fa9e4066Sahrens return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); 710fa9e4066Sahrens } 711fa9e4066Sahrens 712088f3894Sahrens boolean_t 713088f3894Sahrens txg_sync_waiting(dsl_pool_t *dp) 714088f3894Sahrens { 715088f3894Sahrens tx_state_t *tx = &dp->dp_tx; 716088f3894Sahrens 717088f3894Sahrens return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || 718088f3894Sahrens tx->tx_quiesced_txg != 0); 719088f3894Sahrens } 720088f3894Sahrens 721b7b2590dSMatthew Ahrens /* 722b7b2590dSMatthew Ahrens * Verify that this txg is active (open, quiescing, syncing). Non-active 723b7b2590dSMatthew Ahrens * txg's should not be manipulated. 724b7b2590dSMatthew Ahrens */ 725b7b2590dSMatthew Ahrens void 726b7b2590dSMatthew Ahrens txg_verify(spa_t *spa, uint64_t txg) 727b7b2590dSMatthew Ahrens { 728b7b2590dSMatthew Ahrens dsl_pool_t *dp = spa_get_dsl(spa); 729b7b2590dSMatthew Ahrens if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) 730b7b2590dSMatthew Ahrens return; 731b7b2590dSMatthew Ahrens ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 732b7b2590dSMatthew Ahrens ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg); 733b7b2590dSMatthew Ahrens ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES); 734b7b2590dSMatthew Ahrens } 735b7b2590dSMatthew Ahrens 736fa9e4066Sahrens /* 737fa9e4066Sahrens * Per-txg object lists. 738fa9e4066Sahrens */ 739fa9e4066Sahrens void 740b7b2590dSMatthew Ahrens txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset) 741fa9e4066Sahrens { 742fa9e4066Sahrens int t; 743fa9e4066Sahrens 744fa9e4066Sahrens mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); 745fa9e4066Sahrens 746fa9e4066Sahrens tl->tl_offset = offset; 747b7b2590dSMatthew Ahrens tl->tl_spa = spa; 748fa9e4066Sahrens 749fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 750fa9e4066Sahrens tl->tl_head[t] = NULL; 751fa9e4066Sahrens } 752fa9e4066Sahrens 753fa9e4066Sahrens void 754fa9e4066Sahrens txg_list_destroy(txg_list_t *tl) 755fa9e4066Sahrens { 756fa9e4066Sahrens int t; 757fa9e4066Sahrens 758fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 759fa9e4066Sahrens ASSERT(txg_list_empty(tl, t)); 760fa9e4066Sahrens 761fa9e4066Sahrens mutex_destroy(&tl->tl_lock); 762fa9e4066Sahrens } 763fa9e4066Sahrens 764ce636f8bSMatthew Ahrens boolean_t 765fa9e4066Sahrens txg_list_empty(txg_list_t *tl, uint64_t txg) 766fa9e4066Sahrens { 767b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 768fa9e4066Sahrens return (tl->tl_head[txg & TXG_MASK] == NULL); 769fa9e4066Sahrens } 770fa9e4066Sahrens 77173527f44SAlex Reece /* 77273527f44SAlex Reece * Returns true if all txg lists are empty. 77373527f44SAlex Reece * 774b7b2590dSMatthew Ahrens * Warning: this is inherently racy (an item could be added immediately 775b7b2590dSMatthew Ahrens * after this function returns). We don't bother with the lock because 776b7b2590dSMatthew Ahrens * it wouldn't change the semantics. 77773527f44SAlex Reece */ 77873527f44SAlex Reece boolean_t 77973527f44SAlex Reece txg_all_lists_empty(txg_list_t *tl) 78073527f44SAlex Reece { 78173527f44SAlex Reece for (int i = 0; i < TXG_SIZE; i++) { 78273527f44SAlex Reece if (!txg_list_empty(tl, i)) { 78373527f44SAlex Reece return (B_FALSE); 78473527f44SAlex Reece } 78573527f44SAlex Reece } 78673527f44SAlex Reece return (B_TRUE); 78773527f44SAlex Reece } 78873527f44SAlex Reece 789fa9e4066Sahrens /* 7903b2aab18SMatthew Ahrens * Add an entry to the list (unless it's already on the list). 7913b2aab18SMatthew Ahrens * Returns B_TRUE if it was actually added. 792fa9e4066Sahrens */ 7933b2aab18SMatthew Ahrens boolean_t 794fa9e4066Sahrens txg_list_add(txg_list_t *tl, void *p, uint64_t txg) 795fa9e4066Sahrens { 796fa9e4066Sahrens int t = txg & TXG_MASK; 797fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 7983b2aab18SMatthew Ahrens boolean_t add; 799fa9e4066Sahrens 800b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 801fa9e4066Sahrens mutex_enter(&tl->tl_lock); 8023b2aab18SMatthew Ahrens add = (tn->tn_member[t] == 0); 8033b2aab18SMatthew Ahrens if (add) { 804fa9e4066Sahrens tn->tn_member[t] = 1; 805fa9e4066Sahrens tn->tn_next[t] = tl->tl_head[t]; 806fa9e4066Sahrens tl->tl_head[t] = tn; 807fa9e4066Sahrens } 808fa9e4066Sahrens mutex_exit(&tl->tl_lock); 809fa9e4066Sahrens 8103b2aab18SMatthew Ahrens return (add); 811fa9e4066Sahrens } 812fa9e4066Sahrens 813495807d7SMatthew Ahrens /* 8143b2aab18SMatthew Ahrens * Add an entry to the end of the list, unless it's already on the list. 8153b2aab18SMatthew Ahrens * (walks list to find end) 8163b2aab18SMatthew Ahrens * Returns B_TRUE if it was actually added. 817495807d7SMatthew Ahrens */ 8183b2aab18SMatthew Ahrens boolean_t 819495807d7SMatthew Ahrens txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) 820495807d7SMatthew Ahrens { 821495807d7SMatthew Ahrens int t = txg & TXG_MASK; 822495807d7SMatthew Ahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 8233b2aab18SMatthew Ahrens boolean_t add; 824495807d7SMatthew Ahrens 825b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 826495807d7SMatthew Ahrens mutex_enter(&tl->tl_lock); 8273b2aab18SMatthew Ahrens add = (tn->tn_member[t] == 0); 8283b2aab18SMatthew Ahrens if (add) { 829495807d7SMatthew Ahrens txg_node_t **tp; 830495807d7SMatthew Ahrens 831495807d7SMatthew Ahrens for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) 832495807d7SMatthew Ahrens continue; 833495807d7SMatthew Ahrens 834495807d7SMatthew Ahrens tn->tn_member[t] = 1; 835495807d7SMatthew Ahrens tn->tn_next[t] = NULL; 836495807d7SMatthew Ahrens *tp = tn; 837495807d7SMatthew Ahrens } 838495807d7SMatthew Ahrens mutex_exit(&tl->tl_lock); 839495807d7SMatthew Ahrens 8403b2aab18SMatthew Ahrens return (add); 841495807d7SMatthew Ahrens } 842495807d7SMatthew Ahrens 843fa9e4066Sahrens /* 844fa9e4066Sahrens * Remove the head of the list and return it. 845fa9e4066Sahrens */ 846fa9e4066Sahrens void * 847fa9e4066Sahrens txg_list_remove(txg_list_t *tl, uint64_t txg) 848fa9e4066Sahrens { 849fa9e4066Sahrens int t = txg & TXG_MASK; 850fa9e4066Sahrens txg_node_t *tn; 851fa9e4066Sahrens void *p = NULL; 852fa9e4066Sahrens 853b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 854fa9e4066Sahrens mutex_enter(&tl->tl_lock); 855fa9e4066Sahrens if ((tn = tl->tl_head[t]) != NULL) { 8565cabbc6bSPrashanth Sreenivasa ASSERT(tn->tn_member[t]); 8575cabbc6bSPrashanth Sreenivasa ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); 858fa9e4066Sahrens p = (char *)tn - tl->tl_offset; 859fa9e4066Sahrens tl->tl_head[t] = tn->tn_next[t]; 860fa9e4066Sahrens tn->tn_next[t] = NULL; 861fa9e4066Sahrens tn->tn_member[t] = 0; 862fa9e4066Sahrens } 863fa9e4066Sahrens mutex_exit(&tl->tl_lock); 864fa9e4066Sahrens 865fa9e4066Sahrens return (p); 866fa9e4066Sahrens } 867fa9e4066Sahrens 868fa9e4066Sahrens /* 869fa9e4066Sahrens * Remove a specific item from the list and return it. 870fa9e4066Sahrens */ 871fa9e4066Sahrens void * 872fa9e4066Sahrens txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) 873fa9e4066Sahrens { 874fa9e4066Sahrens int t = txg & TXG_MASK; 875fa9e4066Sahrens txg_node_t *tn, **tp; 876fa9e4066Sahrens 877b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 878fa9e4066Sahrens mutex_enter(&tl->tl_lock); 879fa9e4066Sahrens 880fa9e4066Sahrens for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { 881fa9e4066Sahrens if ((char *)tn - tl->tl_offset == p) { 882fa9e4066Sahrens *tp = tn->tn_next[t]; 883fa9e4066Sahrens tn->tn_next[t] = NULL; 884fa9e4066Sahrens tn->tn_member[t] = 0; 885fa9e4066Sahrens mutex_exit(&tl->tl_lock); 886fa9e4066Sahrens return (p); 887fa9e4066Sahrens } 888fa9e4066Sahrens } 889fa9e4066Sahrens 890fa9e4066Sahrens mutex_exit(&tl->tl_lock); 891fa9e4066Sahrens 892fa9e4066Sahrens return (NULL); 893fa9e4066Sahrens } 894fa9e4066Sahrens 8953b2aab18SMatthew Ahrens boolean_t 896fa9e4066Sahrens txg_list_member(txg_list_t *tl, void *p, uint64_t txg) 897fa9e4066Sahrens { 898fa9e4066Sahrens int t = txg & TXG_MASK; 899fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 900fa9e4066Sahrens 901b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 9023b2aab18SMatthew Ahrens return (tn->tn_member[t] != 0); 903fa9e4066Sahrens } 904fa9e4066Sahrens 905fa9e4066Sahrens /* 906fa9e4066Sahrens * Walk a txg list -- only safe if you know it's not changing. 907fa9e4066Sahrens */ 908fa9e4066Sahrens void * 909fa9e4066Sahrens txg_list_head(txg_list_t *tl, uint64_t txg) 910fa9e4066Sahrens { 911fa9e4066Sahrens int t = txg & TXG_MASK; 912fa9e4066Sahrens txg_node_t *tn = tl->tl_head[t]; 913fa9e4066Sahrens 914b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 915fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 916fa9e4066Sahrens } 917fa9e4066Sahrens 918fa9e4066Sahrens void * 919fa9e4066Sahrens txg_list_next(txg_list_t *tl, void *p, uint64_t txg) 920fa9e4066Sahrens { 921fa9e4066Sahrens int t = txg & TXG_MASK; 922fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 923fa9e4066Sahrens 924b7b2590dSMatthew Ahrens txg_verify(tl->tl_spa, txg); 925fa9e4066Sahrens tn = tn->tn_next[t]; 926fa9e4066Sahrens 927fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 928fa9e4066Sahrens } 929