1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 55ad82045Snd * Common Development and Distribution License (the "License"). 65ad82045Snd * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23*383e7c74SXin Li * Portions Copyright 2011 Martin Matuska 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #include <sys/zfs_context.h> 27fa9e4066Sahrens #include <sys/txg_impl.h> 28fa9e4066Sahrens #include <sys/dmu_impl.h> 29d20e665cSRicardo M. Correia #include <sys/dmu_tx.h> 30fa9e4066Sahrens #include <sys/dsl_pool.h> 313f9d6ad7SLin Ling #include <sys/dsl_scan.h> 32fa9e4066Sahrens #include <sys/callb.h> 33fa9e4066Sahrens 34fa9e4066Sahrens /* 35fa9e4066Sahrens * Pool-wide transaction groups. 36fa9e4066Sahrens */ 37fa9e4066Sahrens 38fa9e4066Sahrens static void txg_sync_thread(dsl_pool_t *dp); 39fa9e4066Sahrens static void txg_quiesce_thread(dsl_pool_t *dp); 40fa9e4066Sahrens 4144ecc532SGeorge Wilson int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ 42fa9e4066Sahrens 43fa9e4066Sahrens /* 44fa9e4066Sahrens * Prepare the txg subsystem. 45fa9e4066Sahrens */ 46fa9e4066Sahrens void 47fa9e4066Sahrens txg_init(dsl_pool_t *dp, uint64_t txg) 48fa9e4066Sahrens { 49fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 505ad82045Snd int c; 51fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 52fa9e4066Sahrens 53fa9e4066Sahrens tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); 54fa9e4066Sahrens 558f38d419Sek for (c = 0; c < max_ncpus; c++) { 568f38d419Sek int i; 578f38d419Sek 585ad82045Snd mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); 598f38d419Sek for (i = 0; i < TXG_SIZE; i++) { 608f38d419Sek cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, 618f38d419Sek NULL); 62d20e665cSRicardo M. Correia list_create(&tx->tx_cpu[c].tc_callbacks[i], 63d20e665cSRicardo M. Correia sizeof (dmu_tx_callback_t), 64d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 658f38d419Sek } 668f38d419Sek } 675ad82045Snd 685ad82045Snd mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); 69fa9e4066Sahrens 70b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); 71b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); 72b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); 73b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); 74b5e70f97SRicardo M. Correia cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); 75b5e70f97SRicardo M. Correia 76fa9e4066Sahrens tx->tx_open_txg = txg; 77fa9e4066Sahrens } 78fa9e4066Sahrens 79fa9e4066Sahrens /* 80fa9e4066Sahrens * Close down the txg subsystem. 81fa9e4066Sahrens */ 82fa9e4066Sahrens void 83fa9e4066Sahrens txg_fini(dsl_pool_t *dp) 84fa9e4066Sahrens { 85fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 865ad82045Snd int c; 87fa9e4066Sahrens 88fa9e4066Sahrens ASSERT(tx->tx_threads == 0); 89fa9e4066Sahrens 905ad82045Snd mutex_destroy(&tx->tx_sync_lock); 915ad82045Snd 92b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_more_cv); 93b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_done_cv); 94b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_more_cv); 95b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_done_cv); 96b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_exit_cv); 97b5e70f97SRicardo M. Correia 988f38d419Sek for (c = 0; c < max_ncpus; c++) { 998f38d419Sek int i; 1008f38d419Sek 1015ad82045Snd mutex_destroy(&tx->tx_cpu[c].tc_lock); 102d20e665cSRicardo M. Correia for (i = 0; i < TXG_SIZE; i++) { 1038f38d419Sek cv_destroy(&tx->tx_cpu[c].tc_cv[i]); 104d20e665cSRicardo M. Correia list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); 105d20e665cSRicardo M. Correia } 1068f38d419Sek } 107fa9e4066Sahrens 108d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq != NULL) 109d20e665cSRicardo M. Correia taskq_destroy(tx->tx_commit_cb_taskq); 110d20e665cSRicardo M. Correia 111fa9e4066Sahrens kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); 112fa9e4066Sahrens 113fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 114fa9e4066Sahrens } 115fa9e4066Sahrens 116fa9e4066Sahrens /* 117fa9e4066Sahrens * Start syncing transaction groups. 118fa9e4066Sahrens */ 119fa9e4066Sahrens void 120fa9e4066Sahrens txg_sync_start(dsl_pool_t *dp) 121fa9e4066Sahrens { 122fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 123fa9e4066Sahrens 124fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 125fa9e4066Sahrens 126fa9e4066Sahrens dprintf("pool %p\n", dp); 127fa9e4066Sahrens 128fa9e4066Sahrens ASSERT(tx->tx_threads == 0); 129fa9e4066Sahrens 1301ab7f2deSmaybee tx->tx_threads = 2; 131fa9e4066Sahrens 132fa9e4066Sahrens tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, 133fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 134fa9e4066Sahrens 135088f3894Sahrens /* 136088f3894Sahrens * The sync thread can need a larger-than-default stack size on 137088f3894Sahrens * 32-bit x86. This is due in part to nested pools and 138088f3894Sahrens * scrub_visitbp() recursion. 139088f3894Sahrens */ 1403f9d6ad7SLin Ling tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, 141fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 142fa9e4066Sahrens 143fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 144fa9e4066Sahrens } 145fa9e4066Sahrens 146fa9e4066Sahrens static void 147fa9e4066Sahrens txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) 148fa9e4066Sahrens { 149fa9e4066Sahrens CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); 150fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 151fa9e4066Sahrens } 152fa9e4066Sahrens 153fa9e4066Sahrens static void 154fa9e4066Sahrens txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) 155fa9e4066Sahrens { 156fa9e4066Sahrens ASSERT(*tpp != NULL); 157fa9e4066Sahrens *tpp = NULL; 158fa9e4066Sahrens tx->tx_threads--; 159fa9e4066Sahrens cv_broadcast(&tx->tx_exit_cv); 160fa9e4066Sahrens CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ 161fa9e4066Sahrens thread_exit(); 162fa9e4066Sahrens } 163fa9e4066Sahrens 164fa9e4066Sahrens static void 1651ab7f2deSmaybee txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) 166fa9e4066Sahrens { 167fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(cpr); 168fa9e4066Sahrens 1691ab7f2deSmaybee if (time) 170d3d50737SRafael Vanoni (void) cv_timedwait(cv, &tx->tx_sync_lock, 171d3d50737SRafael Vanoni ddi_get_lbolt() + time); 172fa9e4066Sahrens else 173fa9e4066Sahrens cv_wait(cv, &tx->tx_sync_lock); 174fa9e4066Sahrens 175fa9e4066Sahrens CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); 176fa9e4066Sahrens } 177fa9e4066Sahrens 178fa9e4066Sahrens /* 179fa9e4066Sahrens * Stop syncing transaction groups. 180fa9e4066Sahrens */ 181fa9e4066Sahrens void 182fa9e4066Sahrens txg_sync_stop(dsl_pool_t *dp) 183fa9e4066Sahrens { 184fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 185fa9e4066Sahrens 186fa9e4066Sahrens dprintf("pool %p\n", dp); 187fa9e4066Sahrens /* 188fa9e4066Sahrens * Finish off any work in progress. 189fa9e4066Sahrens */ 1901ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 191468c413aSTim Haley 192468c413aSTim Haley /* 193468c413aSTim Haley * We need to ensure that we've vacated the deferred space_maps. 194468c413aSTim Haley */ 195468c413aSTim Haley txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); 196fa9e4066Sahrens 197fa9e4066Sahrens /* 1981ab7f2deSmaybee * Wake all sync threads and wait for them to die. 199fa9e4066Sahrens */ 200fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 201fa9e4066Sahrens 2021ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 203fa9e4066Sahrens 204fa9e4066Sahrens tx->tx_exiting = 1; 205fa9e4066Sahrens 206fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 207fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 208fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 209fa9e4066Sahrens 210fa9e4066Sahrens while (tx->tx_threads != 0) 211fa9e4066Sahrens cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); 212fa9e4066Sahrens 213fa9e4066Sahrens tx->tx_exiting = 0; 214fa9e4066Sahrens 215fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 216fa9e4066Sahrens } 217fa9e4066Sahrens 218fa9e4066Sahrens uint64_t 219fa9e4066Sahrens txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) 220fa9e4066Sahrens { 221fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 222fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; 223fa9e4066Sahrens uint64_t txg; 224fa9e4066Sahrens 225fa9e4066Sahrens mutex_enter(&tc->tc_lock); 226fa9e4066Sahrens 227fa9e4066Sahrens txg = tx->tx_open_txg; 228fa9e4066Sahrens tc->tc_count[txg & TXG_MASK]++; 229fa9e4066Sahrens 230fa9e4066Sahrens th->th_cpu = tc; 231fa9e4066Sahrens th->th_txg = txg; 232fa9e4066Sahrens 233fa9e4066Sahrens return (txg); 234fa9e4066Sahrens } 235fa9e4066Sahrens 236fa9e4066Sahrens void 237fa9e4066Sahrens txg_rele_to_quiesce(txg_handle_t *th) 238fa9e4066Sahrens { 239fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 240fa9e4066Sahrens 241fa9e4066Sahrens mutex_exit(&tc->tc_lock); 242fa9e4066Sahrens } 243fa9e4066Sahrens 244d20e665cSRicardo M. Correia void 245d20e665cSRicardo M. Correia txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) 246d20e665cSRicardo M. Correia { 247d20e665cSRicardo M. Correia tx_cpu_t *tc = th->th_cpu; 248d20e665cSRicardo M. Correia int g = th->th_txg & TXG_MASK; 249d20e665cSRicardo M. Correia 250d20e665cSRicardo M. Correia mutex_enter(&tc->tc_lock); 251d20e665cSRicardo M. Correia list_move_tail(&tc->tc_callbacks[g], tx_callbacks); 252d20e665cSRicardo M. Correia mutex_exit(&tc->tc_lock); 253d20e665cSRicardo M. Correia } 254d20e665cSRicardo M. Correia 255fa9e4066Sahrens void 256fa9e4066Sahrens txg_rele_to_sync(txg_handle_t *th) 257fa9e4066Sahrens { 258fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 259fa9e4066Sahrens int g = th->th_txg & TXG_MASK; 260fa9e4066Sahrens 261fa9e4066Sahrens mutex_enter(&tc->tc_lock); 262fa9e4066Sahrens ASSERT(tc->tc_count[g] != 0); 263fa9e4066Sahrens if (--tc->tc_count[g] == 0) 264fa9e4066Sahrens cv_broadcast(&tc->tc_cv[g]); 265fa9e4066Sahrens mutex_exit(&tc->tc_lock); 266fa9e4066Sahrens 267fa9e4066Sahrens th->th_cpu = NULL; /* defensive */ 268fa9e4066Sahrens } 269fa9e4066Sahrens 270fa9e4066Sahrens static void 271fa9e4066Sahrens txg_quiesce(dsl_pool_t *dp, uint64_t txg) 272fa9e4066Sahrens { 273fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 274fa9e4066Sahrens int g = txg & TXG_MASK; 275fa9e4066Sahrens int c; 276fa9e4066Sahrens 277fa9e4066Sahrens /* 278fa9e4066Sahrens * Grab all tx_cpu locks so nobody else can get into this txg. 279fa9e4066Sahrens */ 280fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 281fa9e4066Sahrens mutex_enter(&tx->tx_cpu[c].tc_lock); 282fa9e4066Sahrens 283fa9e4066Sahrens ASSERT(txg == tx->tx_open_txg); 284fa9e4066Sahrens tx->tx_open_txg++; 285fa9e4066Sahrens 286fa9e4066Sahrens /* 287fa9e4066Sahrens * Now that we've incremented tx_open_txg, we can let threads 288fa9e4066Sahrens * enter the next transaction group. 289fa9e4066Sahrens */ 290fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 291fa9e4066Sahrens mutex_exit(&tx->tx_cpu[c].tc_lock); 292fa9e4066Sahrens 293fa9e4066Sahrens /* 294fa9e4066Sahrens * Quiesce the transaction group by waiting for everyone to txg_exit(). 295fa9e4066Sahrens */ 296fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) { 297fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[c]; 298fa9e4066Sahrens mutex_enter(&tc->tc_lock); 299fa9e4066Sahrens while (tc->tc_count[g] != 0) 300fa9e4066Sahrens cv_wait(&tc->tc_cv[g], &tc->tc_lock); 301fa9e4066Sahrens mutex_exit(&tc->tc_lock); 302fa9e4066Sahrens } 303fa9e4066Sahrens } 304fa9e4066Sahrens 305d20e665cSRicardo M. Correia static void 306d20e665cSRicardo M. Correia txg_do_callbacks(list_t *cb_list) 307d20e665cSRicardo M. Correia { 308d20e665cSRicardo M. Correia dmu_tx_do_callbacks(cb_list, 0); 309d20e665cSRicardo M. Correia 310d20e665cSRicardo M. Correia list_destroy(cb_list); 311d20e665cSRicardo M. Correia 312d20e665cSRicardo M. Correia kmem_free(cb_list, sizeof (list_t)); 313d20e665cSRicardo M. Correia } 314d20e665cSRicardo M. Correia 315d20e665cSRicardo M. Correia /* 316d20e665cSRicardo M. Correia * Dispatch the commit callbacks registered on this txg to worker threads. 317d20e665cSRicardo M. Correia */ 318d20e665cSRicardo M. Correia static void 319d20e665cSRicardo M. Correia txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) 320d20e665cSRicardo M. Correia { 321d20e665cSRicardo M. Correia int c; 322d20e665cSRicardo M. Correia tx_state_t *tx = &dp->dp_tx; 323d20e665cSRicardo M. Correia list_t *cb_list; 324d20e665cSRicardo M. Correia 325d20e665cSRicardo M. Correia for (c = 0; c < max_ncpus; c++) { 326d20e665cSRicardo M. Correia tx_cpu_t *tc = &tx->tx_cpu[c]; 327d20e665cSRicardo M. Correia /* No need to lock tx_cpu_t at this point */ 328d20e665cSRicardo M. Correia 329d20e665cSRicardo M. Correia int g = txg & TXG_MASK; 330d20e665cSRicardo M. Correia 331d20e665cSRicardo M. Correia if (list_is_empty(&tc->tc_callbacks[g])) 332d20e665cSRicardo M. Correia continue; 333d20e665cSRicardo M. Correia 334d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq == NULL) { 335d20e665cSRicardo M. Correia /* 336d20e665cSRicardo M. Correia * Commit callback taskq hasn't been created yet. 337d20e665cSRicardo M. Correia */ 338d20e665cSRicardo M. Correia tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 339d20e665cSRicardo M. Correia max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, 340d20e665cSRicardo M. Correia TASKQ_PREPOPULATE); 341d20e665cSRicardo M. Correia } 342d20e665cSRicardo M. Correia 343d20e665cSRicardo M. Correia cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 344d20e665cSRicardo M. Correia list_create(cb_list, sizeof (dmu_tx_callback_t), 345d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 346d20e665cSRicardo M. Correia 347d20e665cSRicardo M. Correia list_move_tail(&tc->tc_callbacks[g], cb_list); 348d20e665cSRicardo M. Correia 349d20e665cSRicardo M. Correia (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) 350d20e665cSRicardo M. Correia txg_do_callbacks, cb_list, TQ_SLEEP); 351d20e665cSRicardo M. Correia } 352d20e665cSRicardo M. Correia } 353d20e665cSRicardo M. Correia 354fa9e4066Sahrens static void 355fa9e4066Sahrens txg_sync_thread(dsl_pool_t *dp) 356fa9e4066Sahrens { 357b16da2e2SGeorge Wilson spa_t *spa = dp->dp_spa; 358fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 359fa9e4066Sahrens callb_cpr_t cpr; 36005715f94SMark Maybee uint64_t start, delta; 361fa9e4066Sahrens 362fa9e4066Sahrens txg_thread_enter(tx, &cpr); 363fa9e4066Sahrens 3641ab7f2deSmaybee start = delta = 0; 365fa9e4066Sahrens for (;;) { 36605715f94SMark Maybee uint64_t timer, timeout = zfs_txg_timeout * hz; 36705715f94SMark Maybee uint64_t txg; 368fa9e4066Sahrens 369fa9e4066Sahrens /* 3703f9d6ad7SLin Ling * We sync when we're scanning, there's someone waiting 37188b7b0f2SMatthew Ahrens * on us, or the quiesce thread has handed off a txg to 37288b7b0f2SMatthew Ahrens * us, or we have reached our timeout. 373fa9e4066Sahrens */ 3741ab7f2deSmaybee timer = (delta >= timeout ? 0 : timeout - delta); 375cde58dbcSMatthew Ahrens while (!dsl_scan_active(dp->dp_scan) && 37688b7b0f2SMatthew Ahrens !tx->tx_exiting && timer > 0 && 377fa9e4066Sahrens tx->tx_synced_txg >= tx->tx_sync_txg_waiting && 378fa9e4066Sahrens tx->tx_quiesced_txg == 0) { 379fa9e4066Sahrens dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", 380fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 3811ab7f2deSmaybee txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); 382d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 3831ab7f2deSmaybee timer = (delta > timeout ? 0 : timeout - delta); 384fa9e4066Sahrens } 385fa9e4066Sahrens 386fa9e4066Sahrens /* 387fa9e4066Sahrens * Wait until the quiesce thread hands off a txg to us, 388fa9e4066Sahrens * prompting it to do so if necessary. 389fa9e4066Sahrens */ 390fa9e4066Sahrens while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { 391fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) 392fa9e4066Sahrens tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; 393fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 394fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); 395fa9e4066Sahrens } 396fa9e4066Sahrens 397fa9e4066Sahrens if (tx->tx_exiting) 398fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); 399fa9e4066Sahrens 400fa9e4066Sahrens /* 401fa9e4066Sahrens * Consume the quiesced txg which has been handed off to 402fa9e4066Sahrens * us. This may cause the quiescing thread to now be 403fa9e4066Sahrens * able to quiesce another txg, so we must signal it. 404fa9e4066Sahrens */ 405fa9e4066Sahrens txg = tx->tx_quiesced_txg; 406fa9e4066Sahrens tx->tx_quiesced_txg = 0; 407fa9e4066Sahrens tx->tx_syncing_txg = txg; 408fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 409fa9e4066Sahrens 410fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 4118f38d419Sek txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 412fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 41305715f94SMark Maybee 414d3d50737SRafael Vanoni start = ddi_get_lbolt(); 415b16da2e2SGeorge Wilson spa_sync(spa, txg); 416d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 4171ab7f2deSmaybee 418fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 419fa9e4066Sahrens tx->tx_synced_txg = txg; 420fa9e4066Sahrens tx->tx_syncing_txg = 0; 421fa9e4066Sahrens cv_broadcast(&tx->tx_sync_done_cv); 422d20e665cSRicardo M. Correia 423d20e665cSRicardo M. Correia /* 424d20e665cSRicardo M. Correia * Dispatch commit callbacks to worker threads. 425d20e665cSRicardo M. Correia */ 426d20e665cSRicardo M. Correia txg_dispatch_callbacks(dp, txg); 427fa9e4066Sahrens } 428fa9e4066Sahrens } 429fa9e4066Sahrens 430fa9e4066Sahrens static void 431fa9e4066Sahrens txg_quiesce_thread(dsl_pool_t *dp) 432fa9e4066Sahrens { 433fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 434fa9e4066Sahrens callb_cpr_t cpr; 435fa9e4066Sahrens 436fa9e4066Sahrens txg_thread_enter(tx, &cpr); 437fa9e4066Sahrens 438fa9e4066Sahrens for (;;) { 439fa9e4066Sahrens uint64_t txg; 440fa9e4066Sahrens 441fa9e4066Sahrens /* 442fa9e4066Sahrens * We quiesce when there's someone waiting on us. 443fa9e4066Sahrens * However, we can only have one txg in "quiescing" or 444fa9e4066Sahrens * "quiesced, waiting to sync" state. So we wait until 445fa9e4066Sahrens * the "quiesced, waiting to sync" txg has been consumed 446fa9e4066Sahrens * by the sync thread. 447fa9e4066Sahrens */ 448fa9e4066Sahrens while (!tx->tx_exiting && 449fa9e4066Sahrens (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || 450fa9e4066Sahrens tx->tx_quiesced_txg != 0)) 451fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); 452fa9e4066Sahrens 453fa9e4066Sahrens if (tx->tx_exiting) 454fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); 455fa9e4066Sahrens 456fa9e4066Sahrens txg = tx->tx_open_txg; 457fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 458fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, 459fa9e4066Sahrens tx->tx_sync_txg_waiting); 460fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 461fa9e4066Sahrens txg_quiesce(dp, txg); 462fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 463fa9e4066Sahrens 464fa9e4066Sahrens /* 465fa9e4066Sahrens * Hand this txg off to the sync thread. 466fa9e4066Sahrens */ 467fa9e4066Sahrens dprintf("quiesce done, handing off txg %llu\n", txg); 468fa9e4066Sahrens tx->tx_quiesced_txg = txg; 469fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 470fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 471fa9e4066Sahrens } 472fa9e4066Sahrens } 473fa9e4066Sahrens 4741ab7f2deSmaybee /* 4751ab7f2deSmaybee * Delay this thread by 'ticks' if we are still in the open transaction 4761ab7f2deSmaybee * group and there is already a waiting txg quiesing or quiesced. Abort 4771ab7f2deSmaybee * the delay if this txg stalls or enters the quiesing state. 4781ab7f2deSmaybee */ 4791ab7f2deSmaybee void 4801ab7f2deSmaybee txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) 4811ab7f2deSmaybee { 4821ab7f2deSmaybee tx_state_t *tx = &dp->dp_tx; 48361bb40edSMartin Matuska clock_t timeout = ddi_get_lbolt() + ticks; 4841ab7f2deSmaybee 4851ab7f2deSmaybee /* don't delay if this txg could transition to quiesing immediately */ 4861ab7f2deSmaybee if (tx->tx_open_txg > txg || 4871ab7f2deSmaybee tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) 4881ab7f2deSmaybee return; 4891ab7f2deSmaybee 4901ab7f2deSmaybee mutex_enter(&tx->tx_sync_lock); 4911ab7f2deSmaybee if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { 4921ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 4931ab7f2deSmaybee return; 4941ab7f2deSmaybee } 4951ab7f2deSmaybee 496d3d50737SRafael Vanoni while (ddi_get_lbolt() < timeout && 4971ab7f2deSmaybee tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) 4981ab7f2deSmaybee (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, 4991ab7f2deSmaybee timeout); 5001ab7f2deSmaybee 5011ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 5021ab7f2deSmaybee } 5031ab7f2deSmaybee 504fa9e4066Sahrens void 505fa9e4066Sahrens txg_wait_synced(dsl_pool_t *dp, uint64_t txg) 506fa9e4066Sahrens { 507fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 508fa9e4066Sahrens 509fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 5101ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 511fa9e4066Sahrens if (txg == 0) 512b24ab676SJeff Bonwick txg = tx->tx_open_txg + TXG_DEFER_SIZE; 513fa9e4066Sahrens if (tx->tx_sync_txg_waiting < txg) 514fa9e4066Sahrens tx->tx_sync_txg_waiting = txg; 515fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 516fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 517fa9e4066Sahrens while (tx->tx_synced_txg < txg) { 518fa9e4066Sahrens dprintf("broadcasting sync more " 519fa9e4066Sahrens "tx_synced=%llu waiting=%llu dp=%p\n", 520fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 521fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 522fa9e4066Sahrens cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); 523fa9e4066Sahrens } 524fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 525fa9e4066Sahrens } 526fa9e4066Sahrens 527fa9e4066Sahrens void 528fa9e4066Sahrens txg_wait_open(dsl_pool_t *dp, uint64_t txg) 529fa9e4066Sahrens { 530fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 531fa9e4066Sahrens 532fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 5331ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 534fa9e4066Sahrens if (txg == 0) 535fa9e4066Sahrens txg = tx->tx_open_txg + 1; 536fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < txg) 537fa9e4066Sahrens tx->tx_quiesce_txg_waiting = txg; 538fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 539fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 540fa9e4066Sahrens while (tx->tx_open_txg < txg) { 541fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 542fa9e4066Sahrens cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); 543fa9e4066Sahrens } 544fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 545fa9e4066Sahrens } 546fa9e4066Sahrens 547088f3894Sahrens boolean_t 548fa9e4066Sahrens txg_stalled(dsl_pool_t *dp) 549fa9e4066Sahrens { 550fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 551fa9e4066Sahrens return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); 552fa9e4066Sahrens } 553fa9e4066Sahrens 554088f3894Sahrens boolean_t 555088f3894Sahrens txg_sync_waiting(dsl_pool_t *dp) 556088f3894Sahrens { 557088f3894Sahrens tx_state_t *tx = &dp->dp_tx; 558088f3894Sahrens 559088f3894Sahrens return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || 560088f3894Sahrens tx->tx_quiesced_txg != 0); 561088f3894Sahrens } 562088f3894Sahrens 563fa9e4066Sahrens /* 564fa9e4066Sahrens * Per-txg object lists. 565fa9e4066Sahrens */ 566fa9e4066Sahrens void 567fa9e4066Sahrens txg_list_create(txg_list_t *tl, size_t offset) 568fa9e4066Sahrens { 569fa9e4066Sahrens int t; 570fa9e4066Sahrens 571fa9e4066Sahrens mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); 572fa9e4066Sahrens 573fa9e4066Sahrens tl->tl_offset = offset; 574fa9e4066Sahrens 575fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 576fa9e4066Sahrens tl->tl_head[t] = NULL; 577fa9e4066Sahrens } 578fa9e4066Sahrens 579fa9e4066Sahrens void 580fa9e4066Sahrens txg_list_destroy(txg_list_t *tl) 581fa9e4066Sahrens { 582fa9e4066Sahrens int t; 583fa9e4066Sahrens 584fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 585fa9e4066Sahrens ASSERT(txg_list_empty(tl, t)); 586fa9e4066Sahrens 587fa9e4066Sahrens mutex_destroy(&tl->tl_lock); 588fa9e4066Sahrens } 589fa9e4066Sahrens 590fa9e4066Sahrens int 591fa9e4066Sahrens txg_list_empty(txg_list_t *tl, uint64_t txg) 592fa9e4066Sahrens { 593fa9e4066Sahrens return (tl->tl_head[txg & TXG_MASK] == NULL); 594fa9e4066Sahrens } 595fa9e4066Sahrens 596fa9e4066Sahrens /* 597fa9e4066Sahrens * Add an entry to the list. 598fa9e4066Sahrens * Returns 0 if it's a new entry, 1 if it's already there. 599fa9e4066Sahrens */ 600fa9e4066Sahrens int 601fa9e4066Sahrens txg_list_add(txg_list_t *tl, void *p, uint64_t txg) 602fa9e4066Sahrens { 603fa9e4066Sahrens int t = txg & TXG_MASK; 604fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 605fa9e4066Sahrens int already_on_list; 606fa9e4066Sahrens 607fa9e4066Sahrens mutex_enter(&tl->tl_lock); 608fa9e4066Sahrens already_on_list = tn->tn_member[t]; 609fa9e4066Sahrens if (!already_on_list) { 610fa9e4066Sahrens tn->tn_member[t] = 1; 611fa9e4066Sahrens tn->tn_next[t] = tl->tl_head[t]; 612fa9e4066Sahrens tl->tl_head[t] = tn; 613fa9e4066Sahrens } 614fa9e4066Sahrens mutex_exit(&tl->tl_lock); 615fa9e4066Sahrens 616fa9e4066Sahrens return (already_on_list); 617fa9e4066Sahrens } 618fa9e4066Sahrens 619495807d7SMatthew Ahrens /* 620495807d7SMatthew Ahrens * Add an entry to the end of the list (walks list to find end). 621495807d7SMatthew Ahrens * Returns 0 if it's a new entry, 1 if it's already there. 622495807d7SMatthew Ahrens */ 623495807d7SMatthew Ahrens int 624495807d7SMatthew Ahrens txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) 625495807d7SMatthew Ahrens { 626495807d7SMatthew Ahrens int t = txg & TXG_MASK; 627495807d7SMatthew Ahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 628495807d7SMatthew Ahrens int already_on_list; 629495807d7SMatthew Ahrens 630495807d7SMatthew Ahrens mutex_enter(&tl->tl_lock); 631495807d7SMatthew Ahrens already_on_list = tn->tn_member[t]; 632495807d7SMatthew Ahrens if (!already_on_list) { 633495807d7SMatthew Ahrens txg_node_t **tp; 634495807d7SMatthew Ahrens 635495807d7SMatthew Ahrens for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) 636495807d7SMatthew Ahrens continue; 637495807d7SMatthew Ahrens 638495807d7SMatthew Ahrens tn->tn_member[t] = 1; 639495807d7SMatthew Ahrens tn->tn_next[t] = NULL; 640495807d7SMatthew Ahrens *tp = tn; 641495807d7SMatthew Ahrens } 642495807d7SMatthew Ahrens mutex_exit(&tl->tl_lock); 643495807d7SMatthew Ahrens 644495807d7SMatthew Ahrens return (already_on_list); 645495807d7SMatthew Ahrens } 646495807d7SMatthew Ahrens 647fa9e4066Sahrens /* 648fa9e4066Sahrens * Remove the head of the list and return it. 649fa9e4066Sahrens */ 650fa9e4066Sahrens void * 651fa9e4066Sahrens txg_list_remove(txg_list_t *tl, uint64_t txg) 652fa9e4066Sahrens { 653fa9e4066Sahrens int t = txg & TXG_MASK; 654fa9e4066Sahrens txg_node_t *tn; 655fa9e4066Sahrens void *p = NULL; 656fa9e4066Sahrens 657fa9e4066Sahrens mutex_enter(&tl->tl_lock); 658fa9e4066Sahrens if ((tn = tl->tl_head[t]) != NULL) { 659fa9e4066Sahrens p = (char *)tn - tl->tl_offset; 660fa9e4066Sahrens tl->tl_head[t] = tn->tn_next[t]; 661fa9e4066Sahrens tn->tn_next[t] = NULL; 662fa9e4066Sahrens tn->tn_member[t] = 0; 663fa9e4066Sahrens } 664fa9e4066Sahrens mutex_exit(&tl->tl_lock); 665fa9e4066Sahrens 666fa9e4066Sahrens return (p); 667fa9e4066Sahrens } 668fa9e4066Sahrens 669fa9e4066Sahrens /* 670fa9e4066Sahrens * Remove a specific item from the list and return it. 671fa9e4066Sahrens */ 672fa9e4066Sahrens void * 673fa9e4066Sahrens txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) 674fa9e4066Sahrens { 675fa9e4066Sahrens int t = txg & TXG_MASK; 676fa9e4066Sahrens txg_node_t *tn, **tp; 677fa9e4066Sahrens 678fa9e4066Sahrens mutex_enter(&tl->tl_lock); 679fa9e4066Sahrens 680fa9e4066Sahrens for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { 681fa9e4066Sahrens if ((char *)tn - tl->tl_offset == p) { 682fa9e4066Sahrens *tp = tn->tn_next[t]; 683fa9e4066Sahrens tn->tn_next[t] = NULL; 684fa9e4066Sahrens tn->tn_member[t] = 0; 685fa9e4066Sahrens mutex_exit(&tl->tl_lock); 686fa9e4066Sahrens return (p); 687fa9e4066Sahrens } 688fa9e4066Sahrens } 689fa9e4066Sahrens 690fa9e4066Sahrens mutex_exit(&tl->tl_lock); 691fa9e4066Sahrens 692fa9e4066Sahrens return (NULL); 693fa9e4066Sahrens } 694fa9e4066Sahrens 695fa9e4066Sahrens int 696fa9e4066Sahrens txg_list_member(txg_list_t *tl, void *p, uint64_t txg) 697fa9e4066Sahrens { 698fa9e4066Sahrens int t = txg & TXG_MASK; 699fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 700fa9e4066Sahrens 701fa9e4066Sahrens return (tn->tn_member[t]); 702fa9e4066Sahrens } 703fa9e4066Sahrens 704fa9e4066Sahrens /* 705fa9e4066Sahrens * Walk a txg list -- only safe if you know it's not changing. 706fa9e4066Sahrens */ 707fa9e4066Sahrens void * 708fa9e4066Sahrens txg_list_head(txg_list_t *tl, uint64_t txg) 709fa9e4066Sahrens { 710fa9e4066Sahrens int t = txg & TXG_MASK; 711fa9e4066Sahrens txg_node_t *tn = tl->tl_head[t]; 712fa9e4066Sahrens 713fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 714fa9e4066Sahrens } 715fa9e4066Sahrens 716fa9e4066Sahrens void * 717fa9e4066Sahrens txg_list_next(txg_list_t *tl, void *p, uint64_t txg) 718fa9e4066Sahrens { 719fa9e4066Sahrens int t = txg & TXG_MASK; 720fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 721fa9e4066Sahrens 722fa9e4066Sahrens tn = tn->tn_next[t]; 723fa9e4066Sahrens 724fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 725fa9e4066Sahrens } 726