1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 55ad82045Snd * Common Development and Distribution License (the "License"). 65ad82045Snd * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 223f9d6ad7SLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23fa9e4066Sahrens */ 24fa9e4066Sahrens 25fa9e4066Sahrens #include <sys/zfs_context.h> 26fa9e4066Sahrens #include <sys/txg_impl.h> 27fa9e4066Sahrens #include <sys/dmu_impl.h> 28d20e665cSRicardo M. Correia #include <sys/dmu_tx.h> 29fa9e4066Sahrens #include <sys/dsl_pool.h> 303f9d6ad7SLin Ling #include <sys/dsl_scan.h> 31fa9e4066Sahrens #include <sys/callb.h> 32fa9e4066Sahrens 33fa9e4066Sahrens /* 34fa9e4066Sahrens * Pool-wide transaction groups. 35fa9e4066Sahrens */ 36fa9e4066Sahrens 37fa9e4066Sahrens static void txg_sync_thread(dsl_pool_t *dp); 38fa9e4066Sahrens static void txg_quiesce_thread(dsl_pool_t *dp); 39fa9e4066Sahrens 401ab7f2deSmaybee int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ 41fa9e4066Sahrens 42fa9e4066Sahrens /* 43fa9e4066Sahrens * Prepare the txg subsystem. 44fa9e4066Sahrens */ 45fa9e4066Sahrens void 46fa9e4066Sahrens txg_init(dsl_pool_t *dp, uint64_t txg) 47fa9e4066Sahrens { 48fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 495ad82045Snd int c; 50fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 51fa9e4066Sahrens 52fa9e4066Sahrens tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); 53fa9e4066Sahrens 548f38d419Sek for (c = 0; c < max_ncpus; c++) { 558f38d419Sek int i; 568f38d419Sek 575ad82045Snd mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); 588f38d419Sek for (i = 0; i < TXG_SIZE; i++) { 598f38d419Sek cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, 608f38d419Sek NULL); 61d20e665cSRicardo M. Correia list_create(&tx->tx_cpu[c].tc_callbacks[i], 62d20e665cSRicardo M. Correia sizeof (dmu_tx_callback_t), 63d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 648f38d419Sek } 658f38d419Sek } 665ad82045Snd 675ad82045Snd mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); 68fa9e4066Sahrens 69b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); 70b5e70f97SRicardo M. Correia cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); 71b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); 72b5e70f97SRicardo M. Correia cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); 73b5e70f97SRicardo M. Correia cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); 74b5e70f97SRicardo M. Correia 75fa9e4066Sahrens tx->tx_open_txg = txg; 76fa9e4066Sahrens } 77fa9e4066Sahrens 78fa9e4066Sahrens /* 79fa9e4066Sahrens * Close down the txg subsystem. 80fa9e4066Sahrens */ 81fa9e4066Sahrens void 82fa9e4066Sahrens txg_fini(dsl_pool_t *dp) 83fa9e4066Sahrens { 84fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 855ad82045Snd int c; 86fa9e4066Sahrens 87fa9e4066Sahrens ASSERT(tx->tx_threads == 0); 88fa9e4066Sahrens 895ad82045Snd mutex_destroy(&tx->tx_sync_lock); 905ad82045Snd 91b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_more_cv); 92b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_sync_done_cv); 93b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_more_cv); 94b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_quiesce_done_cv); 95b5e70f97SRicardo M. Correia cv_destroy(&tx->tx_exit_cv); 96b5e70f97SRicardo M. Correia 978f38d419Sek for (c = 0; c < max_ncpus; c++) { 988f38d419Sek int i; 998f38d419Sek 1005ad82045Snd mutex_destroy(&tx->tx_cpu[c].tc_lock); 101d20e665cSRicardo M. Correia for (i = 0; i < TXG_SIZE; i++) { 1028f38d419Sek cv_destroy(&tx->tx_cpu[c].tc_cv[i]); 103d20e665cSRicardo M. Correia list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); 104d20e665cSRicardo M. Correia } 1058f38d419Sek } 106fa9e4066Sahrens 107d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq != NULL) 108d20e665cSRicardo M. Correia taskq_destroy(tx->tx_commit_cb_taskq); 109d20e665cSRicardo M. Correia 110fa9e4066Sahrens kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); 111fa9e4066Sahrens 112fa9e4066Sahrens bzero(tx, sizeof (tx_state_t)); 113fa9e4066Sahrens } 114fa9e4066Sahrens 115fa9e4066Sahrens /* 116fa9e4066Sahrens * Start syncing transaction groups. 117fa9e4066Sahrens */ 118fa9e4066Sahrens void 119fa9e4066Sahrens txg_sync_start(dsl_pool_t *dp) 120fa9e4066Sahrens { 121fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 122fa9e4066Sahrens 123fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 124fa9e4066Sahrens 125fa9e4066Sahrens dprintf("pool %p\n", dp); 126fa9e4066Sahrens 127fa9e4066Sahrens ASSERT(tx->tx_threads == 0); 128fa9e4066Sahrens 1291ab7f2deSmaybee tx->tx_threads = 2; 130fa9e4066Sahrens 131fa9e4066Sahrens tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, 132fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 133fa9e4066Sahrens 134088f3894Sahrens /* 135088f3894Sahrens * The sync thread can need a larger-than-default stack size on 136088f3894Sahrens * 32-bit x86. This is due in part to nested pools and 137088f3894Sahrens * scrub_visitbp() recursion. 138088f3894Sahrens */ 1393f9d6ad7SLin Ling tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, 140fa9e4066Sahrens dp, 0, &p0, TS_RUN, minclsyspri); 141fa9e4066Sahrens 142fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 143fa9e4066Sahrens } 144fa9e4066Sahrens 145fa9e4066Sahrens static void 146fa9e4066Sahrens txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) 147fa9e4066Sahrens { 148fa9e4066Sahrens CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); 149fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 150fa9e4066Sahrens } 151fa9e4066Sahrens 152fa9e4066Sahrens static void 153fa9e4066Sahrens txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) 154fa9e4066Sahrens { 155fa9e4066Sahrens ASSERT(*tpp != NULL); 156fa9e4066Sahrens *tpp = NULL; 157fa9e4066Sahrens tx->tx_threads--; 158fa9e4066Sahrens cv_broadcast(&tx->tx_exit_cv); 159fa9e4066Sahrens CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ 160fa9e4066Sahrens thread_exit(); 161fa9e4066Sahrens } 162fa9e4066Sahrens 163fa9e4066Sahrens static void 1641ab7f2deSmaybee txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) 165fa9e4066Sahrens { 166fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(cpr); 167fa9e4066Sahrens 1681ab7f2deSmaybee if (time) 169d3d50737SRafael Vanoni (void) cv_timedwait(cv, &tx->tx_sync_lock, 170d3d50737SRafael Vanoni ddi_get_lbolt() + time); 171fa9e4066Sahrens else 172fa9e4066Sahrens cv_wait(cv, &tx->tx_sync_lock); 173fa9e4066Sahrens 174fa9e4066Sahrens CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); 175fa9e4066Sahrens } 176fa9e4066Sahrens 177fa9e4066Sahrens /* 178fa9e4066Sahrens * Stop syncing transaction groups. 179fa9e4066Sahrens */ 180fa9e4066Sahrens void 181fa9e4066Sahrens txg_sync_stop(dsl_pool_t *dp) 182fa9e4066Sahrens { 183fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 184fa9e4066Sahrens 185fa9e4066Sahrens dprintf("pool %p\n", dp); 186fa9e4066Sahrens /* 187fa9e4066Sahrens * Finish off any work in progress. 188fa9e4066Sahrens */ 1891ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 190468c413aSTim Haley 191468c413aSTim Haley /* 192468c413aSTim Haley * We need to ensure that we've vacated the deferred space_maps. 193468c413aSTim Haley */ 194468c413aSTim Haley txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); 195fa9e4066Sahrens 196fa9e4066Sahrens /* 1971ab7f2deSmaybee * Wake all sync threads and wait for them to die. 198fa9e4066Sahrens */ 199fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 200fa9e4066Sahrens 2011ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 202fa9e4066Sahrens 203fa9e4066Sahrens tx->tx_exiting = 1; 204fa9e4066Sahrens 205fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 206fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 207fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 208fa9e4066Sahrens 209fa9e4066Sahrens while (tx->tx_threads != 0) 210fa9e4066Sahrens cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); 211fa9e4066Sahrens 212fa9e4066Sahrens tx->tx_exiting = 0; 213fa9e4066Sahrens 214fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 215fa9e4066Sahrens } 216fa9e4066Sahrens 217fa9e4066Sahrens uint64_t 218fa9e4066Sahrens txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) 219fa9e4066Sahrens { 220fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 221fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; 222fa9e4066Sahrens uint64_t txg; 223fa9e4066Sahrens 224fa9e4066Sahrens mutex_enter(&tc->tc_lock); 225fa9e4066Sahrens 226fa9e4066Sahrens txg = tx->tx_open_txg; 227fa9e4066Sahrens tc->tc_count[txg & TXG_MASK]++; 228fa9e4066Sahrens 229fa9e4066Sahrens th->th_cpu = tc; 230fa9e4066Sahrens th->th_txg = txg; 231fa9e4066Sahrens 232fa9e4066Sahrens return (txg); 233fa9e4066Sahrens } 234fa9e4066Sahrens 235fa9e4066Sahrens void 236fa9e4066Sahrens txg_rele_to_quiesce(txg_handle_t *th) 237fa9e4066Sahrens { 238fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 239fa9e4066Sahrens 240fa9e4066Sahrens mutex_exit(&tc->tc_lock); 241fa9e4066Sahrens } 242fa9e4066Sahrens 243d20e665cSRicardo M. Correia void 244d20e665cSRicardo M. Correia txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) 245d20e665cSRicardo M. Correia { 246d20e665cSRicardo M. Correia tx_cpu_t *tc = th->th_cpu; 247d20e665cSRicardo M. Correia int g = th->th_txg & TXG_MASK; 248d20e665cSRicardo M. Correia 249d20e665cSRicardo M. Correia mutex_enter(&tc->tc_lock); 250d20e665cSRicardo M. Correia list_move_tail(&tc->tc_callbacks[g], tx_callbacks); 251d20e665cSRicardo M. Correia mutex_exit(&tc->tc_lock); 252d20e665cSRicardo M. Correia } 253d20e665cSRicardo M. Correia 254fa9e4066Sahrens void 255fa9e4066Sahrens txg_rele_to_sync(txg_handle_t *th) 256fa9e4066Sahrens { 257fa9e4066Sahrens tx_cpu_t *tc = th->th_cpu; 258fa9e4066Sahrens int g = th->th_txg & TXG_MASK; 259fa9e4066Sahrens 260fa9e4066Sahrens mutex_enter(&tc->tc_lock); 261fa9e4066Sahrens ASSERT(tc->tc_count[g] != 0); 262fa9e4066Sahrens if (--tc->tc_count[g] == 0) 263fa9e4066Sahrens cv_broadcast(&tc->tc_cv[g]); 264fa9e4066Sahrens mutex_exit(&tc->tc_lock); 265fa9e4066Sahrens 266fa9e4066Sahrens th->th_cpu = NULL; /* defensive */ 267fa9e4066Sahrens } 268fa9e4066Sahrens 269fa9e4066Sahrens static void 270fa9e4066Sahrens txg_quiesce(dsl_pool_t *dp, uint64_t txg) 271fa9e4066Sahrens { 272fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 273fa9e4066Sahrens int g = txg & TXG_MASK; 274fa9e4066Sahrens int c; 275fa9e4066Sahrens 276fa9e4066Sahrens /* 277fa9e4066Sahrens * Grab all tx_cpu locks so nobody else can get into this txg. 278fa9e4066Sahrens */ 279fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 280fa9e4066Sahrens mutex_enter(&tx->tx_cpu[c].tc_lock); 281fa9e4066Sahrens 282fa9e4066Sahrens ASSERT(txg == tx->tx_open_txg); 283fa9e4066Sahrens tx->tx_open_txg++; 284fa9e4066Sahrens 285fa9e4066Sahrens /* 286fa9e4066Sahrens * Now that we've incremented tx_open_txg, we can let threads 287fa9e4066Sahrens * enter the next transaction group. 288fa9e4066Sahrens */ 289fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) 290fa9e4066Sahrens mutex_exit(&tx->tx_cpu[c].tc_lock); 291fa9e4066Sahrens 292fa9e4066Sahrens /* 293fa9e4066Sahrens * Quiesce the transaction group by waiting for everyone to txg_exit(). 294fa9e4066Sahrens */ 295fa9e4066Sahrens for (c = 0; c < max_ncpus; c++) { 296fa9e4066Sahrens tx_cpu_t *tc = &tx->tx_cpu[c]; 297fa9e4066Sahrens mutex_enter(&tc->tc_lock); 298fa9e4066Sahrens while (tc->tc_count[g] != 0) 299fa9e4066Sahrens cv_wait(&tc->tc_cv[g], &tc->tc_lock); 300fa9e4066Sahrens mutex_exit(&tc->tc_lock); 301fa9e4066Sahrens } 302fa9e4066Sahrens } 303fa9e4066Sahrens 304d20e665cSRicardo M. Correia static void 305d20e665cSRicardo M. Correia txg_do_callbacks(list_t *cb_list) 306d20e665cSRicardo M. Correia { 307d20e665cSRicardo M. Correia dmu_tx_do_callbacks(cb_list, 0); 308d20e665cSRicardo M. Correia 309d20e665cSRicardo M. Correia list_destroy(cb_list); 310d20e665cSRicardo M. Correia 311d20e665cSRicardo M. Correia kmem_free(cb_list, sizeof (list_t)); 312d20e665cSRicardo M. Correia } 313d20e665cSRicardo M. Correia 314d20e665cSRicardo M. Correia /* 315d20e665cSRicardo M. Correia * Dispatch the commit callbacks registered on this txg to worker threads. 316d20e665cSRicardo M. Correia */ 317d20e665cSRicardo M. Correia static void 318d20e665cSRicardo M. Correia txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) 319d20e665cSRicardo M. Correia { 320d20e665cSRicardo M. Correia int c; 321d20e665cSRicardo M. Correia tx_state_t *tx = &dp->dp_tx; 322d20e665cSRicardo M. Correia list_t *cb_list; 323d20e665cSRicardo M. Correia 324d20e665cSRicardo M. Correia for (c = 0; c < max_ncpus; c++) { 325d20e665cSRicardo M. Correia tx_cpu_t *tc = &tx->tx_cpu[c]; 326d20e665cSRicardo M. Correia /* No need to lock tx_cpu_t at this point */ 327d20e665cSRicardo M. Correia 328d20e665cSRicardo M. Correia int g = txg & TXG_MASK; 329d20e665cSRicardo M. Correia 330d20e665cSRicardo M. Correia if (list_is_empty(&tc->tc_callbacks[g])) 331d20e665cSRicardo M. Correia continue; 332d20e665cSRicardo M. Correia 333d20e665cSRicardo M. Correia if (tx->tx_commit_cb_taskq == NULL) { 334d20e665cSRicardo M. Correia /* 335d20e665cSRicardo M. Correia * Commit callback taskq hasn't been created yet. 336d20e665cSRicardo M. Correia */ 337d20e665cSRicardo M. Correia tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 338d20e665cSRicardo M. Correia max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, 339d20e665cSRicardo M. Correia TASKQ_PREPOPULATE); 340d20e665cSRicardo M. Correia } 341d20e665cSRicardo M. Correia 342d20e665cSRicardo M. Correia cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 343d20e665cSRicardo M. Correia list_create(cb_list, sizeof (dmu_tx_callback_t), 344d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 345d20e665cSRicardo M. Correia 346d20e665cSRicardo M. Correia list_move_tail(&tc->tc_callbacks[g], cb_list); 347d20e665cSRicardo M. Correia 348d20e665cSRicardo M. Correia (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) 349d20e665cSRicardo M. Correia txg_do_callbacks, cb_list, TQ_SLEEP); 350d20e665cSRicardo M. Correia } 351d20e665cSRicardo M. Correia } 352d20e665cSRicardo M. Correia 353fa9e4066Sahrens static void 354fa9e4066Sahrens txg_sync_thread(dsl_pool_t *dp) 355fa9e4066Sahrens { 356b16da2e2SGeorge Wilson spa_t *spa = dp->dp_spa; 357fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 358fa9e4066Sahrens callb_cpr_t cpr; 35905715f94SMark Maybee uint64_t start, delta; 360fa9e4066Sahrens 361fa9e4066Sahrens txg_thread_enter(tx, &cpr); 362fa9e4066Sahrens 3631ab7f2deSmaybee start = delta = 0; 364fa9e4066Sahrens for (;;) { 36505715f94SMark Maybee uint64_t timer, timeout = zfs_txg_timeout * hz; 36605715f94SMark Maybee uint64_t txg; 367fa9e4066Sahrens 368fa9e4066Sahrens /* 3693f9d6ad7SLin Ling * We sync when we're scanning, there's someone waiting 37088b7b0f2SMatthew Ahrens * on us, or the quiesce thread has handed off a txg to 37188b7b0f2SMatthew Ahrens * us, or we have reached our timeout. 372fa9e4066Sahrens */ 3731ab7f2deSmaybee timer = (delta >= timeout ? 0 : timeout - delta); 374*cde58dbcSMatthew Ahrens while (!dsl_scan_active(dp->dp_scan) && 37588b7b0f2SMatthew Ahrens !tx->tx_exiting && timer > 0 && 376fa9e4066Sahrens tx->tx_synced_txg >= tx->tx_sync_txg_waiting && 377fa9e4066Sahrens tx->tx_quiesced_txg == 0) { 378fa9e4066Sahrens dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", 379fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 3801ab7f2deSmaybee txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); 381d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 3821ab7f2deSmaybee timer = (delta > timeout ? 0 : timeout - delta); 383fa9e4066Sahrens } 384fa9e4066Sahrens 385fa9e4066Sahrens /* 386fa9e4066Sahrens * Wait until the quiesce thread hands off a txg to us, 387fa9e4066Sahrens * prompting it to do so if necessary. 388fa9e4066Sahrens */ 389fa9e4066Sahrens while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { 390fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) 391fa9e4066Sahrens tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; 392fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 393fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); 394fa9e4066Sahrens } 395fa9e4066Sahrens 396fa9e4066Sahrens if (tx->tx_exiting) 397fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); 398fa9e4066Sahrens 399fa9e4066Sahrens /* 400fa9e4066Sahrens * Consume the quiesced txg which has been handed off to 401fa9e4066Sahrens * us. This may cause the quiescing thread to now be 402fa9e4066Sahrens * able to quiesce another txg, so we must signal it. 403fa9e4066Sahrens */ 404fa9e4066Sahrens txg = tx->tx_quiesced_txg; 405fa9e4066Sahrens tx->tx_quiesced_txg = 0; 406fa9e4066Sahrens tx->tx_syncing_txg = txg; 407fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 408fa9e4066Sahrens 409fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 4108f38d419Sek txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 411fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 41205715f94SMark Maybee 413d3d50737SRafael Vanoni start = ddi_get_lbolt(); 414b16da2e2SGeorge Wilson spa_sync(spa, txg); 415d3d50737SRafael Vanoni delta = ddi_get_lbolt() - start; 4161ab7f2deSmaybee 417fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 418fa9e4066Sahrens tx->tx_synced_txg = txg; 419fa9e4066Sahrens tx->tx_syncing_txg = 0; 420fa9e4066Sahrens cv_broadcast(&tx->tx_sync_done_cv); 421d20e665cSRicardo M. Correia 422d20e665cSRicardo M. Correia /* 423d20e665cSRicardo M. Correia * Dispatch commit callbacks to worker threads. 424d20e665cSRicardo M. Correia */ 425d20e665cSRicardo M. Correia txg_dispatch_callbacks(dp, txg); 426fa9e4066Sahrens } 427fa9e4066Sahrens } 428fa9e4066Sahrens 429fa9e4066Sahrens static void 430fa9e4066Sahrens txg_quiesce_thread(dsl_pool_t *dp) 431fa9e4066Sahrens { 432fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 433fa9e4066Sahrens callb_cpr_t cpr; 434fa9e4066Sahrens 435fa9e4066Sahrens txg_thread_enter(tx, &cpr); 436fa9e4066Sahrens 437fa9e4066Sahrens for (;;) { 438fa9e4066Sahrens uint64_t txg; 439fa9e4066Sahrens 440fa9e4066Sahrens /* 441fa9e4066Sahrens * We quiesce when there's someone waiting on us. 442fa9e4066Sahrens * However, we can only have one txg in "quiescing" or 443fa9e4066Sahrens * "quiesced, waiting to sync" state. So we wait until 444fa9e4066Sahrens * the "quiesced, waiting to sync" txg has been consumed 445fa9e4066Sahrens * by the sync thread. 446fa9e4066Sahrens */ 447fa9e4066Sahrens while (!tx->tx_exiting && 448fa9e4066Sahrens (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || 449fa9e4066Sahrens tx->tx_quiesced_txg != 0)) 450fa9e4066Sahrens txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); 451fa9e4066Sahrens 452fa9e4066Sahrens if (tx->tx_exiting) 453fa9e4066Sahrens txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); 454fa9e4066Sahrens 455fa9e4066Sahrens txg = tx->tx_open_txg; 456fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 457fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, 458fa9e4066Sahrens tx->tx_sync_txg_waiting); 459fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 460fa9e4066Sahrens txg_quiesce(dp, txg); 461fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 462fa9e4066Sahrens 463fa9e4066Sahrens /* 464fa9e4066Sahrens * Hand this txg off to the sync thread. 465fa9e4066Sahrens */ 466fa9e4066Sahrens dprintf("quiesce done, handing off txg %llu\n", txg); 467fa9e4066Sahrens tx->tx_quiesced_txg = txg; 468fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 469fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_done_cv); 470fa9e4066Sahrens } 471fa9e4066Sahrens } 472fa9e4066Sahrens 4731ab7f2deSmaybee /* 4741ab7f2deSmaybee * Delay this thread by 'ticks' if we are still in the open transaction 4751ab7f2deSmaybee * group and there is already a waiting txg quiesing or quiesced. Abort 4761ab7f2deSmaybee * the delay if this txg stalls or enters the quiesing state. 4771ab7f2deSmaybee */ 4781ab7f2deSmaybee void 4791ab7f2deSmaybee txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) 4801ab7f2deSmaybee { 4811ab7f2deSmaybee tx_state_t *tx = &dp->dp_tx; 482d3d50737SRafael Vanoni int timeout = ddi_get_lbolt() + ticks; 4831ab7f2deSmaybee 4841ab7f2deSmaybee /* don't delay if this txg could transition to quiesing immediately */ 4851ab7f2deSmaybee if (tx->tx_open_txg > txg || 4861ab7f2deSmaybee tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) 4871ab7f2deSmaybee return; 4881ab7f2deSmaybee 4891ab7f2deSmaybee mutex_enter(&tx->tx_sync_lock); 4901ab7f2deSmaybee if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { 4911ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 4921ab7f2deSmaybee return; 4931ab7f2deSmaybee } 4941ab7f2deSmaybee 495d3d50737SRafael Vanoni while (ddi_get_lbolt() < timeout && 4961ab7f2deSmaybee tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) 4971ab7f2deSmaybee (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, 4981ab7f2deSmaybee timeout); 4991ab7f2deSmaybee 5001ab7f2deSmaybee mutex_exit(&tx->tx_sync_lock); 5011ab7f2deSmaybee } 5021ab7f2deSmaybee 503fa9e4066Sahrens void 504fa9e4066Sahrens txg_wait_synced(dsl_pool_t *dp, uint64_t txg) 505fa9e4066Sahrens { 506fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 507fa9e4066Sahrens 508fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 5091ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 510fa9e4066Sahrens if (txg == 0) 511b24ab676SJeff Bonwick txg = tx->tx_open_txg + TXG_DEFER_SIZE; 512fa9e4066Sahrens if (tx->tx_sync_txg_waiting < txg) 513fa9e4066Sahrens tx->tx_sync_txg_waiting = txg; 514fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 515fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 516fa9e4066Sahrens while (tx->tx_synced_txg < txg) { 517fa9e4066Sahrens dprintf("broadcasting sync more " 518fa9e4066Sahrens "tx_synced=%llu waiting=%llu dp=%p\n", 519fa9e4066Sahrens tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); 520fa9e4066Sahrens cv_broadcast(&tx->tx_sync_more_cv); 521fa9e4066Sahrens cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); 522fa9e4066Sahrens } 523fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 524fa9e4066Sahrens } 525fa9e4066Sahrens 526fa9e4066Sahrens void 527fa9e4066Sahrens txg_wait_open(dsl_pool_t *dp, uint64_t txg) 528fa9e4066Sahrens { 529fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 530fa9e4066Sahrens 531fa9e4066Sahrens mutex_enter(&tx->tx_sync_lock); 5321ab7f2deSmaybee ASSERT(tx->tx_threads == 2); 533fa9e4066Sahrens if (txg == 0) 534fa9e4066Sahrens txg = tx->tx_open_txg + 1; 535fa9e4066Sahrens if (tx->tx_quiesce_txg_waiting < txg) 536fa9e4066Sahrens tx->tx_quiesce_txg_waiting = txg; 537fa9e4066Sahrens dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", 538fa9e4066Sahrens txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); 539fa9e4066Sahrens while (tx->tx_open_txg < txg) { 540fa9e4066Sahrens cv_broadcast(&tx->tx_quiesce_more_cv); 541fa9e4066Sahrens cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); 542fa9e4066Sahrens } 543fa9e4066Sahrens mutex_exit(&tx->tx_sync_lock); 544fa9e4066Sahrens } 545fa9e4066Sahrens 546088f3894Sahrens boolean_t 547fa9e4066Sahrens txg_stalled(dsl_pool_t *dp) 548fa9e4066Sahrens { 549fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 550fa9e4066Sahrens return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); 551fa9e4066Sahrens } 552fa9e4066Sahrens 553088f3894Sahrens boolean_t 554088f3894Sahrens txg_sync_waiting(dsl_pool_t *dp) 555088f3894Sahrens { 556088f3894Sahrens tx_state_t *tx = &dp->dp_tx; 557088f3894Sahrens 558088f3894Sahrens return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || 559088f3894Sahrens tx->tx_quiesced_txg != 0); 560088f3894Sahrens } 561088f3894Sahrens 562fa9e4066Sahrens /* 563fa9e4066Sahrens * Per-txg object lists. 564fa9e4066Sahrens */ 565fa9e4066Sahrens void 566fa9e4066Sahrens txg_list_create(txg_list_t *tl, size_t offset) 567fa9e4066Sahrens { 568fa9e4066Sahrens int t; 569fa9e4066Sahrens 570fa9e4066Sahrens mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); 571fa9e4066Sahrens 572fa9e4066Sahrens tl->tl_offset = offset; 573fa9e4066Sahrens 574fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 575fa9e4066Sahrens tl->tl_head[t] = NULL; 576fa9e4066Sahrens } 577fa9e4066Sahrens 578fa9e4066Sahrens void 579fa9e4066Sahrens txg_list_destroy(txg_list_t *tl) 580fa9e4066Sahrens { 581fa9e4066Sahrens int t; 582fa9e4066Sahrens 583fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 584fa9e4066Sahrens ASSERT(txg_list_empty(tl, t)); 585fa9e4066Sahrens 586fa9e4066Sahrens mutex_destroy(&tl->tl_lock); 587fa9e4066Sahrens } 588fa9e4066Sahrens 589fa9e4066Sahrens int 590fa9e4066Sahrens txg_list_empty(txg_list_t *tl, uint64_t txg) 591fa9e4066Sahrens { 592fa9e4066Sahrens return (tl->tl_head[txg & TXG_MASK] == NULL); 593fa9e4066Sahrens } 594fa9e4066Sahrens 595fa9e4066Sahrens /* 596fa9e4066Sahrens * Add an entry to the list. 597fa9e4066Sahrens * Returns 0 if it's a new entry, 1 if it's already there. 598fa9e4066Sahrens */ 599fa9e4066Sahrens int 600fa9e4066Sahrens txg_list_add(txg_list_t *tl, void *p, uint64_t txg) 601fa9e4066Sahrens { 602fa9e4066Sahrens int t = txg & TXG_MASK; 603fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 604fa9e4066Sahrens int already_on_list; 605fa9e4066Sahrens 606fa9e4066Sahrens mutex_enter(&tl->tl_lock); 607fa9e4066Sahrens already_on_list = tn->tn_member[t]; 608fa9e4066Sahrens if (!already_on_list) { 609fa9e4066Sahrens tn->tn_member[t] = 1; 610fa9e4066Sahrens tn->tn_next[t] = tl->tl_head[t]; 611fa9e4066Sahrens tl->tl_head[t] = tn; 612fa9e4066Sahrens } 613fa9e4066Sahrens mutex_exit(&tl->tl_lock); 614fa9e4066Sahrens 615fa9e4066Sahrens return (already_on_list); 616fa9e4066Sahrens } 617fa9e4066Sahrens 618495807d7SMatthew Ahrens /* 619495807d7SMatthew Ahrens * Add an entry to the end of the list (walks list to find end). 620495807d7SMatthew Ahrens * Returns 0 if it's a new entry, 1 if it's already there. 621495807d7SMatthew Ahrens */ 622495807d7SMatthew Ahrens int 623495807d7SMatthew Ahrens txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) 624495807d7SMatthew Ahrens { 625495807d7SMatthew Ahrens int t = txg & TXG_MASK; 626495807d7SMatthew Ahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 627495807d7SMatthew Ahrens int already_on_list; 628495807d7SMatthew Ahrens 629495807d7SMatthew Ahrens mutex_enter(&tl->tl_lock); 630495807d7SMatthew Ahrens already_on_list = tn->tn_member[t]; 631495807d7SMatthew Ahrens if (!already_on_list) { 632495807d7SMatthew Ahrens txg_node_t **tp; 633495807d7SMatthew Ahrens 634495807d7SMatthew Ahrens for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) 635495807d7SMatthew Ahrens continue; 636495807d7SMatthew Ahrens 637495807d7SMatthew Ahrens tn->tn_member[t] = 1; 638495807d7SMatthew Ahrens tn->tn_next[t] = NULL; 639495807d7SMatthew Ahrens *tp = tn; 640495807d7SMatthew Ahrens } 641495807d7SMatthew Ahrens mutex_exit(&tl->tl_lock); 642495807d7SMatthew Ahrens 643495807d7SMatthew Ahrens return (already_on_list); 644495807d7SMatthew Ahrens } 645495807d7SMatthew Ahrens 646fa9e4066Sahrens /* 647fa9e4066Sahrens * Remove the head of the list and return it. 648fa9e4066Sahrens */ 649fa9e4066Sahrens void * 650fa9e4066Sahrens txg_list_remove(txg_list_t *tl, uint64_t txg) 651fa9e4066Sahrens { 652fa9e4066Sahrens int t = txg & TXG_MASK; 653fa9e4066Sahrens txg_node_t *tn; 654fa9e4066Sahrens void *p = NULL; 655fa9e4066Sahrens 656fa9e4066Sahrens mutex_enter(&tl->tl_lock); 657fa9e4066Sahrens if ((tn = tl->tl_head[t]) != NULL) { 658fa9e4066Sahrens p = (char *)tn - tl->tl_offset; 659fa9e4066Sahrens tl->tl_head[t] = tn->tn_next[t]; 660fa9e4066Sahrens tn->tn_next[t] = NULL; 661fa9e4066Sahrens tn->tn_member[t] = 0; 662fa9e4066Sahrens } 663fa9e4066Sahrens mutex_exit(&tl->tl_lock); 664fa9e4066Sahrens 665fa9e4066Sahrens return (p); 666fa9e4066Sahrens } 667fa9e4066Sahrens 668fa9e4066Sahrens /* 669fa9e4066Sahrens * Remove a specific item from the list and return it. 670fa9e4066Sahrens */ 671fa9e4066Sahrens void * 672fa9e4066Sahrens txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) 673fa9e4066Sahrens { 674fa9e4066Sahrens int t = txg & TXG_MASK; 675fa9e4066Sahrens txg_node_t *tn, **tp; 676fa9e4066Sahrens 677fa9e4066Sahrens mutex_enter(&tl->tl_lock); 678fa9e4066Sahrens 679fa9e4066Sahrens for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { 680fa9e4066Sahrens if ((char *)tn - tl->tl_offset == p) { 681fa9e4066Sahrens *tp = tn->tn_next[t]; 682fa9e4066Sahrens tn->tn_next[t] = NULL; 683fa9e4066Sahrens tn->tn_member[t] = 0; 684fa9e4066Sahrens mutex_exit(&tl->tl_lock); 685fa9e4066Sahrens return (p); 686fa9e4066Sahrens } 687fa9e4066Sahrens } 688fa9e4066Sahrens 689fa9e4066Sahrens mutex_exit(&tl->tl_lock); 690fa9e4066Sahrens 691fa9e4066Sahrens return (NULL); 692fa9e4066Sahrens } 693fa9e4066Sahrens 694fa9e4066Sahrens int 695fa9e4066Sahrens txg_list_member(txg_list_t *tl, void *p, uint64_t txg) 696fa9e4066Sahrens { 697fa9e4066Sahrens int t = txg & TXG_MASK; 698fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 699fa9e4066Sahrens 700fa9e4066Sahrens return (tn->tn_member[t]); 701fa9e4066Sahrens } 702fa9e4066Sahrens 703fa9e4066Sahrens /* 704fa9e4066Sahrens * Walk a txg list -- only safe if you know it's not changing. 705fa9e4066Sahrens */ 706fa9e4066Sahrens void * 707fa9e4066Sahrens txg_list_head(txg_list_t *tl, uint64_t txg) 708fa9e4066Sahrens { 709fa9e4066Sahrens int t = txg & TXG_MASK; 710fa9e4066Sahrens txg_node_t *tn = tl->tl_head[t]; 711fa9e4066Sahrens 712fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 713fa9e4066Sahrens } 714fa9e4066Sahrens 715fa9e4066Sahrens void * 716fa9e4066Sahrens txg_list_next(txg_list_t *tl, void *p, uint64_t txg) 717fa9e4066Sahrens { 718fa9e4066Sahrens int t = txg & TXG_MASK; 719fa9e4066Sahrens txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); 720fa9e4066Sahrens 721fa9e4066Sahrens tn = tn->tn_next[t]; 722fa9e4066Sahrens 723fa9e4066Sahrens return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); 724fa9e4066Sahrens } 725