1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5f65e61c0Sahrens * Common Development and Distribution License (the "License"). 6f65e61c0Sahrens * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2201025c89SJohn Harres * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 239dccfd2aSAlbert Lee * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 2479d72832SMatthew Ahrens * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 269dccfd2aSAlbert Lee */ 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/dmu.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30fa9e4066Sahrens #include <sys/dbuf.h> 31fa9e4066Sahrens #include <sys/dmu_tx.h> 32fa9e4066Sahrens #include <sys/dmu_objset.h> 33*61e255ceSMatthew Ahrens #include <sys/dsl_dataset.h> 34*61e255ceSMatthew Ahrens #include <sys/dsl_dir.h> 35fa9e4066Sahrens #include <sys/dsl_pool.h> 36*61e255ceSMatthew Ahrens #include <sys/zap_impl.h> 37fa9e4066Sahrens #include <sys/spa.h> 380a586ceaSMark Shellenbaum #include <sys/sa.h> 390a586ceaSMark Shellenbaum #include <sys/sa_impl.h> 40fa9e4066Sahrens #include <sys/zfs_context.h> 410a586ceaSMark Shellenbaum #include <sys/varargs.h> 42fa9e4066Sahrens 43ea8dc4b6Seschrock typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 44ea8dc4b6Seschrock uint64_t arg1, uint64_t arg2); 45ea8dc4b6Seschrock 46fa9e4066Sahrens 47fa9e4066Sahrens dmu_tx_t * 481d452cf5Sahrens dmu_tx_create_dd(dsl_dir_t *dd) 49fa9e4066Sahrens { 50fa9e4066Sahrens dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 51fa9e4066Sahrens tx->tx_dir = dd; 524445fffbSMatthew Ahrens if (dd != NULL) 53fa9e4066Sahrens tx->tx_pool = dd->dd_pool; 54fa9e4066Sahrens list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 558a2f1b91Sahrens offsetof(dmu_tx_hold_t, txh_node)); 56d20e665cSRicardo M. Correia list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), 57d20e665cSRicardo M. Correia offsetof(dmu_tx_callback_t, dcb_node)); 5869962b56SMatthew Ahrens tx->tx_start = gethrtime(); 59fa9e4066Sahrens return (tx); 60fa9e4066Sahrens } 61fa9e4066Sahrens 62fa9e4066Sahrens dmu_tx_t * 63fa9e4066Sahrens dmu_tx_create(objset_t *os) 64fa9e4066Sahrens { 65503ad85cSMatthew Ahrens dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); 66fa9e4066Sahrens tx->tx_objset = os; 67fa9e4066Sahrens return (tx); 68fa9e4066Sahrens } 69fa9e4066Sahrens 70fa9e4066Sahrens dmu_tx_t * 71fa9e4066Sahrens dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 72fa9e4066Sahrens { 731d452cf5Sahrens dmu_tx_t *tx = dmu_tx_create_dd(NULL); 74fa9e4066Sahrens 75fa9e4066Sahrens ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 76fa9e4066Sahrens tx->tx_pool = dp; 77fa9e4066Sahrens tx->tx_txg = txg; 78fa9e4066Sahrens tx->tx_anyobj = TRUE; 79fa9e4066Sahrens 80fa9e4066Sahrens return (tx); 81fa9e4066Sahrens } 82fa9e4066Sahrens 83fa9e4066Sahrens int 84fa9e4066Sahrens dmu_tx_is_syncing(dmu_tx_t *tx) 85fa9e4066Sahrens { 86fa9e4066Sahrens return (tx->tx_anyobj); 87fa9e4066Sahrens } 88fa9e4066Sahrens 89fa9e4066Sahrens int 90fa9e4066Sahrens dmu_tx_private_ok(dmu_tx_t *tx) 91fa9e4066Sahrens { 92ea8dc4b6Seschrock return (tx->tx_anyobj); 93fa9e4066Sahrens } 94fa9e4066Sahrens 958a2f1b91Sahrens static dmu_tx_hold_t * 96fa9e4066Sahrens dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 978a2f1b91Sahrens enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 98fa9e4066Sahrens { 998a2f1b91Sahrens dmu_tx_hold_t *txh; 100fa9e4066Sahrens dnode_t *dn = NULL; 101ea8dc4b6Seschrock int err; 102fa9e4066Sahrens 103fa9e4066Sahrens if (object != DMU_NEW_OBJECT) { 104503ad85cSMatthew Ahrens err = dnode_hold(os, object, tx, &dn); 105ea8dc4b6Seschrock if (err) { 106ea8dc4b6Seschrock tx->tx_err = err; 1078a2f1b91Sahrens return (NULL); 108ea8dc4b6Seschrock } 109fa9e4066Sahrens 110ea8dc4b6Seschrock if (err == 0 && tx->tx_txg != 0) { 111fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 112fa9e4066Sahrens /* 113fa9e4066Sahrens * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 114fa9e4066Sahrens * problem, but there's no way for it to happen (for 115fa9e4066Sahrens * now, at least). 116fa9e4066Sahrens */ 117fa9e4066Sahrens ASSERT(dn->dn_assigned_txg == 0); 118fa9e4066Sahrens dn->dn_assigned_txg = tx->tx_txg; 119fa9e4066Sahrens (void) refcount_add(&dn->dn_tx_holds, tx); 120fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 121fa9e4066Sahrens } 122fa9e4066Sahrens } 123fa9e4066Sahrens 1248a2f1b91Sahrens txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 1258a2f1b91Sahrens txh->txh_tx = tx; 1268a2f1b91Sahrens txh->txh_dnode = dn; 1270c779ad4SMatthew Ahrens refcount_create(&txh->txh_space_towrite); 1280c779ad4SMatthew Ahrens refcount_create(&txh->txh_memory_tohold); 1298a2f1b91Sahrens txh->txh_type = type; 1308a2f1b91Sahrens txh->txh_arg1 = arg1; 1318a2f1b91Sahrens txh->txh_arg2 = arg2; 1328a2f1b91Sahrens list_insert_tail(&tx->tx_holds, txh); 133ea8dc4b6Seschrock 1348a2f1b91Sahrens return (txh); 135fa9e4066Sahrens } 136fa9e4066Sahrens 137fa9e4066Sahrens void 138fa9e4066Sahrens dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 139fa9e4066Sahrens { 140fa9e4066Sahrens /* 141fa9e4066Sahrens * If we're syncing, they can manipulate any object anyhow, and 142fa9e4066Sahrens * the hold on the dnode_t can cause problems. 143fa9e4066Sahrens */ 144fa9e4066Sahrens if (!dmu_tx_is_syncing(tx)) { 1458a2f1b91Sahrens (void) dmu_tx_hold_object_impl(tx, os, 1468a2f1b91Sahrens object, THT_NEWOBJECT, 0, 0); 147fa9e4066Sahrens } 148fa9e4066Sahrens } 149fa9e4066Sahrens 150*61e255ceSMatthew Ahrens /* 151*61e255ceSMatthew Ahrens * This function reads specified data from disk. The specified data will 152*61e255ceSMatthew Ahrens * be needed to perform the transaction -- i.e, it will be read after 153*61e255ceSMatthew Ahrens * we do dmu_tx_assign(). There are two reasons that we read the data now 154*61e255ceSMatthew Ahrens * (before dmu_tx_assign()): 155*61e255ceSMatthew Ahrens * 156*61e255ceSMatthew Ahrens * 1. Reading it now has potentially better performance. The transaction 157*61e255ceSMatthew Ahrens * has not yet been assigned, so the TXG is not held open, and also the 158*61e255ceSMatthew Ahrens * caller typically has less locks held when calling dmu_tx_hold_*() than 159*61e255ceSMatthew Ahrens * after the transaction has been assigned. This reduces the lock (and txg) 160*61e255ceSMatthew Ahrens * hold times, thus reducing lock contention. 161*61e255ceSMatthew Ahrens * 162*61e255ceSMatthew Ahrens * 2. It is easier for callers (primarily the ZPL) to handle i/o errors 163*61e255ceSMatthew Ahrens * that are detected before they start making changes to the DMU state 164*61e255ceSMatthew Ahrens * (i.e. now). Once the transaction has been assigned, and some DMU 165*61e255ceSMatthew Ahrens * state has been changed, it can be difficult to recover from an i/o 166*61e255ceSMatthew Ahrens * error (e.g. to undo the changes already made in memory at the DMU 167*61e255ceSMatthew Ahrens * layer). Typically code to do so does not exist in the caller -- it 168*61e255ceSMatthew Ahrens * assumes that the data has already been cached and thus i/o errors are 169*61e255ceSMatthew Ahrens * not possible. 170*61e255ceSMatthew Ahrens * 171*61e255ceSMatthew Ahrens * It has been observed that the i/o initiated here can be a performance 172*61e255ceSMatthew Ahrens * problem, and it appears to be optional, because we don't look at the 173*61e255ceSMatthew Ahrens * data which is read. However, removing this read would only serve to 174*61e255ceSMatthew Ahrens * move the work elsewhere (after the dmu_tx_assign()), where it may 175*61e255ceSMatthew Ahrens * have a greater impact on performance (in addition to the impact on 176*61e255ceSMatthew Ahrens * fault tolerance noted above). 177*61e255ceSMatthew Ahrens */ 178ea8dc4b6Seschrock static int 179ea8dc4b6Seschrock dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 180ea8dc4b6Seschrock { 181ea8dc4b6Seschrock int err; 182ea8dc4b6Seschrock dmu_buf_impl_t *db; 183ea8dc4b6Seschrock 184ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_READER); 185ea8dc4b6Seschrock db = dbuf_hold_level(dn, level, blkid, FTAG); 186ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 187ea8dc4b6Seschrock if (db == NULL) 188be6fd75aSMatthew Ahrens return (SET_ERROR(EIO)); 1891ab7f2deSmaybee err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 190ea8dc4b6Seschrock dbuf_rele(db, FTAG); 191ea8dc4b6Seschrock return (err); 192ea8dc4b6Seschrock } 193ea8dc4b6Seschrock 194fa9e4066Sahrens /* ARGSUSED */ 195fa9e4066Sahrens static void 1968a2f1b91Sahrens dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 197fa9e4066Sahrens { 1988a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 1998a2f1b91Sahrens int err = 0; 200fa9e4066Sahrens 201fa9e4066Sahrens if (len == 0) 202fa9e4066Sahrens return; 203fa9e4066Sahrens 204*61e255ceSMatthew Ahrens (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); 2054a7f2a75SMark Maybee 206*61e255ceSMatthew Ahrens if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) 207*61e255ceSMatthew Ahrens err = SET_ERROR(EFBIG); 208ea8dc4b6Seschrock 209*61e255ceSMatthew Ahrens if (dn == NULL) 210*61e255ceSMatthew Ahrens return; 211ea8dc4b6Seschrock 212*61e255ceSMatthew Ahrens /* 213*61e255ceSMatthew Ahrens * For i/o error checking, read the blocks that will be needed 214*61e255ceSMatthew Ahrens * to perform the write: the first and last level-0 blocks (if 215*61e255ceSMatthew Ahrens * they are not aligned, i.e. if they are partial-block writes), 216*61e255ceSMatthew Ahrens * and all the level-1 blocks. 217*61e255ceSMatthew Ahrens */ 218*61e255ceSMatthew Ahrens if (dn->dn_maxblkid == 0) { 219*61e255ceSMatthew Ahrens if (off < dn->dn_datablksz && 220*61e255ceSMatthew Ahrens (off > 0 || len < dn->dn_datablksz)) { 221*61e255ceSMatthew Ahrens err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 222*61e255ceSMatthew Ahrens if (err != 0) { 223*61e255ceSMatthew Ahrens txh->txh_tx->tx_err = err; 224ea8dc4b6Seschrock } 2254a7f2a75SMark Maybee } 226*61e255ceSMatthew Ahrens } else { 227*61e255ceSMatthew Ahrens zio_t *zio = zio_root(dn->dn_objset->os_spa, 228*61e255ceSMatthew Ahrens NULL, NULL, ZIO_FLAG_CANFAIL); 2294a7f2a75SMark Maybee 230*61e255ceSMatthew Ahrens /* first level-0 block */ 231*61e255ceSMatthew Ahrens uint64_t start = off >> dn->dn_datablkshift; 232*61e255ceSMatthew Ahrens if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { 233*61e255ceSMatthew Ahrens err = dmu_tx_check_ioerr(zio, dn, 0, start); 234*61e255ceSMatthew Ahrens if (err != 0) { 235*61e255ceSMatthew Ahrens txh->txh_tx->tx_err = err; 236*61e255ceSMatthew Ahrens } 237b24ab676SJeff Bonwick } 2384a7f2a75SMark Maybee 239*61e255ceSMatthew Ahrens /* last level-0 block */ 240*61e255ceSMatthew Ahrens uint64_t end = (off + len - 1) >> dn->dn_datablkshift; 241*61e255ceSMatthew Ahrens if (end != start && end <= dn->dn_maxblkid && 242*61e255ceSMatthew Ahrens P2PHASE(off + len, dn->dn_datablksz)) { 243*61e255ceSMatthew Ahrens err = dmu_tx_check_ioerr(zio, dn, 0, end); 244*61e255ceSMatthew Ahrens if (err != 0) { 24501025c89SJohn Harres txh->txh_tx->tx_err = err; 24601025c89SJohn Harres } 247*61e255ceSMatthew Ahrens } 24801025c89SJohn Harres 249*61e255ceSMatthew Ahrens /* level-1 blocks */ 250*61e255ceSMatthew Ahrens if (dn->dn_nlevels > 1) { 251*61e255ceSMatthew Ahrens int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 252*61e255ceSMatthew Ahrens for (uint64_t i = (start >> shft) + 1; 253*61e255ceSMatthew Ahrens i < end >> shft; i++) { 254*61e255ceSMatthew Ahrens err = dmu_tx_check_ioerr(zio, dn, 1, i); 255*61e255ceSMatthew Ahrens if (err != 0) { 256*61e255ceSMatthew Ahrens txh->txh_tx->tx_err = err; 2570c779ad4SMatthew Ahrens } 2584a7f2a75SMark Maybee } 2594a7f2a75SMark Maybee } 260fa9e4066Sahrens 261*61e255ceSMatthew Ahrens err = zio_wait(zio); 262*61e255ceSMatthew Ahrens if (err != 0) { 263*61e255ceSMatthew Ahrens txh->txh_tx->tx_err = err; 2644a7f2a75SMark Maybee } 265fa9e4066Sahrens } 266fa9e4066Sahrens } 267fa9e4066Sahrens 268fa9e4066Sahrens static void 2698a2f1b91Sahrens dmu_tx_count_dnode(dmu_tx_hold_t *txh) 270fa9e4066Sahrens { 271*61e255ceSMatthew Ahrens (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); 272fa9e4066Sahrens } 273fa9e4066Sahrens 274fa9e4066Sahrens void 275fa9e4066Sahrens dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 276fa9e4066Sahrens { 2778a2f1b91Sahrens dmu_tx_hold_t *txh; 2788a2f1b91Sahrens 279*61e255ceSMatthew Ahrens ASSERT0(tx->tx_txg); 280*61e255ceSMatthew Ahrens ASSERT3U(len, <=, DMU_MAX_ACCESS); 281dd6ef538Smaybee ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 282fa9e4066Sahrens 2838a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 2848a2f1b91Sahrens object, THT_WRITE, off, len); 2858a2f1b91Sahrens if (txh == NULL) 2868a2f1b91Sahrens return; 2878a2f1b91Sahrens 2888a2f1b91Sahrens dmu_tx_count_write(txh, off, len); 2898a2f1b91Sahrens dmu_tx_count_dnode(txh); 290fa9e4066Sahrens } 291fa9e4066Sahrens 2924bb73804SMatthew Ahrens /* 2934bb73804SMatthew Ahrens * This function marks the transaction as being a "net free". The end 2944bb73804SMatthew Ahrens * result is that refquotas will be disabled for this transaction, and 2954bb73804SMatthew Ahrens * this transaction will be able to use half of the pool space overhead 2964bb73804SMatthew Ahrens * (see dsl_pool_adjustedsize()). Therefore this function should only 2974bb73804SMatthew Ahrens * be called for transactions that we expect will not cause a net increase 2984bb73804SMatthew Ahrens * in the amount of space used (but it's OK if that is occasionally not true). 2994bb73804SMatthew Ahrens */ 3004bb73804SMatthew Ahrens void 3014bb73804SMatthew Ahrens dmu_tx_mark_netfree(dmu_tx_t *tx) 3024bb73804SMatthew Ahrens { 303*61e255ceSMatthew Ahrens tx->tx_netfree = B_TRUE; 3044bb73804SMatthew Ahrens } 3054bb73804SMatthew Ahrens 3068a2f1b91Sahrens void 3078a2f1b91Sahrens dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 308fa9e4066Sahrens { 3092f3d8780SMatthew Ahrens int err; 310fa9e4066Sahrens 3118a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 3128a2f1b91Sahrens 313*61e255ceSMatthew Ahrens dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 3148a2f1b91Sahrens object, THT_FREE, off, len); 3158a2f1b91Sahrens if (txh == NULL) 3168a2f1b91Sahrens return; 317*61e255ceSMatthew Ahrens dnode_t *dn = txh->txh_dnode; 31869962b56SMatthew Ahrens dmu_tx_count_dnode(txh); 3198a2f1b91Sahrens 320*61e255ceSMatthew Ahrens if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) 321fa9e4066Sahrens return; 322fa9e4066Sahrens if (len == DMU_OBJECT_END) 323*61e255ceSMatthew Ahrens len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; 324fa9e4066Sahrens 325ea8dc4b6Seschrock /* 3262f3d8780SMatthew Ahrens * For i/o error checking, we read the first and last level-0 3272f3d8780SMatthew Ahrens * blocks if they are not aligned, and all the level-1 blocks. 3282f3d8780SMatthew Ahrens * 3292f3d8780SMatthew Ahrens * Note: dbuf_free_range() assumes that we have not instantiated 3302f3d8780SMatthew Ahrens * any level-0 dbufs that will be completely freed. Therefore we must 3312f3d8780SMatthew Ahrens * exercise care to not read or count the first and last blocks 3322f3d8780SMatthew Ahrens * if they are blocksize-aligned. 3332f3d8780SMatthew Ahrens */ 3342f3d8780SMatthew Ahrens if (dn->dn_datablkshift == 0) { 335713d6c20SMatthew Ahrens if (off != 0 || len < dn->dn_datablksz) 3365253393bSMatthew Ahrens dmu_tx_count_write(txh, 0, dn->dn_datablksz); 3372f3d8780SMatthew Ahrens } else { 3382f3d8780SMatthew Ahrens /* first block will be modified if it is not aligned */ 3392f3d8780SMatthew Ahrens if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) 3402f3d8780SMatthew Ahrens dmu_tx_count_write(txh, off, 1); 3412f3d8780SMatthew Ahrens /* last block will be modified if it is not aligned */ 3422f3d8780SMatthew Ahrens if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) 343*61e255ceSMatthew Ahrens dmu_tx_count_write(txh, off + len, 1); 3442f3d8780SMatthew Ahrens } 3452f3d8780SMatthew Ahrens 3462f3d8780SMatthew Ahrens /* 3472f3d8780SMatthew Ahrens * Check level-1 blocks. 348ea8dc4b6Seschrock */ 34998572ac1Sahrens if (dn->dn_nlevels > 1) { 3502f3d8780SMatthew Ahrens int shift = dn->dn_datablkshift + dn->dn_indblkshift - 35198572ac1Sahrens SPA_BLKPTRSHIFT; 3522f3d8780SMatthew Ahrens uint64_t start = off >> shift; 3532f3d8780SMatthew Ahrens uint64_t end = (off + len) >> shift; 3542f3d8780SMatthew Ahrens 3552f3d8780SMatthew Ahrens ASSERT(dn->dn_indblkshift != 0); 35698572ac1Sahrens 357bb411a08SMatthew Ahrens /* 358bb411a08SMatthew Ahrens * dnode_reallocate() can result in an object with indirect 359bb411a08SMatthew Ahrens * blocks having an odd data block size. In this case, 360bb411a08SMatthew Ahrens * just check the single block. 361bb411a08SMatthew Ahrens */ 362bb411a08SMatthew Ahrens if (dn->dn_datablkshift == 0) 363bb411a08SMatthew Ahrens start = end = 0; 364bb411a08SMatthew Ahrens 365*61e255ceSMatthew Ahrens zio_t *zio = zio_root(tx->tx_pool->dp_spa, 36698572ac1Sahrens NULL, NULL, ZIO_FLAG_CANFAIL); 3672f3d8780SMatthew Ahrens for (uint64_t i = start; i <= end; i++) { 36898572ac1Sahrens uint64_t ibyte = i << shift; 369cdb0ab79Smaybee err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 37098572ac1Sahrens i = ibyte >> shift; 37146e1baa6SMatthew Ahrens if (err == ESRCH || i > end) 37298572ac1Sahrens break; 373*61e255ceSMatthew Ahrens if (err != 0) { 37498572ac1Sahrens tx->tx_err = err; 375*61e255ceSMatthew Ahrens (void) zio_wait(zio); 37698572ac1Sahrens return; 37798572ac1Sahrens } 378ea8dc4b6Seschrock 379*61e255ceSMatthew Ahrens (void) refcount_add_many(&txh->txh_memory_tohold, 380*61e255ceSMatthew Ahrens 1 << dn->dn_indblkshift, FTAG); 381*61e255ceSMatthew Ahrens 38298572ac1Sahrens err = dmu_tx_check_ioerr(zio, dn, 1, i); 383*61e255ceSMatthew Ahrens if (err != 0) { 38498572ac1Sahrens tx->tx_err = err; 385*61e255ceSMatthew Ahrens (void) zio_wait(zio); 38698572ac1Sahrens return; 38798572ac1Sahrens } 38898572ac1Sahrens } 38998572ac1Sahrens err = zio_wait(zio); 390*61e255ceSMatthew Ahrens if (err != 0) { 391ea8dc4b6Seschrock tx->tx_err = err; 392ea8dc4b6Seschrock return; 393ea8dc4b6Seschrock } 394ea8dc4b6Seschrock } 395fa9e4066Sahrens } 396fa9e4066Sahrens 397fa9e4066Sahrens void 39814843421SMatthew Ahrens dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) 399fa9e4066Sahrens { 4000c779ad4SMatthew Ahrens int err; 401fa9e4066Sahrens 4028a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 4038a2f1b91Sahrens 404*61e255ceSMatthew Ahrens dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 4058a2f1b91Sahrens object, THT_ZAP, add, (uintptr_t)name); 4068a2f1b91Sahrens if (txh == NULL) 4078a2f1b91Sahrens return; 408*61e255ceSMatthew Ahrens dnode_t *dn = txh->txh_dnode; 4098a2f1b91Sahrens 4108a2f1b91Sahrens dmu_tx_count_dnode(txh); 411fa9e4066Sahrens 412*61e255ceSMatthew Ahrens /* 413*61e255ceSMatthew Ahrens * Modifying a almost-full microzap is around the worst case (128KB) 414*61e255ceSMatthew Ahrens * 415*61e255ceSMatthew Ahrens * If it is a fat zap, the worst case would be 7*16KB=112KB: 416*61e255ceSMatthew Ahrens * - 3 blocks overwritten: target leaf, ptrtbl block, header block 417*61e255ceSMatthew Ahrens * - 4 new blocks written if adding: 418*61e255ceSMatthew Ahrens * - 2 blocks for possibly split leaves, 419*61e255ceSMatthew Ahrens * - 2 grown ptrtbl blocks 420*61e255ceSMatthew Ahrens */ 421*61e255ceSMatthew Ahrens (void) refcount_add_many(&txh->txh_space_towrite, 422*61e255ceSMatthew Ahrens MZAP_MAX_BLKSZ, FTAG); 423*61e255ceSMatthew Ahrens 424*61e255ceSMatthew Ahrens if (dn == NULL) 425fa9e4066Sahrens return; 426fa9e4066Sahrens 427ad135b5dSChristopher Siden ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); 428fa9e4066Sahrens 429*61e255ceSMatthew Ahrens if (dn->dn_maxblkid == 0 || name == NULL) { 430fa9e4066Sahrens /* 431*61e255ceSMatthew Ahrens * This is a microzap (only one block), or we don't know 432*61e255ceSMatthew Ahrens * the name. Check the first block for i/o errors. 433fa9e4066Sahrens */ 434ea8dc4b6Seschrock err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 435*61e255ceSMatthew Ahrens if (err != 0) { 436ea8dc4b6Seschrock tx->tx_err = err; 4370c779ad4SMatthew Ahrens } 438*61e255ceSMatthew Ahrens } else { 439ea8dc4b6Seschrock /* 440*61e255ceSMatthew Ahrens * Access the name so that we'll check for i/o errors to 441*61e255ceSMatthew Ahrens * the leaf blocks, etc. We ignore ENOENT, as this name 442*61e255ceSMatthew Ahrens * may not yet exist. 443ea8dc4b6Seschrock */ 44479d72832SMatthew Ahrens err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); 445*61e255ceSMatthew Ahrens if (err == EIO || err == ECKSUM || err == ENXIO) { 446ea8dc4b6Seschrock tx->tx_err = err; 4470c779ad4SMatthew Ahrens } 4480c779ad4SMatthew Ahrens } 449fa9e4066Sahrens } 450fa9e4066Sahrens 451fa9e4066Sahrens void 452fa9e4066Sahrens dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 453fa9e4066Sahrens { 4548a2f1b91Sahrens dmu_tx_hold_t *txh; 455fa9e4066Sahrens 4568a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 457fa9e4066Sahrens 4588a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 4598a2f1b91Sahrens object, THT_BONUS, 0, 0); 4608a2f1b91Sahrens if (txh) 4618a2f1b91Sahrens dmu_tx_count_dnode(txh); 462fa9e4066Sahrens } 463fa9e4066Sahrens 464fa9e4066Sahrens void 465fa9e4066Sahrens dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 466fa9e4066Sahrens { 4678a2f1b91Sahrens dmu_tx_hold_t *txh; 468fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 469fa9e4066Sahrens 4708a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 4718a2f1b91Sahrens DMU_NEW_OBJECT, THT_SPACE, space, 0); 4728a2f1b91Sahrens 4730c779ad4SMatthew Ahrens (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); 474fa9e4066Sahrens } 475fa9e4066Sahrens 4769c9dc39aSek #ifdef ZFS_DEBUG 477fa9e4066Sahrens void 478fa9e4066Sahrens dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 479fa9e4066Sahrens { 480*61e255ceSMatthew Ahrens boolean_t match_object = B_FALSE; 481*61e255ceSMatthew Ahrens boolean_t match_offset = B_FALSE; 482fa9e4066Sahrens 483744947dcSTom Erickson DB_DNODE_ENTER(db); 484*61e255ceSMatthew Ahrens dnode_t *dn = DB_DNODE(db); 485fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 486503ad85cSMatthew Ahrens ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); 487fa9e4066Sahrens ASSERT3U(dn->dn_object, ==, db->db.db_object); 488fa9e4066Sahrens 489744947dcSTom Erickson if (tx->tx_anyobj) { 490744947dcSTom Erickson DB_DNODE_EXIT(db); 491fa9e4066Sahrens return; 492744947dcSTom Erickson } 493fa9e4066Sahrens 494fa9e4066Sahrens /* XXX No checking on the meta dnode for now */ 495744947dcSTom Erickson if (db->db.db_object == DMU_META_DNODE_OBJECT) { 496744947dcSTom Erickson DB_DNODE_EXIT(db); 497fa9e4066Sahrens return; 498744947dcSTom Erickson } 499fa9e4066Sahrens 500*61e255ceSMatthew Ahrens for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 5018a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 502fa9e4066Sahrens ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 5038a2f1b91Sahrens if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 504fa9e4066Sahrens match_object = TRUE; 5058a2f1b91Sahrens if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 506fa9e4066Sahrens int datablkshift = dn->dn_datablkshift ? 507fa9e4066Sahrens dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 508fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 509fa9e4066Sahrens int shift = datablkshift + epbs * db->db_level; 510fa9e4066Sahrens uint64_t beginblk = shift >= 64 ? 0 : 5118a2f1b91Sahrens (txh->txh_arg1 >> shift); 512fa9e4066Sahrens uint64_t endblk = shift >= 64 ? 0 : 5138a2f1b91Sahrens ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 514fa9e4066Sahrens uint64_t blkid = db->db_blkid; 515fa9e4066Sahrens 5168a2f1b91Sahrens /* XXX txh_arg2 better not be zero... */ 517fa9e4066Sahrens 5188a2f1b91Sahrens dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 5198a2f1b91Sahrens txh->txh_type, beginblk, endblk); 520fa9e4066Sahrens 5218a2f1b91Sahrens switch (txh->txh_type) { 522fa9e4066Sahrens case THT_WRITE: 523fa9e4066Sahrens if (blkid >= beginblk && blkid <= endblk) 524fa9e4066Sahrens match_offset = TRUE; 525fa9e4066Sahrens /* 526fa9e4066Sahrens * We will let this hold work for the bonus 5270a586ceaSMark Shellenbaum * or spill buffer so that we don't need to 5280a586ceaSMark Shellenbaum * hold it when creating a new object. 529fa9e4066Sahrens */ 5300a586ceaSMark Shellenbaum if (blkid == DMU_BONUS_BLKID || 5310a586ceaSMark Shellenbaum blkid == DMU_SPILL_BLKID) 532fa9e4066Sahrens match_offset = TRUE; 533fa9e4066Sahrens /* 534fa9e4066Sahrens * They might have to increase nlevels, 535fa9e4066Sahrens * thus dirtying the new TLIBs. Or the 536fa9e4066Sahrens * might have to change the block size, 537fa9e4066Sahrens * thus dirying the new lvl=0 blk=0. 538fa9e4066Sahrens */ 539fa9e4066Sahrens if (blkid == 0) 540fa9e4066Sahrens match_offset = TRUE; 541fa9e4066Sahrens break; 542fa9e4066Sahrens case THT_FREE: 543cdb0ab79Smaybee /* 544cdb0ab79Smaybee * We will dirty all the level 1 blocks in 545cdb0ab79Smaybee * the free range and perhaps the first and 546cdb0ab79Smaybee * last level 0 block. 547cdb0ab79Smaybee */ 548cdb0ab79Smaybee if (blkid >= beginblk && (blkid <= endblk || 549cdb0ab79Smaybee txh->txh_arg2 == DMU_OBJECT_END)) 550fa9e4066Sahrens match_offset = TRUE; 551fa9e4066Sahrens break; 5520a586ceaSMark Shellenbaum case THT_SPILL: 5530a586ceaSMark Shellenbaum if (blkid == DMU_SPILL_BLKID) 5540a586ceaSMark Shellenbaum match_offset = TRUE; 5550a586ceaSMark Shellenbaum break; 556fa9e4066Sahrens case THT_BONUS: 5570a586ceaSMark Shellenbaum if (blkid == DMU_BONUS_BLKID) 558fa9e4066Sahrens match_offset = TRUE; 559fa9e4066Sahrens break; 560fa9e4066Sahrens case THT_ZAP: 561fa9e4066Sahrens match_offset = TRUE; 562fa9e4066Sahrens break; 563fa9e4066Sahrens case THT_NEWOBJECT: 564fa9e4066Sahrens match_object = TRUE; 565fa9e4066Sahrens break; 566fa9e4066Sahrens default: 5678a2f1b91Sahrens ASSERT(!"bad txh_type"); 568fa9e4066Sahrens } 569fa9e4066Sahrens } 570744947dcSTom Erickson if (match_object && match_offset) { 571744947dcSTom Erickson DB_DNODE_EXIT(db); 572fa9e4066Sahrens return; 573744947dcSTom Erickson } 574fa9e4066Sahrens } 575744947dcSTom Erickson DB_DNODE_EXIT(db); 576fa9e4066Sahrens panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 577fa9e4066Sahrens (u_longlong_t)db->db.db_object, db->db_level, 578fa9e4066Sahrens (u_longlong_t)db->db_blkid); 579fa9e4066Sahrens } 5809c9dc39aSek #endif 581fa9e4066Sahrens 58269962b56SMatthew Ahrens /* 58369962b56SMatthew Ahrens * If we can't do 10 iops, something is wrong. Let us go ahead 58469962b56SMatthew Ahrens * and hit zfs_dirty_data_max. 58569962b56SMatthew Ahrens */ 58669962b56SMatthew Ahrens hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); 58769962b56SMatthew Ahrens int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ 58869962b56SMatthew Ahrens 58969962b56SMatthew Ahrens /* 59069962b56SMatthew Ahrens * We delay transactions when we've determined that the backend storage 59169962b56SMatthew Ahrens * isn't able to accommodate the rate of incoming writes. 59269962b56SMatthew Ahrens * 59369962b56SMatthew Ahrens * If there is already a transaction waiting, we delay relative to when 59469962b56SMatthew Ahrens * that transaction finishes waiting. This way the calculated min_time 59569962b56SMatthew Ahrens * is independent of the number of threads concurrently executing 59669962b56SMatthew Ahrens * transactions. 59769962b56SMatthew Ahrens * 59869962b56SMatthew Ahrens * If we are the only waiter, wait relative to when the transaction 59969962b56SMatthew Ahrens * started, rather than the current time. This credits the transaction for 60069962b56SMatthew Ahrens * "time already served", e.g. reading indirect blocks. 60169962b56SMatthew Ahrens * 60269962b56SMatthew Ahrens * The minimum time for a transaction to take is calculated as: 60369962b56SMatthew Ahrens * min_time = scale * (dirty - min) / (max - dirty) 60469962b56SMatthew Ahrens * min_time is then capped at zfs_delay_max_ns. 60569962b56SMatthew Ahrens * 60669962b56SMatthew Ahrens * The delay has two degrees of freedom that can be adjusted via tunables. 60769962b56SMatthew Ahrens * The percentage of dirty data at which we start to delay is defined by 60869962b56SMatthew Ahrens * zfs_delay_min_dirty_percent. This should typically be at or above 60969962b56SMatthew Ahrens * zfs_vdev_async_write_active_max_dirty_percent so that we only start to 61069962b56SMatthew Ahrens * delay after writing at full speed has failed to keep up with the incoming 61169962b56SMatthew Ahrens * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly 61269962b56SMatthew Ahrens * speaking, this variable determines the amount of delay at the midpoint of 61369962b56SMatthew Ahrens * the curve. 61469962b56SMatthew Ahrens * 61569962b56SMatthew Ahrens * delay 61669962b56SMatthew Ahrens * 10ms +-------------------------------------------------------------*+ 61769962b56SMatthew Ahrens * | *| 61869962b56SMatthew Ahrens * 9ms + *+ 61969962b56SMatthew Ahrens * | *| 62069962b56SMatthew Ahrens * 8ms + *+ 62169962b56SMatthew Ahrens * | * | 62269962b56SMatthew Ahrens * 7ms + * + 62369962b56SMatthew Ahrens * | * | 62469962b56SMatthew Ahrens * 6ms + * + 62569962b56SMatthew Ahrens * | * | 62669962b56SMatthew Ahrens * 5ms + * + 62769962b56SMatthew Ahrens * | * | 62869962b56SMatthew Ahrens * 4ms + * + 62969962b56SMatthew Ahrens * | * | 63069962b56SMatthew Ahrens * 3ms + * + 63169962b56SMatthew Ahrens * | * | 63269962b56SMatthew Ahrens * 2ms + (midpoint) * + 63369962b56SMatthew Ahrens * | | ** | 63469962b56SMatthew Ahrens * 1ms + v *** + 63569962b56SMatthew Ahrens * | zfs_delay_scale ----------> ******** | 63669962b56SMatthew Ahrens * 0 +-------------------------------------*********----------------+ 63769962b56SMatthew Ahrens * 0% <- zfs_dirty_data_max -> 100% 63869962b56SMatthew Ahrens * 63969962b56SMatthew Ahrens * Note that since the delay is added to the outstanding time remaining on the 64069962b56SMatthew Ahrens * most recent transaction, the delay is effectively the inverse of IOPS. 64169962b56SMatthew Ahrens * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve 64269962b56SMatthew Ahrens * was chosen such that small changes in the amount of accumulated dirty data 64369962b56SMatthew Ahrens * in the first 3/4 of the curve yield relatively small differences in the 64469962b56SMatthew Ahrens * amount of delay. 64569962b56SMatthew Ahrens * 64669962b56SMatthew Ahrens * The effects can be easier to understand when the amount of delay is 64769962b56SMatthew Ahrens * represented on a log scale: 64869962b56SMatthew Ahrens * 64969962b56SMatthew Ahrens * delay 65069962b56SMatthew Ahrens * 100ms +-------------------------------------------------------------++ 65169962b56SMatthew Ahrens * + + 65269962b56SMatthew Ahrens * | | 65369962b56SMatthew Ahrens * + *+ 65469962b56SMatthew Ahrens * 10ms + *+ 65569962b56SMatthew Ahrens * + ** + 65669962b56SMatthew Ahrens * | (midpoint) ** | 65769962b56SMatthew Ahrens * + | ** + 65869962b56SMatthew Ahrens * 1ms + v **** + 65969962b56SMatthew Ahrens * + zfs_delay_scale ----------> ***** + 66069962b56SMatthew Ahrens * | **** | 66169962b56SMatthew Ahrens * + **** + 66269962b56SMatthew Ahrens * 100us + ** + 66369962b56SMatthew Ahrens * + * + 66469962b56SMatthew Ahrens * | * | 66569962b56SMatthew Ahrens * + * + 66669962b56SMatthew Ahrens * 10us + * + 66769962b56SMatthew Ahrens * + + 66869962b56SMatthew Ahrens * | | 66969962b56SMatthew Ahrens * + + 67069962b56SMatthew Ahrens * +--------------------------------------------------------------+ 67169962b56SMatthew Ahrens * 0% <- zfs_dirty_data_max -> 100% 67269962b56SMatthew Ahrens * 67369962b56SMatthew Ahrens * Note here that only as the amount of dirty data approaches its limit does 67469962b56SMatthew Ahrens * the delay start to increase rapidly. The goal of a properly tuned system 67569962b56SMatthew Ahrens * should be to keep the amount of dirty data out of that range by first 67669962b56SMatthew Ahrens * ensuring that the appropriate limits are set for the I/O scheduler to reach 67769962b56SMatthew Ahrens * optimal throughput on the backend storage, and then by changing the value 67869962b56SMatthew Ahrens * of zfs_delay_scale to increase the steepness of the curve. 67969962b56SMatthew Ahrens */ 68069962b56SMatthew Ahrens static void 68169962b56SMatthew Ahrens dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) 68269962b56SMatthew Ahrens { 68369962b56SMatthew Ahrens dsl_pool_t *dp = tx->tx_pool; 68469962b56SMatthew Ahrens uint64_t delay_min_bytes = 68569962b56SMatthew Ahrens zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 68669962b56SMatthew Ahrens hrtime_t wakeup, min_tx_time, now; 68769962b56SMatthew Ahrens 68869962b56SMatthew Ahrens if (dirty <= delay_min_bytes) 68969962b56SMatthew Ahrens return; 69069962b56SMatthew Ahrens 69169962b56SMatthew Ahrens /* 69269962b56SMatthew Ahrens * The caller has already waited until we are under the max. 69369962b56SMatthew Ahrens * We make them pass us the amount of dirty data so we don't 69469962b56SMatthew Ahrens * have to handle the case of it being >= the max, which could 69569962b56SMatthew Ahrens * cause a divide-by-zero if it's == the max. 69669962b56SMatthew Ahrens */ 69769962b56SMatthew Ahrens ASSERT3U(dirty, <, zfs_dirty_data_max); 69869962b56SMatthew Ahrens 69969962b56SMatthew Ahrens now = gethrtime(); 70069962b56SMatthew Ahrens min_tx_time = zfs_delay_scale * 70169962b56SMatthew Ahrens (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); 70269962b56SMatthew Ahrens if (now > tx->tx_start + min_tx_time) 70369962b56SMatthew Ahrens return; 70469962b56SMatthew Ahrens 70569962b56SMatthew Ahrens min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); 70669962b56SMatthew Ahrens 70769962b56SMatthew Ahrens DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, 70869962b56SMatthew Ahrens uint64_t, min_tx_time); 70969962b56SMatthew Ahrens 71069962b56SMatthew Ahrens mutex_enter(&dp->dp_lock); 71169962b56SMatthew Ahrens wakeup = MAX(tx->tx_start + min_tx_time, 71269962b56SMatthew Ahrens dp->dp_last_wakeup + min_tx_time); 71369962b56SMatthew Ahrens dp->dp_last_wakeup = wakeup; 71469962b56SMatthew Ahrens mutex_exit(&dp->dp_lock); 71569962b56SMatthew Ahrens 71669962b56SMatthew Ahrens #ifdef _KERNEL 71769962b56SMatthew Ahrens mutex_enter(&curthread->t_delay_lock); 71869962b56SMatthew Ahrens while (cv_timedwait_hires(&curthread->t_delay_cv, 71969962b56SMatthew Ahrens &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, 72069962b56SMatthew Ahrens CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) 72169962b56SMatthew Ahrens continue; 72269962b56SMatthew Ahrens mutex_exit(&curthread->t_delay_lock); 72369962b56SMatthew Ahrens #else 72469962b56SMatthew Ahrens hrtime_t delta = wakeup - gethrtime(); 72569962b56SMatthew Ahrens struct timespec ts; 72669962b56SMatthew Ahrens ts.tv_sec = delta / NANOSEC; 72769962b56SMatthew Ahrens ts.tv_nsec = delta % NANOSEC; 72869962b56SMatthew Ahrens (void) nanosleep(&ts, NULL); 72969962b56SMatthew Ahrens #endif 73069962b56SMatthew Ahrens } 73169962b56SMatthew Ahrens 732*61e255ceSMatthew Ahrens /* 733*61e255ceSMatthew Ahrens * This routine attempts to assign the transaction to a transaction group. 734*61e255ceSMatthew Ahrens * To do so, we must determine if there is sufficient free space on disk. 735*61e255ceSMatthew Ahrens * 736*61e255ceSMatthew Ahrens * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() 737*61e255ceSMatthew Ahrens * on it), then it is assumed that there is sufficient free space, 738*61e255ceSMatthew Ahrens * unless there's insufficient slop space in the pool (see the comment 739*61e255ceSMatthew Ahrens * above spa_slop_shift in spa_misc.c). 740*61e255ceSMatthew Ahrens * 741*61e255ceSMatthew Ahrens * If it is not a "netfree" transaction, then if the data already on disk 742*61e255ceSMatthew Ahrens * is over the allowed usage (e.g. quota), this will fail with EDQUOT or 743*61e255ceSMatthew Ahrens * ENOSPC. Otherwise, if the current rough estimate of pending changes, 744*61e255ceSMatthew Ahrens * plus the rough estimate of this transaction's changes, may exceed the 745*61e255ceSMatthew Ahrens * allowed usage, then this will fail with ERESTART, which will cause the 746*61e255ceSMatthew Ahrens * caller to wait for the pending changes to be written to disk (by waiting 747*61e255ceSMatthew Ahrens * for the next TXG to open), and then check the space usage again. 748*61e255ceSMatthew Ahrens * 749*61e255ceSMatthew Ahrens * The rough estimate of pending changes is comprised of the sum of: 750*61e255ceSMatthew Ahrens * 751*61e255ceSMatthew Ahrens * - this transaction's holds' txh_space_towrite 752*61e255ceSMatthew Ahrens * 753*61e255ceSMatthew Ahrens * - dd_tempreserved[], which is the sum of in-flight transactions' 754*61e255ceSMatthew Ahrens * holds' txh_space_towrite (i.e. those transactions that have called 755*61e255ceSMatthew Ahrens * dmu_tx_assign() but not yet called dmu_tx_commit()). 756*61e255ceSMatthew Ahrens * 757*61e255ceSMatthew Ahrens * - dd_space_towrite[], which is the amount of dirtied dbufs. 758*61e255ceSMatthew Ahrens * 759*61e255ceSMatthew Ahrens * Note that all of these values are inflated by spa_get_worst_case_asize(), 760*61e255ceSMatthew Ahrens * which means that we may get ERESTART well before we are actually in danger 761*61e255ceSMatthew Ahrens * of running out of space, but this also mitigates any small inaccuracies 762*61e255ceSMatthew Ahrens * in the rough estimate (e.g. txh_space_towrite doesn't take into account 763*61e255ceSMatthew Ahrens * indirect blocks, and dd_space_towrite[] doesn't take into account changes 764*61e255ceSMatthew Ahrens * to the MOS). 765*61e255ceSMatthew Ahrens * 766*61e255ceSMatthew Ahrens * Note that due to this algorithm, it is possible to exceed the allowed 767*61e255ceSMatthew Ahrens * usage by one transaction. Also, as we approach the allowed usage, 768*61e255ceSMatthew Ahrens * we will allow a very limited amount of changes into each TXG, thus 769*61e255ceSMatthew Ahrens * decreasing performance. 770*61e255ceSMatthew Ahrens */ 771fa9e4066Sahrens static int 7723b2aab18SMatthew Ahrens dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) 773fa9e4066Sahrens { 7740a4e9518Sgw spa_t *spa = tx->tx_pool->dp_spa; 775fa9e4066Sahrens 776fb09f5aaSMadhav Suresh ASSERT0(tx->tx_txg); 7770a4e9518Sgw 7788a2f1b91Sahrens if (tx->tx_err) 7798a2f1b91Sahrens return (tx->tx_err); 780fa9e4066Sahrens 781e14bb325SJeff Bonwick if (spa_suspended(spa)) { 7820a4e9518Sgw /* 7830a4e9518Sgw * If the user has indicated a blocking failure mode 7840a4e9518Sgw * then return ERESTART which will block in dmu_tx_wait(). 7850a4e9518Sgw * Otherwise, return EIO so that an error can get 7860a4e9518Sgw * propagated back to the VOP calls. 7870a4e9518Sgw * 7880a4e9518Sgw * Note that we always honor the txg_how flag regardless 7890a4e9518Sgw * of the failuremode setting. 7900a4e9518Sgw */ 7910a4e9518Sgw if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 7920a4e9518Sgw txg_how != TXG_WAIT) 793be6fd75aSMatthew Ahrens return (SET_ERROR(EIO)); 7940a4e9518Sgw 795be6fd75aSMatthew Ahrens return (SET_ERROR(ERESTART)); 7960a4e9518Sgw } 7970a4e9518Sgw 79869962b56SMatthew Ahrens if (!tx->tx_waited && 79969962b56SMatthew Ahrens dsl_pool_need_dirty_delay(tx->tx_pool)) { 80069962b56SMatthew Ahrens tx->tx_wait_dirty = B_TRUE; 80169962b56SMatthew Ahrens return (SET_ERROR(ERESTART)); 80269962b56SMatthew Ahrens } 80369962b56SMatthew Ahrens 804fa9e4066Sahrens tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 8058a2f1b91Sahrens tx->tx_needassign_txh = NULL; 806fa9e4066Sahrens 8078a2f1b91Sahrens /* 8088a2f1b91Sahrens * NB: No error returns are allowed after txg_hold_open, but 8098a2f1b91Sahrens * before processing the dnode holds, due to the 8108a2f1b91Sahrens * dmu_tx_unassign() logic. 8118a2f1b91Sahrens */ 812fa9e4066Sahrens 813*61e255ceSMatthew Ahrens uint64_t towrite = 0; 814*61e255ceSMatthew Ahrens uint64_t tohold = 0; 815*61e255ceSMatthew Ahrens for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 8168a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 8178a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 818fa9e4066Sahrens if (dn != NULL) { 819fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 8208a2f1b91Sahrens if (dn->dn_assigned_txg == tx->tx_txg - 1) { 8218a2f1b91Sahrens mutex_exit(&dn->dn_mtx); 8228a2f1b91Sahrens tx->tx_needassign_txh = txh; 823be6fd75aSMatthew Ahrens return (SET_ERROR(ERESTART)); 824fa9e4066Sahrens } 8258a2f1b91Sahrens if (dn->dn_assigned_txg == 0) 826fa9e4066Sahrens dn->dn_assigned_txg = tx->tx_txg; 8278a2f1b91Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 828fa9e4066Sahrens (void) refcount_add(&dn->dn_tx_holds, tx); 829fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 830fa9e4066Sahrens } 8310c779ad4SMatthew Ahrens towrite += refcount_count(&txh->txh_space_towrite); 8320c779ad4SMatthew Ahrens tohold += refcount_count(&txh->txh_memory_tohold); 833fa9e4066Sahrens } 834fa9e4066Sahrens 835cdb0ab79Smaybee /* needed allocation: worst-case estimate of write space */ 836*61e255ceSMatthew Ahrens uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); 837cdb0ab79Smaybee /* calculate memory footprint estimate */ 838*61e255ceSMatthew Ahrens uint64_t memory = towrite + tohold; 839fa9e4066Sahrens 840*61e255ceSMatthew Ahrens if (tx->tx_dir != NULL && asize != 0) { 841cdb0ab79Smaybee int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 842*61e255ceSMatthew Ahrens asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); 843*61e255ceSMatthew Ahrens if (err != 0) 844fa9e4066Sahrens return (err); 845fa9e4066Sahrens } 846fa9e4066Sahrens 847fa9e4066Sahrens return (0); 848fa9e4066Sahrens } 849fa9e4066Sahrens 8508a2f1b91Sahrens static void 8518a2f1b91Sahrens dmu_tx_unassign(dmu_tx_t *tx) 852fa9e4066Sahrens { 8538a2f1b91Sahrens if (tx->tx_txg == 0) 8548a2f1b91Sahrens return; 855fa9e4066Sahrens 856fa9e4066Sahrens txg_rele_to_quiesce(&tx->tx_txgh); 857fa9e4066Sahrens 8583e30c24aSWill Andrews /* 8593e30c24aSWill Andrews * Walk the transaction's hold list, removing the hold on the 8603e30c24aSWill Andrews * associated dnode, and notifying waiters if the refcount drops to 0. 8613e30c24aSWill Andrews */ 862*61e255ceSMatthew Ahrens for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); 863*61e255ceSMatthew Ahrens txh != tx->tx_needassign_txh; 8648a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 8658a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 866fa9e4066Sahrens 867fa9e4066Sahrens if (dn == NULL) 868fa9e4066Sahrens continue; 869fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 8708a2f1b91Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 871fa9e4066Sahrens 872fa9e4066Sahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 873fa9e4066Sahrens dn->dn_assigned_txg = 0; 874fa9e4066Sahrens cv_broadcast(&dn->dn_notxholds); 875fa9e4066Sahrens } 876fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 877fa9e4066Sahrens } 878fa9e4066Sahrens 879fa9e4066Sahrens txg_rele_to_sync(&tx->tx_txgh); 880fa9e4066Sahrens 8818a2f1b91Sahrens tx->tx_lasttried_txg = tx->tx_txg; 882fa9e4066Sahrens tx->tx_txg = 0; 883fa9e4066Sahrens } 884fa9e4066Sahrens 885fa9e4066Sahrens /* 886fa9e4066Sahrens * Assign tx to a transaction group. txg_how can be one of: 887fa9e4066Sahrens * 888fa9e4066Sahrens * (1) TXG_WAIT. If the current open txg is full, waits until there's 889fa9e4066Sahrens * a new one. This should be used when you're not holding locks. 8903b2aab18SMatthew Ahrens * It will only fail if we're truly out of space (or over quota). 891fa9e4066Sahrens * 892fa9e4066Sahrens * (2) TXG_NOWAIT. If we can't assign into the current open txg without 893fa9e4066Sahrens * blocking, returns immediately with ERESTART. This should be used 894fa9e4066Sahrens * whenever you're holding locks. On an ERESTART error, the caller 8958a2f1b91Sahrens * should drop locks, do a dmu_tx_wait(tx), and try again. 89669962b56SMatthew Ahrens * 89769962b56SMatthew Ahrens * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() 89869962b56SMatthew Ahrens * has already been called on behalf of this operation (though 89969962b56SMatthew Ahrens * most likely on a different tx). 900fa9e4066Sahrens */ 901fa9e4066Sahrens int 9023b2aab18SMatthew Ahrens dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) 903fa9e4066Sahrens { 904fa9e4066Sahrens int err; 905fa9e4066Sahrens 906fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 90769962b56SMatthew Ahrens ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || 90869962b56SMatthew Ahrens txg_how == TXG_WAITED); 909fa9e4066Sahrens ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 910fa9e4066Sahrens 9113b2aab18SMatthew Ahrens /* If we might wait, we must not hold the config lock. */ 9123b2aab18SMatthew Ahrens ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); 9133b2aab18SMatthew Ahrens 91469962b56SMatthew Ahrens if (txg_how == TXG_WAITED) 91569962b56SMatthew Ahrens tx->tx_waited = B_TRUE; 91669962b56SMatthew Ahrens 9178a2f1b91Sahrens while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 9188a2f1b91Sahrens dmu_tx_unassign(tx); 919fa9e4066Sahrens 920fa9e4066Sahrens if (err != ERESTART || txg_how != TXG_WAIT) 921fa9e4066Sahrens return (err); 922fa9e4066Sahrens 9238a2f1b91Sahrens dmu_tx_wait(tx); 924fa9e4066Sahrens } 925fa9e4066Sahrens 926fa9e4066Sahrens txg_rele_to_quiesce(&tx->tx_txgh); 927fa9e4066Sahrens 928fa9e4066Sahrens return (0); 929fa9e4066Sahrens } 930fa9e4066Sahrens 9318a2f1b91Sahrens void 9328a2f1b91Sahrens dmu_tx_wait(dmu_tx_t *tx) 9338a2f1b91Sahrens { 9340a4e9518Sgw spa_t *spa = tx->tx_pool->dp_spa; 93569962b56SMatthew Ahrens dsl_pool_t *dp = tx->tx_pool; 9360a4e9518Sgw 9378a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 9383b2aab18SMatthew Ahrens ASSERT(!dsl_pool_config_held(tx->tx_pool)); 9398a2f1b91Sahrens 94069962b56SMatthew Ahrens if (tx->tx_wait_dirty) { 94169962b56SMatthew Ahrens /* 94269962b56SMatthew Ahrens * dmu_tx_try_assign() has determined that we need to wait 94369962b56SMatthew Ahrens * because we've consumed much or all of the dirty buffer 94469962b56SMatthew Ahrens * space. 94569962b56SMatthew Ahrens */ 94669962b56SMatthew Ahrens mutex_enter(&dp->dp_lock); 94769962b56SMatthew Ahrens while (dp->dp_dirty_total >= zfs_dirty_data_max) 94869962b56SMatthew Ahrens cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); 94969962b56SMatthew Ahrens uint64_t dirty = dp->dp_dirty_total; 95069962b56SMatthew Ahrens mutex_exit(&dp->dp_lock); 95169962b56SMatthew Ahrens 95269962b56SMatthew Ahrens dmu_tx_delay(tx, dirty); 95369962b56SMatthew Ahrens 95469962b56SMatthew Ahrens tx->tx_wait_dirty = B_FALSE; 95569962b56SMatthew Ahrens 95669962b56SMatthew Ahrens /* 95769962b56SMatthew Ahrens * Note: setting tx_waited only has effect if the caller 95869962b56SMatthew Ahrens * used TX_WAIT. Otherwise they are going to destroy 95969962b56SMatthew Ahrens * this tx and try again. The common case, zfs_write(), 96069962b56SMatthew Ahrens * uses TX_WAIT. 96169962b56SMatthew Ahrens */ 96269962b56SMatthew Ahrens tx->tx_waited = B_TRUE; 96369962b56SMatthew Ahrens } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { 96469962b56SMatthew Ahrens /* 96569962b56SMatthew Ahrens * If the pool is suspended we need to wait until it 96669962b56SMatthew Ahrens * is resumed. Note that it's possible that the pool 96769962b56SMatthew Ahrens * has become active after this thread has tried to 96869962b56SMatthew Ahrens * obtain a tx. If that's the case then tx_lasttried_txg 96969962b56SMatthew Ahrens * would not have been set. 97069962b56SMatthew Ahrens */ 97169962b56SMatthew Ahrens txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); 9720a4e9518Sgw } else if (tx->tx_needassign_txh) { 97369962b56SMatthew Ahrens /* 97469962b56SMatthew Ahrens * A dnode is assigned to the quiescing txg. Wait for its 97569962b56SMatthew Ahrens * transaction to complete. 97669962b56SMatthew Ahrens */ 9778a2f1b91Sahrens dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 9788a2f1b91Sahrens 9798a2f1b91Sahrens mutex_enter(&dn->dn_mtx); 9808a2f1b91Sahrens while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 9818a2f1b91Sahrens cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 9828a2f1b91Sahrens mutex_exit(&dn->dn_mtx); 9838a2f1b91Sahrens tx->tx_needassign_txh = NULL; 9848a2f1b91Sahrens } else { 9858a2f1b91Sahrens txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 9868a2f1b91Sahrens } 9878a2f1b91Sahrens } 9888a2f1b91Sahrens 9890c779ad4SMatthew Ahrens static void 9900c779ad4SMatthew Ahrens dmu_tx_destroy(dmu_tx_t *tx) 991fa9e4066Sahrens { 9928a2f1b91Sahrens dmu_tx_hold_t *txh; 993fa9e4066Sahrens 9940c779ad4SMatthew Ahrens while ((txh = list_head(&tx->tx_holds)) != NULL) { 9950c779ad4SMatthew Ahrens dnode_t *dn = txh->txh_dnode; 9960c779ad4SMatthew Ahrens 9970c779ad4SMatthew Ahrens list_remove(&tx->tx_holds, txh); 9980c779ad4SMatthew Ahrens refcount_destroy_many(&txh->txh_space_towrite, 9990c779ad4SMatthew Ahrens refcount_count(&txh->txh_space_towrite)); 10000c779ad4SMatthew Ahrens refcount_destroy_many(&txh->txh_memory_tohold, 10010c779ad4SMatthew Ahrens refcount_count(&txh->txh_memory_tohold)); 10020c779ad4SMatthew Ahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 10030c779ad4SMatthew Ahrens if (dn != NULL) 10040c779ad4SMatthew Ahrens dnode_rele(dn, tx); 10050c779ad4SMatthew Ahrens } 10060c779ad4SMatthew Ahrens 10070c779ad4SMatthew Ahrens list_destroy(&tx->tx_callbacks); 10080c779ad4SMatthew Ahrens list_destroy(&tx->tx_holds); 10090c779ad4SMatthew Ahrens kmem_free(tx, sizeof (dmu_tx_t)); 10100c779ad4SMatthew Ahrens } 10110c779ad4SMatthew Ahrens 10120c779ad4SMatthew Ahrens void 10130c779ad4SMatthew Ahrens dmu_tx_commit(dmu_tx_t *tx) 10140c779ad4SMatthew Ahrens { 1015fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1016fa9e4066Sahrens 10173e30c24aSWill Andrews /* 10183e30c24aSWill Andrews * Go through the transaction's hold list and remove holds on 10193e30c24aSWill Andrews * associated dnodes, notifying waiters if no holds remain. 10203e30c24aSWill Andrews */ 10210c779ad4SMatthew Ahrens for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; 10220c779ad4SMatthew Ahrens txh = list_next(&tx->tx_holds, txh)) { 10238a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 1024fa9e4066Sahrens 1025fa9e4066Sahrens if (dn == NULL) 1026fa9e4066Sahrens continue; 10270c779ad4SMatthew Ahrens 1028fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1029fa9e4066Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1030fa9e4066Sahrens 1031fa9e4066Sahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1032fa9e4066Sahrens dn->dn_assigned_txg = 0; 1033fa9e4066Sahrens cv_broadcast(&dn->dn_notxholds); 1034fa9e4066Sahrens } 1035fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1036fa9e4066Sahrens } 1037fa9e4066Sahrens 10388a2f1b91Sahrens if (tx->tx_tempreserve_cookie) 1039fa9e4066Sahrens dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1040fa9e4066Sahrens 1041d20e665cSRicardo M. Correia if (!list_is_empty(&tx->tx_callbacks)) 1042d20e665cSRicardo M. Correia txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); 1043d20e665cSRicardo M. Correia 1044fa9e4066Sahrens if (tx->tx_anyobj == FALSE) 1045fa9e4066Sahrens txg_rele_to_sync(&tx->tx_txgh); 1046d20e665cSRicardo M. Correia 10470c779ad4SMatthew Ahrens dmu_tx_destroy(tx); 1048fa9e4066Sahrens } 1049fa9e4066Sahrens 1050fa9e4066Sahrens void 1051fa9e4066Sahrens dmu_tx_abort(dmu_tx_t *tx) 1052fa9e4066Sahrens { 1053fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 1054fa9e4066Sahrens 1055d20e665cSRicardo M. Correia /* 1056d20e665cSRicardo M. Correia * Call any registered callbacks with an error code. 1057d20e665cSRicardo M. Correia */ 1058d20e665cSRicardo M. Correia if (!list_is_empty(&tx->tx_callbacks)) 1059d20e665cSRicardo M. Correia dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); 1060d20e665cSRicardo M. Correia 10610c779ad4SMatthew Ahrens dmu_tx_destroy(tx); 1062fa9e4066Sahrens } 1063fa9e4066Sahrens 1064fa9e4066Sahrens uint64_t 1065fa9e4066Sahrens dmu_tx_get_txg(dmu_tx_t *tx) 1066fa9e4066Sahrens { 1067fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1068fa9e4066Sahrens return (tx->tx_txg); 1069fa9e4066Sahrens } 1070d20e665cSRicardo M. Correia 10713b2aab18SMatthew Ahrens dsl_pool_t * 10723b2aab18SMatthew Ahrens dmu_tx_pool(dmu_tx_t *tx) 10733b2aab18SMatthew Ahrens { 10743b2aab18SMatthew Ahrens ASSERT(tx->tx_pool != NULL); 10753b2aab18SMatthew Ahrens return (tx->tx_pool); 10763b2aab18SMatthew Ahrens } 10773b2aab18SMatthew Ahrens 1078d20e665cSRicardo M. Correia void 1079d20e665cSRicardo M. Correia dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) 1080d20e665cSRicardo M. Correia { 1081d20e665cSRicardo M. Correia dmu_tx_callback_t *dcb; 1082d20e665cSRicardo M. Correia 1083d20e665cSRicardo M. Correia dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); 1084d20e665cSRicardo M. Correia 1085d20e665cSRicardo M. Correia dcb->dcb_func = func; 1086d20e665cSRicardo M. Correia dcb->dcb_data = data; 1087d20e665cSRicardo M. Correia 1088d20e665cSRicardo M. Correia list_insert_tail(&tx->tx_callbacks, dcb); 1089d20e665cSRicardo M. Correia } 1090d20e665cSRicardo M. Correia 1091d20e665cSRicardo M. Correia /* 1092d20e665cSRicardo M. Correia * Call all the commit callbacks on a list, with a given error code. 1093d20e665cSRicardo M. Correia */ 1094d20e665cSRicardo M. Correia void 1095d20e665cSRicardo M. Correia dmu_tx_do_callbacks(list_t *cb_list, int error) 1096d20e665cSRicardo M. Correia { 1097d20e665cSRicardo M. Correia dmu_tx_callback_t *dcb; 1098d20e665cSRicardo M. Correia 10990c779ad4SMatthew Ahrens while ((dcb = list_head(cb_list)) != NULL) { 1100d20e665cSRicardo M. Correia list_remove(cb_list, dcb); 1101d20e665cSRicardo M. Correia dcb->dcb_func(dcb->dcb_data, error); 1102d20e665cSRicardo M. Correia kmem_free(dcb, sizeof (dmu_tx_callback_t)); 1103d20e665cSRicardo M. Correia } 1104d20e665cSRicardo M. Correia } 11050a586ceaSMark Shellenbaum 11060a586ceaSMark Shellenbaum /* 11070a586ceaSMark Shellenbaum * Interface to hold a bunch of attributes. 11080a586ceaSMark Shellenbaum * used for creating new files. 11090a586ceaSMark Shellenbaum * attrsize is the total size of all attributes 11100a586ceaSMark Shellenbaum * to be added during object creation 11110a586ceaSMark Shellenbaum * 11120a586ceaSMark Shellenbaum * For updating/adding a single attribute dmu_tx_hold_sa() should be used. 11130a586ceaSMark Shellenbaum */ 11140a586ceaSMark Shellenbaum 11150a586ceaSMark Shellenbaum /* 11160a586ceaSMark Shellenbaum * hold necessary attribute name for attribute registration. 11170a586ceaSMark Shellenbaum * should be a very rare case where this is needed. If it does 11180a586ceaSMark Shellenbaum * happen it would only happen on the first write to the file system. 11190a586ceaSMark Shellenbaum */ 11200a586ceaSMark Shellenbaum static void 11210a586ceaSMark Shellenbaum dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) 11220a586ceaSMark Shellenbaum { 11230a586ceaSMark Shellenbaum if (!sa->sa_need_attr_registration) 11240a586ceaSMark Shellenbaum return; 11250a586ceaSMark Shellenbaum 1126*61e255ceSMatthew Ahrens for (int i = 0; i != sa->sa_num_attrs; i++) { 11270a586ceaSMark Shellenbaum if (!sa->sa_attr_table[i].sa_registered) { 11280a586ceaSMark Shellenbaum if (sa->sa_reg_attr_obj) 11290a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, 11300a586ceaSMark Shellenbaum B_TRUE, sa->sa_attr_table[i].sa_name); 11310a586ceaSMark Shellenbaum else 11320a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 11330a586ceaSMark Shellenbaum B_TRUE, sa->sa_attr_table[i].sa_name); 11340a586ceaSMark Shellenbaum } 11350a586ceaSMark Shellenbaum } 11360a586ceaSMark Shellenbaum } 11370a586ceaSMark Shellenbaum 11380a586ceaSMark Shellenbaum void 11390a586ceaSMark Shellenbaum dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) 11400a586ceaSMark Shellenbaum { 1141*61e255ceSMatthew Ahrens dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, 1142*61e255ceSMatthew Ahrens tx->tx_objset, object, THT_SPILL, 0, 0); 11430a586ceaSMark Shellenbaum 1144*61e255ceSMatthew Ahrens (void) refcount_add_many(&txh->txh_space_towrite, 1145*61e255ceSMatthew Ahrens SPA_OLD_MAXBLOCKSIZE, FTAG); 11460a586ceaSMark Shellenbaum } 11470a586ceaSMark Shellenbaum 11480a586ceaSMark Shellenbaum void 11490a586ceaSMark Shellenbaum dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) 11500a586ceaSMark Shellenbaum { 11510a586ceaSMark Shellenbaum sa_os_t *sa = tx->tx_objset->os_sa; 11520a586ceaSMark Shellenbaum 11530a586ceaSMark Shellenbaum dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 11540a586ceaSMark Shellenbaum 11550a586ceaSMark Shellenbaum if (tx->tx_objset->os_sa->sa_master_obj == 0) 11560a586ceaSMark Shellenbaum return; 11570a586ceaSMark Shellenbaum 1158*61e255ceSMatthew Ahrens if (tx->tx_objset->os_sa->sa_layout_attr_obj) { 11590a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 1160*61e255ceSMatthew Ahrens } else { 11610a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 11620a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 11630a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 11640a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 11650a586ceaSMark Shellenbaum } 11660a586ceaSMark Shellenbaum 11670a586ceaSMark Shellenbaum dmu_tx_sa_registration_hold(sa, tx); 11680a586ceaSMark Shellenbaum 11690a586ceaSMark Shellenbaum if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) 11700a586ceaSMark Shellenbaum return; 11710a586ceaSMark Shellenbaum 11720a586ceaSMark Shellenbaum (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, 11730a586ceaSMark Shellenbaum THT_SPILL, 0, 0); 11740a586ceaSMark Shellenbaum } 11750a586ceaSMark Shellenbaum 11760a586ceaSMark Shellenbaum /* 11770a586ceaSMark Shellenbaum * Hold SA attribute 11780a586ceaSMark Shellenbaum * 11790a586ceaSMark Shellenbaum * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) 11800a586ceaSMark Shellenbaum * 11810a586ceaSMark Shellenbaum * variable_size is the total size of all variable sized attributes 11820a586ceaSMark Shellenbaum * passed to this function. It is not the total size of all 11830a586ceaSMark Shellenbaum * variable size attributes that *may* exist on this object. 11840a586ceaSMark Shellenbaum */ 11850a586ceaSMark Shellenbaum void 11860a586ceaSMark Shellenbaum dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) 11870a586ceaSMark Shellenbaum { 11880a586ceaSMark Shellenbaum uint64_t object; 11890a586ceaSMark Shellenbaum sa_os_t *sa = tx->tx_objset->os_sa; 11900a586ceaSMark Shellenbaum 11910a586ceaSMark Shellenbaum ASSERT(hdl != NULL); 11920a586ceaSMark Shellenbaum 11930a586ceaSMark Shellenbaum object = sa_handle_object(hdl); 11940a586ceaSMark Shellenbaum 11950a586ceaSMark Shellenbaum dmu_tx_hold_bonus(tx, object); 11960a586ceaSMark Shellenbaum 11970a586ceaSMark Shellenbaum if (tx->tx_objset->os_sa->sa_master_obj == 0) 11980a586ceaSMark Shellenbaum return; 11990a586ceaSMark Shellenbaum 12000a586ceaSMark Shellenbaum if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || 12010a586ceaSMark Shellenbaum tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { 12020a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); 12030a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); 12040a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 12050a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 12060a586ceaSMark Shellenbaum } 12070a586ceaSMark Shellenbaum 12080a586ceaSMark Shellenbaum dmu_tx_sa_registration_hold(sa, tx); 12090a586ceaSMark Shellenbaum 12100a586ceaSMark Shellenbaum if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) 12110a586ceaSMark Shellenbaum dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); 12120a586ceaSMark Shellenbaum 1213744947dcSTom Erickson if (sa->sa_force_spill || may_grow || hdl->sa_spill) { 12140a586ceaSMark Shellenbaum ASSERT(tx->tx_txg == 0); 12150a586ceaSMark Shellenbaum dmu_tx_hold_spill(tx, object); 1216744947dcSTom Erickson } else { 1217744947dcSTom Erickson dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; 1218744947dcSTom Erickson dnode_t *dn; 1219744947dcSTom Erickson 1220744947dcSTom Erickson DB_DNODE_ENTER(db); 1221744947dcSTom Erickson dn = DB_DNODE(db); 1222744947dcSTom Erickson if (dn->dn_have_spill) { 1223744947dcSTom Erickson ASSERT(tx->tx_txg == 0); 1224744947dcSTom Erickson dmu_tx_hold_spill(tx, object); 1225744947dcSTom Erickson } 1226744947dcSTom Erickson DB_DNODE_EXIT(db); 12270a586ceaSMark Shellenbaum } 12280a586ceaSMark Shellenbaum } 1229