1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5f65e61c0Sahrens * Common Development and Distribution License (the "License"). 6f65e61c0Sahrens * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 228f38d419Sek * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/dmu.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30fa9e4066Sahrens #include <sys/dbuf.h> 31fa9e4066Sahrens #include <sys/dmu_tx.h> 32fa9e4066Sahrens #include <sys/dmu_objset.h> 33fa9e4066Sahrens #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 34fa9e4066Sahrens #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 35fa9e4066Sahrens #include <sys/dsl_pool.h> 368a2f1b91Sahrens #include <sys/zap_impl.h> /* for fzap_default_block_shift */ 37fa9e4066Sahrens #include <sys/spa.h> 38fa9e4066Sahrens #include <sys/zfs_context.h> 39fa9e4066Sahrens 40ea8dc4b6Seschrock typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 41ea8dc4b6Seschrock uint64_t arg1, uint64_t arg2); 42ea8dc4b6Seschrock 43fa9e4066Sahrens 44fa9e4066Sahrens dmu_tx_t * 451d452cf5Sahrens dmu_tx_create_dd(dsl_dir_t *dd) 46fa9e4066Sahrens { 47fa9e4066Sahrens dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 48fa9e4066Sahrens tx->tx_dir = dd; 49fa9e4066Sahrens if (dd) 50fa9e4066Sahrens tx->tx_pool = dd->dd_pool; 51fa9e4066Sahrens list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 528a2f1b91Sahrens offsetof(dmu_tx_hold_t, txh_node)); 538a2f1b91Sahrens #ifdef ZFS_DEBUG 54fa9e4066Sahrens refcount_create(&tx->tx_space_written); 55fa9e4066Sahrens refcount_create(&tx->tx_space_freed); 568a2f1b91Sahrens #endif 57fa9e4066Sahrens return (tx); 58fa9e4066Sahrens } 59fa9e4066Sahrens 60fa9e4066Sahrens dmu_tx_t * 61fa9e4066Sahrens dmu_tx_create(objset_t *os) 62fa9e4066Sahrens { 631d452cf5Sahrens dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); 64fa9e4066Sahrens tx->tx_objset = os; 65ea8dc4b6Seschrock tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); 66fa9e4066Sahrens return (tx); 67fa9e4066Sahrens } 68fa9e4066Sahrens 69fa9e4066Sahrens dmu_tx_t * 70fa9e4066Sahrens dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 71fa9e4066Sahrens { 721d452cf5Sahrens dmu_tx_t *tx = dmu_tx_create_dd(NULL); 73fa9e4066Sahrens 74fa9e4066Sahrens ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 75fa9e4066Sahrens tx->tx_pool = dp; 76fa9e4066Sahrens tx->tx_txg = txg; 77fa9e4066Sahrens tx->tx_anyobj = TRUE; 78fa9e4066Sahrens 79fa9e4066Sahrens return (tx); 80fa9e4066Sahrens } 81fa9e4066Sahrens 82fa9e4066Sahrens int 83fa9e4066Sahrens dmu_tx_is_syncing(dmu_tx_t *tx) 84fa9e4066Sahrens { 85fa9e4066Sahrens return (tx->tx_anyobj); 86fa9e4066Sahrens } 87fa9e4066Sahrens 88fa9e4066Sahrens int 89fa9e4066Sahrens dmu_tx_private_ok(dmu_tx_t *tx) 90fa9e4066Sahrens { 91ea8dc4b6Seschrock return (tx->tx_anyobj); 92fa9e4066Sahrens } 93fa9e4066Sahrens 948a2f1b91Sahrens static dmu_tx_hold_t * 95fa9e4066Sahrens dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 968a2f1b91Sahrens enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 97fa9e4066Sahrens { 988a2f1b91Sahrens dmu_tx_hold_t *txh; 99fa9e4066Sahrens dnode_t *dn = NULL; 100ea8dc4b6Seschrock int err; 101fa9e4066Sahrens 102fa9e4066Sahrens if (object != DMU_NEW_OBJECT) { 103ea8dc4b6Seschrock err = dnode_hold(os->os, object, tx, &dn); 104ea8dc4b6Seschrock if (err) { 105ea8dc4b6Seschrock tx->tx_err = err; 1068a2f1b91Sahrens return (NULL); 107ea8dc4b6Seschrock } 108fa9e4066Sahrens 109ea8dc4b6Seschrock if (err == 0 && tx->tx_txg != 0) { 110fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 111fa9e4066Sahrens /* 112fa9e4066Sahrens * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 113fa9e4066Sahrens * problem, but there's no way for it to happen (for 114fa9e4066Sahrens * now, at least). 115fa9e4066Sahrens */ 116fa9e4066Sahrens ASSERT(dn->dn_assigned_txg == 0); 117fa9e4066Sahrens dn->dn_assigned_txg = tx->tx_txg; 118fa9e4066Sahrens (void) refcount_add(&dn->dn_tx_holds, tx); 119fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 120fa9e4066Sahrens } 121fa9e4066Sahrens } 122fa9e4066Sahrens 1238a2f1b91Sahrens txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 1248a2f1b91Sahrens txh->txh_tx = tx; 1258a2f1b91Sahrens txh->txh_dnode = dn; 1268a2f1b91Sahrens #ifdef ZFS_DEBUG 1278a2f1b91Sahrens txh->txh_type = type; 1288a2f1b91Sahrens txh->txh_arg1 = arg1; 1298a2f1b91Sahrens txh->txh_arg2 = arg2; 1308a2f1b91Sahrens #endif 1318a2f1b91Sahrens list_insert_tail(&tx->tx_holds, txh); 132ea8dc4b6Seschrock 1338a2f1b91Sahrens return (txh); 134fa9e4066Sahrens } 135fa9e4066Sahrens 136fa9e4066Sahrens void 137fa9e4066Sahrens dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 138fa9e4066Sahrens { 139fa9e4066Sahrens /* 140fa9e4066Sahrens * If we're syncing, they can manipulate any object anyhow, and 141fa9e4066Sahrens * the hold on the dnode_t can cause problems. 142fa9e4066Sahrens */ 143fa9e4066Sahrens if (!dmu_tx_is_syncing(tx)) { 1448a2f1b91Sahrens (void) dmu_tx_hold_object_impl(tx, os, 1458a2f1b91Sahrens object, THT_NEWOBJECT, 0, 0); 146fa9e4066Sahrens } 147fa9e4066Sahrens } 148fa9e4066Sahrens 149ea8dc4b6Seschrock static int 150ea8dc4b6Seschrock dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 151ea8dc4b6Seschrock { 152ea8dc4b6Seschrock int err; 153ea8dc4b6Seschrock dmu_buf_impl_t *db; 154ea8dc4b6Seschrock 155ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_READER); 156ea8dc4b6Seschrock db = dbuf_hold_level(dn, level, blkid, FTAG); 157ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 158ea8dc4b6Seschrock if (db == NULL) 159ea8dc4b6Seschrock return (EIO); 1601ab7f2deSmaybee err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 161ea8dc4b6Seschrock dbuf_rele(db, FTAG); 162ea8dc4b6Seschrock return (err); 163ea8dc4b6Seschrock } 164ea8dc4b6Seschrock 165fa9e4066Sahrens /* ARGSUSED */ 166fa9e4066Sahrens static void 1678a2f1b91Sahrens dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 168fa9e4066Sahrens { 1698a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 1708a2f1b91Sahrens uint64_t start, end, i; 171fa9e4066Sahrens int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 1728a2f1b91Sahrens int err = 0; 173fa9e4066Sahrens 174fa9e4066Sahrens if (len == 0) 175fa9e4066Sahrens return; 176fa9e4066Sahrens 177fa9e4066Sahrens min_bs = SPA_MINBLOCKSHIFT; 178fa9e4066Sahrens max_bs = SPA_MAXBLOCKSHIFT; 179fa9e4066Sahrens min_ibs = DN_MIN_INDBLKSHIFT; 180fa9e4066Sahrens max_ibs = DN_MAX_INDBLKSHIFT; 181fa9e4066Sahrens 1828a2f1b91Sahrens 183ea8dc4b6Seschrock /* 184ea8dc4b6Seschrock * For i/o error checking, read the first and last level-0 18599653d4eSeschrock * blocks (if they are not aligned), and all the level-1 blocks. 186ea8dc4b6Seschrock */ 187ea8dc4b6Seschrock 1888a2f1b91Sahrens if (dn) { 189ea8dc4b6Seschrock if (dn->dn_maxblkid == 0) { 190ea8dc4b6Seschrock err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 1918a2f1b91Sahrens if (err) 1928a2f1b91Sahrens goto out; 193ea8dc4b6Seschrock } else { 1948a2f1b91Sahrens zio_t *zio = zio_root(dn->dn_objset->os_spa, 195ea8dc4b6Seschrock NULL, NULL, ZIO_FLAG_CANFAIL); 196ea8dc4b6Seschrock 197ea8dc4b6Seschrock /* first level-0 block */ 19899653d4eSeschrock start = off >> dn->dn_datablkshift; 19999653d4eSeschrock if (P2PHASE(off, dn->dn_datablksz) || 20099653d4eSeschrock len < dn->dn_datablksz) { 20199653d4eSeschrock err = dmu_tx_check_ioerr(zio, dn, 0, start); 2028a2f1b91Sahrens if (err) 2038a2f1b91Sahrens goto out; 204ea8dc4b6Seschrock } 205ea8dc4b6Seschrock 206ea8dc4b6Seschrock /* last level-0 block */ 20799653d4eSeschrock end = (off+len-1) >> dn->dn_datablkshift; 20899653d4eSeschrock if (end != start && 20999653d4eSeschrock P2PHASE(off+len, dn->dn_datablksz)) { 210ea8dc4b6Seschrock err = dmu_tx_check_ioerr(zio, dn, 0, end); 2118a2f1b91Sahrens if (err) 2128a2f1b91Sahrens goto out; 213ea8dc4b6Seschrock } 214ea8dc4b6Seschrock 215ea8dc4b6Seschrock /* level-1 blocks */ 216ea8dc4b6Seschrock if (dn->dn_nlevels > 1) { 217ea8dc4b6Seschrock start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 218ea8dc4b6Seschrock end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 219ea8dc4b6Seschrock for (i = start+1; i < end; i++) { 220ea8dc4b6Seschrock err = dmu_tx_check_ioerr(zio, dn, 1, i); 2218a2f1b91Sahrens if (err) 2228a2f1b91Sahrens goto out; 223ea8dc4b6Seschrock } 224ea8dc4b6Seschrock } 225ea8dc4b6Seschrock 226ea8dc4b6Seschrock err = zio_wait(zio); 2278a2f1b91Sahrens if (err) 2288a2f1b91Sahrens goto out; 229ea8dc4b6Seschrock } 230ea8dc4b6Seschrock } 231ea8dc4b6Seschrock 232fa9e4066Sahrens /* 233fa9e4066Sahrens * If there's more than one block, the blocksize can't change, 234fa9e4066Sahrens * so we can make a more precise estimate. Alternatively, 235fa9e4066Sahrens * if the dnode's ibs is larger than max_ibs, always use that. 236fa9e4066Sahrens * This ensures that if we reduce DN_MAX_INDBLKSHIFT, 237fa9e4066Sahrens * the code will still work correctly on existing pools. 238fa9e4066Sahrens */ 239fa9e4066Sahrens if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { 240fa9e4066Sahrens min_ibs = max_ibs = dn->dn_indblkshift; 241fa9e4066Sahrens if (dn->dn_datablkshift != 0) 242fa9e4066Sahrens min_bs = max_bs = dn->dn_datablkshift; 243fa9e4066Sahrens } 244fa9e4066Sahrens 245fa9e4066Sahrens /* 246fa9e4066Sahrens * 'end' is the last thing we will access, not one past. 247fa9e4066Sahrens * This way we won't overflow when accessing the last byte. 248fa9e4066Sahrens */ 249fa9e4066Sahrens start = P2ALIGN(off, 1ULL << max_bs); 250fa9e4066Sahrens end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 2518a2f1b91Sahrens txh->txh_space_towrite += end - start + 1; 252fa9e4066Sahrens 253fa9e4066Sahrens start >>= min_bs; 254fa9e4066Sahrens end >>= min_bs; 255fa9e4066Sahrens 256fa9e4066Sahrens epbs = min_ibs - SPA_BLKPTRSHIFT; 257fa9e4066Sahrens 258fa9e4066Sahrens /* 259fa9e4066Sahrens * The object contains at most 2^(64 - min_bs) blocks, 260fa9e4066Sahrens * and each indirect level maps 2^epbs. 261fa9e4066Sahrens */ 262fa9e4066Sahrens for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 263fa9e4066Sahrens start >>= epbs; 264fa9e4066Sahrens end >>= epbs; 265fa9e4066Sahrens /* 266fa9e4066Sahrens * If we increase the number of levels of indirection, 267fa9e4066Sahrens * we'll need new blkid=0 indirect blocks. If start == 0, 268fa9e4066Sahrens * we're already accounting for that blocks; and if end == 0, 269fa9e4066Sahrens * we can't increase the number of levels beyond that. 270fa9e4066Sahrens */ 271fa9e4066Sahrens if (start != 0 && end != 0) 2728a2f1b91Sahrens txh->txh_space_towrite += 1ULL << max_ibs; 2738a2f1b91Sahrens txh->txh_space_towrite += (end - start + 1) << max_ibs; 274fa9e4066Sahrens } 275fa9e4066Sahrens 2768a2f1b91Sahrens ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); 277fa9e4066Sahrens 2788a2f1b91Sahrens out: 2798a2f1b91Sahrens if (err) 2808a2f1b91Sahrens txh->txh_tx->tx_err = err; 281fa9e4066Sahrens } 282fa9e4066Sahrens 283fa9e4066Sahrens static void 2848a2f1b91Sahrens dmu_tx_count_dnode(dmu_tx_hold_t *txh) 285fa9e4066Sahrens { 2868a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 2878a2f1b91Sahrens dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; 2888a2f1b91Sahrens uint64_t space = mdn->dn_datablksz + 2898a2f1b91Sahrens ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 290fa9e4066Sahrens 291fa9e4066Sahrens if (dn && dn->dn_dbuf->db_blkptr && 292fa9e4066Sahrens dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 293ea8dc4b6Seschrock dn->dn_dbuf->db_blkptr->blk_birth)) { 2948a2f1b91Sahrens txh->txh_space_tooverwrite += space; 2958a2f1b91Sahrens } else { 2968a2f1b91Sahrens txh->txh_space_towrite += space; 297a9799022Sck if (dn && dn->dn_dbuf->db_blkptr) 298a9799022Sck txh->txh_space_tounref += space; 299fa9e4066Sahrens } 300fa9e4066Sahrens } 301fa9e4066Sahrens 302fa9e4066Sahrens void 303fa9e4066Sahrens dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 304fa9e4066Sahrens { 3058a2f1b91Sahrens dmu_tx_hold_t *txh; 3068a2f1b91Sahrens 307fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 308ea8dc4b6Seschrock ASSERT(len < DMU_MAX_ACCESS); 309dd6ef538Smaybee ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 310fa9e4066Sahrens 3118a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 3128a2f1b91Sahrens object, THT_WRITE, off, len); 3138a2f1b91Sahrens if (txh == NULL) 3148a2f1b91Sahrens return; 3158a2f1b91Sahrens 3168a2f1b91Sahrens dmu_tx_count_write(txh, off, len); 3178a2f1b91Sahrens dmu_tx_count_dnode(txh); 318fa9e4066Sahrens } 319fa9e4066Sahrens 320fa9e4066Sahrens static void 3218a2f1b91Sahrens dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 322fa9e4066Sahrens { 323*cdb0ab79Smaybee uint64_t blkid, nblks, lastblk; 324*cdb0ab79Smaybee uint64_t space = 0, unref = 0, skipped = 0; 3258a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 326fa9e4066Sahrens dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 3278a2f1b91Sahrens spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 328*cdb0ab79Smaybee int epbs; 329fa9e4066Sahrens 330*cdb0ab79Smaybee if (dn->dn_nlevels == 0) 331fa9e4066Sahrens return; 332c543ec06Sahrens 333fa9e4066Sahrens /* 334*cdb0ab79Smaybee * The struct_rwlock protects us against dn_nlevels 335c543ec06Sahrens * changing, in case (against all odds) we manage to dirty & 336c543ec06Sahrens * sync out the changes after we check for being dirty. 337*cdb0ab79Smaybee * Also, dbuf_hold_level() wants us to have the struct_rwlock. 338fa9e4066Sahrens */ 339fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 340*cdb0ab79Smaybee epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 341*cdb0ab79Smaybee if (dn->dn_maxblkid == 0) { 342c543ec06Sahrens if (off == 0 && len >= dn->dn_datablksz) { 343c543ec06Sahrens blkid = 0; 344c543ec06Sahrens nblks = 1; 345c543ec06Sahrens } else { 346c543ec06Sahrens rw_exit(&dn->dn_struct_rwlock); 347c543ec06Sahrens return; 348c543ec06Sahrens } 349c543ec06Sahrens } else { 350c543ec06Sahrens blkid = off >> dn->dn_datablkshift; 351*cdb0ab79Smaybee nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; 352fa9e4066Sahrens 353*cdb0ab79Smaybee if (blkid >= dn->dn_maxblkid) { 354c543ec06Sahrens rw_exit(&dn->dn_struct_rwlock); 355c543ec06Sahrens return; 356c543ec06Sahrens } 357*cdb0ab79Smaybee if (blkid + nblks > dn->dn_maxblkid) 358*cdb0ab79Smaybee nblks = dn->dn_maxblkid - blkid; 359fa9e4066Sahrens 360c543ec06Sahrens } 361*cdb0ab79Smaybee if (dn->dn_nlevels == 1) { 362fa9e4066Sahrens int i; 363fa9e4066Sahrens for (i = 0; i < nblks; i++) { 364fa9e4066Sahrens blkptr_t *bp = dn->dn_phys->dn_blkptr; 365*cdb0ab79Smaybee ASSERT3U(blkid + i, <, dn->dn_nblkptr); 366fa9e4066Sahrens bp += blkid + i; 367ea8dc4b6Seschrock if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { 368fa9e4066Sahrens dprintf_bp(bp, "can free old%s", ""); 36999653d4eSeschrock space += bp_get_dasize(spa, bp); 370fa9e4066Sahrens } 371a9799022Sck unref += BP_GET_ASIZE(bp); 372fa9e4066Sahrens } 373ea8dc4b6Seschrock nblks = 0; 374fa9e4066Sahrens } 375fa9e4066Sahrens 376*cdb0ab79Smaybee /* 377*cdb0ab79Smaybee * Add in memory requirements of higher-level indirects 378*cdb0ab79Smaybee */ 379*cdb0ab79Smaybee if (nblks && dn->dn_nlevels > 2) { 380*cdb0ab79Smaybee uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); 381*cdb0ab79Smaybee int level = 2; 382*cdb0ab79Smaybee 383*cdb0ab79Smaybee while (level++ < dn->dn_nlevels) { 384*cdb0ab79Smaybee txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; 385*cdb0ab79Smaybee blkcnt = 1 + (blkcnt >> epbs); 386*cdb0ab79Smaybee } 387*cdb0ab79Smaybee ASSERT(blkcnt <= dn->dn_nblkptr); 388*cdb0ab79Smaybee } 389*cdb0ab79Smaybee 390*cdb0ab79Smaybee lastblk = blkid + nblks - 1; 391fa9e4066Sahrens while (nblks) { 392fa9e4066Sahrens dmu_buf_impl_t *dbuf; 393*cdb0ab79Smaybee uint64_t ibyte, new_blkid; 394*cdb0ab79Smaybee int epb = 1 << epbs; 395*cdb0ab79Smaybee int err, i, blkoff, tochk; 396*cdb0ab79Smaybee blkptr_t *bp; 397*cdb0ab79Smaybee 398*cdb0ab79Smaybee ibyte = blkid << dn->dn_datablkshift; 399*cdb0ab79Smaybee err = dnode_next_offset(dn, 400*cdb0ab79Smaybee DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); 401*cdb0ab79Smaybee new_blkid = ibyte >> dn->dn_datablkshift; 402*cdb0ab79Smaybee if (err == ESRCH) 403*cdb0ab79Smaybee break; 404*cdb0ab79Smaybee if (err) { 405*cdb0ab79Smaybee txh->txh_tx->tx_err = err; 406*cdb0ab79Smaybee break; 407*cdb0ab79Smaybee } 408*cdb0ab79Smaybee if (new_blkid > lastblk) 409*cdb0ab79Smaybee break; 410fa9e4066Sahrens 411*cdb0ab79Smaybee if (new_blkid > blkid) { 412*cdb0ab79Smaybee skipped += new_blkid - blkid - 1; 413*cdb0ab79Smaybee nblks -= new_blkid - blkid; 414*cdb0ab79Smaybee blkid = new_blkid; 415*cdb0ab79Smaybee } 416*cdb0ab79Smaybee blkoff = P2PHASE(blkid, epb); 417*cdb0ab79Smaybee tochk = MIN(epb - blkoff, nblks); 418fa9e4066Sahrens 419*cdb0ab79Smaybee dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); 420*cdb0ab79Smaybee 421*cdb0ab79Smaybee txh->txh_memory_tohold += dbuf->db.db_size; 422*cdb0ab79Smaybee if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { 423*cdb0ab79Smaybee txh->txh_tx->tx_err = E2BIG; 424ea8dc4b6Seschrock dbuf_rele(dbuf, FTAG); 425*cdb0ab79Smaybee break; 426c543ec06Sahrens } 427*cdb0ab79Smaybee err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 428*cdb0ab79Smaybee if (err != 0) { 4298a2f1b91Sahrens txh->txh_tx->tx_err = err; 430*cdb0ab79Smaybee dbuf_rele(dbuf, FTAG); 431c543ec06Sahrens break; 432fa9e4066Sahrens } 433fa9e4066Sahrens 434*cdb0ab79Smaybee bp = dbuf->db.db_data; 435*cdb0ab79Smaybee bp += blkoff; 436*cdb0ab79Smaybee 437*cdb0ab79Smaybee for (i = 0; i < tochk; i++) { 438*cdb0ab79Smaybee if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { 439*cdb0ab79Smaybee dprintf_bp(&bp[i], "can free old%s", ""); 440*cdb0ab79Smaybee space += bp_get_dasize(spa, &bp[i]); 441*cdb0ab79Smaybee } 442*cdb0ab79Smaybee unref += BP_GET_ASIZE(bp); 443*cdb0ab79Smaybee } 444*cdb0ab79Smaybee dbuf_rele(dbuf, FTAG); 445*cdb0ab79Smaybee 446fa9e4066Sahrens blkid += tochk; 447fa9e4066Sahrens nblks -= tochk; 448fa9e4066Sahrens } 449fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 450fa9e4066Sahrens 451*cdb0ab79Smaybee /* account for new level 1 indirect blocks that might show up */ 452*cdb0ab79Smaybee if (skipped) { 453*cdb0ab79Smaybee skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); 454*cdb0ab79Smaybee txh->txh_memory_tohold += skipped << dn->dn_indblkshift; 455*cdb0ab79Smaybee } 4568a2f1b91Sahrens txh->txh_space_tofree += space; 457a9799022Sck txh->txh_space_tounref += unref; 458fa9e4066Sahrens } 459fa9e4066Sahrens 4608a2f1b91Sahrens void 4618a2f1b91Sahrens dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 462fa9e4066Sahrens { 4638a2f1b91Sahrens dmu_tx_hold_t *txh; 4648a2f1b91Sahrens dnode_t *dn; 465ea8dc4b6Seschrock uint64_t start, end, i; 466c543ec06Sahrens int err, shift; 467ea8dc4b6Seschrock zio_t *zio; 468fa9e4066Sahrens 4698a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 4708a2f1b91Sahrens 4718a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 4728a2f1b91Sahrens object, THT_FREE, off, len); 4738a2f1b91Sahrens if (txh == NULL) 4748a2f1b91Sahrens return; 4758a2f1b91Sahrens dn = txh->txh_dnode; 4768a2f1b91Sahrens 477fa9e4066Sahrens /* first block */ 47898572ac1Sahrens if (off != 0) 4798a2f1b91Sahrens dmu_tx_count_write(txh, off, 1); 480fa9e4066Sahrens /* last block */ 481fa9e4066Sahrens if (len != DMU_OBJECT_END) 4828a2f1b91Sahrens dmu_tx_count_write(txh, off+len, 1); 483fa9e4066Sahrens 484fa9e4066Sahrens if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 485fa9e4066Sahrens return; 486fa9e4066Sahrens if (len == DMU_OBJECT_END) 487fa9e4066Sahrens len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 488fa9e4066Sahrens 489ea8dc4b6Seschrock /* 490ea8dc4b6Seschrock * For i/o error checking, read the first and last level-0 491ea8dc4b6Seschrock * blocks, and all the level-1 blocks. The above count_write's 492*cdb0ab79Smaybee * have already taken care of the level-0 blocks. 493ea8dc4b6Seschrock */ 49498572ac1Sahrens if (dn->dn_nlevels > 1) { 49598572ac1Sahrens shift = dn->dn_datablkshift + dn->dn_indblkshift - 49698572ac1Sahrens SPA_BLKPTRSHIFT; 49798572ac1Sahrens start = off >> shift; 49898572ac1Sahrens end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; 49998572ac1Sahrens 50098572ac1Sahrens zio = zio_root(tx->tx_pool->dp_spa, 50198572ac1Sahrens NULL, NULL, ZIO_FLAG_CANFAIL); 50298572ac1Sahrens for (i = start; i <= end; i++) { 50398572ac1Sahrens uint64_t ibyte = i << shift; 504*cdb0ab79Smaybee err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); 50598572ac1Sahrens i = ibyte >> shift; 50698572ac1Sahrens if (err == ESRCH) 50798572ac1Sahrens break; 50898572ac1Sahrens if (err) { 50998572ac1Sahrens tx->tx_err = err; 51098572ac1Sahrens return; 51198572ac1Sahrens } 512ea8dc4b6Seschrock 51398572ac1Sahrens err = dmu_tx_check_ioerr(zio, dn, 1, i); 51498572ac1Sahrens if (err) { 51598572ac1Sahrens tx->tx_err = err; 51698572ac1Sahrens return; 51798572ac1Sahrens } 51898572ac1Sahrens } 51998572ac1Sahrens err = zio_wait(zio); 520ea8dc4b6Seschrock if (err) { 521ea8dc4b6Seschrock tx->tx_err = err; 522ea8dc4b6Seschrock return; 523ea8dc4b6Seschrock } 524ea8dc4b6Seschrock } 525ea8dc4b6Seschrock 5268a2f1b91Sahrens dmu_tx_count_dnode(txh); 5278a2f1b91Sahrens dmu_tx_count_free(txh, off, len); 528fa9e4066Sahrens } 529fa9e4066Sahrens 530fa9e4066Sahrens void 5318a2f1b91Sahrens dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) 532fa9e4066Sahrens { 5338a2f1b91Sahrens dmu_tx_hold_t *txh; 5348a2f1b91Sahrens dnode_t *dn; 535fa9e4066Sahrens uint64_t nblocks; 536ea8dc4b6Seschrock int epbs, err; 537fa9e4066Sahrens 5388a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 5398a2f1b91Sahrens 5408a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 5418a2f1b91Sahrens object, THT_ZAP, add, (uintptr_t)name); 5428a2f1b91Sahrens if (txh == NULL) 5438a2f1b91Sahrens return; 5448a2f1b91Sahrens dn = txh->txh_dnode; 5458a2f1b91Sahrens 5468a2f1b91Sahrens dmu_tx_count_dnode(txh); 547fa9e4066Sahrens 548fa9e4066Sahrens if (dn == NULL) { 549fa9e4066Sahrens /* 550ea8dc4b6Seschrock * We will be able to fit a new object's entries into one leaf 551fa9e4066Sahrens * block. So there will be at most 2 blocks total, 552fa9e4066Sahrens * including the header block. 553fa9e4066Sahrens */ 5548a2f1b91Sahrens dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 555fa9e4066Sahrens return; 556fa9e4066Sahrens } 557fa9e4066Sahrens 558fa9e4066Sahrens ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); 559fa9e4066Sahrens 560ea8dc4b6Seschrock if (dn->dn_maxblkid == 0 && !add) { 561fa9e4066Sahrens /* 562fa9e4066Sahrens * If there is only one block (i.e. this is a micro-zap) 563ea8dc4b6Seschrock * and we are not adding anything, the accounting is simple. 564fa9e4066Sahrens */ 565ea8dc4b6Seschrock err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 566ea8dc4b6Seschrock if (err) { 567ea8dc4b6Seschrock tx->tx_err = err; 568ea8dc4b6Seschrock return; 569ea8dc4b6Seschrock } 570ea8dc4b6Seschrock 571b6130eadSmaybee /* 572b6130eadSmaybee * Use max block size here, since we don't know how much 573b6130eadSmaybee * the size will change between now and the dbuf dirty call. 574b6130eadSmaybee */ 575fa9e4066Sahrens if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 576a9799022Sck dn->dn_phys->dn_blkptr[0].blk_birth)) { 577b6130eadSmaybee txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; 578a9799022Sck } else { 579b6130eadSmaybee txh->txh_space_towrite += SPA_MAXBLOCKSIZE; 580a9799022Sck txh->txh_space_tounref += 581a9799022Sck BP_GET_ASIZE(dn->dn_phys->dn_blkptr); 582a9799022Sck } 583fa9e4066Sahrens return; 584fa9e4066Sahrens } 585fa9e4066Sahrens 586ea8dc4b6Seschrock if (dn->dn_maxblkid > 0 && name) { 587ea8dc4b6Seschrock /* 588ea8dc4b6Seschrock * access the name in this fat-zap so that we'll check 589ea8dc4b6Seschrock * for i/o errors to the leaf blocks, etc. 590ea8dc4b6Seschrock */ 591ea8dc4b6Seschrock err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, 592ea8dc4b6Seschrock 8, 0, NULL); 593ea8dc4b6Seschrock if (err == EIO) { 594ea8dc4b6Seschrock tx->tx_err = err; 595ea8dc4b6Seschrock return; 596ea8dc4b6Seschrock } 597ea8dc4b6Seschrock } 598ea8dc4b6Seschrock 599fa9e4066Sahrens /* 600ea8dc4b6Seschrock * 3 blocks overwritten: target leaf, ptrtbl block, header block 601ea8dc4b6Seschrock * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks 602fa9e4066Sahrens */ 6038a2f1b91Sahrens dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, 604ea8dc4b6Seschrock (3 + add ? 3 : 0) << dn->dn_datablkshift); 605fa9e4066Sahrens 606fa9e4066Sahrens /* 607fa9e4066Sahrens * If the modified blocks are scattered to the four winds, 608fa9e4066Sahrens * we'll have to modify an indirect twig for each. 609fa9e4066Sahrens */ 610fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 611fa9e4066Sahrens for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 6128a2f1b91Sahrens txh->txh_space_towrite += 3 << dn->dn_indblkshift; 613fa9e4066Sahrens } 614fa9e4066Sahrens 615fa9e4066Sahrens void 616fa9e4066Sahrens dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 617fa9e4066Sahrens { 6188a2f1b91Sahrens dmu_tx_hold_t *txh; 619fa9e4066Sahrens 6208a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 621fa9e4066Sahrens 6228a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 6238a2f1b91Sahrens object, THT_BONUS, 0, 0); 6248a2f1b91Sahrens if (txh) 6258a2f1b91Sahrens dmu_tx_count_dnode(txh); 626fa9e4066Sahrens } 627fa9e4066Sahrens 628fa9e4066Sahrens void 629fa9e4066Sahrens dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 630fa9e4066Sahrens { 6318a2f1b91Sahrens dmu_tx_hold_t *txh; 632fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 633fa9e4066Sahrens 6348a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 6358a2f1b91Sahrens DMU_NEW_OBJECT, THT_SPACE, space, 0); 6368a2f1b91Sahrens 6378a2f1b91Sahrens txh->txh_space_towrite += space; 638fa9e4066Sahrens } 639fa9e4066Sahrens 640fa9e4066Sahrens int 641fa9e4066Sahrens dmu_tx_holds(dmu_tx_t *tx, uint64_t object) 642fa9e4066Sahrens { 6438a2f1b91Sahrens dmu_tx_hold_t *txh; 644fa9e4066Sahrens int holds = 0; 645fa9e4066Sahrens 646fa9e4066Sahrens /* 647fa9e4066Sahrens * By asserting that the tx is assigned, we're counting the 648fa9e4066Sahrens * number of dn_tx_holds, which is the same as the number of 649fa9e4066Sahrens * dn_holds. Otherwise, we'd be counting dn_holds, but 650fa9e4066Sahrens * dn_tx_holds could be 0. 651fa9e4066Sahrens */ 652fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 653fa9e4066Sahrens 654fa9e4066Sahrens /* if (tx->tx_anyobj == TRUE) */ 655fa9e4066Sahrens /* return (0); */ 656fa9e4066Sahrens 6578a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh; 6588a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 6598a2f1b91Sahrens if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 660fa9e4066Sahrens holds++; 661fa9e4066Sahrens } 662fa9e4066Sahrens 663fa9e4066Sahrens return (holds); 664fa9e4066Sahrens } 665fa9e4066Sahrens 6669c9dc39aSek #ifdef ZFS_DEBUG 667fa9e4066Sahrens void 668fa9e4066Sahrens dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 669fa9e4066Sahrens { 6708a2f1b91Sahrens dmu_tx_hold_t *txh; 671fa9e4066Sahrens int match_object = FALSE, match_offset = FALSE; 672fa9e4066Sahrens dnode_t *dn = db->db_dnode; 673fa9e4066Sahrens 674fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 675fa9e4066Sahrens ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); 676fa9e4066Sahrens ASSERT3U(dn->dn_object, ==, db->db.db_object); 677fa9e4066Sahrens 678fa9e4066Sahrens if (tx->tx_anyobj) 679fa9e4066Sahrens return; 680fa9e4066Sahrens 681fa9e4066Sahrens /* XXX No checking on the meta dnode for now */ 682ea8dc4b6Seschrock if (db->db.db_object == DMU_META_DNODE_OBJECT) 683fa9e4066Sahrens return; 684fa9e4066Sahrens 6858a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh; 6868a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 687fa9e4066Sahrens ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 6888a2f1b91Sahrens if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 689fa9e4066Sahrens match_object = TRUE; 6908a2f1b91Sahrens if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 691fa9e4066Sahrens int datablkshift = dn->dn_datablkshift ? 692fa9e4066Sahrens dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 693fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 694fa9e4066Sahrens int shift = datablkshift + epbs * db->db_level; 695fa9e4066Sahrens uint64_t beginblk = shift >= 64 ? 0 : 6968a2f1b91Sahrens (txh->txh_arg1 >> shift); 697fa9e4066Sahrens uint64_t endblk = shift >= 64 ? 0 : 6988a2f1b91Sahrens ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 699fa9e4066Sahrens uint64_t blkid = db->db_blkid; 700fa9e4066Sahrens 7018a2f1b91Sahrens /* XXX txh_arg2 better not be zero... */ 702fa9e4066Sahrens 7038a2f1b91Sahrens dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 7048a2f1b91Sahrens txh->txh_type, beginblk, endblk); 705fa9e4066Sahrens 7068a2f1b91Sahrens switch (txh->txh_type) { 707fa9e4066Sahrens case THT_WRITE: 708fa9e4066Sahrens if (blkid >= beginblk && blkid <= endblk) 709fa9e4066Sahrens match_offset = TRUE; 710fa9e4066Sahrens /* 711fa9e4066Sahrens * We will let this hold work for the bonus 712fa9e4066Sahrens * buffer so that we don't need to hold it 713fa9e4066Sahrens * when creating a new object. 714fa9e4066Sahrens */ 715fa9e4066Sahrens if (blkid == DB_BONUS_BLKID) 716fa9e4066Sahrens match_offset = TRUE; 717fa9e4066Sahrens /* 718fa9e4066Sahrens * They might have to increase nlevels, 719fa9e4066Sahrens * thus dirtying the new TLIBs. Or the 720fa9e4066Sahrens * might have to change the block size, 721fa9e4066Sahrens * thus dirying the new lvl=0 blk=0. 722fa9e4066Sahrens */ 723fa9e4066Sahrens if (blkid == 0) 724fa9e4066Sahrens match_offset = TRUE; 725fa9e4066Sahrens break; 726fa9e4066Sahrens case THT_FREE: 727*cdb0ab79Smaybee /* 728*cdb0ab79Smaybee * We will dirty all the level 1 blocks in 729*cdb0ab79Smaybee * the free range and perhaps the first and 730*cdb0ab79Smaybee * last level 0 block. 731*cdb0ab79Smaybee */ 732*cdb0ab79Smaybee if (blkid >= beginblk && (blkid <= endblk || 733*cdb0ab79Smaybee txh->txh_arg2 == DMU_OBJECT_END)) 734fa9e4066Sahrens match_offset = TRUE; 735fa9e4066Sahrens break; 736fa9e4066Sahrens case THT_BONUS: 737fa9e4066Sahrens if (blkid == DB_BONUS_BLKID) 738fa9e4066Sahrens match_offset = TRUE; 739fa9e4066Sahrens break; 740fa9e4066Sahrens case THT_ZAP: 741fa9e4066Sahrens match_offset = TRUE; 742fa9e4066Sahrens break; 743fa9e4066Sahrens case THT_NEWOBJECT: 744fa9e4066Sahrens match_object = TRUE; 745fa9e4066Sahrens break; 746fa9e4066Sahrens default: 7478a2f1b91Sahrens ASSERT(!"bad txh_type"); 748fa9e4066Sahrens } 749fa9e4066Sahrens } 750fa9e4066Sahrens if (match_object && match_offset) 751fa9e4066Sahrens return; 752fa9e4066Sahrens } 753fa9e4066Sahrens panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 754fa9e4066Sahrens (u_longlong_t)db->db.db_object, db->db_level, 755fa9e4066Sahrens (u_longlong_t)db->db_blkid); 756fa9e4066Sahrens } 7579c9dc39aSek #endif 758fa9e4066Sahrens 759fa9e4066Sahrens static int 7608a2f1b91Sahrens dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) 761fa9e4066Sahrens { 7628a2f1b91Sahrens dmu_tx_hold_t *txh; 7630a4e9518Sgw spa_t *spa = tx->tx_pool->dp_spa; 764*cdb0ab79Smaybee uint64_t memory, asize, fsize, usize; 765*cdb0ab79Smaybee uint64_t towrite, tofree, tooverwrite, tounref, tohold; 766fa9e4066Sahrens 7678a2f1b91Sahrens ASSERT3U(tx->tx_txg, ==, 0); 7680a4e9518Sgw 7698a2f1b91Sahrens if (tx->tx_err) 7708a2f1b91Sahrens return (tx->tx_err); 771fa9e4066Sahrens 7720a4e9518Sgw if (spa_state(spa) == POOL_STATE_IO_FAILURE) { 7730a4e9518Sgw /* 7740a4e9518Sgw * If the user has indicated a blocking failure mode 7750a4e9518Sgw * then return ERESTART which will block in dmu_tx_wait(). 7760a4e9518Sgw * Otherwise, return EIO so that an error can get 7770a4e9518Sgw * propagated back to the VOP calls. 7780a4e9518Sgw * 7790a4e9518Sgw * Note that we always honor the txg_how flag regardless 7800a4e9518Sgw * of the failuremode setting. 7810a4e9518Sgw */ 7820a4e9518Sgw if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 7830a4e9518Sgw txg_how != TXG_WAIT) 7840a4e9518Sgw return (EIO); 7850a4e9518Sgw 7860a4e9518Sgw return (ERESTART); 7870a4e9518Sgw } 7880a4e9518Sgw 789fa9e4066Sahrens tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 7908a2f1b91Sahrens tx->tx_needassign_txh = NULL; 791fa9e4066Sahrens 7928a2f1b91Sahrens /* 7938a2f1b91Sahrens * NB: No error returns are allowed after txg_hold_open, but 7948a2f1b91Sahrens * before processing the dnode holds, due to the 7958a2f1b91Sahrens * dmu_tx_unassign() logic. 7968a2f1b91Sahrens */ 797fa9e4066Sahrens 798*cdb0ab79Smaybee towrite = tofree = tooverwrite = tounref = tohold = 0; 7998a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh; 8008a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 8018a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 802fa9e4066Sahrens if (dn != NULL) { 803fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 8048a2f1b91Sahrens if (dn->dn_assigned_txg == tx->tx_txg - 1) { 8058a2f1b91Sahrens mutex_exit(&dn->dn_mtx); 8068a2f1b91Sahrens tx->tx_needassign_txh = txh; 8078a2f1b91Sahrens return (ERESTART); 808fa9e4066Sahrens } 8098a2f1b91Sahrens if (dn->dn_assigned_txg == 0) 810fa9e4066Sahrens dn->dn_assigned_txg = tx->tx_txg; 8118a2f1b91Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 812fa9e4066Sahrens (void) refcount_add(&dn->dn_tx_holds, tx); 813fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 814fa9e4066Sahrens } 8158a2f1b91Sahrens towrite += txh->txh_space_towrite; 8168a2f1b91Sahrens tofree += txh->txh_space_tofree; 8178a2f1b91Sahrens tooverwrite += txh->txh_space_tooverwrite; 818a9799022Sck tounref += txh->txh_space_tounref; 819*cdb0ab79Smaybee tohold += txh->txh_memory_tohold; 820ea8dc4b6Seschrock } 821ea8dc4b6Seschrock 8228a2f1b91Sahrens /* 8238a2f1b91Sahrens * NB: This check must be after we've held the dnodes, so that 8248a2f1b91Sahrens * the dmu_tx_unassign() logic will work properly 8258a2f1b91Sahrens */ 8268a2f1b91Sahrens if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) 8278a2f1b91Sahrens return (ERESTART); 8288a2f1b91Sahrens 829ea8dc4b6Seschrock /* 830ea8dc4b6Seschrock * If a snapshot has been taken since we made our estimates, 831ea8dc4b6Seschrock * assume that we won't be able to free or overwrite anything. 832ea8dc4b6Seschrock */ 833ea8dc4b6Seschrock if (tx->tx_objset && 834ea8dc4b6Seschrock dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > 835ea8dc4b6Seschrock tx->tx_lastsnap_txg) { 8368a2f1b91Sahrens towrite += tooverwrite; 8378a2f1b91Sahrens tooverwrite = tofree = 0; 838fa9e4066Sahrens } 839fa9e4066Sahrens 840*cdb0ab79Smaybee /* needed allocation: worst-case estimate of write space */ 841*cdb0ab79Smaybee asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); 842*cdb0ab79Smaybee /* freed space estimate: worst-case overwrite + free estimate */ 8438a2f1b91Sahrens fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 844*cdb0ab79Smaybee /* convert unrefd space to worst-case estimate */ 845a9799022Sck usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); 846*cdb0ab79Smaybee /* calculate memory footprint estimate */ 847*cdb0ab79Smaybee memory = towrite + tooverwrite + tohold; 8488a2f1b91Sahrens 8498a2f1b91Sahrens #ifdef ZFS_DEBUG 850*cdb0ab79Smaybee /* add in 'tohold' to account for our dirty holds on this memory */ 851*cdb0ab79Smaybee tx->tx_space_towrite = asize + 852*cdb0ab79Smaybee spa_get_asize(tx->tx_pool->dp_spa, tohold); 8538a2f1b91Sahrens tx->tx_space_tofree = tofree; 8548a2f1b91Sahrens tx->tx_space_tooverwrite = tooverwrite; 855a9799022Sck tx->tx_space_tounref = tounref; 8568a2f1b91Sahrens #endif 857fa9e4066Sahrens 858fa9e4066Sahrens if (tx->tx_dir && asize != 0) { 859*cdb0ab79Smaybee int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, 860*cdb0ab79Smaybee asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); 8618a2f1b91Sahrens if (err) 862fa9e4066Sahrens return (err); 863fa9e4066Sahrens } 864fa9e4066Sahrens 865fa9e4066Sahrens return (0); 866fa9e4066Sahrens } 867fa9e4066Sahrens 8688a2f1b91Sahrens static void 8698a2f1b91Sahrens dmu_tx_unassign(dmu_tx_t *tx) 870fa9e4066Sahrens { 8718a2f1b91Sahrens dmu_tx_hold_t *txh; 872fa9e4066Sahrens 8738a2f1b91Sahrens if (tx->tx_txg == 0) 8748a2f1b91Sahrens return; 875fa9e4066Sahrens 876fa9e4066Sahrens txg_rele_to_quiesce(&tx->tx_txgh); 877fa9e4066Sahrens 8788a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 8798a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 8808a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 881fa9e4066Sahrens 882fa9e4066Sahrens if (dn == NULL) 883fa9e4066Sahrens continue; 884fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 8858a2f1b91Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 886fa9e4066Sahrens 887fa9e4066Sahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 888fa9e4066Sahrens dn->dn_assigned_txg = 0; 889fa9e4066Sahrens cv_broadcast(&dn->dn_notxholds); 890fa9e4066Sahrens } 891fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 892fa9e4066Sahrens } 893fa9e4066Sahrens 894fa9e4066Sahrens txg_rele_to_sync(&tx->tx_txgh); 895fa9e4066Sahrens 8968a2f1b91Sahrens tx->tx_lasttried_txg = tx->tx_txg; 897fa9e4066Sahrens tx->tx_txg = 0; 898fa9e4066Sahrens } 899fa9e4066Sahrens 900fa9e4066Sahrens /* 901fa9e4066Sahrens * Assign tx to a transaction group. txg_how can be one of: 902fa9e4066Sahrens * 903fa9e4066Sahrens * (1) TXG_WAIT. If the current open txg is full, waits until there's 904fa9e4066Sahrens * a new one. This should be used when you're not holding locks. 905fa9e4066Sahrens * If will only fail if we're truly out of space (or over quota). 906fa9e4066Sahrens * 907fa9e4066Sahrens * (2) TXG_NOWAIT. If we can't assign into the current open txg without 908fa9e4066Sahrens * blocking, returns immediately with ERESTART. This should be used 909fa9e4066Sahrens * whenever you're holding locks. On an ERESTART error, the caller 9108a2f1b91Sahrens * should drop locks, do a dmu_tx_wait(tx), and try again. 911fa9e4066Sahrens * 912fa9e4066Sahrens * (3) A specific txg. Use this if you need to ensure that multiple 913fa9e4066Sahrens * transactions all sync in the same txg. Like TXG_NOWAIT, it 914fa9e4066Sahrens * returns ERESTART if it can't assign you into the requested txg. 915fa9e4066Sahrens */ 916fa9e4066Sahrens int 917fa9e4066Sahrens dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 918fa9e4066Sahrens { 919fa9e4066Sahrens int err; 920fa9e4066Sahrens 921fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 922fa9e4066Sahrens ASSERT(txg_how != 0); 923fa9e4066Sahrens ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 924fa9e4066Sahrens 9258a2f1b91Sahrens while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 9268a2f1b91Sahrens dmu_tx_unassign(tx); 927fa9e4066Sahrens 928fa9e4066Sahrens if (err != ERESTART || txg_how != TXG_WAIT) 929fa9e4066Sahrens return (err); 930fa9e4066Sahrens 9318a2f1b91Sahrens dmu_tx_wait(tx); 932fa9e4066Sahrens } 933fa9e4066Sahrens 934fa9e4066Sahrens txg_rele_to_quiesce(&tx->tx_txgh); 935fa9e4066Sahrens 936fa9e4066Sahrens return (0); 937fa9e4066Sahrens } 938fa9e4066Sahrens 9398a2f1b91Sahrens void 9408a2f1b91Sahrens dmu_tx_wait(dmu_tx_t *tx) 9418a2f1b91Sahrens { 9420a4e9518Sgw spa_t *spa = tx->tx_pool->dp_spa; 9430a4e9518Sgw 9448a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 9458a2f1b91Sahrens 9460a4e9518Sgw /* 9470a4e9518Sgw * It's possible that the pool has become active after this thread 9480a4e9518Sgw * has tried to obtain a tx. If that's the case then his 9490a4e9518Sgw * tx_lasttried_txg would not have been assigned. 9500a4e9518Sgw */ 9510a4e9518Sgw if (spa_state(spa) == POOL_STATE_IO_FAILURE || 9520a4e9518Sgw tx->tx_lasttried_txg == 0) { 9530a4e9518Sgw txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); 9540a4e9518Sgw } else if (tx->tx_needassign_txh) { 9558a2f1b91Sahrens dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 9568a2f1b91Sahrens 9578a2f1b91Sahrens mutex_enter(&dn->dn_mtx); 9588a2f1b91Sahrens while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 9598a2f1b91Sahrens cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 9608a2f1b91Sahrens mutex_exit(&dn->dn_mtx); 9618a2f1b91Sahrens tx->tx_needassign_txh = NULL; 9628a2f1b91Sahrens } else { 9638a2f1b91Sahrens txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 9648a2f1b91Sahrens } 9658a2f1b91Sahrens } 9668a2f1b91Sahrens 967fa9e4066Sahrens void 968fa9e4066Sahrens dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 969fa9e4066Sahrens { 9708a2f1b91Sahrens #ifdef ZFS_DEBUG 971fa9e4066Sahrens if (tx->tx_dir == NULL || delta == 0) 972fa9e4066Sahrens return; 973fa9e4066Sahrens 974fa9e4066Sahrens if (delta > 0) { 975fa9e4066Sahrens ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 976fa9e4066Sahrens tx->tx_space_towrite); 977fa9e4066Sahrens (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 978fa9e4066Sahrens } else { 979fa9e4066Sahrens (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 980fa9e4066Sahrens } 9818a2f1b91Sahrens #endif 982fa9e4066Sahrens } 983fa9e4066Sahrens 984fa9e4066Sahrens void 985fa9e4066Sahrens dmu_tx_commit(dmu_tx_t *tx) 986fa9e4066Sahrens { 9878a2f1b91Sahrens dmu_tx_hold_t *txh; 988fa9e4066Sahrens 989fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 990fa9e4066Sahrens 9918a2f1b91Sahrens while (txh = list_head(&tx->tx_holds)) { 9928a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 993fa9e4066Sahrens 9948a2f1b91Sahrens list_remove(&tx->tx_holds, txh); 9958a2f1b91Sahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 996fa9e4066Sahrens if (dn == NULL) 997fa9e4066Sahrens continue; 998fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 999fa9e4066Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 1000fa9e4066Sahrens 1001fa9e4066Sahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 1002fa9e4066Sahrens dn->dn_assigned_txg = 0; 1003fa9e4066Sahrens cv_broadcast(&dn->dn_notxholds); 1004fa9e4066Sahrens } 1005fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1006fa9e4066Sahrens dnode_rele(dn, tx); 1007fa9e4066Sahrens } 1008fa9e4066Sahrens 10098a2f1b91Sahrens if (tx->tx_tempreserve_cookie) 1010fa9e4066Sahrens dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 1011fa9e4066Sahrens 1012fa9e4066Sahrens if (tx->tx_anyobj == FALSE) 1013fa9e4066Sahrens txg_rele_to_sync(&tx->tx_txgh); 10148f38d419Sek list_destroy(&tx->tx_holds); 10158a2f1b91Sahrens #ifdef ZFS_DEBUG 1016fa9e4066Sahrens dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 1017fa9e4066Sahrens tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 1018fa9e4066Sahrens tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 1019fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_written, 1020fa9e4066Sahrens refcount_count(&tx->tx_space_written)); 1021fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_freed, 1022fa9e4066Sahrens refcount_count(&tx->tx_space_freed)); 1023fa9e4066Sahrens #endif 1024fa9e4066Sahrens kmem_free(tx, sizeof (dmu_tx_t)); 1025fa9e4066Sahrens } 1026fa9e4066Sahrens 1027fa9e4066Sahrens void 1028fa9e4066Sahrens dmu_tx_abort(dmu_tx_t *tx) 1029fa9e4066Sahrens { 10308a2f1b91Sahrens dmu_tx_hold_t *txh; 1031fa9e4066Sahrens 1032fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 1033fa9e4066Sahrens 10348a2f1b91Sahrens while (txh = list_head(&tx->tx_holds)) { 10358a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 1036fa9e4066Sahrens 10378a2f1b91Sahrens list_remove(&tx->tx_holds, txh); 10388a2f1b91Sahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 1039fa9e4066Sahrens if (dn != NULL) 1040fa9e4066Sahrens dnode_rele(dn, tx); 1041fa9e4066Sahrens } 10428f38d419Sek list_destroy(&tx->tx_holds); 10438a2f1b91Sahrens #ifdef ZFS_DEBUG 1044fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_written, 1045fa9e4066Sahrens refcount_count(&tx->tx_space_written)); 1046fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_freed, 1047fa9e4066Sahrens refcount_count(&tx->tx_space_freed)); 1048fa9e4066Sahrens #endif 1049fa9e4066Sahrens kmem_free(tx, sizeof (dmu_tx_t)); 1050fa9e4066Sahrens } 1051fa9e4066Sahrens 1052fa9e4066Sahrens uint64_t 1053fa9e4066Sahrens dmu_tx_get_txg(dmu_tx_t *tx) 1054fa9e4066Sahrens { 1055fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1056fa9e4066Sahrens return (tx->tx_txg); 1057fa9e4066Sahrens } 1058