1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5f65e61c0Sahrens * Common Development and Distribution License (the "License"). 6f65e61c0Sahrens * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 228f38d419Sek * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/dmu.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30fa9e4066Sahrens #include <sys/dbuf.h> 31fa9e4066Sahrens #include <sys/dmu_tx.h> 32fa9e4066Sahrens #include <sys/dmu_objset.h> 33fa9e4066Sahrens #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ 34fa9e4066Sahrens #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ 35fa9e4066Sahrens #include <sys/dsl_pool.h> 368a2f1b91Sahrens #include <sys/zap_impl.h> /* for fzap_default_block_shift */ 37fa9e4066Sahrens #include <sys/spa.h> 38fa9e4066Sahrens #include <sys/zfs_context.h> 39fa9e4066Sahrens 40ea8dc4b6Seschrock typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, 41ea8dc4b6Seschrock uint64_t arg1, uint64_t arg2); 42ea8dc4b6Seschrock 43fa9e4066Sahrens 44fa9e4066Sahrens dmu_tx_t * 451d452cf5Sahrens dmu_tx_create_dd(dsl_dir_t *dd) 46fa9e4066Sahrens { 47fa9e4066Sahrens dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); 48fa9e4066Sahrens tx->tx_dir = dd; 49fa9e4066Sahrens if (dd) 50fa9e4066Sahrens tx->tx_pool = dd->dd_pool; 51fa9e4066Sahrens list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), 528a2f1b91Sahrens offsetof(dmu_tx_hold_t, txh_node)); 538a2f1b91Sahrens #ifdef ZFS_DEBUG 54fa9e4066Sahrens refcount_create(&tx->tx_space_written); 55fa9e4066Sahrens refcount_create(&tx->tx_space_freed); 568a2f1b91Sahrens #endif 57fa9e4066Sahrens return (tx); 58fa9e4066Sahrens } 59fa9e4066Sahrens 60fa9e4066Sahrens dmu_tx_t * 61fa9e4066Sahrens dmu_tx_create(objset_t *os) 62fa9e4066Sahrens { 631d452cf5Sahrens dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); 64fa9e4066Sahrens tx->tx_objset = os; 65ea8dc4b6Seschrock tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); 66fa9e4066Sahrens return (tx); 67fa9e4066Sahrens } 68fa9e4066Sahrens 69fa9e4066Sahrens dmu_tx_t * 70fa9e4066Sahrens dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) 71fa9e4066Sahrens { 721d452cf5Sahrens dmu_tx_t *tx = dmu_tx_create_dd(NULL); 73fa9e4066Sahrens 74fa9e4066Sahrens ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); 75fa9e4066Sahrens tx->tx_pool = dp; 76fa9e4066Sahrens tx->tx_txg = txg; 77fa9e4066Sahrens tx->tx_anyobj = TRUE; 78fa9e4066Sahrens 79fa9e4066Sahrens return (tx); 80fa9e4066Sahrens } 81fa9e4066Sahrens 82fa9e4066Sahrens int 83fa9e4066Sahrens dmu_tx_is_syncing(dmu_tx_t *tx) 84fa9e4066Sahrens { 85fa9e4066Sahrens return (tx->tx_anyobj); 86fa9e4066Sahrens } 87fa9e4066Sahrens 88fa9e4066Sahrens int 89fa9e4066Sahrens dmu_tx_private_ok(dmu_tx_t *tx) 90fa9e4066Sahrens { 91ea8dc4b6Seschrock return (tx->tx_anyobj); 92fa9e4066Sahrens } 93fa9e4066Sahrens 948a2f1b91Sahrens static dmu_tx_hold_t * 95fa9e4066Sahrens dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, 968a2f1b91Sahrens enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) 97fa9e4066Sahrens { 988a2f1b91Sahrens dmu_tx_hold_t *txh; 99fa9e4066Sahrens dnode_t *dn = NULL; 100ea8dc4b6Seschrock int err; 101fa9e4066Sahrens 102fa9e4066Sahrens if (object != DMU_NEW_OBJECT) { 103ea8dc4b6Seschrock err = dnode_hold(os->os, object, tx, &dn); 104ea8dc4b6Seschrock if (err) { 105ea8dc4b6Seschrock tx->tx_err = err; 1068a2f1b91Sahrens return (NULL); 107ea8dc4b6Seschrock } 108fa9e4066Sahrens 109ea8dc4b6Seschrock if (err == 0 && tx->tx_txg != 0) { 110fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 111fa9e4066Sahrens /* 112fa9e4066Sahrens * dn->dn_assigned_txg == tx->tx_txg doesn't pose a 113fa9e4066Sahrens * problem, but there's no way for it to happen (for 114fa9e4066Sahrens * now, at least). 115fa9e4066Sahrens */ 116fa9e4066Sahrens ASSERT(dn->dn_assigned_txg == 0); 117fa9e4066Sahrens dn->dn_assigned_txg = tx->tx_txg; 118fa9e4066Sahrens (void) refcount_add(&dn->dn_tx_holds, tx); 119fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 120fa9e4066Sahrens } 121fa9e4066Sahrens } 122fa9e4066Sahrens 1238a2f1b91Sahrens txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); 1248a2f1b91Sahrens txh->txh_tx = tx; 1258a2f1b91Sahrens txh->txh_dnode = dn; 1268a2f1b91Sahrens #ifdef ZFS_DEBUG 1278a2f1b91Sahrens txh->txh_type = type; 1288a2f1b91Sahrens txh->txh_arg1 = arg1; 1298a2f1b91Sahrens txh->txh_arg2 = arg2; 1308a2f1b91Sahrens #endif 1318a2f1b91Sahrens list_insert_tail(&tx->tx_holds, txh); 132ea8dc4b6Seschrock 1338a2f1b91Sahrens return (txh); 134fa9e4066Sahrens } 135fa9e4066Sahrens 136fa9e4066Sahrens void 137fa9e4066Sahrens dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) 138fa9e4066Sahrens { 139fa9e4066Sahrens /* 140fa9e4066Sahrens * If we're syncing, they can manipulate any object anyhow, and 141fa9e4066Sahrens * the hold on the dnode_t can cause problems. 142fa9e4066Sahrens */ 143fa9e4066Sahrens if (!dmu_tx_is_syncing(tx)) { 1448a2f1b91Sahrens (void) dmu_tx_hold_object_impl(tx, os, 1458a2f1b91Sahrens object, THT_NEWOBJECT, 0, 0); 146fa9e4066Sahrens } 147fa9e4066Sahrens } 148fa9e4066Sahrens 149ea8dc4b6Seschrock static int 150ea8dc4b6Seschrock dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) 151ea8dc4b6Seschrock { 152ea8dc4b6Seschrock int err; 153ea8dc4b6Seschrock dmu_buf_impl_t *db; 154ea8dc4b6Seschrock 155ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_READER); 156ea8dc4b6Seschrock db = dbuf_hold_level(dn, level, blkid, FTAG); 157ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 158ea8dc4b6Seschrock if (db == NULL) 159ea8dc4b6Seschrock return (EIO); 160*1ab7f2deSmaybee err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); 161ea8dc4b6Seschrock dbuf_rele(db, FTAG); 162ea8dc4b6Seschrock return (err); 163ea8dc4b6Seschrock } 164ea8dc4b6Seschrock 165fa9e4066Sahrens /* ARGSUSED */ 166fa9e4066Sahrens static void 1678a2f1b91Sahrens dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 168fa9e4066Sahrens { 1698a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 1708a2f1b91Sahrens uint64_t start, end, i; 171fa9e4066Sahrens int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; 1728a2f1b91Sahrens int err = 0; 173fa9e4066Sahrens 174fa9e4066Sahrens if (len == 0) 175fa9e4066Sahrens return; 176fa9e4066Sahrens 177fa9e4066Sahrens min_bs = SPA_MINBLOCKSHIFT; 178fa9e4066Sahrens max_bs = SPA_MAXBLOCKSHIFT; 179fa9e4066Sahrens min_ibs = DN_MIN_INDBLKSHIFT; 180fa9e4066Sahrens max_ibs = DN_MAX_INDBLKSHIFT; 181fa9e4066Sahrens 1828a2f1b91Sahrens 183ea8dc4b6Seschrock /* 184ea8dc4b6Seschrock * For i/o error checking, read the first and last level-0 18599653d4eSeschrock * blocks (if they are not aligned), and all the level-1 blocks. 186ea8dc4b6Seschrock */ 187ea8dc4b6Seschrock 1888a2f1b91Sahrens if (dn) { 189ea8dc4b6Seschrock if (dn->dn_maxblkid == 0) { 190ea8dc4b6Seschrock err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 1918a2f1b91Sahrens if (err) 1928a2f1b91Sahrens goto out; 193ea8dc4b6Seschrock } else { 1948a2f1b91Sahrens zio_t *zio = zio_root(dn->dn_objset->os_spa, 195ea8dc4b6Seschrock NULL, NULL, ZIO_FLAG_CANFAIL); 196ea8dc4b6Seschrock 197ea8dc4b6Seschrock /* first level-0 block */ 19899653d4eSeschrock start = off >> dn->dn_datablkshift; 19999653d4eSeschrock if (P2PHASE(off, dn->dn_datablksz) || 20099653d4eSeschrock len < dn->dn_datablksz) { 20199653d4eSeschrock err = dmu_tx_check_ioerr(zio, dn, 0, start); 2028a2f1b91Sahrens if (err) 2038a2f1b91Sahrens goto out; 204ea8dc4b6Seschrock } 205ea8dc4b6Seschrock 206ea8dc4b6Seschrock /* last level-0 block */ 20799653d4eSeschrock end = (off+len-1) >> dn->dn_datablkshift; 20899653d4eSeschrock if (end != start && 20999653d4eSeschrock P2PHASE(off+len, dn->dn_datablksz)) { 210ea8dc4b6Seschrock err = dmu_tx_check_ioerr(zio, dn, 0, end); 2118a2f1b91Sahrens if (err) 2128a2f1b91Sahrens goto out; 213ea8dc4b6Seschrock } 214ea8dc4b6Seschrock 215ea8dc4b6Seschrock /* level-1 blocks */ 216ea8dc4b6Seschrock if (dn->dn_nlevels > 1) { 217ea8dc4b6Seschrock start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 218ea8dc4b6Seschrock end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; 219ea8dc4b6Seschrock for (i = start+1; i < end; i++) { 220ea8dc4b6Seschrock err = dmu_tx_check_ioerr(zio, dn, 1, i); 2218a2f1b91Sahrens if (err) 2228a2f1b91Sahrens goto out; 223ea8dc4b6Seschrock } 224ea8dc4b6Seschrock } 225ea8dc4b6Seschrock 226ea8dc4b6Seschrock err = zio_wait(zio); 2278a2f1b91Sahrens if (err) 2288a2f1b91Sahrens goto out; 229ea8dc4b6Seschrock } 230ea8dc4b6Seschrock } 231ea8dc4b6Seschrock 232fa9e4066Sahrens /* 233fa9e4066Sahrens * If there's more than one block, the blocksize can't change, 234fa9e4066Sahrens * so we can make a more precise estimate. Alternatively, 235fa9e4066Sahrens * if the dnode's ibs is larger than max_ibs, always use that. 236fa9e4066Sahrens * This ensures that if we reduce DN_MAX_INDBLKSHIFT, 237fa9e4066Sahrens * the code will still work correctly on existing pools. 238fa9e4066Sahrens */ 239fa9e4066Sahrens if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { 240fa9e4066Sahrens min_ibs = max_ibs = dn->dn_indblkshift; 241fa9e4066Sahrens if (dn->dn_datablkshift != 0) 242fa9e4066Sahrens min_bs = max_bs = dn->dn_datablkshift; 243fa9e4066Sahrens } 244fa9e4066Sahrens 245fa9e4066Sahrens /* 246fa9e4066Sahrens * 'end' is the last thing we will access, not one past. 247fa9e4066Sahrens * This way we won't overflow when accessing the last byte. 248fa9e4066Sahrens */ 249fa9e4066Sahrens start = P2ALIGN(off, 1ULL << max_bs); 250fa9e4066Sahrens end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; 2518a2f1b91Sahrens txh->txh_space_towrite += end - start + 1; 252fa9e4066Sahrens 253fa9e4066Sahrens start >>= min_bs; 254fa9e4066Sahrens end >>= min_bs; 255fa9e4066Sahrens 256fa9e4066Sahrens epbs = min_ibs - SPA_BLKPTRSHIFT; 257fa9e4066Sahrens 258fa9e4066Sahrens /* 259fa9e4066Sahrens * The object contains at most 2^(64 - min_bs) blocks, 260fa9e4066Sahrens * and each indirect level maps 2^epbs. 261fa9e4066Sahrens */ 262fa9e4066Sahrens for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { 263fa9e4066Sahrens start >>= epbs; 264fa9e4066Sahrens end >>= epbs; 265fa9e4066Sahrens /* 266fa9e4066Sahrens * If we increase the number of levels of indirection, 267fa9e4066Sahrens * we'll need new blkid=0 indirect blocks. If start == 0, 268fa9e4066Sahrens * we're already accounting for that blocks; and if end == 0, 269fa9e4066Sahrens * we can't increase the number of levels beyond that. 270fa9e4066Sahrens */ 271fa9e4066Sahrens if (start != 0 && end != 0) 2728a2f1b91Sahrens txh->txh_space_towrite += 1ULL << max_ibs; 2738a2f1b91Sahrens txh->txh_space_towrite += (end - start + 1) << max_ibs; 274fa9e4066Sahrens } 275fa9e4066Sahrens 2768a2f1b91Sahrens ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); 277fa9e4066Sahrens 2788a2f1b91Sahrens out: 2798a2f1b91Sahrens if (err) 2808a2f1b91Sahrens txh->txh_tx->tx_err = err; 281fa9e4066Sahrens } 282fa9e4066Sahrens 283fa9e4066Sahrens static void 2848a2f1b91Sahrens dmu_tx_count_dnode(dmu_tx_hold_t *txh) 285fa9e4066Sahrens { 2868a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 2878a2f1b91Sahrens dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; 2888a2f1b91Sahrens uint64_t space = mdn->dn_datablksz + 2898a2f1b91Sahrens ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); 290fa9e4066Sahrens 291fa9e4066Sahrens if (dn && dn->dn_dbuf->db_blkptr && 292fa9e4066Sahrens dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 293ea8dc4b6Seschrock dn->dn_dbuf->db_blkptr->blk_birth)) { 2948a2f1b91Sahrens txh->txh_space_tooverwrite += space; 2958a2f1b91Sahrens } else { 2968a2f1b91Sahrens txh->txh_space_towrite += space; 297a9799022Sck if (dn && dn->dn_dbuf->db_blkptr) 298a9799022Sck txh->txh_space_tounref += space; 299fa9e4066Sahrens } 300fa9e4066Sahrens } 301fa9e4066Sahrens 302fa9e4066Sahrens void 303fa9e4066Sahrens dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) 304fa9e4066Sahrens { 3058a2f1b91Sahrens dmu_tx_hold_t *txh; 3068a2f1b91Sahrens 307fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 308ea8dc4b6Seschrock ASSERT(len < DMU_MAX_ACCESS); 309dd6ef538Smaybee ASSERT(len == 0 || UINT64_MAX - off >= len - 1); 310fa9e4066Sahrens 3118a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 3128a2f1b91Sahrens object, THT_WRITE, off, len); 3138a2f1b91Sahrens if (txh == NULL) 3148a2f1b91Sahrens return; 3158a2f1b91Sahrens 3168a2f1b91Sahrens dmu_tx_count_write(txh, off, len); 3178a2f1b91Sahrens dmu_tx_count_dnode(txh); 318fa9e4066Sahrens } 319fa9e4066Sahrens 320fa9e4066Sahrens static void 3218a2f1b91Sahrens dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) 322fa9e4066Sahrens { 323fa9e4066Sahrens uint64_t blkid, nblks; 324a9799022Sck uint64_t space = 0, unref = 0; 3258a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 326fa9e4066Sahrens dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 3278a2f1b91Sahrens spa_t *spa = txh->txh_tx->tx_pool->dp_spa; 328347a31bcSahrens int dirty; 329fa9e4066Sahrens 330c543ec06Sahrens /* 331347a31bcSahrens * We don't need to use any locking to check for dirtyness 332347a31bcSahrens * because it's OK if we get stale data -- the dnode may become 33331fd60d3Sahrens * dirty immediately after our check anyway. This is just a 33431fd60d3Sahrens * means to avoid the expensive count when we aren't sure we 335347a31bcSahrens * need it. We need to be able to deal with a dirty dnode. 336c543ec06Sahrens */ 337347a31bcSahrens dirty = list_link_active(&dn->dn_dirty_link[0]) | 338c543ec06Sahrens list_link_active(&dn->dn_dirty_link[1]) | 339c543ec06Sahrens list_link_active(&dn->dn_dirty_link[2]) | 340347a31bcSahrens list_link_active(&dn->dn_dirty_link[3]); 3418a2f1b91Sahrens if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) 342fa9e4066Sahrens return; 343c543ec06Sahrens 344fa9e4066Sahrens /* 345c543ec06Sahrens * the struct_rwlock protects us against dn_phys->dn_nlevels 346c543ec06Sahrens * changing, in case (against all odds) we manage to dirty & 347c543ec06Sahrens * sync out the changes after we check for being dirty. 348c543ec06Sahrens * also, dbuf_hold_impl() wants us to have the struct_rwlock. 349c543ec06Sahrens * 350c543ec06Sahrens * It's fine to use dn_datablkshift rather than the dn_phys 351c543ec06Sahrens * equivalent because if it is changing, maxblkid==0 and we will 352c543ec06Sahrens * bail. 353fa9e4066Sahrens */ 354fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 355c543ec06Sahrens if (dn->dn_phys->dn_maxblkid == 0) { 356c543ec06Sahrens if (off == 0 && len >= dn->dn_datablksz) { 357c543ec06Sahrens blkid = 0; 358c543ec06Sahrens nblks = 1; 359c543ec06Sahrens } else { 360c543ec06Sahrens rw_exit(&dn->dn_struct_rwlock); 361c543ec06Sahrens return; 362c543ec06Sahrens } 363c543ec06Sahrens } else { 364c543ec06Sahrens blkid = off >> dn->dn_datablkshift; 365c543ec06Sahrens nblks = (off + len) >> dn->dn_datablkshift; 366fa9e4066Sahrens 367c543ec06Sahrens if (blkid >= dn->dn_phys->dn_maxblkid) { 368c543ec06Sahrens rw_exit(&dn->dn_struct_rwlock); 369c543ec06Sahrens return; 370c543ec06Sahrens } 371c543ec06Sahrens if (blkid + nblks > dn->dn_phys->dn_maxblkid) 372c543ec06Sahrens nblks = dn->dn_phys->dn_maxblkid - blkid; 373fa9e4066Sahrens 374c543ec06Sahrens /* don't bother after 128,000 blocks */ 375c543ec06Sahrens nblks = MIN(nblks, 128*1024); 376c543ec06Sahrens } 377fa9e4066Sahrens 378fa9e4066Sahrens if (dn->dn_phys->dn_nlevels == 1) { 379fa9e4066Sahrens int i; 380fa9e4066Sahrens for (i = 0; i < nblks; i++) { 381fa9e4066Sahrens blkptr_t *bp = dn->dn_phys->dn_blkptr; 382fa9e4066Sahrens ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); 383fa9e4066Sahrens bp += blkid + i; 384ea8dc4b6Seschrock if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { 385fa9e4066Sahrens dprintf_bp(bp, "can free old%s", ""); 38699653d4eSeschrock space += bp_get_dasize(spa, bp); 387fa9e4066Sahrens } 388a9799022Sck unref += BP_GET_ASIZE(bp); 389fa9e4066Sahrens } 390ea8dc4b6Seschrock nblks = 0; 391fa9e4066Sahrens } 392fa9e4066Sahrens 393fa9e4066Sahrens while (nblks) { 394fa9e4066Sahrens dmu_buf_impl_t *dbuf; 395fa9e4066Sahrens int err, epbs, blkoff, tochk; 396fa9e4066Sahrens 397fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 398fa9e4066Sahrens blkoff = P2PHASE(blkid, 1<<epbs); 399fa9e4066Sahrens tochk = MIN((1<<epbs) - blkoff, nblks); 400fa9e4066Sahrens 401fa9e4066Sahrens err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); 402fa9e4066Sahrens if (err == 0) { 403fa9e4066Sahrens int i; 404fa9e4066Sahrens blkptr_t *bp; 405fa9e4066Sahrens 406ea8dc4b6Seschrock err = dbuf_read(dbuf, NULL, 407ea8dc4b6Seschrock DB_RF_HAVESTRUCT | DB_RF_CANFAIL); 408ea8dc4b6Seschrock if (err != 0) { 4098a2f1b91Sahrens txh->txh_tx->tx_err = err; 410ea8dc4b6Seschrock dbuf_rele(dbuf, FTAG); 411ea8dc4b6Seschrock break; 412ea8dc4b6Seschrock } 413fa9e4066Sahrens 414fa9e4066Sahrens bp = dbuf->db.db_data; 415fa9e4066Sahrens bp += blkoff; 416fa9e4066Sahrens 417fa9e4066Sahrens for (i = 0; i < tochk; i++) { 418fa9e4066Sahrens if (dsl_dataset_block_freeable(ds, 419ea8dc4b6Seschrock bp[i].blk_birth)) { 420fa9e4066Sahrens dprintf_bp(&bp[i], 421fa9e4066Sahrens "can free old%s", ""); 42299653d4eSeschrock space += bp_get_dasize(spa, &bp[i]); 423fa9e4066Sahrens } 424a9799022Sck unref += BP_GET_ASIZE(bp); 425fa9e4066Sahrens } 426ea8dc4b6Seschrock dbuf_rele(dbuf, FTAG); 427c543ec06Sahrens } 4288a2f1b91Sahrens if (err && err != ENOENT) { 4298a2f1b91Sahrens txh->txh_tx->tx_err = err; 430c543ec06Sahrens break; 431fa9e4066Sahrens } 432fa9e4066Sahrens 433fa9e4066Sahrens blkid += tochk; 434fa9e4066Sahrens nblks -= tochk; 435fa9e4066Sahrens } 436fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 437fa9e4066Sahrens 4388a2f1b91Sahrens txh->txh_space_tofree += space; 439a9799022Sck txh->txh_space_tounref += unref; 440fa9e4066Sahrens } 441fa9e4066Sahrens 4428a2f1b91Sahrens void 4438a2f1b91Sahrens dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) 444fa9e4066Sahrens { 4458a2f1b91Sahrens dmu_tx_hold_t *txh; 4468a2f1b91Sahrens dnode_t *dn; 447ea8dc4b6Seschrock uint64_t start, end, i; 448c543ec06Sahrens int err, shift; 449ea8dc4b6Seschrock zio_t *zio; 450fa9e4066Sahrens 4518a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 4528a2f1b91Sahrens 4538a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 4548a2f1b91Sahrens object, THT_FREE, off, len); 4558a2f1b91Sahrens if (txh == NULL) 4568a2f1b91Sahrens return; 4578a2f1b91Sahrens dn = txh->txh_dnode; 4588a2f1b91Sahrens 459fa9e4066Sahrens /* first block */ 46098572ac1Sahrens if (off != 0) 4618a2f1b91Sahrens dmu_tx_count_write(txh, off, 1); 462fa9e4066Sahrens /* last block */ 463fa9e4066Sahrens if (len != DMU_OBJECT_END) 4648a2f1b91Sahrens dmu_tx_count_write(txh, off+len, 1); 465fa9e4066Sahrens 466fa9e4066Sahrens if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) 467fa9e4066Sahrens return; 468fa9e4066Sahrens if (len == DMU_OBJECT_END) 469fa9e4066Sahrens len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; 470fa9e4066Sahrens 471ea8dc4b6Seschrock /* 472ea8dc4b6Seschrock * For i/o error checking, read the first and last level-0 473ea8dc4b6Seschrock * blocks, and all the level-1 blocks. The above count_write's 474ea8dc4b6Seschrock * will take care of the level-0 blocks. 475ea8dc4b6Seschrock */ 47698572ac1Sahrens if (dn->dn_nlevels > 1) { 47798572ac1Sahrens shift = dn->dn_datablkshift + dn->dn_indblkshift - 47898572ac1Sahrens SPA_BLKPTRSHIFT; 47998572ac1Sahrens start = off >> shift; 48098572ac1Sahrens end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; 48198572ac1Sahrens 48298572ac1Sahrens zio = zio_root(tx->tx_pool->dp_spa, 48398572ac1Sahrens NULL, NULL, ZIO_FLAG_CANFAIL); 48498572ac1Sahrens for (i = start; i <= end; i++) { 48598572ac1Sahrens uint64_t ibyte = i << shift; 4866754306eSahrens err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); 48798572ac1Sahrens i = ibyte >> shift; 48898572ac1Sahrens if (err == ESRCH) 48998572ac1Sahrens break; 49098572ac1Sahrens if (err) { 49198572ac1Sahrens tx->tx_err = err; 49298572ac1Sahrens return; 49398572ac1Sahrens } 494ea8dc4b6Seschrock 49598572ac1Sahrens err = dmu_tx_check_ioerr(zio, dn, 1, i); 49698572ac1Sahrens if (err) { 49798572ac1Sahrens tx->tx_err = err; 49898572ac1Sahrens return; 49998572ac1Sahrens } 50098572ac1Sahrens } 50198572ac1Sahrens err = zio_wait(zio); 502ea8dc4b6Seschrock if (err) { 503ea8dc4b6Seschrock tx->tx_err = err; 504ea8dc4b6Seschrock return; 505ea8dc4b6Seschrock } 506ea8dc4b6Seschrock } 507ea8dc4b6Seschrock 5088a2f1b91Sahrens dmu_tx_count_dnode(txh); 5098a2f1b91Sahrens dmu_tx_count_free(txh, off, len); 510fa9e4066Sahrens } 511fa9e4066Sahrens 512fa9e4066Sahrens void 5138a2f1b91Sahrens dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) 514fa9e4066Sahrens { 5158a2f1b91Sahrens dmu_tx_hold_t *txh; 5168a2f1b91Sahrens dnode_t *dn; 517fa9e4066Sahrens uint64_t nblocks; 518ea8dc4b6Seschrock int epbs, err; 519fa9e4066Sahrens 5208a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 5218a2f1b91Sahrens 5228a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 5238a2f1b91Sahrens object, THT_ZAP, add, (uintptr_t)name); 5248a2f1b91Sahrens if (txh == NULL) 5258a2f1b91Sahrens return; 5268a2f1b91Sahrens dn = txh->txh_dnode; 5278a2f1b91Sahrens 5288a2f1b91Sahrens dmu_tx_count_dnode(txh); 529fa9e4066Sahrens 530fa9e4066Sahrens if (dn == NULL) { 531fa9e4066Sahrens /* 532ea8dc4b6Seschrock * We will be able to fit a new object's entries into one leaf 533fa9e4066Sahrens * block. So there will be at most 2 blocks total, 534fa9e4066Sahrens * including the header block. 535fa9e4066Sahrens */ 5368a2f1b91Sahrens dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); 537fa9e4066Sahrens return; 538fa9e4066Sahrens } 539fa9e4066Sahrens 540fa9e4066Sahrens ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); 541fa9e4066Sahrens 542ea8dc4b6Seschrock if (dn->dn_maxblkid == 0 && !add) { 543fa9e4066Sahrens /* 544fa9e4066Sahrens * If there is only one block (i.e. this is a micro-zap) 545ea8dc4b6Seschrock * and we are not adding anything, the accounting is simple. 546fa9e4066Sahrens */ 547ea8dc4b6Seschrock err = dmu_tx_check_ioerr(NULL, dn, 0, 0); 548ea8dc4b6Seschrock if (err) { 549ea8dc4b6Seschrock tx->tx_err = err; 550ea8dc4b6Seschrock return; 551ea8dc4b6Seschrock } 552ea8dc4b6Seschrock 553b6130eadSmaybee /* 554b6130eadSmaybee * Use max block size here, since we don't know how much 555b6130eadSmaybee * the size will change between now and the dbuf dirty call. 556b6130eadSmaybee */ 557fa9e4066Sahrens if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, 558a9799022Sck dn->dn_phys->dn_blkptr[0].blk_birth)) { 559b6130eadSmaybee txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; 560a9799022Sck } else { 561b6130eadSmaybee txh->txh_space_towrite += SPA_MAXBLOCKSIZE; 562a9799022Sck txh->txh_space_tounref += 563a9799022Sck BP_GET_ASIZE(dn->dn_phys->dn_blkptr); 564a9799022Sck } 565fa9e4066Sahrens return; 566fa9e4066Sahrens } 567fa9e4066Sahrens 568ea8dc4b6Seschrock if (dn->dn_maxblkid > 0 && name) { 569ea8dc4b6Seschrock /* 570ea8dc4b6Seschrock * access the name in this fat-zap so that we'll check 571ea8dc4b6Seschrock * for i/o errors to the leaf blocks, etc. 572ea8dc4b6Seschrock */ 573ea8dc4b6Seschrock err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, 574ea8dc4b6Seschrock 8, 0, NULL); 575ea8dc4b6Seschrock if (err == EIO) { 576ea8dc4b6Seschrock tx->tx_err = err; 577ea8dc4b6Seschrock return; 578ea8dc4b6Seschrock } 579ea8dc4b6Seschrock } 580ea8dc4b6Seschrock 581fa9e4066Sahrens /* 582ea8dc4b6Seschrock * 3 blocks overwritten: target leaf, ptrtbl block, header block 583ea8dc4b6Seschrock * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks 584fa9e4066Sahrens */ 5858a2f1b91Sahrens dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, 586ea8dc4b6Seschrock (3 + add ? 3 : 0) << dn->dn_datablkshift); 587fa9e4066Sahrens 588fa9e4066Sahrens /* 589fa9e4066Sahrens * If the modified blocks are scattered to the four winds, 590fa9e4066Sahrens * we'll have to modify an indirect twig for each. 591fa9e4066Sahrens */ 592fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 593fa9e4066Sahrens for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) 5948a2f1b91Sahrens txh->txh_space_towrite += 3 << dn->dn_indblkshift; 595fa9e4066Sahrens } 596fa9e4066Sahrens 597fa9e4066Sahrens void 598fa9e4066Sahrens dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) 599fa9e4066Sahrens { 6008a2f1b91Sahrens dmu_tx_hold_t *txh; 601fa9e4066Sahrens 6028a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 603fa9e4066Sahrens 6048a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 6058a2f1b91Sahrens object, THT_BONUS, 0, 0); 6068a2f1b91Sahrens if (txh) 6078a2f1b91Sahrens dmu_tx_count_dnode(txh); 608fa9e4066Sahrens } 609fa9e4066Sahrens 610fa9e4066Sahrens void 611fa9e4066Sahrens dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) 612fa9e4066Sahrens { 6138a2f1b91Sahrens dmu_tx_hold_t *txh; 614fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 615fa9e4066Sahrens 6168a2f1b91Sahrens txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, 6178a2f1b91Sahrens DMU_NEW_OBJECT, THT_SPACE, space, 0); 6188a2f1b91Sahrens 6198a2f1b91Sahrens txh->txh_space_towrite += space; 620fa9e4066Sahrens } 621fa9e4066Sahrens 622fa9e4066Sahrens int 623fa9e4066Sahrens dmu_tx_holds(dmu_tx_t *tx, uint64_t object) 624fa9e4066Sahrens { 6258a2f1b91Sahrens dmu_tx_hold_t *txh; 626fa9e4066Sahrens int holds = 0; 627fa9e4066Sahrens 628fa9e4066Sahrens /* 629fa9e4066Sahrens * By asserting that the tx is assigned, we're counting the 630fa9e4066Sahrens * number of dn_tx_holds, which is the same as the number of 631fa9e4066Sahrens * dn_holds. Otherwise, we'd be counting dn_holds, but 632fa9e4066Sahrens * dn_tx_holds could be 0. 633fa9e4066Sahrens */ 634fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 635fa9e4066Sahrens 636fa9e4066Sahrens /* if (tx->tx_anyobj == TRUE) */ 637fa9e4066Sahrens /* return (0); */ 638fa9e4066Sahrens 6398a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh; 6408a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 6418a2f1b91Sahrens if (txh->txh_dnode && txh->txh_dnode->dn_object == object) 642fa9e4066Sahrens holds++; 643fa9e4066Sahrens } 644fa9e4066Sahrens 645fa9e4066Sahrens return (holds); 646fa9e4066Sahrens } 647fa9e4066Sahrens 6489c9dc39aSek #ifdef ZFS_DEBUG 649fa9e4066Sahrens void 650fa9e4066Sahrens dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) 651fa9e4066Sahrens { 6528a2f1b91Sahrens dmu_tx_hold_t *txh; 653fa9e4066Sahrens int match_object = FALSE, match_offset = FALSE; 654fa9e4066Sahrens dnode_t *dn = db->db_dnode; 655fa9e4066Sahrens 656fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 657fa9e4066Sahrens ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); 658fa9e4066Sahrens ASSERT3U(dn->dn_object, ==, db->db.db_object); 659fa9e4066Sahrens 660fa9e4066Sahrens if (tx->tx_anyobj) 661fa9e4066Sahrens return; 662fa9e4066Sahrens 663fa9e4066Sahrens /* XXX No checking on the meta dnode for now */ 664ea8dc4b6Seschrock if (db->db.db_object == DMU_META_DNODE_OBJECT) 665fa9e4066Sahrens return; 666fa9e4066Sahrens 6678a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh; 6688a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 669fa9e4066Sahrens ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); 6708a2f1b91Sahrens if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) 671fa9e4066Sahrens match_object = TRUE; 6728a2f1b91Sahrens if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { 673fa9e4066Sahrens int datablkshift = dn->dn_datablkshift ? 674fa9e4066Sahrens dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; 675fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 676fa9e4066Sahrens int shift = datablkshift + epbs * db->db_level; 677fa9e4066Sahrens uint64_t beginblk = shift >= 64 ? 0 : 6788a2f1b91Sahrens (txh->txh_arg1 >> shift); 679fa9e4066Sahrens uint64_t endblk = shift >= 64 ? 0 : 6808a2f1b91Sahrens ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); 681fa9e4066Sahrens uint64_t blkid = db->db_blkid; 682fa9e4066Sahrens 6838a2f1b91Sahrens /* XXX txh_arg2 better not be zero... */ 684fa9e4066Sahrens 6858a2f1b91Sahrens dprintf("found txh type %x beginblk=%llx endblk=%llx\n", 6868a2f1b91Sahrens txh->txh_type, beginblk, endblk); 687fa9e4066Sahrens 6888a2f1b91Sahrens switch (txh->txh_type) { 689fa9e4066Sahrens case THT_WRITE: 690fa9e4066Sahrens if (blkid >= beginblk && blkid <= endblk) 691fa9e4066Sahrens match_offset = TRUE; 692fa9e4066Sahrens /* 693fa9e4066Sahrens * We will let this hold work for the bonus 694fa9e4066Sahrens * buffer so that we don't need to hold it 695fa9e4066Sahrens * when creating a new object. 696fa9e4066Sahrens */ 697fa9e4066Sahrens if (blkid == DB_BONUS_BLKID) 698fa9e4066Sahrens match_offset = TRUE; 699fa9e4066Sahrens /* 700fa9e4066Sahrens * They might have to increase nlevels, 701fa9e4066Sahrens * thus dirtying the new TLIBs. Or the 702fa9e4066Sahrens * might have to change the block size, 703fa9e4066Sahrens * thus dirying the new lvl=0 blk=0. 704fa9e4066Sahrens */ 705fa9e4066Sahrens if (blkid == 0) 706fa9e4066Sahrens match_offset = TRUE; 707fa9e4066Sahrens break; 708fa9e4066Sahrens case THT_FREE: 709fa9e4066Sahrens if (blkid == beginblk && 7108a2f1b91Sahrens (txh->txh_arg1 != 0 || 711fa9e4066Sahrens dn->dn_maxblkid == 0)) 712fa9e4066Sahrens match_offset = TRUE; 713fa9e4066Sahrens if (blkid == endblk && 7148a2f1b91Sahrens txh->txh_arg2 != DMU_OBJECT_END) 715fa9e4066Sahrens match_offset = TRUE; 716fa9e4066Sahrens break; 717fa9e4066Sahrens case THT_BONUS: 718fa9e4066Sahrens if (blkid == DB_BONUS_BLKID) 719fa9e4066Sahrens match_offset = TRUE; 720fa9e4066Sahrens break; 721fa9e4066Sahrens case THT_ZAP: 722fa9e4066Sahrens match_offset = TRUE; 723fa9e4066Sahrens break; 724fa9e4066Sahrens case THT_NEWOBJECT: 725fa9e4066Sahrens match_object = TRUE; 726fa9e4066Sahrens break; 727fa9e4066Sahrens default: 7288a2f1b91Sahrens ASSERT(!"bad txh_type"); 729fa9e4066Sahrens } 730fa9e4066Sahrens } 731fa9e4066Sahrens if (match_object && match_offset) 732fa9e4066Sahrens return; 733fa9e4066Sahrens } 734fa9e4066Sahrens panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", 735fa9e4066Sahrens (u_longlong_t)db->db.db_object, db->db_level, 736fa9e4066Sahrens (u_longlong_t)db->db_blkid); 737fa9e4066Sahrens } 7389c9dc39aSek #endif 739fa9e4066Sahrens 740fa9e4066Sahrens static int 7418a2f1b91Sahrens dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) 742fa9e4066Sahrens { 7438a2f1b91Sahrens dmu_tx_hold_t *txh; 7440a4e9518Sgw spa_t *spa = tx->tx_pool->dp_spa; 745a9799022Sck uint64_t lsize, asize, fsize, usize; 746a9799022Sck uint64_t towrite, tofree, tooverwrite, tounref; 747fa9e4066Sahrens 7488a2f1b91Sahrens ASSERT3U(tx->tx_txg, ==, 0); 7490a4e9518Sgw 7508a2f1b91Sahrens if (tx->tx_err) 7518a2f1b91Sahrens return (tx->tx_err); 752fa9e4066Sahrens 7530a4e9518Sgw if (spa_state(spa) == POOL_STATE_IO_FAILURE) { 7540a4e9518Sgw /* 7550a4e9518Sgw * If the user has indicated a blocking failure mode 7560a4e9518Sgw * then return ERESTART which will block in dmu_tx_wait(). 7570a4e9518Sgw * Otherwise, return EIO so that an error can get 7580a4e9518Sgw * propagated back to the VOP calls. 7590a4e9518Sgw * 7600a4e9518Sgw * Note that we always honor the txg_how flag regardless 7610a4e9518Sgw * of the failuremode setting. 7620a4e9518Sgw */ 7630a4e9518Sgw if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && 7640a4e9518Sgw txg_how != TXG_WAIT) 7650a4e9518Sgw return (EIO); 7660a4e9518Sgw 7670a4e9518Sgw return (ERESTART); 7680a4e9518Sgw } 7690a4e9518Sgw 770fa9e4066Sahrens tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); 7718a2f1b91Sahrens tx->tx_needassign_txh = NULL; 772fa9e4066Sahrens 7738a2f1b91Sahrens /* 7748a2f1b91Sahrens * NB: No error returns are allowed after txg_hold_open, but 7758a2f1b91Sahrens * before processing the dnode holds, due to the 7768a2f1b91Sahrens * dmu_tx_unassign() logic. 7778a2f1b91Sahrens */ 778fa9e4066Sahrens 779a9799022Sck towrite = tofree = tooverwrite = tounref = 0; 7808a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh; 7818a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 7828a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 783fa9e4066Sahrens if (dn != NULL) { 784fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 7858a2f1b91Sahrens if (dn->dn_assigned_txg == tx->tx_txg - 1) { 7868a2f1b91Sahrens mutex_exit(&dn->dn_mtx); 7878a2f1b91Sahrens tx->tx_needassign_txh = txh; 7888a2f1b91Sahrens return (ERESTART); 789fa9e4066Sahrens } 7908a2f1b91Sahrens if (dn->dn_assigned_txg == 0) 791fa9e4066Sahrens dn->dn_assigned_txg = tx->tx_txg; 7928a2f1b91Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 793fa9e4066Sahrens (void) refcount_add(&dn->dn_tx_holds, tx); 794fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 795fa9e4066Sahrens } 7968a2f1b91Sahrens towrite += txh->txh_space_towrite; 7978a2f1b91Sahrens tofree += txh->txh_space_tofree; 7988a2f1b91Sahrens tooverwrite += txh->txh_space_tooverwrite; 799a9799022Sck tounref += txh->txh_space_tounref; 800ea8dc4b6Seschrock } 801ea8dc4b6Seschrock 8028a2f1b91Sahrens /* 8038a2f1b91Sahrens * NB: This check must be after we've held the dnodes, so that 8048a2f1b91Sahrens * the dmu_tx_unassign() logic will work properly 8058a2f1b91Sahrens */ 8068a2f1b91Sahrens if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) 8078a2f1b91Sahrens return (ERESTART); 8088a2f1b91Sahrens 809ea8dc4b6Seschrock /* 810ea8dc4b6Seschrock * If a snapshot has been taken since we made our estimates, 811ea8dc4b6Seschrock * assume that we won't be able to free or overwrite anything. 812ea8dc4b6Seschrock */ 813ea8dc4b6Seschrock if (tx->tx_objset && 814ea8dc4b6Seschrock dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > 815ea8dc4b6Seschrock tx->tx_lastsnap_txg) { 8168a2f1b91Sahrens towrite += tooverwrite; 8178a2f1b91Sahrens tooverwrite = tofree = 0; 818fa9e4066Sahrens } 819fa9e4066Sahrens 820fa9e4066Sahrens /* 821fa9e4066Sahrens * Convert logical size to worst-case allocated size. 822fa9e4066Sahrens */ 8238a2f1b91Sahrens fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; 8248a2f1b91Sahrens lsize = towrite + tooverwrite; 825fa9e4066Sahrens asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); 826a9799022Sck usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); 8278a2f1b91Sahrens 8288a2f1b91Sahrens #ifdef ZFS_DEBUG 829fa9e4066Sahrens tx->tx_space_towrite = asize; 8308a2f1b91Sahrens tx->tx_space_tofree = tofree; 8318a2f1b91Sahrens tx->tx_space_tooverwrite = tooverwrite; 832a9799022Sck tx->tx_space_tounref = tounref; 8338a2f1b91Sahrens #endif 834fa9e4066Sahrens 835fa9e4066Sahrens if (tx->tx_dir && asize != 0) { 836fa9e4066Sahrens int err = dsl_dir_tempreserve_space(tx->tx_dir, 837a9799022Sck lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); 8388a2f1b91Sahrens if (err) 839fa9e4066Sahrens return (err); 840fa9e4066Sahrens } 841fa9e4066Sahrens 842fa9e4066Sahrens return (0); 843fa9e4066Sahrens } 844fa9e4066Sahrens 8458a2f1b91Sahrens static void 8468a2f1b91Sahrens dmu_tx_unassign(dmu_tx_t *tx) 847fa9e4066Sahrens { 8488a2f1b91Sahrens dmu_tx_hold_t *txh; 849fa9e4066Sahrens 8508a2f1b91Sahrens if (tx->tx_txg == 0) 8518a2f1b91Sahrens return; 852fa9e4066Sahrens 853fa9e4066Sahrens txg_rele_to_quiesce(&tx->tx_txgh); 854fa9e4066Sahrens 8558a2f1b91Sahrens for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; 8568a2f1b91Sahrens txh = list_next(&tx->tx_holds, txh)) { 8578a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 858fa9e4066Sahrens 859fa9e4066Sahrens if (dn == NULL) 860fa9e4066Sahrens continue; 861fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 8628a2f1b91Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 863fa9e4066Sahrens 864fa9e4066Sahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 865fa9e4066Sahrens dn->dn_assigned_txg = 0; 866fa9e4066Sahrens cv_broadcast(&dn->dn_notxholds); 867fa9e4066Sahrens } 868fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 869fa9e4066Sahrens } 870fa9e4066Sahrens 871fa9e4066Sahrens txg_rele_to_sync(&tx->tx_txgh); 872fa9e4066Sahrens 8738a2f1b91Sahrens tx->tx_lasttried_txg = tx->tx_txg; 874fa9e4066Sahrens tx->tx_txg = 0; 875fa9e4066Sahrens } 876fa9e4066Sahrens 877fa9e4066Sahrens /* 878fa9e4066Sahrens * Assign tx to a transaction group. txg_how can be one of: 879fa9e4066Sahrens * 880fa9e4066Sahrens * (1) TXG_WAIT. If the current open txg is full, waits until there's 881fa9e4066Sahrens * a new one. This should be used when you're not holding locks. 882fa9e4066Sahrens * If will only fail if we're truly out of space (or over quota). 883fa9e4066Sahrens * 884fa9e4066Sahrens * (2) TXG_NOWAIT. If we can't assign into the current open txg without 885fa9e4066Sahrens * blocking, returns immediately with ERESTART. This should be used 886fa9e4066Sahrens * whenever you're holding locks. On an ERESTART error, the caller 8878a2f1b91Sahrens * should drop locks, do a dmu_tx_wait(tx), and try again. 888fa9e4066Sahrens * 889fa9e4066Sahrens * (3) A specific txg. Use this if you need to ensure that multiple 890fa9e4066Sahrens * transactions all sync in the same txg. Like TXG_NOWAIT, it 891fa9e4066Sahrens * returns ERESTART if it can't assign you into the requested txg. 892fa9e4066Sahrens */ 893fa9e4066Sahrens int 894fa9e4066Sahrens dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) 895fa9e4066Sahrens { 896fa9e4066Sahrens int err; 897fa9e4066Sahrens 898fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 899fa9e4066Sahrens ASSERT(txg_how != 0); 900fa9e4066Sahrens ASSERT(!dsl_pool_sync_context(tx->tx_pool)); 901fa9e4066Sahrens 9028a2f1b91Sahrens while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { 9038a2f1b91Sahrens dmu_tx_unassign(tx); 904fa9e4066Sahrens 905fa9e4066Sahrens if (err != ERESTART || txg_how != TXG_WAIT) 906fa9e4066Sahrens return (err); 907fa9e4066Sahrens 9088a2f1b91Sahrens dmu_tx_wait(tx); 909fa9e4066Sahrens } 910fa9e4066Sahrens 911fa9e4066Sahrens txg_rele_to_quiesce(&tx->tx_txgh); 912fa9e4066Sahrens 913fa9e4066Sahrens return (0); 914fa9e4066Sahrens } 915fa9e4066Sahrens 9168a2f1b91Sahrens void 9178a2f1b91Sahrens dmu_tx_wait(dmu_tx_t *tx) 9188a2f1b91Sahrens { 9190a4e9518Sgw spa_t *spa = tx->tx_pool->dp_spa; 9200a4e9518Sgw 9218a2f1b91Sahrens ASSERT(tx->tx_txg == 0); 9228a2f1b91Sahrens 9230a4e9518Sgw /* 9240a4e9518Sgw * It's possible that the pool has become active after this thread 9250a4e9518Sgw * has tried to obtain a tx. If that's the case then his 9260a4e9518Sgw * tx_lasttried_txg would not have been assigned. 9270a4e9518Sgw */ 9280a4e9518Sgw if (spa_state(spa) == POOL_STATE_IO_FAILURE || 9290a4e9518Sgw tx->tx_lasttried_txg == 0) { 9300a4e9518Sgw txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); 9310a4e9518Sgw } else if (tx->tx_needassign_txh) { 9328a2f1b91Sahrens dnode_t *dn = tx->tx_needassign_txh->txh_dnode; 9338a2f1b91Sahrens 9348a2f1b91Sahrens mutex_enter(&dn->dn_mtx); 9358a2f1b91Sahrens while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) 9368a2f1b91Sahrens cv_wait(&dn->dn_notxholds, &dn->dn_mtx); 9378a2f1b91Sahrens mutex_exit(&dn->dn_mtx); 9388a2f1b91Sahrens tx->tx_needassign_txh = NULL; 9398a2f1b91Sahrens } else { 9408a2f1b91Sahrens txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); 9418a2f1b91Sahrens } 9428a2f1b91Sahrens } 9438a2f1b91Sahrens 944fa9e4066Sahrens void 945fa9e4066Sahrens dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) 946fa9e4066Sahrens { 9478a2f1b91Sahrens #ifdef ZFS_DEBUG 948fa9e4066Sahrens if (tx->tx_dir == NULL || delta == 0) 949fa9e4066Sahrens return; 950fa9e4066Sahrens 951fa9e4066Sahrens if (delta > 0) { 952fa9e4066Sahrens ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, 953fa9e4066Sahrens tx->tx_space_towrite); 954fa9e4066Sahrens (void) refcount_add_many(&tx->tx_space_written, delta, NULL); 955fa9e4066Sahrens } else { 956fa9e4066Sahrens (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); 957fa9e4066Sahrens } 9588a2f1b91Sahrens #endif 959fa9e4066Sahrens } 960fa9e4066Sahrens 961fa9e4066Sahrens void 962fa9e4066Sahrens dmu_tx_commit(dmu_tx_t *tx) 963fa9e4066Sahrens { 9648a2f1b91Sahrens dmu_tx_hold_t *txh; 965fa9e4066Sahrens 966fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 967fa9e4066Sahrens 9688a2f1b91Sahrens while (txh = list_head(&tx->tx_holds)) { 9698a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 970fa9e4066Sahrens 9718a2f1b91Sahrens list_remove(&tx->tx_holds, txh); 9728a2f1b91Sahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 973fa9e4066Sahrens if (dn == NULL) 974fa9e4066Sahrens continue; 975fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 976fa9e4066Sahrens ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); 977fa9e4066Sahrens 978fa9e4066Sahrens if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { 979fa9e4066Sahrens dn->dn_assigned_txg = 0; 980fa9e4066Sahrens cv_broadcast(&dn->dn_notxholds); 981fa9e4066Sahrens } 982fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 983fa9e4066Sahrens dnode_rele(dn, tx); 984fa9e4066Sahrens } 985fa9e4066Sahrens 9868a2f1b91Sahrens if (tx->tx_tempreserve_cookie) 987fa9e4066Sahrens dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); 988fa9e4066Sahrens 989fa9e4066Sahrens if (tx->tx_anyobj == FALSE) 990fa9e4066Sahrens txg_rele_to_sync(&tx->tx_txgh); 9918f38d419Sek list_destroy(&tx->tx_holds); 9928a2f1b91Sahrens #ifdef ZFS_DEBUG 993fa9e4066Sahrens dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", 994fa9e4066Sahrens tx->tx_space_towrite, refcount_count(&tx->tx_space_written), 995fa9e4066Sahrens tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); 996fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_written, 997fa9e4066Sahrens refcount_count(&tx->tx_space_written)); 998fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_freed, 999fa9e4066Sahrens refcount_count(&tx->tx_space_freed)); 1000fa9e4066Sahrens #endif 1001fa9e4066Sahrens kmem_free(tx, sizeof (dmu_tx_t)); 1002fa9e4066Sahrens } 1003fa9e4066Sahrens 1004fa9e4066Sahrens void 1005fa9e4066Sahrens dmu_tx_abort(dmu_tx_t *tx) 1006fa9e4066Sahrens { 10078a2f1b91Sahrens dmu_tx_hold_t *txh; 1008fa9e4066Sahrens 1009fa9e4066Sahrens ASSERT(tx->tx_txg == 0); 1010fa9e4066Sahrens 10118a2f1b91Sahrens while (txh = list_head(&tx->tx_holds)) { 10128a2f1b91Sahrens dnode_t *dn = txh->txh_dnode; 1013fa9e4066Sahrens 10148a2f1b91Sahrens list_remove(&tx->tx_holds, txh); 10158a2f1b91Sahrens kmem_free(txh, sizeof (dmu_tx_hold_t)); 1016fa9e4066Sahrens if (dn != NULL) 1017fa9e4066Sahrens dnode_rele(dn, tx); 1018fa9e4066Sahrens } 10198f38d419Sek list_destroy(&tx->tx_holds); 10208a2f1b91Sahrens #ifdef ZFS_DEBUG 1021fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_written, 1022fa9e4066Sahrens refcount_count(&tx->tx_space_written)); 1023fa9e4066Sahrens refcount_destroy_many(&tx->tx_space_freed, 1024fa9e4066Sahrens refcount_count(&tx->tx_space_freed)); 1025fa9e4066Sahrens #endif 1026fa9e4066Sahrens kmem_free(tx, sizeof (dmu_tx_t)); 1027fa9e4066Sahrens } 1028fa9e4066Sahrens 1029fa9e4066Sahrens uint64_t 1030fa9e4066Sahrens dmu_tx_get_txg(dmu_tx_t *tx) 1031fa9e4066Sahrens { 1032fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1033fa9e4066Sahrens return (tx->tx_txg); 1034fa9e4066Sahrens } 1035