fs/zfs/dmu_tx.c

fa9e4066Sahrens/*
fa9e4066Sahrens * CDDL HEADER START
fa9e4066Sahrens *
fa9e4066Sahrens * The contents of this file are subject to the terms of the
f65e61c0Sahrens * Common Development and Distribution License (the "License").
f65e61c0Sahrens * You may not use this file except in compliance with the License.
fa9e4066Sahrens *
fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing.
fa9e4066Sahrens * See the License for the specific language governing permissions
fa9e4066Sahrens * and limitations under the License.
fa9e4066Sahrens *
fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each
fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the
fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying
fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner]
fa9e4066Sahrens *
fa9e4066Sahrens * CDDL HEADER END
fa9e4066Sahrens */
fa9e4066Sahrens/*
01025c89SJohn Harres * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
9dccfd2aSAlbert Lee * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
b7b2590dSMatthew Ahrens * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
9dccfd2aSAlbert Lee */
fa9e4066Sahrens
fa9e4066Sahrens#include <sys/dmu.h>
fa9e4066Sahrens#include <sys/dmu_impl.h>
fa9e4066Sahrens#include <sys/dbuf.h>
fa9e4066Sahrens#include <sys/dmu_tx.h>
fa9e4066Sahrens#include <sys/dmu_objset.h>
61e255ceSMatthew Ahrens#include <sys/dsl_dataset.h>
61e255ceSMatthew Ahrens#include <sys/dsl_dir.h>
fa9e4066Sahrens#include <sys/dsl_pool.h>
61e255ceSMatthew Ahrens#include <sys/zap_impl.h>
fa9e4066Sahrens#include <sys/spa.h>
0a586ceaSMark Shellenbaum#include <sys/sa.h>
0a586ceaSMark Shellenbaum#include <sys/sa_impl.h>
fa9e4066Sahrens#include <sys/zfs_context.h>
0a586ceaSMark Shellenbaum#include <sys/varargs.h>
fa9e4066Sahrens
ea8dc4b6Seschrocktypedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
ea8dc4b6Seschrock    uint64_t arg1, uint64_t arg2);
ea8dc4b6Seschrock
fa9e4066Sahrens
fa9e4066Sahrensdmu_tx_t *
1d452cf5Sahrensdmu_tx_create_dd(dsl_dir_t *dd)
fa9e4066Sahrens{
fa9e4066Sahrens	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
fa9e4066Sahrens	tx->tx_dir = dd;
4445fffbSMatthew Ahrens	if (dd != NULL)
fa9e4066Sahrens		tx->tx_pool = dd->dd_pool;
fa9e4066Sahrens	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
8a2f1b91Sahrens	    offsetof(dmu_tx_hold_t, txh_node));
d20e665cSRicardo M. Correia	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
d20e665cSRicardo M. Correia	    offsetof(dmu_tx_callback_t, dcb_node));
69962b56SMatthew Ahrens	tx->tx_start = gethrtime();
fa9e4066Sahrens	return (tx);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensdmu_tx_t *
fa9e4066Sahrensdmu_tx_create(objset_t *os)
fa9e4066Sahrens{
503ad85cSMatthew Ahrens	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
fa9e4066Sahrens	tx->tx_objset = os;
fa9e4066Sahrens	return (tx);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensdmu_tx_t *
fa9e4066Sahrensdmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
fa9e4066Sahrens{
1d452cf5Sahrens	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
fa9e4066Sahrens
b7b2590dSMatthew Ahrens	txg_verify(dp->dp_spa, txg);
fa9e4066Sahrens	tx->tx_pool = dp;
fa9e4066Sahrens	tx->tx_txg = txg;
fa9e4066Sahrens	tx->tx_anyobj = TRUE;
fa9e4066Sahrens
fa9e4066Sahrens	return (tx);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensint
fa9e4066Sahrensdmu_tx_is_syncing(dmu_tx_t *tx)
fa9e4066Sahrens{
fa9e4066Sahrens	return (tx->tx_anyobj);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensint
fa9e4066Sahrensdmu_tx_private_ok(dmu_tx_t *tx)
fa9e4066Sahrens{
ea8dc4b6Seschrock	return (tx->tx_anyobj);
fa9e4066Sahrens}
fa9e4066Sahrens
8a2f1b91Sahrensstatic dmu_tx_hold_t *
b0c42cd4Sbzzzdmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
b0c42cd4Sbzzz    uint64_t arg1, uint64_t arg2)
fa9e4066Sahrens{
8a2f1b91Sahrens	dmu_tx_hold_t *txh;
fa9e4066Sahrens
b0c42cd4Sbzzz	if (dn != NULL) {
*e914ace2STim Schumacher		(void) zfs_refcount_add(&dn->dn_holds, tx);
b0c42cd4Sbzzz		if (tx->tx_txg != 0) {
fa9e4066Sahrens			mutex_enter(&dn->dn_mtx);
fa9e4066Sahrens			/*
fa9e4066Sahrens			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
fa9e4066Sahrens			 * problem, but there's no way for it to happen (for
fa9e4066Sahrens			 * now, at least).
fa9e4066Sahrens			 */
fa9e4066Sahrens			ASSERT(dn->dn_assigned_txg == 0);
fa9e4066Sahrens			dn->dn_assigned_txg = tx->tx_txg;
*e914ace2STim Schumacher			(void) zfs_refcount_add(&dn->dn_tx_holds, tx);
fa9e4066Sahrens			mutex_exit(&dn->dn_mtx);
fa9e4066Sahrens		}
fa9e4066Sahrens	}
fa9e4066Sahrens
8a2f1b91Sahrens	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
8a2f1b91Sahrens	txh->txh_tx = tx;
8a2f1b91Sahrens	txh->txh_dnode = dn;
*e914ace2STim Schumacher	zfs_refcount_create(&txh->txh_space_towrite);
*e914ace2STim Schumacher	zfs_refcount_create(&txh->txh_memory_tohold);
8a2f1b91Sahrens	txh->txh_type = type;
8a2f1b91Sahrens	txh->txh_arg1 = arg1;
8a2f1b91Sahrens	txh->txh_arg2 = arg2;
8a2f1b91Sahrens	list_insert_tail(&tx->tx_holds, txh);
ea8dc4b6Seschrock
8a2f1b91Sahrens	return (txh);
fa9e4066Sahrens}
fa9e4066Sahrens
b0c42cd4Sbzzzstatic dmu_tx_hold_t *
b0c42cd4Sbzzzdmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
b0c42cd4Sbzzz    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dnode_t *dn = NULL;
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz	int err;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	if (object != DMU_NEW_OBJECT) {
b0c42cd4Sbzzz		err = dnode_hold(os, object, FTAG, &dn);
b0c42cd4Sbzzz		if (err != 0) {
b0c42cd4Sbzzz			tx->tx_err = err;
b0c42cd4Sbzzz			return (NULL);
b0c42cd4Sbzzz		}
b0c42cd4Sbzzz	}
b0c42cd4Sbzzz	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
b0c42cd4Sbzzz	if (dn != NULL)
b0c42cd4Sbzzz		dnode_rele(dn, FTAG);
b0c42cd4Sbzzz	return (txh);
b0c42cd4Sbzzz}
b0c42cd4Sbzzz
fa9e4066Sahrensvoid
b0c42cd4Sbzzzdmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
fa9e4066Sahrens{
fa9e4066Sahrens	/*
fa9e4066Sahrens	 * If we're syncing, they can manipulate any object anyhow, and
fa9e4066Sahrens	 * the hold on the dnode_t can cause problems.
fa9e4066Sahrens	 */
b0c42cd4Sbzzz	if (!dmu_tx_is_syncing(tx))
b0c42cd4Sbzzz		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
fa9e4066Sahrens}
fa9e4066Sahrens
61e255ceSMatthew Ahrens/*
61e255ceSMatthew Ahrens * This function reads specified data from disk.  The specified data will
61e255ceSMatthew Ahrens * be needed to perform the transaction -- i.e, it will be read after
61e255ceSMatthew Ahrens * we do dmu_tx_assign().  There are two reasons that we read the data now
61e255ceSMatthew Ahrens * (before dmu_tx_assign()):
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * 1. Reading it now has potentially better performance.  The transaction
61e255ceSMatthew Ahrens * has not yet been assigned, so the TXG is not held open, and also the
61e255ceSMatthew Ahrens * caller typically has less locks held when calling dmu_tx_hold_*() than
61e255ceSMatthew Ahrens * after the transaction has been assigned.  This reduces the lock (and txg)
61e255ceSMatthew Ahrens * hold times, thus reducing lock contention.
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
61e255ceSMatthew Ahrens * that are detected before they start making changes to the DMU state
61e255ceSMatthew Ahrens * (i.e. now).  Once the transaction has been assigned, and some DMU
61e255ceSMatthew Ahrens * state has been changed, it can be difficult to recover from an i/o
61e255ceSMatthew Ahrens * error (e.g. to undo the changes already made in memory at the DMU
61e255ceSMatthew Ahrens * layer).  Typically code to do so does not exist in the caller -- it
61e255ceSMatthew Ahrens * assumes that the data has already been cached and thus i/o errors are
61e255ceSMatthew Ahrens * not possible.
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * It has been observed that the i/o initiated here can be a performance
61e255ceSMatthew Ahrens * problem, and it appears to be optional, because we don't look at the
61e255ceSMatthew Ahrens * data which is read.  However, removing this read would only serve to
61e255ceSMatthew Ahrens * move the work elsewhere (after the dmu_tx_assign()), where it may
61e255ceSMatthew Ahrens * have a greater impact on performance (in addition to the impact on
61e255ceSMatthew Ahrens * fault tolerance noted above).
61e255ceSMatthew Ahrens */
ea8dc4b6Seschrockstatic int
ea8dc4b6Seschrockdmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
ea8dc4b6Seschrock{
ea8dc4b6Seschrock	int err;
ea8dc4b6Seschrock	dmu_buf_impl_t *db;
ea8dc4b6Seschrock
ea8dc4b6Seschrock	rw_enter(&dn->dn_struct_rwlock, RW_READER);
ea8dc4b6Seschrock	db = dbuf_hold_level(dn, level, blkid, FTAG);
ea8dc4b6Seschrock	rw_exit(&dn->dn_struct_rwlock);
ea8dc4b6Seschrock	if (db == NULL)
be6fd75aSMatthew Ahrens		return (SET_ERROR(EIO));
1ab7f2deSmaybee	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
ea8dc4b6Seschrock	dbuf_rele(db, FTAG);
ea8dc4b6Seschrock	return (err);
ea8dc4b6Seschrock}
ea8dc4b6Seschrock
fa9e4066Sahrens/* ARGSUSED */
fa9e4066Sahrensstatic void
8a2f1b91Sahrensdmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
fa9e4066Sahrens{
8a2f1b91Sahrens	dnode_t *dn = txh->txh_dnode;
8a2f1b91Sahrens	int err = 0;
fa9e4066Sahrens
fa9e4066Sahrens	if (len == 0)
fa9e4066Sahrens		return;
fa9e4066Sahrens
*e914ace2STim Schumacher	(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
4a7f2a75SMark Maybee
*e914ace2STim Schumacher	if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
61e255ceSMatthew Ahrens		err = SET_ERROR(EFBIG);
ea8dc4b6Seschrock
61e255ceSMatthew Ahrens	if (dn == NULL)
61e255ceSMatthew Ahrens		return;
ea8dc4b6Seschrock
61e255ceSMatthew Ahrens	/*
61e255ceSMatthew Ahrens	 * For i/o error checking, read the blocks that will be needed
61e255ceSMatthew Ahrens	 * to perform the write: the first and last level-0 blocks (if
61e255ceSMatthew Ahrens	 * they are not aligned, i.e. if they are partial-block writes),
61e255ceSMatthew Ahrens	 * and all the level-1 blocks.
61e255ceSMatthew Ahrens	 */
61e255ceSMatthew Ahrens	if (dn->dn_maxblkid == 0) {
61e255ceSMatthew Ahrens		if (off < dn->dn_datablksz &&
61e255ceSMatthew Ahrens		    (off > 0 || len < dn->dn_datablksz)) {
61e255ceSMatthew Ahrens			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
61e255ceSMatthew Ahrens			if (err != 0) {
61e255ceSMatthew Ahrens				txh->txh_tx->tx_err = err;
ea8dc4b6Seschrock			}
4a7f2a75SMark Maybee		}
61e255ceSMatthew Ahrens	} else {
61e255ceSMatthew Ahrens		zio_t *zio = zio_root(dn->dn_objset->os_spa,
61e255ceSMatthew Ahrens		    NULL, NULL, ZIO_FLAG_CANFAIL);
4a7f2a75SMark Maybee
61e255ceSMatthew Ahrens		/* first level-0 block */
61e255ceSMatthew Ahrens		uint64_t start = off >> dn->dn_datablkshift;
61e255ceSMatthew Ahrens		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
61e255ceSMatthew Ahrens			err = dmu_tx_check_ioerr(zio, dn, 0, start);
61e255ceSMatthew Ahrens			if (err != 0) {
61e255ceSMatthew Ahrens				txh->txh_tx->tx_err = err;
61e255ceSMatthew Ahrens			}
b24ab676SJeff Bonwick		}
4a7f2a75SMark Maybee
61e255ceSMatthew Ahrens		/* last level-0 block */
61e255ceSMatthew Ahrens		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
61e255ceSMatthew Ahrens		if (end != start && end <= dn->dn_maxblkid &&
61e255ceSMatthew Ahrens		    P2PHASE(off + len, dn->dn_datablksz)) {
61e255ceSMatthew Ahrens			err = dmu_tx_check_ioerr(zio, dn, 0, end);
61e255ceSMatthew Ahrens			if (err != 0) {
01025c89SJohn Harres				txh->txh_tx->tx_err = err;
01025c89SJohn Harres			}
61e255ceSMatthew Ahrens		}
01025c89SJohn Harres
61e255ceSMatthew Ahrens		/* level-1 blocks */
61e255ceSMatthew Ahrens		if (dn->dn_nlevels > 1) {
61e255ceSMatthew Ahrens			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
61e255ceSMatthew Ahrens			for (uint64_t i = (start >> shft) + 1;
61e255ceSMatthew Ahrens			    i < end >> shft; i++) {
61e255ceSMatthew Ahrens				err = dmu_tx_check_ioerr(zio, dn, 1, i);
61e255ceSMatthew Ahrens				if (err != 0) {
61e255ceSMatthew Ahrens					txh->txh_tx->tx_err = err;
0c779ad4SMatthew Ahrens				}
4a7f2a75SMark Maybee			}
4a7f2a75SMark Maybee		}
fa9e4066Sahrens
61e255ceSMatthew Ahrens		err = zio_wait(zio);
61e255ceSMatthew Ahrens		if (err != 0) {
61e255ceSMatthew Ahrens			txh->txh_tx->tx_err = err;
4a7f2a75SMark Maybee		}
fa9e4066Sahrens	}
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensstatic void
8a2f1b91Sahrensdmu_tx_count_dnode(dmu_tx_hold_t *txh)
fa9e4066Sahrens{
*e914ace2STim Schumacher	(void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE,
*e914ace2STim Schumacher	    FTAG);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensvoid
fa9e4066Sahrensdmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
fa9e4066Sahrens{
8a2f1b91Sahrens	dmu_tx_hold_t *txh;
8a2f1b91Sahrens
61e255ceSMatthew Ahrens	ASSERT0(tx->tx_txg);
61e255ceSMatthew Ahrens	ASSERT3U(len, <=, DMU_MAX_ACCESS);
dd6ef538Smaybee	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
fa9e4066Sahrens
8a2f1b91Sahrens	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
8a2f1b91Sahrens	    object, THT_WRITE, off, len);
b0c42cd4Sbzzz	if (txh != NULL) {
b0c42cd4Sbzzz		dmu_tx_count_write(txh, off, len);
b0c42cd4Sbzzz		dmu_tx_count_dnode(txh);
b0c42cd4Sbzzz	}
b0c42cd4Sbzzz}
8a2f1b91Sahrens
5cabbc6bSPrashanth Sreenivasavoid
5cabbc6bSPrashanth Sreenivasadmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
5cabbc6bSPrashanth Sreenivasa{
5cabbc6bSPrashanth Sreenivasa	dmu_tx_hold_t *txh;
5cabbc6bSPrashanth Sreenivasa
5cabbc6bSPrashanth Sreenivasa	ASSERT(tx->tx_txg == 0);
5cabbc6bSPrashanth Sreenivasa	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
5cabbc6bSPrashanth Sreenivasa	    object, THT_WRITE, 0, 0);
5cabbc6bSPrashanth Sreenivasa	if (txh == NULL)
5cabbc6bSPrashanth Sreenivasa		return;
5cabbc6bSPrashanth Sreenivasa
5cabbc6bSPrashanth Sreenivasa	dnode_t *dn = txh->txh_dnode;
*e914ace2STim Schumacher	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
5cabbc6bSPrashanth Sreenivasa	    1ULL << dn->dn_indblkshift, FTAG);
5cabbc6bSPrashanth Sreenivasa	dmu_tx_count_dnode(txh);
5cabbc6bSPrashanth Sreenivasa}
5cabbc6bSPrashanth Sreenivasa
b0c42cd4Sbzzzvoid
b0c42cd4Sbzzzdmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	ASSERT0(tx->tx_txg);
b0c42cd4Sbzzz	ASSERT3U(len, <=, DMU_MAX_ACCESS);
b0c42cd4Sbzzz	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
b0c42cd4Sbzzz
b0c42cd4Sbzzz	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
b0c42cd4Sbzzz	if (txh != NULL) {
b0c42cd4Sbzzz		dmu_tx_count_write(txh, off, len);
b0c42cd4Sbzzz		dmu_tx_count_dnode(txh);
b0c42cd4Sbzzz	}
fa9e4066Sahrens}
fa9e4066Sahrens
4bb73804SMatthew Ahrens/*
4bb73804SMatthew Ahrens * This function marks the transaction as being a "net free".  The end
4bb73804SMatthew Ahrens * result is that refquotas will be disabled for this transaction, and
4bb73804SMatthew Ahrens * this transaction will be able to use half of the pool space overhead
4bb73804SMatthew Ahrens * (see dsl_pool_adjustedsize()).  Therefore this function should only
4bb73804SMatthew Ahrens * be called for transactions that we expect will not cause a net increase
4bb73804SMatthew Ahrens * in the amount of space used (but it's OK if that is occasionally not true).
4bb73804SMatthew Ahrens */
4bb73804SMatthew Ahrensvoid
4bb73804SMatthew Ahrensdmu_tx_mark_netfree(dmu_tx_t *tx)
4bb73804SMatthew Ahrens{
61e255ceSMatthew Ahrens	tx->tx_netfree = B_TRUE;
4bb73804SMatthew Ahrens}
4bb73804SMatthew Ahrens
b0c42cd4Sbzzzstatic void
b0c42cd4Sbzzzdmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
fa9e4066Sahrens{
b0c42cd4Sbzzz	dmu_tx_t *tx;
b0c42cd4Sbzzz	dnode_t *dn;
2f3d8780SMatthew Ahrens	int err;
fa9e4066Sahrens
b0c42cd4Sbzzz	tx = txh->txh_tx;
8a2f1b91Sahrens	ASSERT(tx->tx_txg == 0);
8a2f1b91Sahrens
b0c42cd4Sbzzz	dn = txh->txh_dnode;
69962b56SMatthew Ahrens	dmu_tx_count_dnode(txh);
8a2f1b91Sahrens
61e255ceSMatthew Ahrens	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
fa9e4066Sahrens		return;
fa9e4066Sahrens	if (len == DMU_OBJECT_END)
61e255ceSMatthew Ahrens		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
fa9e4066Sahrens
ea8dc4b6Seschrock	/*
2f3d8780SMatthew Ahrens	 * For i/o error checking, we read the first and last level-0
2f3d8780SMatthew Ahrens	 * blocks if they are not aligned, and all the level-1 blocks.
2f3d8780SMatthew Ahrens	 *
2f3d8780SMatthew Ahrens	 * Note:  dbuf_free_range() assumes that we have not instantiated
2f3d8780SMatthew Ahrens	 * any level-0 dbufs that will be completely freed.  Therefore we must
2f3d8780SMatthew Ahrens	 * exercise care to not read or count the first and last blocks
2f3d8780SMatthew Ahrens	 * if they are blocksize-aligned.
2f3d8780SMatthew Ahrens	 */
2f3d8780SMatthew Ahrens	if (dn->dn_datablkshift == 0) {
713d6c20SMatthew Ahrens		if (off != 0 || len < dn->dn_datablksz)
5253393bSMatthew Ahrens			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
2f3d8780SMatthew Ahrens	} else {
2f3d8780SMatthew Ahrens		/* first block will be modified if it is not aligned */
2f3d8780SMatthew Ahrens		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
2f3d8780SMatthew Ahrens			dmu_tx_count_write(txh, off, 1);
2f3d8780SMatthew Ahrens		/* last block will be modified if it is not aligned */
2f3d8780SMatthew Ahrens		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
61e255ceSMatthew Ahrens			dmu_tx_count_write(txh, off + len, 1);
2f3d8780SMatthew Ahrens	}
2f3d8780SMatthew Ahrens
2f3d8780SMatthew Ahrens	/*
2f3d8780SMatthew Ahrens	 * Check level-1 blocks.
ea8dc4b6Seschrock	 */
98572ac1Sahrens	if (dn->dn_nlevels > 1) {
2f3d8780SMatthew Ahrens		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
98572ac1Sahrens		    SPA_BLKPTRSHIFT;
2f3d8780SMatthew Ahrens		uint64_t start = off >> shift;
2f3d8780SMatthew Ahrens		uint64_t end = (off + len) >> shift;
2f3d8780SMatthew Ahrens
2f3d8780SMatthew Ahrens		ASSERT(dn->dn_indblkshift != 0);
98572ac1Sahrens
bb411a08SMatthew Ahrens		/*
bb411a08SMatthew Ahrens		 * dnode_reallocate() can result in an object with indirect
bb411a08SMatthew Ahrens		 * blocks having an odd data block size.  In this case,
bb411a08SMatthew Ahrens		 * just check the single block.
bb411a08SMatthew Ahrens		 */
bb411a08SMatthew Ahrens		if (dn->dn_datablkshift == 0)
bb411a08SMatthew Ahrens			start = end = 0;
bb411a08SMatthew Ahrens
61e255ceSMatthew Ahrens		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
98572ac1Sahrens		    NULL, NULL, ZIO_FLAG_CANFAIL);
2f3d8780SMatthew Ahrens		for (uint64_t i = start; i <= end; i++) {
98572ac1Sahrens			uint64_t ibyte = i << shift;
cdb0ab79Smaybee			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
98572ac1Sahrens			i = ibyte >> shift;
46e1baa6SMatthew Ahrens			if (err == ESRCH || i > end)
98572ac1Sahrens				break;
61e255ceSMatthew Ahrens			if (err != 0) {
98572ac1Sahrens				tx->tx_err = err;
61e255ceSMatthew Ahrens				(void) zio_wait(zio);
98572ac1Sahrens				return;
98572ac1Sahrens			}
ea8dc4b6Seschrock
*e914ace2STim Schumacher			(void) zfs_refcount_add_many(&txh->txh_memory_tohold,
61e255ceSMatthew Ahrens			    1 << dn->dn_indblkshift, FTAG);
61e255ceSMatthew Ahrens
98572ac1Sahrens			err = dmu_tx_check_ioerr(zio, dn, 1, i);
61e255ceSMatthew Ahrens			if (err != 0) {
98572ac1Sahrens				tx->tx_err = err;
61e255ceSMatthew Ahrens				(void) zio_wait(zio);
98572ac1Sahrens				return;
98572ac1Sahrens			}
98572ac1Sahrens		}
98572ac1Sahrens		err = zio_wait(zio);
61e255ceSMatthew Ahrens		if (err != 0) {
ea8dc4b6Seschrock			tx->tx_err = err;
ea8dc4b6Seschrock			return;
ea8dc4b6Seschrock		}
ea8dc4b6Seschrock	}
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensvoid
b0c42cd4Sbzzzdmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
b0c42cd4Sbzzz	    object, THT_FREE, off, len);
b0c42cd4Sbzzz	if (txh != NULL)
b0c42cd4Sbzzz		(void) dmu_tx_hold_free_impl(txh, off, len);
b0c42cd4Sbzzz}
b0c42cd4Sbzzz
b0c42cd4Sbzzzvoid
b0c42cd4Sbzzzdmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
b0c42cd4Sbzzz	if (txh != NULL)
b0c42cd4Sbzzz		(void) dmu_tx_hold_free_impl(txh, off, len);
b0c42cd4Sbzzz}
b0c42cd4Sbzzz
b0c42cd4Sbzzzstatic void
411be58aSMatthew Ahrensdmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
fa9e4066Sahrens{
b0c42cd4Sbzzz	dmu_tx_t *tx = txh->txh_tx;
b0c42cd4Sbzzz	dnode_t *dn;
0c779ad4SMatthew Ahrens	int err;
fa9e4066Sahrens
8a2f1b91Sahrens	ASSERT(tx->tx_txg == 0);
8a2f1b91Sahrens
b0c42cd4Sbzzz	dn = txh->txh_dnode;
8a2f1b91Sahrens
8a2f1b91Sahrens	dmu_tx_count_dnode(txh);
fa9e4066Sahrens
61e255ceSMatthew Ahrens	/*
61e255ceSMatthew Ahrens	 * Modifying a almost-full microzap is around the worst case (128KB)
61e255ceSMatthew Ahrens	 *
61e255ceSMatthew Ahrens	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
61e255ceSMatthew Ahrens	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
61e255ceSMatthew Ahrens	 * - 4 new blocks written if adding:
61e255ceSMatthew Ahrens	 *    - 2 blocks for possibly split leaves,
61e255ceSMatthew Ahrens	 *    - 2 grown ptrtbl blocks
61e255ceSMatthew Ahrens	 */
*e914ace2STim Schumacher	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
61e255ceSMatthew Ahrens	    MZAP_MAX_BLKSZ, FTAG);
61e255ceSMatthew Ahrens
61e255ceSMatthew Ahrens	if (dn == NULL)
fa9e4066Sahrens		return;
fa9e4066Sahrens
ad135b5dSChristopher Siden	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
fa9e4066Sahrens
61e255ceSMatthew Ahrens	if (dn->dn_maxblkid == 0 || name == NULL) {
fa9e4066Sahrens		/*
61e255ceSMatthew Ahrens		 * This is a microzap (only one block), or we don't know
61e255ceSMatthew Ahrens		 * the name.  Check the first block for i/o errors.
fa9e4066Sahrens		 */
ea8dc4b6Seschrock		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
61e255ceSMatthew Ahrens		if (err != 0) {
ea8dc4b6Seschrock			tx->tx_err = err;
0c779ad4SMatthew Ahrens		}
61e255ceSMatthew Ahrens	} else {
ea8dc4b6Seschrock		/*
61e255ceSMatthew Ahrens		 * Access the name so that we'll check for i/o errors to
61e255ceSMatthew Ahrens		 * the leaf blocks, etc.  We ignore ENOENT, as this name
61e255ceSMatthew Ahrens		 * may not yet exist.
ea8dc4b6Seschrock		 */
79d72832SMatthew Ahrens		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
61e255ceSMatthew Ahrens		if (err == EIO || err == ECKSUM || err == ENXIO) {
ea8dc4b6Seschrock			tx->tx_err = err;
0c779ad4SMatthew Ahrens		}
0c779ad4SMatthew Ahrens	}
fa9e4066Sahrens}
fa9e4066Sahrens
b0c42cd4Sbzzzvoid
b0c42cd4Sbzzzdmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	ASSERT0(tx->tx_txg);
b0c42cd4Sbzzz
b0c42cd4Sbzzz	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
b0c42cd4Sbzzz	    object, THT_ZAP, add, (uintptr_t)name);
b0c42cd4Sbzzz	if (txh != NULL)
411be58aSMatthew Ahrens		dmu_tx_hold_zap_impl(txh, name);
b0c42cd4Sbzzz}
b0c42cd4Sbzzz
b0c42cd4Sbzzzvoid
b0c42cd4Sbzzzdmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	ASSERT0(tx->tx_txg);
b0c42cd4Sbzzz	ASSERT(dn != NULL);
b0c42cd4Sbzzz
b0c42cd4Sbzzz	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
b0c42cd4Sbzzz	if (txh != NULL)
411be58aSMatthew Ahrens		dmu_tx_hold_zap_impl(txh, name);
b0c42cd4Sbzzz}
b0c42cd4Sbzzz
fa9e4066Sahrensvoid
fa9e4066Sahrensdmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
fa9e4066Sahrens{
8a2f1b91Sahrens	dmu_tx_hold_t *txh;
fa9e4066Sahrens
8a2f1b91Sahrens	ASSERT(tx->tx_txg == 0);
fa9e4066Sahrens
8a2f1b91Sahrens	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
8a2f1b91Sahrens	    object, THT_BONUS, 0, 0);
8a2f1b91Sahrens	if (txh)
8a2f1b91Sahrens		dmu_tx_count_dnode(txh);
fa9e4066Sahrens}
fa9e4066Sahrens
b0c42cd4Sbzzzvoid
b0c42cd4Sbzzzdmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
b0c42cd4Sbzzz{
b0c42cd4Sbzzz	dmu_tx_hold_t *txh;
b0c42cd4Sbzzz
b0c42cd4Sbzzz	ASSERT0(tx->tx_txg);
b0c42cd4Sbzzz
b0c42cd4Sbzzz	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
b0c42cd4Sbzzz	if (txh)
b0c42cd4Sbzzz		dmu_tx_count_dnode(txh);
b0c42cd4Sbzzz}
b0c42cd4Sbzzz
fa9e4066Sahrensvoid
fa9e4066Sahrensdmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
fa9e4066Sahrens{
8a2f1b91Sahrens	dmu_tx_hold_t *txh;
fa9e4066Sahrens	ASSERT(tx->tx_txg == 0);
fa9e4066Sahrens
8a2f1b91Sahrens	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
8a2f1b91Sahrens	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
8a2f1b91Sahrens
*e914ace2STim Schumacher	(void) zfs_refcount_add_many(&txh->txh_space_towrite, space, FTAG);
fa9e4066Sahrens}
fa9e4066Sahrens
9c9dc39aSek#ifdef ZFS_DEBUG
fa9e4066Sahrensvoid
fa9e4066Sahrensdmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
fa9e4066Sahrens{
61e255ceSMatthew Ahrens	boolean_t match_object = B_FALSE;
61e255ceSMatthew Ahrens	boolean_t match_offset = B_FALSE;
fa9e4066Sahrens
744947dcSTom Erickson	DB_DNODE_ENTER(db);
61e255ceSMatthew Ahrens	dnode_t *dn = DB_DNODE(db);
fa9e4066Sahrens	ASSERT(tx->tx_txg != 0);
503ad85cSMatthew Ahrens	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
fa9e4066Sahrens	ASSERT3U(dn->dn_object, ==, db->db.db_object);
fa9e4066Sahrens
744947dcSTom Erickson	if (tx->tx_anyobj) {
744947dcSTom Erickson		DB_DNODE_EXIT(db);
fa9e4066Sahrens		return;
744947dcSTom Erickson	}
fa9e4066Sahrens
fa9e4066Sahrens	/* XXX No checking on the meta dnode for now */
744947dcSTom Erickson	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
744947dcSTom Erickson		DB_DNODE_EXIT(db);
fa9e4066Sahrens		return;
744947dcSTom Erickson	}
fa9e4066Sahrens
61e255ceSMatthew Ahrens	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
8a2f1b91Sahrens	    txh = list_next(&tx->tx_holds, txh)) {
fa9e4066Sahrens		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
8a2f1b91Sahrens		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
fa9e4066Sahrens			match_object = TRUE;
8a2f1b91Sahrens		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
fa9e4066Sahrens			int datablkshift = dn->dn_datablkshift ?
fa9e4066Sahrens			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
fa9e4066Sahrens			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
fa9e4066Sahrens			int shift = datablkshift + epbs * db->db_level;
fa9e4066Sahrens			uint64_t beginblk = shift >= 64 ? 0 :
8a2f1b91Sahrens			    (txh->txh_arg1 >> shift);
fa9e4066Sahrens			uint64_t endblk = shift >= 64 ? 0 :
8a2f1b91Sahrens			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
fa9e4066Sahrens			uint64_t blkid = db->db_blkid;
fa9e4066Sahrens
8a2f1b91Sahrens			/* XXX txh_arg2 better not be zero... */
fa9e4066Sahrens
8a2f1b91Sahrens			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
8a2f1b91Sahrens			    txh->txh_type, beginblk, endblk);
fa9e4066Sahrens
8a2f1b91Sahrens			switch (txh->txh_type) {
fa9e4066Sahrens			case THT_WRITE:
fa9e4066Sahrens				if (blkid >= beginblk && blkid <= endblk)
fa9e4066Sahrens					match_offset = TRUE;
fa9e4066Sahrens				/*
fa9e4066Sahrens				 * We will let this hold work for the bonus
0a586ceaSMark Shellenbaum				 * or spill buffer so that we don't need to
0a586ceaSMark Shellenbaum				 * hold it when creating a new object.
fa9e4066Sahrens				 */
0a586ceaSMark Shellenbaum				if (blkid == DMU_BONUS_BLKID ||
0a586ceaSMark Shellenbaum				    blkid == DMU_SPILL_BLKID)
fa9e4066Sahrens					match_offset = TRUE;
fa9e4066Sahrens				/*
fa9e4066Sahrens				 * They might have to increase nlevels,
fa9e4066Sahrens				 * thus dirtying the new TLIBs.  Or the
fa9e4066Sahrens				 * might have to change the block size,
fa9e4066Sahrens				 * thus dirying the new lvl=0 blk=0.
fa9e4066Sahrens				 */
fa9e4066Sahrens				if (blkid == 0)
fa9e4066Sahrens					match_offset = TRUE;
fa9e4066Sahrens				break;
fa9e4066Sahrens			case THT_FREE:
cdb0ab79Smaybee				/*
cdb0ab79Smaybee				 * We will dirty all the level 1 blocks in
cdb0ab79Smaybee				 * the free range and perhaps the first and
cdb0ab79Smaybee				 * last level 0 block.
cdb0ab79Smaybee				 */
cdb0ab79Smaybee				if (blkid >= beginblk && (blkid <= endblk ||
cdb0ab79Smaybee				    txh->txh_arg2 == DMU_OBJECT_END))
fa9e4066Sahrens					match_offset = TRUE;
fa9e4066Sahrens				break;
0a586ceaSMark Shellenbaum			case THT_SPILL:
0a586ceaSMark Shellenbaum				if (blkid == DMU_SPILL_BLKID)
0a586ceaSMark Shellenbaum					match_offset = TRUE;
0a586ceaSMark Shellenbaum				break;
fa9e4066Sahrens			case THT_BONUS:
0a586ceaSMark Shellenbaum				if (blkid == DMU_BONUS_BLKID)
fa9e4066Sahrens					match_offset = TRUE;
fa9e4066Sahrens				break;
fa9e4066Sahrens			case THT_ZAP:
fa9e4066Sahrens				match_offset = TRUE;
fa9e4066Sahrens				break;
fa9e4066Sahrens			case THT_NEWOBJECT:
fa9e4066Sahrens				match_object = TRUE;
fa9e4066Sahrens				break;
fa9e4066Sahrens			default:
8a2f1b91Sahrens				ASSERT(!"bad txh_type");
fa9e4066Sahrens			}
fa9e4066Sahrens		}
744947dcSTom Erickson		if (match_object && match_offset) {
744947dcSTom Erickson			DB_DNODE_EXIT(db);
fa9e4066Sahrens			return;
744947dcSTom Erickson		}
fa9e4066Sahrens	}
744947dcSTom Erickson	DB_DNODE_EXIT(db);
fa9e4066Sahrens	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
fa9e4066Sahrens	    (u_longlong_t)db->db.db_object, db->db_level,
fa9e4066Sahrens	    (u_longlong_t)db->db_blkid);
fa9e4066Sahrens}
9c9dc39aSek#endif
fa9e4066Sahrens
69962b56SMatthew Ahrens/*
69962b56SMatthew Ahrens * If we can't do 10 iops, something is wrong.  Let us go ahead
69962b56SMatthew Ahrens * and hit zfs_dirty_data_max.
69962b56SMatthew Ahrens */
69962b56SMatthew Ahrenshrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
69962b56SMatthew Ahrensint zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens/*
69962b56SMatthew Ahrens * We delay transactions when we've determined that the backend storage
69962b56SMatthew Ahrens * isn't able to accommodate the rate of incoming writes.
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * If there is already a transaction waiting, we delay relative to when
69962b56SMatthew Ahrens * that transaction finishes waiting.  This way the calculated min_time
69962b56SMatthew Ahrens * is independent of the number of threads concurrently executing
69962b56SMatthew Ahrens * transactions.
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * If we are the only waiter, wait relative to when the transaction
69962b56SMatthew Ahrens * started, rather than the current time.  This credits the transaction for
69962b56SMatthew Ahrens * "time already served", e.g. reading indirect blocks.
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * The minimum time for a transaction to take is calculated as:
69962b56SMatthew Ahrens *     min_time = scale * (dirty - min) / (max - dirty)
69962b56SMatthew Ahrens *     min_time is then capped at zfs_delay_max_ns.
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * The delay has two degrees of freedom that can be adjusted via tunables.
69962b56SMatthew Ahrens * The percentage of dirty data at which we start to delay is defined by
69962b56SMatthew Ahrens * zfs_delay_min_dirty_percent. This should typically be at or above
69962b56SMatthew Ahrens * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
69962b56SMatthew Ahrens * delay after writing at full speed has failed to keep up with the incoming
69962b56SMatthew Ahrens * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
69962b56SMatthew Ahrens * speaking, this variable determines the amount of delay at the midpoint of
69962b56SMatthew Ahrens * the curve.
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * delay
69962b56SMatthew Ahrens *  10ms +-------------------------------------------------------------*+
69962b56SMatthew Ahrens *       |                                                             *|
69962b56SMatthew Ahrens *   9ms +                                                             *+
69962b56SMatthew Ahrens *       |                                                             *|
69962b56SMatthew Ahrens *   8ms +                                                             *+
69962b56SMatthew Ahrens *       |                                                            * |
69962b56SMatthew Ahrens *   7ms +                                                            * +
69962b56SMatthew Ahrens *       |                                                            * |
69962b56SMatthew Ahrens *   6ms +                                                            * +
69962b56SMatthew Ahrens *       |                                                            * |
69962b56SMatthew Ahrens *   5ms +                                                           *  +
69962b56SMatthew Ahrens *       |                                                           *  |
69962b56SMatthew Ahrens *   4ms +                                                           *  +
69962b56SMatthew Ahrens *       |                                                           *  |
69962b56SMatthew Ahrens *   3ms +                                                          *   +
69962b56SMatthew Ahrens *       |                                                          *   |
69962b56SMatthew Ahrens *   2ms +                                              (midpoint) *    +
69962b56SMatthew Ahrens *       |                                                  |    **     |
69962b56SMatthew Ahrens *   1ms +                                                  v ***       +
69962b56SMatthew Ahrens *       |             zfs_delay_scale ---------->     ********         |
69962b56SMatthew Ahrens *     0 +-------------------------------------*********----------------+
69962b56SMatthew Ahrens *       0%                    <- zfs_dirty_data_max ->               100%
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * Note that since the delay is added to the outstanding time remaining on the
69962b56SMatthew Ahrens * most recent transaction, the delay is effectively the inverse of IOPS.
69962b56SMatthew Ahrens * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
69962b56SMatthew Ahrens * was chosen such that small changes in the amount of accumulated dirty data
69962b56SMatthew Ahrens * in the first 3/4 of the curve yield relatively small differences in the
69962b56SMatthew Ahrens * amount of delay.
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * The effects can be easier to understand when the amount of delay is
69962b56SMatthew Ahrens * represented on a log scale:
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * delay
69962b56SMatthew Ahrens * 100ms +-------------------------------------------------------------++
69962b56SMatthew Ahrens *       +                                                              +
69962b56SMatthew Ahrens *       |                                                              |
69962b56SMatthew Ahrens *       +                                                             *+
69962b56SMatthew Ahrens *  10ms +                                                             *+
69962b56SMatthew Ahrens *       +                                                           ** +
69962b56SMatthew Ahrens *       |                                              (midpoint)  **  |
69962b56SMatthew Ahrens *       +                                                  |     **    +
69962b56SMatthew Ahrens *   1ms +                                                  v ****      +
69962b56SMatthew Ahrens *       +             zfs_delay_scale ---------->        *****         +
69962b56SMatthew Ahrens *       |                                             ****             |
69962b56SMatthew Ahrens *       +                                          ****                +
69962b56SMatthew Ahrens * 100us +                                        **                    +
69962b56SMatthew Ahrens *       +                                       *                      +
69962b56SMatthew Ahrens *       |                                      *                       |
69962b56SMatthew Ahrens *       +                                     *                        +
69962b56SMatthew Ahrens *  10us +                                     *                        +
69962b56SMatthew Ahrens *       +                                                              +
69962b56SMatthew Ahrens *       |                                                              |
69962b56SMatthew Ahrens *       +                                                              +
69962b56SMatthew Ahrens *       +--------------------------------------------------------------+
69962b56SMatthew Ahrens *       0%                    <- zfs_dirty_data_max ->               100%
69962b56SMatthew Ahrens *
69962b56SMatthew Ahrens * Note here that only as the amount of dirty data approaches its limit does
69962b56SMatthew Ahrens * the delay start to increase rapidly. The goal of a properly tuned system
69962b56SMatthew Ahrens * should be to keep the amount of dirty data out of that range by first
69962b56SMatthew Ahrens * ensuring that the appropriate limits are set for the I/O scheduler to reach
69962b56SMatthew Ahrens * optimal throughput on the backend storage, and then by changing the value
69962b56SMatthew Ahrens * of zfs_delay_scale to increase the steepness of the curve.
69962b56SMatthew Ahrens */
69962b56SMatthew Ahrensstatic void
69962b56SMatthew Ahrensdmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
69962b56SMatthew Ahrens{
69962b56SMatthew Ahrens	dsl_pool_t *dp = tx->tx_pool;
69962b56SMatthew Ahrens	uint64_t delay_min_bytes =
69962b56SMatthew Ahrens	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
69962b56SMatthew Ahrens	hrtime_t wakeup, min_tx_time, now;
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens	if (dirty <= delay_min_bytes)
69962b56SMatthew Ahrens		return;
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens	/*
69962b56SMatthew Ahrens	 * The caller has already waited until we are under the max.
69962b56SMatthew Ahrens	 * We make them pass us the amount of dirty data so we don't
69962b56SMatthew Ahrens	 * have to handle the case of it being >= the max, which could
69962b56SMatthew Ahrens	 * cause a divide-by-zero if it's == the max.
69962b56SMatthew Ahrens	 */
69962b56SMatthew Ahrens	ASSERT3U(dirty, <, zfs_dirty_data_max);
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens	now = gethrtime();
69962b56SMatthew Ahrens	min_tx_time = zfs_delay_scale *
69962b56SMatthew Ahrens	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
69962b56SMatthew Ahrens	if (now > tx->tx_start + min_tx_time)
69962b56SMatthew Ahrens		return;
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
69962b56SMatthew Ahrens	    uint64_t, min_tx_time);
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens	mutex_enter(&dp->dp_lock);
69962b56SMatthew Ahrens	wakeup = MAX(tx->tx_start + min_tx_time,
69962b56SMatthew Ahrens	    dp->dp_last_wakeup + min_tx_time);
69962b56SMatthew Ahrens	dp->dp_last_wakeup = wakeup;
69962b56SMatthew Ahrens	mutex_exit(&dp->dp_lock);
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens#ifdef _KERNEL
69962b56SMatthew Ahrens	mutex_enter(&curthread->t_delay_lock);
69962b56SMatthew Ahrens	while (cv_timedwait_hires(&curthread->t_delay_cv,
69962b56SMatthew Ahrens	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
69962b56SMatthew Ahrens	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
69962b56SMatthew Ahrens		continue;
69962b56SMatthew Ahrens	mutex_exit(&curthread->t_delay_lock);
69962b56SMatthew Ahrens#else
69962b56SMatthew Ahrens	hrtime_t delta = wakeup - gethrtime();
69962b56SMatthew Ahrens	struct timespec ts;
69962b56SMatthew Ahrens	ts.tv_sec = delta / NANOSEC;
69962b56SMatthew Ahrens	ts.tv_nsec = delta % NANOSEC;
69962b56SMatthew Ahrens	(void) nanosleep(&ts, NULL);
69962b56SMatthew Ahrens#endif
69962b56SMatthew Ahrens}
69962b56SMatthew Ahrens
61e255ceSMatthew Ahrens/*
61e255ceSMatthew Ahrens * This routine attempts to assign the transaction to a transaction group.
61e255ceSMatthew Ahrens * To do so, we must determine if there is sufficient free space on disk.
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
61e255ceSMatthew Ahrens * on it), then it is assumed that there is sufficient free space,
61e255ceSMatthew Ahrens * unless there's insufficient slop space in the pool (see the comment
61e255ceSMatthew Ahrens * above spa_slop_shift in spa_misc.c).
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * If it is not a "netfree" transaction, then if the data already on disk
61e255ceSMatthew Ahrens * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
61e255ceSMatthew Ahrens * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
61e255ceSMatthew Ahrens * plus the rough estimate of this transaction's changes, may exceed the
61e255ceSMatthew Ahrens * allowed usage, then this will fail with ERESTART, which will cause the
61e255ceSMatthew Ahrens * caller to wait for the pending changes to be written to disk (by waiting
61e255ceSMatthew Ahrens * for the next TXG to open), and then check the space usage again.
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * The rough estimate of pending changes is comprised of the sum of:
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens *  - this transaction's holds' txh_space_towrite
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens *  - dd_tempreserved[], which is the sum of in-flight transactions'
61e255ceSMatthew Ahrens *    holds' txh_space_towrite (i.e. those transactions that have called
61e255ceSMatthew Ahrens *    dmu_tx_assign() but not yet called dmu_tx_commit()).
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens *  - dd_space_towrite[], which is the amount of dirtied dbufs.
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * Note that all of these values are inflated by spa_get_worst_case_asize(),
61e255ceSMatthew Ahrens * which means that we may get ERESTART well before we are actually in danger
61e255ceSMatthew Ahrens * of running out of space, but this also mitigates any small inaccuracies
61e255ceSMatthew Ahrens * in the rough estimate (e.g. txh_space_towrite doesn't take into account
61e255ceSMatthew Ahrens * indirect blocks, and dd_space_towrite[] doesn't take into account changes
61e255ceSMatthew Ahrens * to the MOS).
61e255ceSMatthew Ahrens *
61e255ceSMatthew Ahrens * Note that due to this algorithm, it is possible to exceed the allowed
61e255ceSMatthew Ahrens * usage by one transaction.  Also, as we approach the allowed usage,
61e255ceSMatthew Ahrens * we will allow a very limited amount of changes into each TXG, thus
61e255ceSMatthew Ahrens * decreasing performance.
61e255ceSMatthew Ahrens */
fa9e4066Sahrensstatic int
f864f99eSPrakash Suryadmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
fa9e4066Sahrens{
0a4e9518Sgw	spa_t *spa = tx->tx_pool->dp_spa;
fa9e4066Sahrens
fb09f5aaSMadhav Suresh	ASSERT0(tx->tx_txg);
0a4e9518Sgw
8a2f1b91Sahrens	if (tx->tx_err)
8a2f1b91Sahrens		return (tx->tx_err);
fa9e4066Sahrens
e14bb325SJeff Bonwick	if (spa_suspended(spa)) {
0a4e9518Sgw		/*
0a4e9518Sgw		 * If the user has indicated a blocking failure mode
0a4e9518Sgw		 * then return ERESTART which will block in dmu_tx_wait().
0a4e9518Sgw		 * Otherwise, return EIO so that an error can get
0a4e9518Sgw		 * propagated back to the VOP calls.
0a4e9518Sgw		 *
0a4e9518Sgw		 * Note that we always honor the txg_how flag regardless
0a4e9518Sgw		 * of the failuremode setting.
0a4e9518Sgw		 */
0a4e9518Sgw		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
f864f99eSPrakash Surya		    !(txg_how & TXG_WAIT))
be6fd75aSMatthew Ahrens			return (SET_ERROR(EIO));
0a4e9518Sgw
be6fd75aSMatthew Ahrens		return (SET_ERROR(ERESTART));
0a4e9518Sgw	}
0a4e9518Sgw
f864f99eSPrakash Surya	if (!tx->tx_dirty_delayed &&
69962b56SMatthew Ahrens	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
69962b56SMatthew Ahrens		tx->tx_wait_dirty = B_TRUE;
69962b56SMatthew Ahrens		return (SET_ERROR(ERESTART));
69962b56SMatthew Ahrens	}
69962b56SMatthew Ahrens
fa9e4066Sahrens	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
8a2f1b91Sahrens	tx->tx_needassign_txh = NULL;
fa9e4066Sahrens
8a2f1b91Sahrens	/*
8a2f1b91Sahrens	 * NB: No error returns are allowed after txg_hold_open, but
8a2f1b91Sahrens	 * before processing the dnode holds, due to the
8a2f1b91Sahrens	 * dmu_tx_unassign() logic.
8a2f1b91Sahrens	 */
fa9e4066Sahrens
61e255ceSMatthew Ahrens	uint64_t towrite = 0;
61e255ceSMatthew Ahrens	uint64_t tohold = 0;
61e255ceSMatthew Ahrens	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
8a2f1b91Sahrens	    txh = list_next(&tx->tx_holds, txh)) {
8a2f1b91Sahrens		dnode_t *dn = txh->txh_dnode;
fa9e4066Sahrens		if (dn != NULL) {
fa9e4066Sahrens			mutex_enter(&dn->dn_mtx);
8a2f1b91Sahrens			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
8a2f1b91Sahrens				mutex_exit(&dn->dn_mtx);
8a2f1b91Sahrens				tx->tx_needassign_txh = txh;
be6fd75aSMatthew Ahrens				return (SET_ERROR(ERESTART));
fa9e4066Sahrens			}
8a2f1b91Sahrens			if (dn->dn_assigned_txg == 0)
fa9e4066Sahrens				dn->dn_assigned_txg = tx->tx_txg;
8a2f1b91Sahrens			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
*e914ace2STim Schumacher			(void) zfs_refcount_add(&dn->dn_tx_holds, tx);
fa9e4066Sahrens			mutex_exit(&dn->dn_mtx);
fa9e4066Sahrens		}
*e914ace2STim Schumacher		towrite += zfs_refcount_count(&txh->txh_space_towrite);
*e914ace2STim Schumacher		tohold += zfs_refcount_count(&txh->txh_memory_tohold);
fa9e4066Sahrens	}
fa9e4066Sahrens
cdb0ab79Smaybee	/* needed allocation: worst-case estimate of write space */
61e255ceSMatthew Ahrens	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
cdb0ab79Smaybee	/* calculate memory footprint estimate */
61e255ceSMatthew Ahrens	uint64_t memory = towrite + tohold;
fa9e4066Sahrens
61e255ceSMatthew Ahrens	if (tx->tx_dir != NULL && asize != 0) {
cdb0ab79Smaybee		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
61e255ceSMatthew Ahrens		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
61e255ceSMatthew Ahrens		if (err != 0)
fa9e4066Sahrens			return (err);
fa9e4066Sahrens	}
fa9e4066Sahrens
fa9e4066Sahrens	return (0);
fa9e4066Sahrens}
fa9e4066Sahrens
8a2f1b91Sahrensstatic void
8a2f1b91Sahrensdmu_tx_unassign(dmu_tx_t *tx)
fa9e4066Sahrens{
8a2f1b91Sahrens	if (tx->tx_txg == 0)
8a2f1b91Sahrens		return;
fa9e4066Sahrens
fa9e4066Sahrens	txg_rele_to_quiesce(&tx->tx_txgh);
fa9e4066Sahrens
3e30c24aSWill Andrews	/*
3e30c24aSWill Andrews	 * Walk the transaction's hold list, removing the hold on the
3e30c24aSWill Andrews	 * associated dnode, and notifying waiters if the refcount drops to 0.
3e30c24aSWill Andrews	 */
61e255ceSMatthew Ahrens	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
61e255ceSMatthew Ahrens	    txh != tx->tx_needassign_txh;
8a2f1b91Sahrens	    txh = list_next(&tx->tx_holds, txh)) {
8a2f1b91Sahrens		dnode_t *dn = txh->txh_dnode;
fa9e4066Sahrens
fa9e4066Sahrens		if (dn == NULL)
fa9e4066Sahrens			continue;
fa9e4066Sahrens		mutex_enter(&dn->dn_mtx);
8a2f1b91Sahrens		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
fa9e4066Sahrens
*e914ace2STim Schumacher		if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
fa9e4066Sahrens			dn->dn_assigned_txg = 0;
fa9e4066Sahrens			cv_broadcast(&dn->dn_notxholds);
fa9e4066Sahrens		}
fa9e4066Sahrens		mutex_exit(&dn->dn_mtx);
fa9e4066Sahrens	}
fa9e4066Sahrens
fa9e4066Sahrens	txg_rele_to_sync(&tx->tx_txgh);
fa9e4066Sahrens
8a2f1b91Sahrens	tx->tx_lasttried_txg = tx->tx_txg;
fa9e4066Sahrens	tx->tx_txg = 0;
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrens/*
f864f99eSPrakash Surya * Assign tx to a transaction group; txg_how is a bitmask:
fa9e4066Sahrens *
f864f99eSPrakash Surya * If TXG_WAIT is set and the currently open txg is full, this function
f864f99eSPrakash Surya * will wait until there's a new txg. This should be used when no locks
f864f99eSPrakash Surya * are being held. With this bit set, this function will only fail if
f864f99eSPrakash Surya * we're truly out of space (or over quota).
fa9e4066Sahrens *
f864f99eSPrakash Surya * If TXG_WAIT is *not* set and we can't assign into the currently open
f864f99eSPrakash Surya * txg without blocking, this function will return immediately with
f864f99eSPrakash Surya * ERESTART. This should be used whenever locks are being held.  On an
f864f99eSPrakash Surya * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
f864f99eSPrakash Surya * and try again.
69962b56SMatthew Ahrens *
f864f99eSPrakash Surya * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
f864f99eSPrakash Surya * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
f864f99eSPrakash Surya * details on the throttle). This is used by the VFS operations, after
f864f99eSPrakash Surya * they have already called dmu_tx_wait() (though most likely on a
f864f99eSPrakash Surya * different tx).
fa9e4066Sahrens */
fa9e4066Sahrensint
f864f99eSPrakash Suryadmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
fa9e4066Sahrens{
fa9e4066Sahrens	int err;
fa9e4066Sahrens
fa9e4066Sahrens	ASSERT(tx->tx_txg == 0);
f864f99eSPrakash Surya	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
fa9e4066Sahrens	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
fa9e4066Sahrens
3b2aab18SMatthew Ahrens	/* If we might wait, we must not hold the config lock. */
f864f99eSPrakash Surya	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
3b2aab18SMatthew Ahrens
f864f99eSPrakash Surya	if ((txg_how & TXG_NOTHROTTLE))
f864f99eSPrakash Surya		tx->tx_dirty_delayed = B_TRUE;
69962b56SMatthew Ahrens
8a2f1b91Sahrens	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
8a2f1b91Sahrens		dmu_tx_unassign(tx);
fa9e4066Sahrens
f864f99eSPrakash Surya		if (err != ERESTART || !(txg_how & TXG_WAIT))
fa9e4066Sahrens			return (err);
fa9e4066Sahrens
8a2f1b91Sahrens		dmu_tx_wait(tx);
fa9e4066Sahrens	}
fa9e4066Sahrens
fa9e4066Sahrens	txg_rele_to_quiesce(&tx->tx_txgh);
fa9e4066Sahrens
fa9e4066Sahrens	return (0);
fa9e4066Sahrens}
fa9e4066Sahrens
8a2f1b91Sahrensvoid
8a2f1b91Sahrensdmu_tx_wait(dmu_tx_t *tx)
8a2f1b91Sahrens{
0a4e9518Sgw	spa_t *spa = tx->tx_pool->dp_spa;
69962b56SMatthew Ahrens	dsl_pool_t *dp = tx->tx_pool;
0a4e9518Sgw
8a2f1b91Sahrens	ASSERT(tx->tx_txg == 0);
3b2aab18SMatthew Ahrens	ASSERT(!dsl_pool_config_held(tx->tx_pool));
8a2f1b91Sahrens
69962b56SMatthew Ahrens	if (tx->tx_wait_dirty) {
69962b56SMatthew Ahrens		/*
69962b56SMatthew Ahrens		 * dmu_tx_try_assign() has determined that we need to wait
69962b56SMatthew Ahrens		 * because we've consumed much or all of the dirty buffer
69962b56SMatthew Ahrens		 * space.
69962b56SMatthew Ahrens		 */
69962b56SMatthew Ahrens		mutex_enter(&dp->dp_lock);
69962b56SMatthew Ahrens		while (dp->dp_dirty_total >= zfs_dirty_data_max)
69962b56SMatthew Ahrens			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
69962b56SMatthew Ahrens		uint64_t dirty = dp->dp_dirty_total;
69962b56SMatthew Ahrens		mutex_exit(&dp->dp_lock);
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens		dmu_tx_delay(tx, dirty);
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens		tx->tx_wait_dirty = B_FALSE;
69962b56SMatthew Ahrens
69962b56SMatthew Ahrens		/*
f864f99eSPrakash Surya		 * Note: setting tx_dirty_delayed only has effect if the
f864f99eSPrakash Surya		 * caller used TX_WAIT.  Otherwise they are going to
f864f99eSPrakash Surya		 * destroy this tx and try again.  The common case,
f864f99eSPrakash Surya		 * zfs_write(), uses TX_WAIT.
69962b56SMatthew Ahrens		 */
f864f99eSPrakash Surya		tx->tx_dirty_delayed = B_TRUE;
69962b56SMatthew Ahrens	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
69962b56SMatthew Ahrens		/*
69962b56SMatthew Ahrens		 * If the pool is suspended we need to wait until it
69962b56SMatthew Ahrens		 * is resumed.  Note that it's possible that the pool
69962b56SMatthew Ahrens		 * has become active after this thread has tried to
69962b56SMatthew Ahrens		 * obtain a tx.  If that's the case then tx_lasttried_txg
69962b56SMatthew Ahrens		 * would not have been set.
69962b56SMatthew Ahrens		 */
69962b56SMatthew Ahrens		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
0a4e9518Sgw	} else if (tx->tx_needassign_txh) {
69962b56SMatthew Ahrens		/*
69962b56SMatthew Ahrens		 * A dnode is assigned to the quiescing txg.  Wait for its
69962b56SMatthew Ahrens		 * transaction to complete.
69962b56SMatthew Ahrens		 */
8a2f1b91Sahrens		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
8a2f1b91Sahrens
8a2f1b91Sahrens		mutex_enter(&dn->dn_mtx);
8a2f1b91Sahrens		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
8a2f1b91Sahrens			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
8a2f1b91Sahrens		mutex_exit(&dn->dn_mtx);
8a2f1b91Sahrens		tx->tx_needassign_txh = NULL;
8a2f1b91Sahrens	} else {
fa41d87dSSerapheim Dimitropoulos		/*
fa41d87dSSerapheim Dimitropoulos		 * If we have a lot of dirty data just wait until we sync
fa41d87dSSerapheim Dimitropoulos		 * out a TXG at which point we'll hopefully have synced
fa41d87dSSerapheim Dimitropoulos		 * a portion of the changes.
fa41d87dSSerapheim Dimitropoulos		 */
fa41d87dSSerapheim Dimitropoulos		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
8a2f1b91Sahrens	}
8a2f1b91Sahrens}
8a2f1b91Sahrens
0c779ad4SMatthew Ahrensstatic void
0c779ad4SMatthew Ahrensdmu_tx_destroy(dmu_tx_t *tx)
fa9e4066Sahrens{
8a2f1b91Sahrens	dmu_tx_hold_t *txh;
fa9e4066Sahrens
0c779ad4SMatthew Ahrens	while ((txh = list_head(&tx->tx_holds)) != NULL) {
0c779ad4SMatthew Ahrens		dnode_t *dn = txh->txh_dnode;
0c779ad4SMatthew Ahrens
0c779ad4SMatthew Ahrens		list_remove(&tx->tx_holds, txh);
*e914ace2STim Schumacher		zfs_refcount_destroy_many(&txh->txh_space_towrite,
*e914ace2STim Schumacher		    zfs_refcount_count(&txh->txh_space_towrite));
*e914ace2STim Schumacher		zfs_refcount_destroy_many(&txh->txh_memory_tohold,
*e914ace2STim Schumacher		    zfs_refcount_count(&txh->txh_memory_tohold));
0c779ad4SMatthew Ahrens		kmem_free(txh, sizeof (dmu_tx_hold_t));
0c779ad4SMatthew Ahrens		if (dn != NULL)
0c779ad4SMatthew Ahrens			dnode_rele(dn, tx);
0c779ad4SMatthew Ahrens	}
0c779ad4SMatthew Ahrens
0c779ad4SMatthew Ahrens	list_destroy(&tx->tx_callbacks);
0c779ad4SMatthew Ahrens	list_destroy(&tx->tx_holds);
0c779ad4SMatthew Ahrens	kmem_free(tx, sizeof (dmu_tx_t));
0c779ad4SMatthew Ahrens}
0c779ad4SMatthew Ahrens
0c779ad4SMatthew Ahrensvoid
0c779ad4SMatthew Ahrensdmu_tx_commit(dmu_tx_t *tx)
0c779ad4SMatthew Ahrens{
fa9e4066Sahrens	ASSERT(tx->tx_txg != 0);
fa9e4066Sahrens
3e30c24aSWill Andrews	/*
3e30c24aSWill Andrews	 * Go through the transaction's hold list and remove holds on
3e30c24aSWill Andrews	 * associated dnodes, notifying waiters if no holds remain.
3e30c24aSWill Andrews	 */
0c779ad4SMatthew Ahrens	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
0c779ad4SMatthew Ahrens	    txh = list_next(&tx->tx_holds, txh)) {
8a2f1b91Sahrens		dnode_t *dn = txh->txh_dnode;
fa9e4066Sahrens
fa9e4066Sahrens		if (dn == NULL)
fa9e4066Sahrens			continue;
0c779ad4SMatthew Ahrens
fa9e4066Sahrens		mutex_enter(&dn->dn_mtx);
fa9e4066Sahrens		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
fa9e4066Sahrens
*e914ace2STim Schumacher		if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
fa9e4066Sahrens			dn->dn_assigned_txg = 0;
fa9e4066Sahrens			cv_broadcast(&dn->dn_notxholds);
fa9e4066Sahrens		}
fa9e4066Sahrens		mutex_exit(&dn->dn_mtx);
fa9e4066Sahrens	}
fa9e4066Sahrens
8a2f1b91Sahrens	if (tx->tx_tempreserve_cookie)
fa9e4066Sahrens		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
fa9e4066Sahrens
d20e665cSRicardo M. Correia	if (!list_is_empty(&tx->tx_callbacks))
d20e665cSRicardo M. Correia		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
d20e665cSRicardo M. Correia
fa9e4066Sahrens	if (tx->tx_anyobj == FALSE)
fa9e4066Sahrens		txg_rele_to_sync(&tx->tx_txgh);
d20e665cSRicardo M. Correia
0c779ad4SMatthew Ahrens	dmu_tx_destroy(tx);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensvoid
fa9e4066Sahrensdmu_tx_abort(dmu_tx_t *tx)
fa9e4066Sahrens{
fa9e4066Sahrens	ASSERT(tx->tx_txg == 0);
fa9e4066Sahrens
d20e665cSRicardo M. Correia	/*
d20e665cSRicardo M. Correia	 * Call any registered callbacks with an error code.
d20e665cSRicardo M. Correia	 */
d20e665cSRicardo M. Correia	if (!list_is_empty(&tx->tx_callbacks))
d20e665cSRicardo M. Correia		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
d20e665cSRicardo M. Correia
0c779ad4SMatthew Ahrens	dmu_tx_destroy(tx);
fa9e4066Sahrens}
fa9e4066Sahrens
fa9e4066Sahrensuint64_t
fa9e4066Sahrensdmu_tx_get_txg(dmu_tx_t *tx)
fa9e4066Sahrens{
fa9e4066Sahrens	ASSERT(tx->tx_txg != 0);
fa9e4066Sahrens	return (tx->tx_txg);
fa9e4066Sahrens}
d20e665cSRicardo M. Correia
3b2aab18SMatthew Ahrensdsl_pool_t *
3b2aab18SMatthew Ahrensdmu_tx_pool(dmu_tx_t *tx)
3b2aab18SMatthew Ahrens{
3b2aab18SMatthew Ahrens	ASSERT(tx->tx_pool != NULL);
3b2aab18SMatthew Ahrens	return (tx->tx_pool);
3b2aab18SMatthew Ahrens}
3b2aab18SMatthew Ahrens
d20e665cSRicardo M. Correiavoid
d20e665cSRicardo M. Correiadmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
d20e665cSRicardo M. Correia{
d20e665cSRicardo M. Correia	dmu_tx_callback_t *dcb;
d20e665cSRicardo M. Correia
d20e665cSRicardo M. Correia	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
d20e665cSRicardo M. Correia
d20e665cSRicardo M. Correia	dcb->dcb_func = func;
d20e665cSRicardo M. Correia	dcb->dcb_data = data;
d20e665cSRicardo M. Correia
d20e665cSRicardo M. Correia	list_insert_tail(&tx->tx_callbacks, dcb);
d20e665cSRicardo M. Correia}
d20e665cSRicardo M. Correia
d20e665cSRicardo M. Correia/*
d20e665cSRicardo M. Correia * Call all the commit callbacks on a list, with a given error code.
d20e665cSRicardo M. Correia */
d20e665cSRicardo M. Correiavoid
d20e665cSRicardo M. Correiadmu_tx_do_callbacks(list_t *cb_list, int error)
d20e665cSRicardo M. Correia{
d20e665cSRicardo M. Correia	dmu_tx_callback_t *dcb;
d20e665cSRicardo M. Correia
0c779ad4SMatthew Ahrens	while ((dcb = list_head(cb_list)) != NULL) {
d20e665cSRicardo M. Correia		list_remove(cb_list, dcb);
d20e665cSRicardo M. Correia		dcb->dcb_func(dcb->dcb_data, error);
d20e665cSRicardo M. Correia		kmem_free(dcb, sizeof (dmu_tx_callback_t));
d20e665cSRicardo M. Correia	}
d20e665cSRicardo M. Correia}
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum/*
0a586ceaSMark Shellenbaum * Interface to hold a bunch of attributes.
0a586ceaSMark Shellenbaum * used for creating new files.
0a586ceaSMark Shellenbaum * attrsize is the total size of all attributes
0a586ceaSMark Shellenbaum * to be added during object creation
0a586ceaSMark Shellenbaum *
0a586ceaSMark Shellenbaum * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
0a586ceaSMark Shellenbaum */
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum/*
0a586ceaSMark Shellenbaum * hold necessary attribute name for attribute registration.
0a586ceaSMark Shellenbaum * should be a very rare case where this is needed.  If it does
0a586ceaSMark Shellenbaum * happen it would only happen on the first write to the file system.
0a586ceaSMark Shellenbaum */
0a586ceaSMark Shellenbaumstatic void
0a586ceaSMark Shellenbaumdmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
0a586ceaSMark Shellenbaum{
0a586ceaSMark Shellenbaum	if (!sa->sa_need_attr_registration)
0a586ceaSMark Shellenbaum		return;
0a586ceaSMark Shellenbaum
61e255ceSMatthew Ahrens	for (int i = 0; i != sa->sa_num_attrs; i++) {
0a586ceaSMark Shellenbaum		if (!sa->sa_attr_table[i].sa_registered) {
0a586ceaSMark Shellenbaum			if (sa->sa_reg_attr_obj)
0a586ceaSMark Shellenbaum				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
0a586ceaSMark Shellenbaum				    B_TRUE, sa->sa_attr_table[i].sa_name);
0a586ceaSMark Shellenbaum			else
0a586ceaSMark Shellenbaum				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
0a586ceaSMark Shellenbaum				    B_TRUE, sa->sa_attr_table[i].sa_name);
0a586ceaSMark Shellenbaum		}
0a586ceaSMark Shellenbaum	}
0a586ceaSMark Shellenbaum}
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaumvoid
0a586ceaSMark Shellenbaumdmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
0a586ceaSMark Shellenbaum{
54811da5SToomas Soome	dmu_tx_hold_t *txh;
0a586ceaSMark Shellenbaum
54811da5SToomas Soome	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
54811da5SToomas Soome	    THT_SPILL, 0, 0);
54811da5SToomas Soome	if (txh != NULL)
*e914ace2STim Schumacher		(void) zfs_refcount_add_many(&txh->txh_space_towrite,
54811da5SToomas Soome		    SPA_OLD_MAXBLOCKSIZE, FTAG);
0a586ceaSMark Shellenbaum}
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaumvoid
0a586ceaSMark Shellenbaumdmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
0a586ceaSMark Shellenbaum{
0a586ceaSMark Shellenbaum	sa_os_t *sa = tx->tx_objset->os_sa;
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	if (tx->tx_objset->os_sa->sa_master_obj == 0)
0a586ceaSMark Shellenbaum		return;
0a586ceaSMark Shellenbaum
61e255ceSMatthew Ahrens	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
61e255ceSMatthew Ahrens	} else {
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
0a586ceaSMark Shellenbaum	}
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	dmu_tx_sa_registration_hold(sa, tx);
0a586ceaSMark Shellenbaum
54811da5SToomas Soome	if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
0a586ceaSMark Shellenbaum		return;
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
0a586ceaSMark Shellenbaum	    THT_SPILL, 0, 0);
0a586ceaSMark Shellenbaum}
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum/*
0a586ceaSMark Shellenbaum * Hold SA attribute
0a586ceaSMark Shellenbaum *
0a586ceaSMark Shellenbaum * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
0a586ceaSMark Shellenbaum *
0a586ceaSMark Shellenbaum * variable_size is the total size of all variable sized attributes
0a586ceaSMark Shellenbaum * passed to this function.  It is not the total size of all
0a586ceaSMark Shellenbaum * variable size attributes that *may* exist on this object.
0a586ceaSMark Shellenbaum */
0a586ceaSMark Shellenbaumvoid
0a586ceaSMark Shellenbaumdmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
0a586ceaSMark Shellenbaum{
0a586ceaSMark Shellenbaum	uint64_t object;
0a586ceaSMark Shellenbaum	sa_os_t *sa = tx->tx_objset->os_sa;
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	ASSERT(hdl != NULL);
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	object = sa_handle_object(hdl);
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	dmu_tx_hold_bonus(tx, object);
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	if (tx->tx_objset->os_sa->sa_master_obj == 0)
0a586ceaSMark Shellenbaum		return;
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
0a586ceaSMark Shellenbaum	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
0a586ceaSMark Shellenbaum	}
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	dmu_tx_sa_registration_hold(sa, tx);
0a586ceaSMark Shellenbaum
0a586ceaSMark Shellenbaum	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
0a586ceaSMark Shellenbaum		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
0a586ceaSMark Shellenbaum
744947dcSTom Erickson	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
0a586ceaSMark Shellenbaum		ASSERT(tx->tx_txg == 0);
0a586ceaSMark Shellenbaum		dmu_tx_hold_spill(tx, object);
744947dcSTom Erickson	} else {
744947dcSTom Erickson		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
744947dcSTom Erickson		dnode_t *dn;
744947dcSTom Erickson
744947dcSTom Erickson		DB_DNODE_ENTER(db);
744947dcSTom Erickson		dn = DB_DNODE(db);
744947dcSTom Erickson		if (dn->dn_have_spill) {
744947dcSTom Erickson			ASSERT(tx->tx_txg == 0);
744947dcSTom Erickson			dmu_tx_hold_spill(tx, object);
744947dcSTom Erickson		}
744947dcSTom Erickson		DB_DNODE_EXIT(db);
0a586ceaSMark Shellenbaum	}
0a586ceaSMark Shellenbaum}