xref: /illumos-gate/usr/src/uts/common/fs/zfs/dmu_tx.c (revision ea8dc4b6d2251b437950c0056bc626b311c73c27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dbuf.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
34 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
35 #include <sys/dsl_pool.h>
36 #include <sys/zap_impl.h>	/* for ZAP_BLOCK_SHIFT */
37 #include <sys/spa.h>
38 #include <sys/zfs_context.h>
39 
40 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
41     uint64_t arg1, uint64_t arg2);
42 
43 #ifdef ZFS_DEBUG
44 int dmu_use_tx_debug_bufs = 1;
45 #endif
46 
47 dmu_tx_t *
48 dmu_tx_create_ds(dsl_dir_t *dd)
49 {
50 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
51 	tx->tx_dir = dd;
52 	if (dd)
53 		tx->tx_pool = dd->dd_pool;
54 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
55 	    offsetof(dmu_tx_hold_t, dth_node));
56 	refcount_create(&tx->tx_space_written);
57 	refcount_create(&tx->tx_space_freed);
58 	return (tx);
59 }
60 
61 dmu_tx_t *
62 dmu_tx_create(objset_t *os)
63 {
64 	dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
65 	tx->tx_objset = os;
66 	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
67 	return (tx);
68 }
69 
70 dmu_tx_t *
71 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
72 {
73 	dmu_tx_t *tx = dmu_tx_create_ds(NULL);
74 
75 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
76 	tx->tx_pool = dp;
77 	tx->tx_txg = txg;
78 	tx->tx_anyobj = TRUE;
79 
80 	return (tx);
81 }
82 
83 int
84 dmu_tx_is_syncing(dmu_tx_t *tx)
85 {
86 	return (tx->tx_anyobj);
87 }
88 
89 int
90 dmu_tx_private_ok(dmu_tx_t *tx)
91 {
92 	return (tx->tx_anyobj);
93 }
94 
95 static void
96 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
97     enum dmu_tx_hold_type type, dmu_tx_hold_func_t func,
98     uint64_t arg1, uint64_t arg2)
99 {
100 	dmu_tx_hold_t *dth;
101 	dnode_t *dn = NULL;
102 	int err;
103 
104 	if (object != DMU_NEW_OBJECT) {
105 		err = dnode_hold(os->os, object, tx, &dn);
106 		if (err) {
107 			tx->tx_err = err;
108 			return;
109 		}
110 
111 		if (err == 0 && tx->tx_txg != 0) {
112 			mutex_enter(&dn->dn_mtx);
113 			/*
114 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
115 			 * problem, but there's no way for it to happen (for
116 			 * now, at least).
117 			 */
118 			ASSERT(dn->dn_assigned_txg == 0);
119 			ASSERT(dn->dn_assigned_tx == NULL);
120 			dn->dn_assigned_txg = tx->tx_txg;
121 			dn->dn_assigned_tx = tx;
122 			(void) refcount_add(&dn->dn_tx_holds, tx);
123 			mutex_exit(&dn->dn_mtx);
124 		}
125 	}
126 
127 	dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
128 	dth->dth_dnode = dn;
129 	dth->dth_type = type;
130 	dth->dth_arg1 = arg1;
131 	dth->dth_arg2 = arg2;
132 	list_insert_tail(&tx->tx_holds, dth);
133 
134 	if (func)
135 		func(tx, dn, arg1, arg2);
136 }
137 
138 void
139 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
140 {
141 	/*
142 	 * If we're syncing, they can manipulate any object anyhow, and
143 	 * the hold on the dnode_t can cause problems.
144 	 */
145 	if (!dmu_tx_is_syncing(tx)) {
146 		dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT,
147 		    NULL, 0, 0);
148 	}
149 }
150 
151 static int
152 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
153 {
154 	int err;
155 	dmu_buf_impl_t *db;
156 
157 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
158 	db = dbuf_hold_level(dn, level, blkid, FTAG);
159 	rw_exit(&dn->dn_struct_rwlock);
160 	if (db == NULL)
161 		return (EIO);
162 	err = dbuf_read(db, zio, DB_RF_CANFAIL);
163 	dbuf_rele(db, FTAG);
164 	return (err);
165 }
166 
167 /* ARGSUSED */
168 static void
169 dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
170 {
171 	uint64_t start, end, i, space;
172 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
173 
174 	if (len == 0)
175 		return;
176 
177 	min_bs = SPA_MINBLOCKSHIFT;
178 	max_bs = SPA_MAXBLOCKSHIFT;
179 	min_ibs = DN_MIN_INDBLKSHIFT;
180 	max_ibs = DN_MAX_INDBLKSHIFT;
181 
182 	/*
183 	 * For i/o error checking, read the first and last level-0
184 	 * blocks, and all the level-1 blocks.  We needn't do this on
185 	 * the meta-dnode, because we've already read it in.
186 	 */
187 
188 	if (dn && dn->dn_object != DMU_META_DNODE_OBJECT) {
189 		int err;
190 
191 		if (dn->dn_maxblkid == 0) {
192 			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
193 			if (err) {
194 				tx->tx_err = err;
195 				return;
196 			}
197 		} else {
198 			zio_t *zio = zio_root(tx->tx_pool->dp_spa,
199 			    NULL, NULL, ZIO_FLAG_CANFAIL);
200 
201 			/* first level-0 block */
202 			start = off/dn->dn_datablksz;
203 			err = dmu_tx_check_ioerr(zio, dn, 0, start);
204 			if (err) {
205 				tx->tx_err = err;
206 				return;
207 			}
208 
209 			/* last level-0 block */
210 			end = (off+len)/dn->dn_datablksz;
211 			if (end != start) {
212 				err = dmu_tx_check_ioerr(zio, dn, 0, end);
213 				if (err) {
214 					tx->tx_err = err;
215 					return;
216 				}
217 			}
218 
219 			/* level-1 blocks */
220 			if (dn->dn_nlevels > 1) {
221 				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
222 				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
223 				for (i = start+1; i < end; i++) {
224 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
225 					if (err) {
226 						tx->tx_err = err;
227 						return;
228 					}
229 				}
230 			}
231 
232 			err = zio_wait(zio);
233 			if (err) {
234 				tx->tx_err = err;
235 				return;
236 			}
237 		}
238 	}
239 
240 	/*
241 	 * If there's more than one block, the blocksize can't change,
242 	 * so we can make a more precise estimate.  Alternatively,
243 	 * if the dnode's ibs is larger than max_ibs, always use that.
244 	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
245 	 * the code will still work correctly on existing pools.
246 	 */
247 	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
248 		min_ibs = max_ibs = dn->dn_indblkshift;
249 		if (dn->dn_datablkshift != 0)
250 			min_bs = max_bs = dn->dn_datablkshift;
251 	}
252 
253 	/*
254 	 * 'end' is the last thing we will access, not one past.
255 	 * This way we won't overflow when accessing the last byte.
256 	 */
257 	start = P2ALIGN(off, 1ULL << max_bs);
258 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
259 	space = end - start + 1;
260 
261 	start >>= min_bs;
262 	end >>= min_bs;
263 
264 	epbs = min_ibs - SPA_BLKPTRSHIFT;
265 
266 	/*
267 	 * The object contains at most 2^(64 - min_bs) blocks,
268 	 * and each indirect level maps 2^epbs.
269 	 */
270 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
271 		start >>= epbs;
272 		end >>= epbs;
273 		/*
274 		 * If we increase the number of levels of indirection,
275 		 * we'll need new blkid=0 indirect blocks.  If start == 0,
276 		 * we're already accounting for that blocks; and if end == 0,
277 		 * we can't increase the number of levels beyond that.
278 		 */
279 		if (start != 0 && end != 0)
280 			space += 1ULL << max_ibs;
281 		space += (end - start + 1) << max_ibs;
282 	}
283 
284 	ASSERT(space < 2 * DMU_MAX_ACCESS);
285 
286 	tx->tx_space_towrite += space;
287 }
288 
289 static void
290 dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
291 {
292 	dnode_t *mdn = tx->tx_objset->os->os_meta_dnode;
293 	uint64_t object = dn ? dn->dn_object : DN_MAX_OBJECT - 1;
294 	uint64_t pre_write_space;
295 
296 	ASSERT(object < DN_MAX_OBJECT);
297 	pre_write_space = tx->tx_space_towrite;
298 	dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
299 	if (dn && dn->dn_dbuf->db_blkptr &&
300 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
301 	    dn->dn_dbuf->db_blkptr->blk_birth)) {
302 		tx->tx_space_tooverwrite +=
303 			tx->tx_space_towrite - pre_write_space;
304 		tx->tx_space_towrite = pre_write_space;
305 	}
306 }
307 
308 /* ARGSUSED */
309 static void
310 dmu_tx_hold_write_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
311 {
312 	dmu_tx_count_write(tx, dn, off, len);
313 	dmu_tx_count_dnode(tx, dn);
314 }
315 
316 void
317 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
318 {
319 	ASSERT(tx->tx_txg == 0);
320 	ASSERT(len < DMU_MAX_ACCESS);
321 	ASSERT(UINT64_MAX - off >= len - 1);
322 
323 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
324 	    dmu_tx_hold_write_impl, off, len);
325 }
326 
327 static void
328 dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
329 {
330 	uint64_t blkid, nblks;
331 	uint64_t space = 0;
332 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
333 
334 	if (dn->dn_datablkshift == 0)
335 		return;
336 	/*
337 	 * not that the dnode can change, since it isn't dirty, but
338 	 * dbuf_hold_impl() wants us to have the struct_rwlock.
339 	 * also need it to protect dn_maxblkid.
340 	 */
341 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
342 	blkid = off >> dn->dn_datablkshift;
343 	nblks = (off + len) >> dn->dn_datablkshift;
344 
345 	if (blkid >= dn->dn_maxblkid) {
346 		rw_exit(&dn->dn_struct_rwlock);
347 		return;
348 	}
349 	if (blkid + nblks > dn->dn_maxblkid)
350 		nblks = dn->dn_maxblkid - blkid;
351 
352 	/* don't bother after the 100,000 blocks */
353 	nblks = MIN(nblks, 128*1024);
354 
355 	if (dn->dn_phys->dn_nlevels == 1) {
356 		int i;
357 		for (i = 0; i < nblks; i++) {
358 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
359 			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
360 			bp += blkid + i;
361 			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
362 				dprintf_bp(bp, "can free old%s", "");
363 				space += BP_GET_ASIZE(bp);
364 			}
365 		}
366 		nblks = 0;
367 	}
368 
369 	while (nblks) {
370 		dmu_buf_impl_t *dbuf;
371 		int err, epbs, blkoff, tochk;
372 
373 		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
374 		blkoff = P2PHASE(blkid, 1<<epbs);
375 		tochk = MIN((1<<epbs) - blkoff, nblks);
376 
377 		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
378 		if (err == 0) {
379 			int i;
380 			blkptr_t *bp;
381 
382 			err = dbuf_read(dbuf, NULL,
383 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
384 			if (err != 0) {
385 				tx->tx_err = err;
386 				dbuf_rele(dbuf, FTAG);
387 				break;
388 			}
389 
390 			bp = dbuf->db.db_data;
391 			bp += blkoff;
392 
393 			for (i = 0; i < tochk; i++) {
394 				if (dsl_dataset_block_freeable(ds,
395 				    bp[i].blk_birth)) {
396 					dprintf_bp(&bp[i],
397 					    "can free old%s", "");
398 					space += BP_GET_ASIZE(&bp[i]);
399 				}
400 			}
401 			dbuf_rele(dbuf, FTAG);
402 		} else {
403 			/* the indirect block is sparse */
404 			ASSERT(err == ENOENT);
405 		}
406 
407 		blkid += tochk;
408 		nblks -= tochk;
409 	}
410 	rw_exit(&dn->dn_struct_rwlock);
411 
412 	tx->tx_space_tofree += space;
413 }
414 
415 static void
416 dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
417 {
418 	uint64_t start, end, i;
419 	int dirty, err, shift;
420 	zio_t *zio;
421 
422 	/* first block */
423 	if (off != 0 /* || dn->dn_maxblkid == 0 */)
424 		dmu_tx_count_write(tx, dn, off, 1);
425 	/* last block */
426 	if (len != DMU_OBJECT_END)
427 		dmu_tx_count_write(tx, dn, off+len, 1);
428 
429 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
430 		return;
431 	if (len == DMU_OBJECT_END)
432 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
433 
434 	/*
435 	 * For i/o error checking, read the first and last level-0
436 	 * blocks, and all the level-1 blocks.  The above count_write's
437 	 * will take care of the level-0 blocks.
438 	 */
439 	shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT;
440 	start = off >> shift;
441 	end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
442 
443 	zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
444 	for (i = start+1; i < end; i++) {
445 		uint64_t ibyte = i << shift;
446 		err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1);
447 		i = ibyte >> shift;
448 		if (err == ESRCH)
449 			break;
450 		if (err) {
451 			tx->tx_err = err;
452 			return;
453 		}
454 
455 		err = dmu_tx_check_ioerr(zio, dn, 1, i);
456 		if (err) {
457 			tx->tx_err = err;
458 			return;
459 		}
460 	}
461 	err = zio_wait(zio);
462 	if (err) {
463 		tx->tx_err = err;
464 		return;
465 	}
466 
467 	dmu_tx_count_dnode(tx, dn);
468 
469 	/* XXX locking */
470 	dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
471 	    dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
472 	if (dn->dn_assigned_tx != NULL && !dirty)
473 		dmu_tx_count_free(tx, dn, off, len);
474 }
475 
476 void
477 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
478 {
479 	ASSERT(tx->tx_txg == 0);
480 
481 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE,
482 	    dmu_tx_hold_free_impl, off, len);
483 }
484 
485 /* ARGSUSED */
486 static void
487 dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t add, uint64_t iname)
488 {
489 	uint64_t nblocks;
490 	int epbs, err;
491 	char *name = (char *)(uintptr_t)iname;
492 
493 	dmu_tx_count_dnode(tx, dn);
494 
495 	if (dn == NULL) {
496 		/*
497 		 * We will be able to fit a new object's entries into one leaf
498 		 * block.  So there will be at most 2 blocks total,
499 		 * including the header block.
500 		 */
501 		dmu_tx_count_write(tx, dn, 0, 2 << fzap_default_block_shift);
502 		return;
503 	}
504 
505 	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
506 
507 	if (dn->dn_maxblkid == 0 && !add) {
508 		/*
509 		 * If there is only one block  (i.e. this is a micro-zap)
510 		 * and we are not adding anything, the accounting is simple.
511 		 */
512 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
513 		if (err) {
514 			tx->tx_err = err;
515 			return;
516 		}
517 
518 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
519 		    dn->dn_phys->dn_blkptr[0].blk_birth))
520 			tx->tx_space_tooverwrite += dn->dn_datablksz;
521 		else
522 			tx->tx_space_towrite += dn->dn_datablksz;
523 		return;
524 	}
525 
526 	if (dn->dn_maxblkid > 0 && name) {
527 		/*
528 		 * access the name in this fat-zap so that we'll check
529 		 * for i/o errors to the leaf blocks, etc.
530 		 */
531 		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
532 		    8, 0, NULL);
533 		if (err == EIO) {
534 			tx->tx_err = err;
535 			return;
536 		}
537 	}
538 
539 	/*
540 	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
541 	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
542 	 */
543 	dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
544 	    (3 + add ? 3 : 0) << dn->dn_datablkshift);
545 
546 	/*
547 	 * If the modified blocks are scattered to the four winds,
548 	 * we'll have to modify an indirect twig for each.
549 	 */
550 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
551 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
552 		tx->tx_space_towrite += 3 << dn->dn_indblkshift;
553 }
554 
555 void
556 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
557 {
558 	ASSERT(tx->tx_txg == 0);
559 
560 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
561 	    dmu_tx_hold_zap_impl, add, (uintptr_t)name);
562 }
563 
564 void
565 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
566 {
567 	ASSERT(tx->tx_txg == 0);
568 
569 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS,
570 	    dmu_tx_hold_write_impl, 0, 0);
571 }
572 
573 
574 /* ARGSUSED */
575 static void
576 dmu_tx_hold_space_impl(dmu_tx_t *tx, dnode_t *dn,
577     uint64_t space, uint64_t unused)
578 {
579 	tx->tx_space_towrite += space;
580 }
581 
582 void
583 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
584 {
585 	ASSERT(tx->tx_txg == 0);
586 
587 	dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE,
588 	    dmu_tx_hold_space_impl, space, 0);
589 }
590 
591 int
592 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
593 {
594 	dmu_tx_hold_t *dth;
595 	int holds = 0;
596 
597 	/*
598 	 * By asserting that the tx is assigned, we're counting the
599 	 * number of dn_tx_holds, which is the same as the number of
600 	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
601 	 * dn_tx_holds could be 0.
602 	 */
603 	ASSERT(tx->tx_txg != 0);
604 
605 	/* if (tx->tx_anyobj == TRUE) */
606 		/* return (0); */
607 
608 	for (dth = list_head(&tx->tx_holds); dth;
609 	    dth = list_next(&tx->tx_holds, dth)) {
610 		if (dth->dth_dnode && dth->dth_dnode->dn_object == object)
611 			holds++;
612 	}
613 
614 	return (holds);
615 }
616 
617 #ifdef ZFS_DEBUG
618 void
619 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
620 {
621 	dmu_tx_hold_t *dth;
622 	int match_object = FALSE, match_offset = FALSE;
623 	dnode_t *dn = db->db_dnode;
624 
625 	ASSERT(tx->tx_txg != 0);
626 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
627 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
628 
629 	if (tx->tx_anyobj)
630 		return;
631 
632 	/* XXX No checking on the meta dnode for now */
633 	if (db->db.db_object == DMU_META_DNODE_OBJECT)
634 		return;
635 
636 	for (dth = list_head(&tx->tx_holds); dth;
637 	    dth = list_next(&tx->tx_holds, dth)) {
638 		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
639 		if (dth->dth_dnode == dn && dth->dth_type != THT_NEWOBJECT)
640 			match_object = TRUE;
641 		if (dth->dth_dnode == NULL || dth->dth_dnode == dn) {
642 			int datablkshift = dn->dn_datablkshift ?
643 			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
644 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
645 			int shift = datablkshift + epbs * db->db_level;
646 			uint64_t beginblk = shift >= 64 ? 0 :
647 			    (dth->dth_arg1 >> shift);
648 			uint64_t endblk = shift >= 64 ? 0 :
649 			    ((dth->dth_arg1 + dth->dth_arg2 - 1) >> shift);
650 			uint64_t blkid = db->db_blkid;
651 
652 			/* XXX dth_arg2 better not be zero... */
653 
654 			dprintf("found dth type %x beginblk=%llx endblk=%llx\n",
655 			    dth->dth_type, beginblk, endblk);
656 
657 			switch (dth->dth_type) {
658 			case THT_WRITE:
659 				if (blkid >= beginblk && blkid <= endblk)
660 					match_offset = TRUE;
661 				/*
662 				 * We will let this hold work for the bonus
663 				 * buffer so that we don't need to hold it
664 				 * when creating a new object.
665 				 */
666 				if (blkid == DB_BONUS_BLKID)
667 					match_offset = TRUE;
668 				/*
669 				 * They might have to increase nlevels,
670 				 * thus dirtying the new TLIBs.  Or the
671 				 * might have to change the block size,
672 				 * thus dirying the new lvl=0 blk=0.
673 				 */
674 				if (blkid == 0)
675 					match_offset = TRUE;
676 				break;
677 			case THT_FREE:
678 				if (blkid == beginblk &&
679 				    (dth->dth_arg1 != 0 ||
680 				    dn->dn_maxblkid == 0))
681 					match_offset = TRUE;
682 				if (blkid == endblk &&
683 				    dth->dth_arg2 != DMU_OBJECT_END)
684 					match_offset = TRUE;
685 				break;
686 			case THT_BONUS:
687 				if (blkid == DB_BONUS_BLKID)
688 					match_offset = TRUE;
689 				break;
690 			case THT_ZAP:
691 				match_offset = TRUE;
692 				break;
693 			case THT_NEWOBJECT:
694 				match_object = TRUE;
695 				break;
696 			default:
697 				ASSERT(!"bad dth_type");
698 			}
699 		}
700 		if (match_object && match_offset)
701 			return;
702 	}
703 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
704 	    (u_longlong_t)db->db.db_object, db->db_level,
705 	    (u_longlong_t)db->db_blkid);
706 }
707 #endif
708 
709 static int
710 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
711 {
712 	dmu_tx_hold_t *dth;
713 	uint64_t lsize, asize, fsize, towrite;
714 
715 	*last_dth = NULL;
716 
717 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
718 
719 	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
720 		return (ERESTART);
721 	if (tx->tx_err)
722 		return (tx->tx_err);
723 
724 	for (dth = list_head(&tx->tx_holds); dth;
725 	    dth = list_next(&tx->tx_holds, dth)) {
726 		dnode_t *dn = dth->dth_dnode;
727 		if (dn != NULL) {
728 			mutex_enter(&dn->dn_mtx);
729 			while (dn->dn_assigned_txg == tx->tx_txg - 1) {
730 				if (txg_how != TXG_WAIT) {
731 					mutex_exit(&dn->dn_mtx);
732 					return (ERESTART);
733 				}
734 				cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
735 			}
736 			if (dn->dn_assigned_txg == 0) {
737 				ASSERT(dn->dn_assigned_tx == NULL);
738 				dn->dn_assigned_txg = tx->tx_txg;
739 				dn->dn_assigned_tx = tx;
740 			} else {
741 				ASSERT(dn->dn_assigned_txg == tx->tx_txg);
742 				if (dn->dn_assigned_tx != tx)
743 					dn->dn_assigned_tx = NULL;
744 			}
745 			(void) refcount_add(&dn->dn_tx_holds, tx);
746 			mutex_exit(&dn->dn_mtx);
747 		}
748 		*last_dth = dth;
749 		if (tx->tx_err)
750 			return (tx->tx_err);
751 	}
752 
753 	/*
754 	 * If a snapshot has been taken since we made our estimates,
755 	 * assume that we won't be able to free or overwrite anything.
756 	 */
757 	if (tx->tx_objset &&
758 	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
759 	    tx->tx_lastsnap_txg) {
760 		tx->tx_space_towrite += tx->tx_space_tooverwrite;
761 		tx->tx_space_tooverwrite = 0;
762 		tx->tx_space_tofree = 0;
763 	}
764 
765 	/*
766 	 * Convert logical size to worst-case allocated size.
767 	 */
768 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tx->tx_space_tooverwrite) +
769 	    tx->tx_space_tofree;
770 	lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
771 	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
772 	towrite = tx->tx_space_towrite;
773 	tx->tx_space_towrite = asize;
774 
775 	if (tx->tx_dir && asize != 0) {
776 		int err = dsl_dir_tempreserve_space(tx->tx_dir,
777 		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
778 		if (err) {
779 			tx->tx_space_towrite = towrite;
780 			return (err);
781 		}
782 	}
783 
784 	return (0);
785 }
786 
787 static uint64_t
788 dmu_tx_unassign(dmu_tx_t *tx, dmu_tx_hold_t *last_dth)
789 {
790 	uint64_t txg = tx->tx_txg;
791 	dmu_tx_hold_t *dth;
792 
793 	ASSERT(txg != 0);
794 
795 	txg_rele_to_quiesce(&tx->tx_txgh);
796 
797 	for (dth = last_dth; dth; dth = list_prev(&tx->tx_holds, dth)) {
798 		dnode_t *dn = dth->dth_dnode;
799 
800 		if (dn == NULL)
801 			continue;
802 		mutex_enter(&dn->dn_mtx);
803 		ASSERT3U(dn->dn_assigned_txg, ==, txg);
804 
805 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
806 			dn->dn_assigned_txg = 0;
807 			dn->dn_assigned_tx = NULL;
808 			cv_broadcast(&dn->dn_notxholds);
809 		}
810 		mutex_exit(&dn->dn_mtx);
811 	}
812 
813 	txg_rele_to_sync(&tx->tx_txgh);
814 
815 	tx->tx_txg = 0;
816 	return (txg);
817 }
818 
819 /*
820  * Assign tx to a transaction group.  txg_how can be one of:
821  *
822  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
823  *	a new one.  This should be used when you're not holding locks.
824  *	If will only fail if we're truly out of space (or over quota).
825  *
826  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
827  *	blocking, returns immediately with ERESTART.  This should be used
828  *	whenever you're holding locks.  On an ERESTART error, the caller
829  *	should drop locks, do a txg_wait_open(dp, 0), and try again.
830  *
831  * (3)	A specific txg.  Use this if you need to ensure that multiple
832  *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
833  *	returns ERESTART if it can't assign you into the requested txg.
834  */
835 int
836 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
837 {
838 	dmu_tx_hold_t *last_dth;
839 	int err;
840 
841 	ASSERT(tx->tx_txg == 0);
842 	ASSERT(txg_how != 0);
843 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
844 
845 	while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
846 		uint64_t txg = dmu_tx_unassign(tx, last_dth);
847 
848 		if (err != ERESTART || txg_how != TXG_WAIT)
849 			return (err);
850 
851 		txg_wait_open(tx->tx_pool, txg + 1);
852 	}
853 
854 	txg_rele_to_quiesce(&tx->tx_txgh);
855 
856 	return (0);
857 }
858 
859 void
860 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
861 {
862 	if (tx->tx_dir == NULL || delta == 0)
863 		return;
864 
865 	if (delta > 0) {
866 		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
867 		    tx->tx_space_towrite);
868 		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
869 	} else {
870 		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
871 	}
872 }
873 
874 void
875 dmu_tx_commit(dmu_tx_t *tx)
876 {
877 	dmu_tx_hold_t *dth;
878 
879 	ASSERT(tx->tx_txg != 0);
880 
881 	while (dth = list_head(&tx->tx_holds)) {
882 		dnode_t *dn = dth->dth_dnode;
883 
884 		list_remove(&tx->tx_holds, dth);
885 		kmem_free(dth, sizeof (dmu_tx_hold_t));
886 		if (dn == NULL)
887 			continue;
888 		mutex_enter(&dn->dn_mtx);
889 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
890 
891 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
892 			dn->dn_assigned_txg = 0;
893 			dn->dn_assigned_tx = NULL;
894 			cv_broadcast(&dn->dn_notxholds);
895 		}
896 		mutex_exit(&dn->dn_mtx);
897 		dnode_rele(dn, tx);
898 	}
899 
900 	if (tx->tx_dir && tx->tx_space_towrite > 0) {
901 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
902 	}
903 
904 	if (tx->tx_anyobj == FALSE)
905 		txg_rele_to_sync(&tx->tx_txgh);
906 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
907 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
908 	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
909 	refcount_destroy_many(&tx->tx_space_written,
910 	    refcount_count(&tx->tx_space_written));
911 	refcount_destroy_many(&tx->tx_space_freed,
912 	    refcount_count(&tx->tx_space_freed));
913 #ifdef ZFS_DEBUG
914 	if (tx->tx_debug_buf)
915 		kmem_free(tx->tx_debug_buf, 4096);
916 #endif
917 	kmem_free(tx, sizeof (dmu_tx_t));
918 }
919 
920 void
921 dmu_tx_abort(dmu_tx_t *tx)
922 {
923 	dmu_tx_hold_t *dth;
924 
925 	ASSERT(tx->tx_txg == 0);
926 
927 	while (dth = list_head(&tx->tx_holds)) {
928 		dnode_t *dn = dth->dth_dnode;
929 
930 		list_remove(&tx->tx_holds, dth);
931 		kmem_free(dth, sizeof (dmu_tx_hold_t));
932 		if (dn != NULL)
933 			dnode_rele(dn, tx);
934 	}
935 	refcount_destroy_many(&tx->tx_space_written,
936 	    refcount_count(&tx->tx_space_written));
937 	refcount_destroy_many(&tx->tx_space_freed,
938 	    refcount_count(&tx->tx_space_freed));
939 #ifdef ZFS_DEBUG
940 	if (tx->tx_debug_buf)
941 		kmem_free(tx->tx_debug_buf, 4096);
942 #endif
943 	kmem_free(tx, sizeof (dmu_tx_t));
944 }
945 
946 uint64_t
947 dmu_tx_get_txg(dmu_tx_t *tx)
948 {
949 	ASSERT(tx->tx_txg != 0);
950 	return (tx->tx_txg);
951 }
952