xref: /illumos-gate/usr/src/uts/common/fs/zfs/dbuf.c (revision cb92f4130ce5b2c4ae1fa5fa6c776f4d4dc28ad9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28  * Copyright (c) 2014 Integros [integros.com]
29  */
30 
31 #include <sys/zfs_context.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_send.h>
34 #include <sys/dmu_impl.h>
35 #include <sys/dbuf.h>
36 #include <sys/dmu_objset.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/dmu_tx.h>
40 #include <sys/spa.h>
41 #include <sys/zio.h>
42 #include <sys/dmu_zfetch.h>
43 #include <sys/sa.h>
44 #include <sys/sa_impl.h>
45 #include <sys/zfeature.h>
46 #include <sys/blkptr.h>
47 #include <sys/range_tree.h>
48 
49 /*
50  * Number of times that zfs_free_range() took the slow path while doing
51  * a zfs receive.  A nonzero value indicates a potential performance problem.
52  */
53 uint64_t zfs_free_range_recv_miss;
54 
55 static void dbuf_destroy(dmu_buf_impl_t *db);
56 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
57 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
58 
59 #ifndef __lint
60 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
61     dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
62 #endif /* ! __lint */
63 
64 /*
65  * Global data structures and functions for the dbuf cache.
66  */
67 static kmem_cache_t *dbuf_cache;
68 static taskq_t *dbu_evict_taskq;
69 
70 /* ARGSUSED */
71 static int
72 dbuf_cons(void *vdb, void *unused, int kmflag)
73 {
74 	dmu_buf_impl_t *db = vdb;
75 	bzero(db, sizeof (dmu_buf_impl_t));
76 
77 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
78 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
79 	refcount_create(&db->db_holds);
80 
81 	return (0);
82 }
83 
84 /* ARGSUSED */
85 static void
86 dbuf_dest(void *vdb, void *unused)
87 {
88 	dmu_buf_impl_t *db = vdb;
89 	mutex_destroy(&db->db_mtx);
90 	cv_destroy(&db->db_changed);
91 	refcount_destroy(&db->db_holds);
92 }
93 
94 /*
95  * dbuf hash table routines
96  */
97 static dbuf_hash_table_t dbuf_hash_table;
98 
99 static uint64_t dbuf_hash_count;
100 
101 static uint64_t
102 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
103 {
104 	uintptr_t osv = (uintptr_t)os;
105 	uint64_t crc = -1ULL;
106 
107 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
108 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
109 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
110 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
111 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
112 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
113 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
114 
115 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
116 
117 	return (crc);
118 }
119 
120 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
121 
122 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
123 	((dbuf)->db.db_object == (obj) &&		\
124 	(dbuf)->db_objset == (os) &&			\
125 	(dbuf)->db_level == (level) &&			\
126 	(dbuf)->db_blkid == (blkid))
127 
128 dmu_buf_impl_t *
129 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
130 {
131 	dbuf_hash_table_t *h = &dbuf_hash_table;
132 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
133 	uint64_t idx = hv & h->hash_table_mask;
134 	dmu_buf_impl_t *db;
135 
136 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
137 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
138 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
139 			mutex_enter(&db->db_mtx);
140 			if (db->db_state != DB_EVICTING) {
141 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
142 				return (db);
143 			}
144 			mutex_exit(&db->db_mtx);
145 		}
146 	}
147 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
148 	return (NULL);
149 }
150 
151 static dmu_buf_impl_t *
152 dbuf_find_bonus(objset_t *os, uint64_t object)
153 {
154 	dnode_t *dn;
155 	dmu_buf_impl_t *db = NULL;
156 
157 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
158 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
159 		if (dn->dn_bonus != NULL) {
160 			db = dn->dn_bonus;
161 			mutex_enter(&db->db_mtx);
162 		}
163 		rw_exit(&dn->dn_struct_rwlock);
164 		dnode_rele(dn, FTAG);
165 	}
166 	return (db);
167 }
168 
169 /*
170  * Insert an entry into the hash table.  If there is already an element
171  * equal to elem in the hash table, then the already existing element
172  * will be returned and the new element will not be inserted.
173  * Otherwise returns NULL.
174  */
175 static dmu_buf_impl_t *
176 dbuf_hash_insert(dmu_buf_impl_t *db)
177 {
178 	dbuf_hash_table_t *h = &dbuf_hash_table;
179 	objset_t *os = db->db_objset;
180 	uint64_t obj = db->db.db_object;
181 	int level = db->db_level;
182 	uint64_t blkid = db->db_blkid;
183 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
184 	uint64_t idx = hv & h->hash_table_mask;
185 	dmu_buf_impl_t *dbf;
186 
187 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
188 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
189 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
190 			mutex_enter(&dbf->db_mtx);
191 			if (dbf->db_state != DB_EVICTING) {
192 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
193 				return (dbf);
194 			}
195 			mutex_exit(&dbf->db_mtx);
196 		}
197 	}
198 
199 	mutex_enter(&db->db_mtx);
200 	db->db_hash_next = h->hash_table[idx];
201 	h->hash_table[idx] = db;
202 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
203 	atomic_inc_64(&dbuf_hash_count);
204 
205 	return (NULL);
206 }
207 
208 /*
209  * Remove an entry from the hash table.  It must be in the EVICTING state.
210  */
211 static void
212 dbuf_hash_remove(dmu_buf_impl_t *db)
213 {
214 	dbuf_hash_table_t *h = &dbuf_hash_table;
215 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
216 	    db->db_level, db->db_blkid);
217 	uint64_t idx = hv & h->hash_table_mask;
218 	dmu_buf_impl_t *dbf, **dbp;
219 
220 	/*
221 	 * We musn't hold db_mtx to maintain lock ordering:
222 	 * DBUF_HASH_MUTEX > db_mtx.
223 	 */
224 	ASSERT(refcount_is_zero(&db->db_holds));
225 	ASSERT(db->db_state == DB_EVICTING);
226 	ASSERT(!MUTEX_HELD(&db->db_mtx));
227 
228 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
229 	dbp = &h->hash_table[idx];
230 	while ((dbf = *dbp) != db) {
231 		dbp = &dbf->db_hash_next;
232 		ASSERT(dbf != NULL);
233 	}
234 	*dbp = db->db_hash_next;
235 	db->db_hash_next = NULL;
236 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
237 	atomic_dec_64(&dbuf_hash_count);
238 }
239 
240 static arc_evict_func_t dbuf_do_evict;
241 
242 typedef enum {
243 	DBVU_EVICTING,
244 	DBVU_NOT_EVICTING
245 } dbvu_verify_type_t;
246 
247 static void
248 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
249 {
250 #ifdef ZFS_DEBUG
251 	int64_t holds;
252 
253 	if (db->db_user == NULL)
254 		return;
255 
256 	/* Only data blocks support the attachment of user data. */
257 	ASSERT(db->db_level == 0);
258 
259 	/* Clients must resolve a dbuf before attaching user data. */
260 	ASSERT(db->db.db_data != NULL);
261 	ASSERT3U(db->db_state, ==, DB_CACHED);
262 
263 	holds = refcount_count(&db->db_holds);
264 	if (verify_type == DBVU_EVICTING) {
265 		/*
266 		 * Immediate eviction occurs when holds == dirtycnt.
267 		 * For normal eviction buffers, holds is zero on
268 		 * eviction, except when dbuf_fix_old_data() calls
269 		 * dbuf_clear_data().  However, the hold count can grow
270 		 * during eviction even though db_mtx is held (see
271 		 * dmu_bonus_hold() for an example), so we can only
272 		 * test the generic invariant that holds >= dirtycnt.
273 		 */
274 		ASSERT3U(holds, >=, db->db_dirtycnt);
275 	} else {
276 		if (db->db_user_immediate_evict == TRUE)
277 			ASSERT3U(holds, >=, db->db_dirtycnt);
278 		else
279 			ASSERT3U(holds, >, 0);
280 	}
281 #endif
282 }
283 
284 static void
285 dbuf_evict_user(dmu_buf_impl_t *db)
286 {
287 	dmu_buf_user_t *dbu = db->db_user;
288 
289 	ASSERT(MUTEX_HELD(&db->db_mtx));
290 
291 	if (dbu == NULL)
292 		return;
293 
294 	dbuf_verify_user(db, DBVU_EVICTING);
295 	db->db_user = NULL;
296 
297 #ifdef ZFS_DEBUG
298 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
299 		*dbu->dbu_clear_on_evict_dbufp = NULL;
300 #endif
301 
302 	/*
303 	 * Invoke the callback from a taskq to avoid lock order reversals
304 	 * and limit stack depth.
305 	 */
306 	taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
307 	    &dbu->dbu_tqent);
308 }
309 
310 boolean_t
311 dbuf_is_metadata(dmu_buf_impl_t *db)
312 {
313 	if (db->db_level > 0) {
314 		return (B_TRUE);
315 	} else {
316 		boolean_t is_metadata;
317 
318 		DB_DNODE_ENTER(db);
319 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
320 		DB_DNODE_EXIT(db);
321 
322 		return (is_metadata);
323 	}
324 }
325 
326 void
327 dbuf_evict(dmu_buf_impl_t *db)
328 {
329 	ASSERT(MUTEX_HELD(&db->db_mtx));
330 	ASSERT(db->db_buf == NULL);
331 	ASSERT(db->db_data_pending == NULL);
332 
333 	dbuf_clear(db);
334 	dbuf_destroy(db);
335 }
336 
337 void
338 dbuf_init(void)
339 {
340 	uint64_t hsize = 1ULL << 16;
341 	dbuf_hash_table_t *h = &dbuf_hash_table;
342 	int i;
343 
344 	/*
345 	 * The hash table is big enough to fill all of physical memory
346 	 * with an average 4K block size.  The table will take up
347 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
348 	 */
349 	while (hsize * 4096 < physmem * PAGESIZE)
350 		hsize <<= 1;
351 
352 retry:
353 	h->hash_table_mask = hsize - 1;
354 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
355 	if (h->hash_table == NULL) {
356 		/* XXX - we should really return an error instead of assert */
357 		ASSERT(hsize > (1ULL << 10));
358 		hsize >>= 1;
359 		goto retry;
360 	}
361 
362 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
363 	    sizeof (dmu_buf_impl_t),
364 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
365 
366 	for (i = 0; i < DBUF_MUTEXES; i++)
367 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
368 
369 	/*
370 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
371 	 * configuration is not required.
372 	 */
373 	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
374 }
375 
376 void
377 dbuf_fini(void)
378 {
379 	dbuf_hash_table_t *h = &dbuf_hash_table;
380 	int i;
381 
382 	for (i = 0; i < DBUF_MUTEXES; i++)
383 		mutex_destroy(&h->hash_mutexes[i]);
384 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
385 	kmem_cache_destroy(dbuf_cache);
386 	taskq_destroy(dbu_evict_taskq);
387 }
388 
389 /*
390  * Other stuff.
391  */
392 
393 #ifdef ZFS_DEBUG
394 static void
395 dbuf_verify(dmu_buf_impl_t *db)
396 {
397 	dnode_t *dn;
398 	dbuf_dirty_record_t *dr;
399 
400 	ASSERT(MUTEX_HELD(&db->db_mtx));
401 
402 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
403 		return;
404 
405 	ASSERT(db->db_objset != NULL);
406 	DB_DNODE_ENTER(db);
407 	dn = DB_DNODE(db);
408 	if (dn == NULL) {
409 		ASSERT(db->db_parent == NULL);
410 		ASSERT(db->db_blkptr == NULL);
411 	} else {
412 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
413 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
414 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
415 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
416 		    db->db_blkid == DMU_SPILL_BLKID ||
417 		    !avl_is_empty(&dn->dn_dbufs));
418 	}
419 	if (db->db_blkid == DMU_BONUS_BLKID) {
420 		ASSERT(dn != NULL);
421 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
422 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
423 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
424 		ASSERT(dn != NULL);
425 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
426 		ASSERT0(db->db.db_offset);
427 	} else {
428 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
429 	}
430 
431 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
432 		ASSERT(dr->dr_dbuf == db);
433 
434 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
435 		ASSERT(dr->dr_dbuf == db);
436 
437 	/*
438 	 * We can't assert that db_size matches dn_datablksz because it
439 	 * can be momentarily different when another thread is doing
440 	 * dnode_set_blksz().
441 	 */
442 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
443 		dr = db->db_data_pending;
444 		/*
445 		 * It should only be modified in syncing context, so
446 		 * make sure we only have one copy of the data.
447 		 */
448 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
449 	}
450 
451 	/* verify db->db_blkptr */
452 	if (db->db_blkptr) {
453 		if (db->db_parent == dn->dn_dbuf) {
454 			/* db is pointed to by the dnode */
455 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
456 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
457 				ASSERT(db->db_parent == NULL);
458 			else
459 				ASSERT(db->db_parent != NULL);
460 			if (db->db_blkid != DMU_SPILL_BLKID)
461 				ASSERT3P(db->db_blkptr, ==,
462 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
463 		} else {
464 			/* db is pointed to by an indirect block */
465 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
466 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
467 			ASSERT3U(db->db_parent->db.db_object, ==,
468 			    db->db.db_object);
469 			/*
470 			 * dnode_grow_indblksz() can make this fail if we don't
471 			 * have the struct_rwlock.  XXX indblksz no longer
472 			 * grows.  safe to do this now?
473 			 */
474 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
475 				ASSERT3P(db->db_blkptr, ==,
476 				    ((blkptr_t *)db->db_parent->db.db_data +
477 				    db->db_blkid % epb));
478 			}
479 		}
480 	}
481 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
482 	    (db->db_buf == NULL || db->db_buf->b_data) &&
483 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
484 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
485 		/*
486 		 * If the blkptr isn't set but they have nonzero data,
487 		 * it had better be dirty, otherwise we'll lose that
488 		 * data when we evict this buffer.
489 		 */
490 		if (db->db_dirtycnt == 0) {
491 			uint64_t *buf = db->db.db_data;
492 			int i;
493 
494 			for (i = 0; i < db->db.db_size >> 3; i++) {
495 				ASSERT(buf[i] == 0);
496 			}
497 		}
498 	}
499 	DB_DNODE_EXIT(db);
500 }
501 #endif
502 
503 static void
504 dbuf_clear_data(dmu_buf_impl_t *db)
505 {
506 	ASSERT(MUTEX_HELD(&db->db_mtx));
507 	dbuf_evict_user(db);
508 	db->db_buf = NULL;
509 	db->db.db_data = NULL;
510 	if (db->db_state != DB_NOFILL)
511 		db->db_state = DB_UNCACHED;
512 }
513 
514 static void
515 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
516 {
517 	ASSERT(MUTEX_HELD(&db->db_mtx));
518 	ASSERT(buf != NULL);
519 
520 	db->db_buf = buf;
521 	ASSERT(buf->b_data != NULL);
522 	db->db.db_data = buf->b_data;
523 	if (!arc_released(buf))
524 		arc_set_callback(buf, dbuf_do_evict, db);
525 }
526 
527 /*
528  * Loan out an arc_buf for read.  Return the loaned arc_buf.
529  */
530 arc_buf_t *
531 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
532 {
533 	arc_buf_t *abuf;
534 
535 	mutex_enter(&db->db_mtx);
536 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
537 		int blksz = db->db.db_size;
538 		spa_t *spa = db->db_objset->os_spa;
539 
540 		mutex_exit(&db->db_mtx);
541 		abuf = arc_loan_buf(spa, blksz);
542 		bcopy(db->db.db_data, abuf->b_data, blksz);
543 	} else {
544 		abuf = db->db_buf;
545 		arc_loan_inuse_buf(abuf, db);
546 		dbuf_clear_data(db);
547 		mutex_exit(&db->db_mtx);
548 	}
549 	return (abuf);
550 }
551 
552 /*
553  * Calculate which level n block references the data at the level 0 offset
554  * provided.
555  */
556 uint64_t
557 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
558 {
559 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
560 		/*
561 		 * The level n blkid is equal to the level 0 blkid divided by
562 		 * the number of level 0s in a level n block.
563 		 *
564 		 * The level 0 blkid is offset >> datablkshift =
565 		 * offset / 2^datablkshift.
566 		 *
567 		 * The number of level 0s in a level n is the number of block
568 		 * pointers in an indirect block, raised to the power of level.
569 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
570 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
571 		 *
572 		 * Thus, the level n blkid is: offset /
573 		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
574 		 * = offset / 2^(datablkshift + level *
575 		 *   (indblkshift - SPA_BLKPTRSHIFT))
576 		 * = offset >> (datablkshift + level *
577 		 *   (indblkshift - SPA_BLKPTRSHIFT))
578 		 */
579 		return (offset >> (dn->dn_datablkshift + level *
580 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
581 	} else {
582 		ASSERT3U(offset, <, dn->dn_datablksz);
583 		return (0);
584 	}
585 }
586 
587 static void
588 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
589 {
590 	dmu_buf_impl_t *db = vdb;
591 
592 	mutex_enter(&db->db_mtx);
593 	ASSERT3U(db->db_state, ==, DB_READ);
594 	/*
595 	 * All reads are synchronous, so we must have a hold on the dbuf
596 	 */
597 	ASSERT(refcount_count(&db->db_holds) > 0);
598 	ASSERT(db->db_buf == NULL);
599 	ASSERT(db->db.db_data == NULL);
600 	if (db->db_level == 0 && db->db_freed_in_flight) {
601 		/* we were freed in flight; disregard any error */
602 		arc_release(buf, db);
603 		bzero(buf->b_data, db->db.db_size);
604 		arc_buf_freeze(buf);
605 		db->db_freed_in_flight = FALSE;
606 		dbuf_set_data(db, buf);
607 		db->db_state = DB_CACHED;
608 	} else if (zio == NULL || zio->io_error == 0) {
609 		dbuf_set_data(db, buf);
610 		db->db_state = DB_CACHED;
611 	} else {
612 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
613 		ASSERT3P(db->db_buf, ==, NULL);
614 		VERIFY(arc_buf_remove_ref(buf, db));
615 		db->db_state = DB_UNCACHED;
616 	}
617 	cv_broadcast(&db->db_changed);
618 	dbuf_rele_and_unlock(db, NULL);
619 }
620 
621 static void
622 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
623 {
624 	dnode_t *dn;
625 	zbookmark_phys_t zb;
626 	arc_flags_t aflags = ARC_FLAG_NOWAIT;
627 
628 	DB_DNODE_ENTER(db);
629 	dn = DB_DNODE(db);
630 	ASSERT(!refcount_is_zero(&db->db_holds));
631 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
632 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
633 	ASSERT(MUTEX_HELD(&db->db_mtx));
634 	ASSERT(db->db_state == DB_UNCACHED);
635 	ASSERT(db->db_buf == NULL);
636 
637 	if (db->db_blkid == DMU_BONUS_BLKID) {
638 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
639 
640 		ASSERT3U(bonuslen, <=, db->db.db_size);
641 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
642 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
643 		if (bonuslen < DN_MAX_BONUSLEN)
644 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
645 		if (bonuslen)
646 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
647 		DB_DNODE_EXIT(db);
648 		db->db_state = DB_CACHED;
649 		mutex_exit(&db->db_mtx);
650 		return;
651 	}
652 
653 	/*
654 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
655 	 * processes the delete record and clears the bp while we are waiting
656 	 * for the dn_mtx (resulting in a "no" from block_freed).
657 	 */
658 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
659 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
660 	    BP_IS_HOLE(db->db_blkptr)))) {
661 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
662 
663 		DB_DNODE_EXIT(db);
664 		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
665 		    db->db.db_size, db, type));
666 		bzero(db->db.db_data, db->db.db_size);
667 		db->db_state = DB_CACHED;
668 		mutex_exit(&db->db_mtx);
669 		return;
670 	}
671 
672 	DB_DNODE_EXIT(db);
673 
674 	db->db_state = DB_READ;
675 	mutex_exit(&db->db_mtx);
676 
677 	if (DBUF_IS_L2CACHEABLE(db))
678 		aflags |= ARC_FLAG_L2CACHE;
679 	if (DBUF_IS_L2COMPRESSIBLE(db))
680 		aflags |= ARC_FLAG_L2COMPRESS;
681 
682 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
683 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
684 	    db->db.db_object, db->db_level, db->db_blkid);
685 
686 	dbuf_add_ref(db, NULL);
687 
688 	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
689 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
690 	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
691 	    &aflags, &zb);
692 }
693 
694 int
695 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
696 {
697 	int err = 0;
698 	boolean_t havepzio = (zio != NULL);
699 	boolean_t prefetch;
700 	dnode_t *dn;
701 
702 	/*
703 	 * We don't have to hold the mutex to check db_state because it
704 	 * can't be freed while we have a hold on the buffer.
705 	 */
706 	ASSERT(!refcount_is_zero(&db->db_holds));
707 
708 	if (db->db_state == DB_NOFILL)
709 		return (SET_ERROR(EIO));
710 
711 	DB_DNODE_ENTER(db);
712 	dn = DB_DNODE(db);
713 	if ((flags & DB_RF_HAVESTRUCT) == 0)
714 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
715 
716 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
717 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
718 	    DBUF_IS_CACHEABLE(db);
719 
720 	mutex_enter(&db->db_mtx);
721 	if (db->db_state == DB_CACHED) {
722 		mutex_exit(&db->db_mtx);
723 		if (prefetch)
724 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
725 		if ((flags & DB_RF_HAVESTRUCT) == 0)
726 			rw_exit(&dn->dn_struct_rwlock);
727 		DB_DNODE_EXIT(db);
728 	} else if (db->db_state == DB_UNCACHED) {
729 		spa_t *spa = dn->dn_objset->os_spa;
730 
731 		if (zio == NULL)
732 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
733 		dbuf_read_impl(db, zio, flags);
734 
735 		/* dbuf_read_impl has dropped db_mtx for us */
736 
737 		if (prefetch)
738 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
739 
740 		if ((flags & DB_RF_HAVESTRUCT) == 0)
741 			rw_exit(&dn->dn_struct_rwlock);
742 		DB_DNODE_EXIT(db);
743 
744 		if (!havepzio)
745 			err = zio_wait(zio);
746 	} else {
747 		/*
748 		 * Another reader came in while the dbuf was in flight
749 		 * between UNCACHED and CACHED.  Either a writer will finish
750 		 * writing the buffer (sending the dbuf to CACHED) or the
751 		 * first reader's request will reach the read_done callback
752 		 * and send the dbuf to CACHED.  Otherwise, a failure
753 		 * occurred and the dbuf went to UNCACHED.
754 		 */
755 		mutex_exit(&db->db_mtx);
756 		if (prefetch)
757 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
758 		if ((flags & DB_RF_HAVESTRUCT) == 0)
759 			rw_exit(&dn->dn_struct_rwlock);
760 		DB_DNODE_EXIT(db);
761 
762 		/* Skip the wait per the caller's request. */
763 		mutex_enter(&db->db_mtx);
764 		if ((flags & DB_RF_NEVERWAIT) == 0) {
765 			while (db->db_state == DB_READ ||
766 			    db->db_state == DB_FILL) {
767 				ASSERT(db->db_state == DB_READ ||
768 				    (flags & DB_RF_HAVESTRUCT) == 0);
769 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
770 				    db, zio_t *, zio);
771 				cv_wait(&db->db_changed, &db->db_mtx);
772 			}
773 			if (db->db_state == DB_UNCACHED)
774 				err = SET_ERROR(EIO);
775 		}
776 		mutex_exit(&db->db_mtx);
777 	}
778 
779 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
780 	return (err);
781 }
782 
783 static void
784 dbuf_noread(dmu_buf_impl_t *db)
785 {
786 	ASSERT(!refcount_is_zero(&db->db_holds));
787 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
788 	mutex_enter(&db->db_mtx);
789 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
790 		cv_wait(&db->db_changed, &db->db_mtx);
791 	if (db->db_state == DB_UNCACHED) {
792 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
793 		spa_t *spa = db->db_objset->os_spa;
794 
795 		ASSERT(db->db_buf == NULL);
796 		ASSERT(db->db.db_data == NULL);
797 		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
798 		db->db_state = DB_FILL;
799 	} else if (db->db_state == DB_NOFILL) {
800 		dbuf_clear_data(db);
801 	} else {
802 		ASSERT3U(db->db_state, ==, DB_CACHED);
803 	}
804 	mutex_exit(&db->db_mtx);
805 }
806 
807 /*
808  * This is our just-in-time copy function.  It makes a copy of
809  * buffers, that have been modified in a previous transaction
810  * group, before we modify them in the current active group.
811  *
812  * This function is used in two places: when we are dirtying a
813  * buffer for the first time in a txg, and when we are freeing
814  * a range in a dnode that includes this buffer.
815  *
816  * Note that when we are called from dbuf_free_range() we do
817  * not put a hold on the buffer, we just traverse the active
818  * dbuf list for the dnode.
819  */
820 static void
821 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
822 {
823 	dbuf_dirty_record_t *dr = db->db_last_dirty;
824 
825 	ASSERT(MUTEX_HELD(&db->db_mtx));
826 	ASSERT(db->db.db_data != NULL);
827 	ASSERT(db->db_level == 0);
828 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
829 
830 	if (dr == NULL ||
831 	    (dr->dt.dl.dr_data !=
832 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
833 		return;
834 
835 	/*
836 	 * If the last dirty record for this dbuf has not yet synced
837 	 * and its referencing the dbuf data, either:
838 	 *	reset the reference to point to a new copy,
839 	 * or (if there a no active holders)
840 	 *	just null out the current db_data pointer.
841 	 */
842 	ASSERT(dr->dr_txg >= txg - 2);
843 	if (db->db_blkid == DMU_BONUS_BLKID) {
844 		/* Note that the data bufs here are zio_bufs */
845 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
846 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
847 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
848 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
849 		int size = db->db.db_size;
850 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
851 		spa_t *spa = db->db_objset->os_spa;
852 
853 		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
854 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
855 	} else {
856 		dbuf_clear_data(db);
857 	}
858 }
859 
860 void
861 dbuf_unoverride(dbuf_dirty_record_t *dr)
862 {
863 	dmu_buf_impl_t *db = dr->dr_dbuf;
864 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
865 	uint64_t txg = dr->dr_txg;
866 
867 	ASSERT(MUTEX_HELD(&db->db_mtx));
868 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
869 	ASSERT(db->db_level == 0);
870 
871 	if (db->db_blkid == DMU_BONUS_BLKID ||
872 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
873 		return;
874 
875 	ASSERT(db->db_data_pending != dr);
876 
877 	/* free this block */
878 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
879 		zio_free(db->db_objset->os_spa, txg, bp);
880 
881 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
882 	dr->dt.dl.dr_nopwrite = B_FALSE;
883 
884 	/*
885 	 * Release the already-written buffer, so we leave it in
886 	 * a consistent dirty state.  Note that all callers are
887 	 * modifying the buffer, so they will immediately do
888 	 * another (redundant) arc_release().  Therefore, leave
889 	 * the buf thawed to save the effort of freezing &
890 	 * immediately re-thawing it.
891 	 */
892 	arc_release(dr->dt.dl.dr_data, db);
893 }
894 
895 /*
896  * Evict (if its unreferenced) or clear (if its referenced) any level-0
897  * data blocks in the free range, so that any future readers will find
898  * empty blocks.
899  *
900  * This is a no-op if the dataset is in the middle of an incremental
901  * receive; see comment below for details.
902  */
903 void
904 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
905     dmu_tx_t *tx)
906 {
907 	dmu_buf_impl_t db_search;
908 	dmu_buf_impl_t *db, *db_next;
909 	uint64_t txg = tx->tx_txg;
910 	avl_index_t where;
911 	boolean_t freespill =
912 	    (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID);
913 
914 	if (end_blkid > dn->dn_maxblkid && !freespill)
915 		end_blkid = dn->dn_maxblkid;
916 	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
917 
918 	db_search.db_level = 0;
919 	db_search.db_blkid = start_blkid;
920 	db_search.db_state = DB_SEARCH;
921 
922 	mutex_enter(&dn->dn_dbufs_mtx);
923 	if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) {
924 		/* There can't be any dbufs in this range; no need to search. */
925 #ifdef DEBUG
926 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
927 		ASSERT3P(db, ==, NULL);
928 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
929 		ASSERT(db == NULL || db->db_level > 0);
930 #endif
931 		mutex_exit(&dn->dn_dbufs_mtx);
932 		return;
933 	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
934 		/*
935 		 * If we are receiving, we expect there to be no dbufs in
936 		 * the range to be freed, because receive modifies each
937 		 * block at most once, and in offset order.  If this is
938 		 * not the case, it can lead to performance problems,
939 		 * so note that we unexpectedly took the slow path.
940 		 */
941 		atomic_inc_64(&zfs_free_range_recv_miss);
942 	}
943 
944 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
945 	ASSERT3P(db, ==, NULL);
946 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
947 
948 	for (; db != NULL; db = db_next) {
949 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
950 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
951 
952 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
953 			break;
954 		}
955 		ASSERT3U(db->db_blkid, >=, start_blkid);
956 
957 		/* found a level 0 buffer in the range */
958 		mutex_enter(&db->db_mtx);
959 		if (dbuf_undirty(db, tx)) {
960 			/* mutex has been dropped and dbuf destroyed */
961 			continue;
962 		}
963 
964 		if (db->db_state == DB_UNCACHED ||
965 		    db->db_state == DB_NOFILL ||
966 		    db->db_state == DB_EVICTING) {
967 			ASSERT(db->db.db_data == NULL);
968 			mutex_exit(&db->db_mtx);
969 			continue;
970 		}
971 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
972 			/* will be handled in dbuf_read_done or dbuf_rele */
973 			db->db_freed_in_flight = TRUE;
974 			mutex_exit(&db->db_mtx);
975 			continue;
976 		}
977 		if (refcount_count(&db->db_holds) == 0) {
978 			ASSERT(db->db_buf);
979 			dbuf_clear(db);
980 			continue;
981 		}
982 		/* The dbuf is referenced */
983 
984 		if (db->db_last_dirty != NULL) {
985 			dbuf_dirty_record_t *dr = db->db_last_dirty;
986 
987 			if (dr->dr_txg == txg) {
988 				/*
989 				 * This buffer is "in-use", re-adjust the file
990 				 * size to reflect that this buffer may
991 				 * contain new data when we sync.
992 				 */
993 				if (db->db_blkid != DMU_SPILL_BLKID &&
994 				    db->db_blkid > dn->dn_maxblkid)
995 					dn->dn_maxblkid = db->db_blkid;
996 				dbuf_unoverride(dr);
997 			} else {
998 				/*
999 				 * This dbuf is not dirty in the open context.
1000 				 * Either uncache it (if its not referenced in
1001 				 * the open context) or reset its contents to
1002 				 * empty.
1003 				 */
1004 				dbuf_fix_old_data(db, txg);
1005 			}
1006 		}
1007 		/* clear the contents if its cached */
1008 		if (db->db_state == DB_CACHED) {
1009 			ASSERT(db->db.db_data != NULL);
1010 			arc_release(db->db_buf, db);
1011 			bzero(db->db.db_data, db->db.db_size);
1012 			arc_buf_freeze(db->db_buf);
1013 		}
1014 
1015 		mutex_exit(&db->db_mtx);
1016 	}
1017 	mutex_exit(&dn->dn_dbufs_mtx);
1018 }
1019 
1020 static int
1021 dbuf_block_freeable(dmu_buf_impl_t *db)
1022 {
1023 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
1024 	uint64_t birth_txg = 0;
1025 
1026 	/*
1027 	 * We don't need any locking to protect db_blkptr:
1028 	 * If it's syncing, then db_last_dirty will be set
1029 	 * so we'll ignore db_blkptr.
1030 	 *
1031 	 * This logic ensures that only block births for
1032 	 * filled blocks are considered.
1033 	 */
1034 	ASSERT(MUTEX_HELD(&db->db_mtx));
1035 	if (db->db_last_dirty && (db->db_blkptr == NULL ||
1036 	    !BP_IS_HOLE(db->db_blkptr))) {
1037 		birth_txg = db->db_last_dirty->dr_txg;
1038 	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1039 		birth_txg = db->db_blkptr->blk_birth;
1040 	}
1041 
1042 	/*
1043 	 * If this block don't exist or is in a snapshot, it can't be freed.
1044 	 * Don't pass the bp to dsl_dataset_block_freeable() since we
1045 	 * are holding the db_mtx lock and might deadlock if we are
1046 	 * prefetching a dedup-ed block.
1047 	 */
1048 	if (birth_txg != 0)
1049 		return (ds == NULL ||
1050 		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
1051 	else
1052 		return (B_FALSE);
1053 }
1054 
1055 void
1056 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1057 {
1058 	arc_buf_t *buf, *obuf;
1059 	int osize = db->db.db_size;
1060 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1061 	dnode_t *dn;
1062 
1063 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1064 
1065 	DB_DNODE_ENTER(db);
1066 	dn = DB_DNODE(db);
1067 
1068 	/* XXX does *this* func really need the lock? */
1069 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1070 
1071 	/*
1072 	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1073 	 * is OK, because there can be no other references to the db
1074 	 * when we are changing its size, so no concurrent DB_FILL can
1075 	 * be happening.
1076 	 */
1077 	/*
1078 	 * XXX we should be doing a dbuf_read, checking the return
1079 	 * value and returning that up to our callers
1080 	 */
1081 	dmu_buf_will_dirty(&db->db, tx);
1082 
1083 	/* create the data buffer for the new block */
1084 	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1085 
1086 	/* copy old block data to the new block */
1087 	obuf = db->db_buf;
1088 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1089 	/* zero the remainder */
1090 	if (size > osize)
1091 		bzero((uint8_t *)buf->b_data + osize, size - osize);
1092 
1093 	mutex_enter(&db->db_mtx);
1094 	dbuf_set_data(db, buf);
1095 	VERIFY(arc_buf_remove_ref(obuf, db));
1096 	db->db.db_size = size;
1097 
1098 	if (db->db_level == 0) {
1099 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1100 		db->db_last_dirty->dt.dl.dr_data = buf;
1101 	}
1102 	mutex_exit(&db->db_mtx);
1103 
1104 	dnode_willuse_space(dn, size-osize, tx);
1105 	DB_DNODE_EXIT(db);
1106 }
1107 
1108 void
1109 dbuf_release_bp(dmu_buf_impl_t *db)
1110 {
1111 	objset_t *os = db->db_objset;
1112 
1113 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1114 	ASSERT(arc_released(os->os_phys_buf) ||
1115 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1116 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1117 
1118 	(void) arc_release(db->db_buf, db);
1119 }
1120 
1121 /*
1122  * We already have a dirty record for this TXG, and we are being
1123  * dirtied again.
1124  */
1125 static void
1126 dbuf_redirty(dbuf_dirty_record_t *dr)
1127 {
1128 	dmu_buf_impl_t *db = dr->dr_dbuf;
1129 
1130 	ASSERT(MUTEX_HELD(&db->db_mtx));
1131 
1132 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1133 		/*
1134 		 * If this buffer has already been written out,
1135 		 * we now need to reset its state.
1136 		 */
1137 		dbuf_unoverride(dr);
1138 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1139 		    db->db_state != DB_NOFILL) {
1140 			/* Already released on initial dirty, so just thaw. */
1141 			ASSERT(arc_released(db->db_buf));
1142 			arc_buf_thaw(db->db_buf);
1143 		}
1144 	}
1145 }
1146 
1147 dbuf_dirty_record_t *
1148 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1149 {
1150 	dnode_t *dn;
1151 	objset_t *os;
1152 	dbuf_dirty_record_t **drp, *dr;
1153 	int drop_struct_lock = FALSE;
1154 	boolean_t do_free_accounting = B_FALSE;
1155 	int txgoff = tx->tx_txg & TXG_MASK;
1156 
1157 	ASSERT(tx->tx_txg != 0);
1158 	ASSERT(!refcount_is_zero(&db->db_holds));
1159 	DMU_TX_DIRTY_BUF(tx, db);
1160 
1161 	DB_DNODE_ENTER(db);
1162 	dn = DB_DNODE(db);
1163 	/*
1164 	 * Shouldn't dirty a regular buffer in syncing context.  Private
1165 	 * objects may be dirtied in syncing context, but only if they
1166 	 * were already pre-dirtied in open context.
1167 	 */
1168 	ASSERT(!dmu_tx_is_syncing(tx) ||
1169 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1170 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1171 	    dn->dn_objset->os_dsl_dataset == NULL);
1172 	/*
1173 	 * We make this assert for private objects as well, but after we
1174 	 * check if we're already dirty.  They are allowed to re-dirty
1175 	 * in syncing context.
1176 	 */
1177 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1178 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1179 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1180 
1181 	mutex_enter(&db->db_mtx);
1182 	/*
1183 	 * XXX make this true for indirects too?  The problem is that
1184 	 * transactions created with dmu_tx_create_assigned() from
1185 	 * syncing context don't bother holding ahead.
1186 	 */
1187 	ASSERT(db->db_level != 0 ||
1188 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1189 	    db->db_state == DB_NOFILL);
1190 
1191 	mutex_enter(&dn->dn_mtx);
1192 	/*
1193 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1194 	 * initialize the objset.
1195 	 */
1196 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1197 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1198 		dn->dn_dirtyctx =
1199 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1200 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
1201 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1202 	}
1203 	mutex_exit(&dn->dn_mtx);
1204 
1205 	if (db->db_blkid == DMU_SPILL_BLKID)
1206 		dn->dn_have_spill = B_TRUE;
1207 
1208 	/*
1209 	 * If this buffer is already dirty, we're done.
1210 	 */
1211 	drp = &db->db_last_dirty;
1212 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1213 	    db->db.db_object == DMU_META_DNODE_OBJECT);
1214 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1215 		drp = &dr->dr_next;
1216 	if (dr && dr->dr_txg == tx->tx_txg) {
1217 		DB_DNODE_EXIT(db);
1218 
1219 		dbuf_redirty(dr);
1220 		mutex_exit(&db->db_mtx);
1221 		return (dr);
1222 	}
1223 
1224 	/*
1225 	 * Only valid if not already dirty.
1226 	 */
1227 	ASSERT(dn->dn_object == 0 ||
1228 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1229 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1230 
1231 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1232 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1233 	    dn->dn_phys->dn_nlevels > db->db_level ||
1234 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1235 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1236 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1237 
1238 	/*
1239 	 * We should only be dirtying in syncing context if it's the
1240 	 * mos or we're initializing the os or it's a special object.
1241 	 * However, we are allowed to dirty in syncing context provided
1242 	 * we already dirtied it in open context.  Hence we must make
1243 	 * this assertion only if we're not already dirty.
1244 	 */
1245 	os = dn->dn_objset;
1246 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1247 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1248 	ASSERT(db->db.db_size != 0);
1249 
1250 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1251 
1252 	if (db->db_blkid != DMU_BONUS_BLKID) {
1253 		/*
1254 		 * Update the accounting.
1255 		 * Note: we delay "free accounting" until after we drop
1256 		 * the db_mtx.  This keeps us from grabbing other locks
1257 		 * (and possibly deadlocking) in bp_get_dsize() while
1258 		 * also holding the db_mtx.
1259 		 */
1260 		dnode_willuse_space(dn, db->db.db_size, tx);
1261 		do_free_accounting = dbuf_block_freeable(db);
1262 	}
1263 
1264 	/*
1265 	 * If this buffer is dirty in an old transaction group we need
1266 	 * to make a copy of it so that the changes we make in this
1267 	 * transaction group won't leak out when we sync the older txg.
1268 	 */
1269 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1270 	if (db->db_level == 0) {
1271 		void *data_old = db->db_buf;
1272 
1273 		if (db->db_state != DB_NOFILL) {
1274 			if (db->db_blkid == DMU_BONUS_BLKID) {
1275 				dbuf_fix_old_data(db, tx->tx_txg);
1276 				data_old = db->db.db_data;
1277 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1278 				/*
1279 				 * Release the data buffer from the cache so
1280 				 * that we can modify it without impacting
1281 				 * possible other users of this cached data
1282 				 * block.  Note that indirect blocks and
1283 				 * private objects are not released until the
1284 				 * syncing state (since they are only modified
1285 				 * then).
1286 				 */
1287 				arc_release(db->db_buf, db);
1288 				dbuf_fix_old_data(db, tx->tx_txg);
1289 				data_old = db->db_buf;
1290 			}
1291 			ASSERT(data_old != NULL);
1292 		}
1293 		dr->dt.dl.dr_data = data_old;
1294 	} else {
1295 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1296 		list_create(&dr->dt.di.dr_children,
1297 		    sizeof (dbuf_dirty_record_t),
1298 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1299 	}
1300 	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1301 		dr->dr_accounted = db->db.db_size;
1302 	dr->dr_dbuf = db;
1303 	dr->dr_txg = tx->tx_txg;
1304 	dr->dr_next = *drp;
1305 	*drp = dr;
1306 
1307 	/*
1308 	 * We could have been freed_in_flight between the dbuf_noread
1309 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1310 	 * happened after the free.
1311 	 */
1312 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1313 	    db->db_blkid != DMU_SPILL_BLKID) {
1314 		mutex_enter(&dn->dn_mtx);
1315 		if (dn->dn_free_ranges[txgoff] != NULL) {
1316 			range_tree_clear(dn->dn_free_ranges[txgoff],
1317 			    db->db_blkid, 1);
1318 		}
1319 		mutex_exit(&dn->dn_mtx);
1320 		db->db_freed_in_flight = FALSE;
1321 	}
1322 
1323 	/*
1324 	 * This buffer is now part of this txg
1325 	 */
1326 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1327 	db->db_dirtycnt += 1;
1328 	ASSERT3U(db->db_dirtycnt, <=, 3);
1329 
1330 	mutex_exit(&db->db_mtx);
1331 
1332 	if (db->db_blkid == DMU_BONUS_BLKID ||
1333 	    db->db_blkid == DMU_SPILL_BLKID) {
1334 		mutex_enter(&dn->dn_mtx);
1335 		ASSERT(!list_link_active(&dr->dr_dirty_node));
1336 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1337 		mutex_exit(&dn->dn_mtx);
1338 		dnode_setdirty(dn, tx);
1339 		DB_DNODE_EXIT(db);
1340 		return (dr);
1341 	} else if (do_free_accounting) {
1342 		blkptr_t *bp = db->db_blkptr;
1343 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1344 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1345 		/*
1346 		 * This is only a guess -- if the dbuf is dirty
1347 		 * in a previous txg, we don't know how much
1348 		 * space it will use on disk yet.  We should
1349 		 * really have the struct_rwlock to access
1350 		 * db_blkptr, but since this is just a guess,
1351 		 * it's OK if we get an odd answer.
1352 		 */
1353 		ddt_prefetch(os->os_spa, bp);
1354 		dnode_willuse_space(dn, -willfree, tx);
1355 	}
1356 
1357 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1358 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1359 		drop_struct_lock = TRUE;
1360 	}
1361 
1362 	if (db->db_level == 0) {
1363 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1364 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1365 	}
1366 
1367 	if (db->db_level+1 < dn->dn_nlevels) {
1368 		dmu_buf_impl_t *parent = db->db_parent;
1369 		dbuf_dirty_record_t *di;
1370 		int parent_held = FALSE;
1371 
1372 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1373 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1374 
1375 			parent = dbuf_hold_level(dn, db->db_level+1,
1376 			    db->db_blkid >> epbs, FTAG);
1377 			ASSERT(parent != NULL);
1378 			parent_held = TRUE;
1379 		}
1380 		if (drop_struct_lock)
1381 			rw_exit(&dn->dn_struct_rwlock);
1382 		ASSERT3U(db->db_level+1, ==, parent->db_level);
1383 		di = dbuf_dirty(parent, tx);
1384 		if (parent_held)
1385 			dbuf_rele(parent, FTAG);
1386 
1387 		mutex_enter(&db->db_mtx);
1388 		/*
1389 		 * Since we've dropped the mutex, it's possible that
1390 		 * dbuf_undirty() might have changed this out from under us.
1391 		 */
1392 		if (db->db_last_dirty == dr ||
1393 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1394 			mutex_enter(&di->dt.di.dr_mtx);
1395 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1396 			ASSERT(!list_link_active(&dr->dr_dirty_node));
1397 			list_insert_tail(&di->dt.di.dr_children, dr);
1398 			mutex_exit(&di->dt.di.dr_mtx);
1399 			dr->dr_parent = di;
1400 		}
1401 		mutex_exit(&db->db_mtx);
1402 	} else {
1403 		ASSERT(db->db_level+1 == dn->dn_nlevels);
1404 		ASSERT(db->db_blkid < dn->dn_nblkptr);
1405 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1406 		mutex_enter(&dn->dn_mtx);
1407 		ASSERT(!list_link_active(&dr->dr_dirty_node));
1408 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1409 		mutex_exit(&dn->dn_mtx);
1410 		if (drop_struct_lock)
1411 			rw_exit(&dn->dn_struct_rwlock);
1412 	}
1413 
1414 	dnode_setdirty(dn, tx);
1415 	DB_DNODE_EXIT(db);
1416 	return (dr);
1417 }
1418 
1419 /*
1420  * Undirty a buffer in the transaction group referenced by the given
1421  * transaction.  Return whether this evicted the dbuf.
1422  */
1423 static boolean_t
1424 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1425 {
1426 	dnode_t *dn;
1427 	uint64_t txg = tx->tx_txg;
1428 	dbuf_dirty_record_t *dr, **drp;
1429 
1430 	ASSERT(txg != 0);
1431 
1432 	/*
1433 	 * Due to our use of dn_nlevels below, this can only be called
1434 	 * in open context, unless we are operating on the MOS.
1435 	 * From syncing context, dn_nlevels may be different from the
1436 	 * dn_nlevels used when dbuf was dirtied.
1437 	 */
1438 	ASSERT(db->db_objset ==
1439 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
1440 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
1441 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1442 	ASSERT0(db->db_level);
1443 	ASSERT(MUTEX_HELD(&db->db_mtx));
1444 
1445 	/*
1446 	 * If this buffer is not dirty, we're done.
1447 	 */
1448 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1449 		if (dr->dr_txg <= txg)
1450 			break;
1451 	if (dr == NULL || dr->dr_txg < txg)
1452 		return (B_FALSE);
1453 	ASSERT(dr->dr_txg == txg);
1454 	ASSERT(dr->dr_dbuf == db);
1455 
1456 	DB_DNODE_ENTER(db);
1457 	dn = DB_DNODE(db);
1458 
1459 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1460 
1461 	ASSERT(db->db.db_size != 0);
1462 
1463 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
1464 	    dr->dr_accounted, txg);
1465 
1466 	*drp = dr->dr_next;
1467 
1468 	/*
1469 	 * Note that there are three places in dbuf_dirty()
1470 	 * where this dirty record may be put on a list.
1471 	 * Make sure to do a list_remove corresponding to
1472 	 * every one of those list_insert calls.
1473 	 */
1474 	if (dr->dr_parent) {
1475 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1476 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1477 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1478 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1479 	    db->db_level + 1 == dn->dn_nlevels) {
1480 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1481 		mutex_enter(&dn->dn_mtx);
1482 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1483 		mutex_exit(&dn->dn_mtx);
1484 	}
1485 	DB_DNODE_EXIT(db);
1486 
1487 	if (db->db_state != DB_NOFILL) {
1488 		dbuf_unoverride(dr);
1489 
1490 		ASSERT(db->db_buf != NULL);
1491 		ASSERT(dr->dt.dl.dr_data != NULL);
1492 		if (dr->dt.dl.dr_data != db->db_buf)
1493 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1494 	}
1495 
1496 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1497 
1498 	ASSERT(db->db_dirtycnt > 0);
1499 	db->db_dirtycnt -= 1;
1500 
1501 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1502 		arc_buf_t *buf = db->db_buf;
1503 
1504 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1505 		dbuf_clear_data(db);
1506 		VERIFY(arc_buf_remove_ref(buf, db));
1507 		dbuf_evict(db);
1508 		return (B_TRUE);
1509 	}
1510 
1511 	return (B_FALSE);
1512 }
1513 
1514 void
1515 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1516 {
1517 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1518 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1519 
1520 	ASSERT(tx->tx_txg != 0);
1521 	ASSERT(!refcount_is_zero(&db->db_holds));
1522 
1523 	/*
1524 	 * Quick check for dirtyness.  For already dirty blocks, this
1525 	 * reduces runtime of this function by >90%, and overall performance
1526 	 * by 50% for some workloads (e.g. file deletion with indirect blocks
1527 	 * cached).
1528 	 */
1529 	mutex_enter(&db->db_mtx);
1530 	dbuf_dirty_record_t *dr;
1531 	for (dr = db->db_last_dirty;
1532 	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1533 		/*
1534 		 * It's possible that it is already dirty but not cached,
1535 		 * because there are some calls to dbuf_dirty() that don't
1536 		 * go through dmu_buf_will_dirty().
1537 		 */
1538 		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1539 			/* This dbuf is already dirty and cached. */
1540 			dbuf_redirty(dr);
1541 			mutex_exit(&db->db_mtx);
1542 			return;
1543 		}
1544 	}
1545 	mutex_exit(&db->db_mtx);
1546 
1547 	DB_DNODE_ENTER(db);
1548 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1549 		rf |= DB_RF_HAVESTRUCT;
1550 	DB_DNODE_EXIT(db);
1551 	(void) dbuf_read(db, NULL, rf);
1552 	(void) dbuf_dirty(db, tx);
1553 }
1554 
1555 void
1556 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1557 {
1558 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1559 
1560 	db->db_state = DB_NOFILL;
1561 
1562 	dmu_buf_will_fill(db_fake, tx);
1563 }
1564 
1565 void
1566 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1567 {
1568 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1569 
1570 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1571 	ASSERT(tx->tx_txg != 0);
1572 	ASSERT(db->db_level == 0);
1573 	ASSERT(!refcount_is_zero(&db->db_holds));
1574 
1575 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1576 	    dmu_tx_private_ok(tx));
1577 
1578 	dbuf_noread(db);
1579 	(void) dbuf_dirty(db, tx);
1580 }
1581 
1582 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1583 /* ARGSUSED */
1584 void
1585 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1586 {
1587 	mutex_enter(&db->db_mtx);
1588 	DBUF_VERIFY(db);
1589 
1590 	if (db->db_state == DB_FILL) {
1591 		if (db->db_level == 0 && db->db_freed_in_flight) {
1592 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1593 			/* we were freed while filling */
1594 			/* XXX dbuf_undirty? */
1595 			bzero(db->db.db_data, db->db.db_size);
1596 			db->db_freed_in_flight = FALSE;
1597 		}
1598 		db->db_state = DB_CACHED;
1599 		cv_broadcast(&db->db_changed);
1600 	}
1601 	mutex_exit(&db->db_mtx);
1602 }
1603 
1604 void
1605 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
1606     bp_embedded_type_t etype, enum zio_compress comp,
1607     int uncompressed_size, int compressed_size, int byteorder,
1608     dmu_tx_t *tx)
1609 {
1610 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
1611 	struct dirty_leaf *dl;
1612 	dmu_object_type_t type;
1613 
1614 	if (etype == BP_EMBEDDED_TYPE_DATA) {
1615 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
1616 		    SPA_FEATURE_EMBEDDED_DATA));
1617 	}
1618 
1619 	DB_DNODE_ENTER(db);
1620 	type = DB_DNODE(db)->dn_type;
1621 	DB_DNODE_EXIT(db);
1622 
1623 	ASSERT0(db->db_level);
1624 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1625 
1626 	dmu_buf_will_not_fill(dbuf, tx);
1627 
1628 	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1629 	dl = &db->db_last_dirty->dt.dl;
1630 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
1631 	    data, comp, uncompressed_size, compressed_size);
1632 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
1633 	BP_SET_TYPE(&dl->dr_overridden_by, type);
1634 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
1635 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
1636 
1637 	dl->dr_override_state = DR_OVERRIDDEN;
1638 	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
1639 }
1640 
1641 /*
1642  * Directly assign a provided arc buf to a given dbuf if it's not referenced
1643  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1644  */
1645 void
1646 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1647 {
1648 	ASSERT(!refcount_is_zero(&db->db_holds));
1649 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1650 	ASSERT(db->db_level == 0);
1651 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1652 	ASSERT(buf != NULL);
1653 	ASSERT(arc_buf_size(buf) == db->db.db_size);
1654 	ASSERT(tx->tx_txg != 0);
1655 
1656 	arc_return_buf(buf, db);
1657 	ASSERT(arc_released(buf));
1658 
1659 	mutex_enter(&db->db_mtx);
1660 
1661 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1662 		cv_wait(&db->db_changed, &db->db_mtx);
1663 
1664 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1665 
1666 	if (db->db_state == DB_CACHED &&
1667 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1668 		mutex_exit(&db->db_mtx);
1669 		(void) dbuf_dirty(db, tx);
1670 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1671 		VERIFY(arc_buf_remove_ref(buf, db));
1672 		xuio_stat_wbuf_copied();
1673 		return;
1674 	}
1675 
1676 	xuio_stat_wbuf_nocopy();
1677 	if (db->db_state == DB_CACHED) {
1678 		dbuf_dirty_record_t *dr = db->db_last_dirty;
1679 
1680 		ASSERT(db->db_buf != NULL);
1681 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1682 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1683 			if (!arc_released(db->db_buf)) {
1684 				ASSERT(dr->dt.dl.dr_override_state ==
1685 				    DR_OVERRIDDEN);
1686 				arc_release(db->db_buf, db);
1687 			}
1688 			dr->dt.dl.dr_data = buf;
1689 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1690 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1691 			arc_release(db->db_buf, db);
1692 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1693 		}
1694 		db->db_buf = NULL;
1695 	}
1696 	ASSERT(db->db_buf == NULL);
1697 	dbuf_set_data(db, buf);
1698 	db->db_state = DB_FILL;
1699 	mutex_exit(&db->db_mtx);
1700 	(void) dbuf_dirty(db, tx);
1701 	dmu_buf_fill_done(&db->db, tx);
1702 }
1703 
1704 /*
1705  * "Clear" the contents of this dbuf.  This will mark the dbuf
1706  * EVICTING and clear *most* of its references.  Unfortunately,
1707  * when we are not holding the dn_dbufs_mtx, we can't clear the
1708  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1709  * in this case.  For callers from the DMU we will usually see:
1710  *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1711  * For the arc callback, we will usually see:
1712  *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1713  * Sometimes, though, we will get a mix of these two:
1714  *	DMU: dbuf_clear()->arc_clear_callback()
1715  *	ARC: dbuf_do_evict()->dbuf_destroy()
1716  *
1717  * This routine will dissociate the dbuf from the arc, by calling
1718  * arc_clear_callback(), but will not evict the data from the ARC.
1719  */
1720 void
1721 dbuf_clear(dmu_buf_impl_t *db)
1722 {
1723 	dnode_t *dn;
1724 	dmu_buf_impl_t *parent = db->db_parent;
1725 	dmu_buf_impl_t *dndb;
1726 	boolean_t dbuf_gone = B_FALSE;
1727 
1728 	ASSERT(MUTEX_HELD(&db->db_mtx));
1729 	ASSERT(refcount_is_zero(&db->db_holds));
1730 
1731 	dbuf_evict_user(db);
1732 
1733 	if (db->db_state == DB_CACHED) {
1734 		ASSERT(db->db.db_data != NULL);
1735 		if (db->db_blkid == DMU_BONUS_BLKID) {
1736 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1737 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1738 		}
1739 		db->db.db_data = NULL;
1740 		db->db_state = DB_UNCACHED;
1741 	}
1742 
1743 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1744 	ASSERT(db->db_data_pending == NULL);
1745 
1746 	db->db_state = DB_EVICTING;
1747 	db->db_blkptr = NULL;
1748 
1749 	DB_DNODE_ENTER(db);
1750 	dn = DB_DNODE(db);
1751 	dndb = dn->dn_dbuf;
1752 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1753 		avl_remove(&dn->dn_dbufs, db);
1754 		atomic_dec_32(&dn->dn_dbufs_count);
1755 		membar_producer();
1756 		DB_DNODE_EXIT(db);
1757 		/*
1758 		 * Decrementing the dbuf count means that the hold corresponding
1759 		 * to the removed dbuf is no longer discounted in dnode_move(),
1760 		 * so the dnode cannot be moved until after we release the hold.
1761 		 * The membar_producer() ensures visibility of the decremented
1762 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1763 		 * release any lock.
1764 		 */
1765 		dnode_rele(dn, db);
1766 		db->db_dnode_handle = NULL;
1767 	} else {
1768 		DB_DNODE_EXIT(db);
1769 	}
1770 
1771 	if (db->db_buf)
1772 		dbuf_gone = arc_clear_callback(db->db_buf);
1773 
1774 	if (!dbuf_gone)
1775 		mutex_exit(&db->db_mtx);
1776 
1777 	/*
1778 	 * If this dbuf is referenced from an indirect dbuf,
1779 	 * decrement the ref count on the indirect dbuf.
1780 	 */
1781 	if (parent && parent != dndb)
1782 		dbuf_rele(parent, db);
1783 }
1784 
1785 /*
1786  * Note: While bpp will always be updated if the function returns success,
1787  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
1788  * this happens when the dnode is the meta-dnode, or a userused or groupused
1789  * object.
1790  */
1791 static int
1792 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1793     dmu_buf_impl_t **parentp, blkptr_t **bpp)
1794 {
1795 	int nlevels, epbs;
1796 
1797 	*parentp = NULL;
1798 	*bpp = NULL;
1799 
1800 	ASSERT(blkid != DMU_BONUS_BLKID);
1801 
1802 	if (blkid == DMU_SPILL_BLKID) {
1803 		mutex_enter(&dn->dn_mtx);
1804 		if (dn->dn_have_spill &&
1805 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1806 			*bpp = &dn->dn_phys->dn_spill;
1807 		else
1808 			*bpp = NULL;
1809 		dbuf_add_ref(dn->dn_dbuf, NULL);
1810 		*parentp = dn->dn_dbuf;
1811 		mutex_exit(&dn->dn_mtx);
1812 		return (0);
1813 	}
1814 
1815 	if (dn->dn_phys->dn_nlevels == 0)
1816 		nlevels = 1;
1817 	else
1818 		nlevels = dn->dn_phys->dn_nlevels;
1819 
1820 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1821 
1822 	ASSERT3U(level * epbs, <, 64);
1823 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1824 	if (level >= nlevels ||
1825 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1826 		/* the buffer has no parent yet */
1827 		return (SET_ERROR(ENOENT));
1828 	} else if (level < nlevels-1) {
1829 		/* this block is referenced from an indirect block */
1830 		int err = dbuf_hold_impl(dn, level+1,
1831 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
1832 		if (err)
1833 			return (err);
1834 		err = dbuf_read(*parentp, NULL,
1835 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1836 		if (err) {
1837 			dbuf_rele(*parentp, NULL);
1838 			*parentp = NULL;
1839 			return (err);
1840 		}
1841 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1842 		    (blkid & ((1ULL << epbs) - 1));
1843 		return (0);
1844 	} else {
1845 		/* the block is referenced from the dnode */
1846 		ASSERT3U(level, ==, nlevels-1);
1847 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1848 		    blkid < dn->dn_phys->dn_nblkptr);
1849 		if (dn->dn_dbuf) {
1850 			dbuf_add_ref(dn->dn_dbuf, NULL);
1851 			*parentp = dn->dn_dbuf;
1852 		}
1853 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1854 		return (0);
1855 	}
1856 }
1857 
1858 static dmu_buf_impl_t *
1859 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1860     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1861 {
1862 	objset_t *os = dn->dn_objset;
1863 	dmu_buf_impl_t *db, *odb;
1864 
1865 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1866 	ASSERT(dn->dn_type != DMU_OT_NONE);
1867 
1868 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1869 
1870 	db->db_objset = os;
1871 	db->db.db_object = dn->dn_object;
1872 	db->db_level = level;
1873 	db->db_blkid = blkid;
1874 	db->db_last_dirty = NULL;
1875 	db->db_dirtycnt = 0;
1876 	db->db_dnode_handle = dn->dn_handle;
1877 	db->db_parent = parent;
1878 	db->db_blkptr = blkptr;
1879 
1880 	db->db_user = NULL;
1881 	db->db_user_immediate_evict = FALSE;
1882 	db->db_freed_in_flight = FALSE;
1883 	db->db_pending_evict = FALSE;
1884 
1885 	if (blkid == DMU_BONUS_BLKID) {
1886 		ASSERT3P(parent, ==, dn->dn_dbuf);
1887 		db->db.db_size = DN_MAX_BONUSLEN -
1888 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1889 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1890 		db->db.db_offset = DMU_BONUS_BLKID;
1891 		db->db_state = DB_UNCACHED;
1892 		/* the bonus dbuf is not placed in the hash table */
1893 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1894 		return (db);
1895 	} else if (blkid == DMU_SPILL_BLKID) {
1896 		db->db.db_size = (blkptr != NULL) ?
1897 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1898 		db->db.db_offset = 0;
1899 	} else {
1900 		int blocksize =
1901 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1902 		db->db.db_size = blocksize;
1903 		db->db.db_offset = db->db_blkid * blocksize;
1904 	}
1905 
1906 	/*
1907 	 * Hold the dn_dbufs_mtx while we get the new dbuf
1908 	 * in the hash table *and* added to the dbufs list.
1909 	 * This prevents a possible deadlock with someone
1910 	 * trying to look up this dbuf before its added to the
1911 	 * dn_dbufs list.
1912 	 */
1913 	mutex_enter(&dn->dn_dbufs_mtx);
1914 	db->db_state = DB_EVICTING;
1915 	if ((odb = dbuf_hash_insert(db)) != NULL) {
1916 		/* someone else inserted it first */
1917 		kmem_cache_free(dbuf_cache, db);
1918 		mutex_exit(&dn->dn_dbufs_mtx);
1919 		return (odb);
1920 	}
1921 	avl_add(&dn->dn_dbufs, db);
1922 	if (db->db_level == 0 && db->db_blkid >=
1923 	    dn->dn_unlisted_l0_blkid)
1924 		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1925 	db->db_state = DB_UNCACHED;
1926 	mutex_exit(&dn->dn_dbufs_mtx);
1927 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1928 
1929 	if (parent && parent != dn->dn_dbuf)
1930 		dbuf_add_ref(parent, db);
1931 
1932 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1933 	    refcount_count(&dn->dn_holds) > 0);
1934 	(void) refcount_add(&dn->dn_holds, db);
1935 	atomic_inc_32(&dn->dn_dbufs_count);
1936 
1937 	dprintf_dbuf(db, "db=%p\n", db);
1938 
1939 	return (db);
1940 }
1941 
1942 static int
1943 dbuf_do_evict(void *private)
1944 {
1945 	dmu_buf_impl_t *db = private;
1946 
1947 	if (!MUTEX_HELD(&db->db_mtx))
1948 		mutex_enter(&db->db_mtx);
1949 
1950 	ASSERT(refcount_is_zero(&db->db_holds));
1951 
1952 	if (db->db_state != DB_EVICTING) {
1953 		ASSERT(db->db_state == DB_CACHED);
1954 		DBUF_VERIFY(db);
1955 		db->db_buf = NULL;
1956 		dbuf_evict(db);
1957 	} else {
1958 		mutex_exit(&db->db_mtx);
1959 		dbuf_destroy(db);
1960 	}
1961 	return (0);
1962 }
1963 
1964 static void
1965 dbuf_destroy(dmu_buf_impl_t *db)
1966 {
1967 	ASSERT(refcount_is_zero(&db->db_holds));
1968 
1969 	if (db->db_blkid != DMU_BONUS_BLKID) {
1970 		/*
1971 		 * If this dbuf is still on the dn_dbufs list,
1972 		 * remove it from that list.
1973 		 */
1974 		if (db->db_dnode_handle != NULL) {
1975 			dnode_t *dn;
1976 
1977 			DB_DNODE_ENTER(db);
1978 			dn = DB_DNODE(db);
1979 			mutex_enter(&dn->dn_dbufs_mtx);
1980 			avl_remove(&dn->dn_dbufs, db);
1981 			atomic_dec_32(&dn->dn_dbufs_count);
1982 			mutex_exit(&dn->dn_dbufs_mtx);
1983 			DB_DNODE_EXIT(db);
1984 			/*
1985 			 * Decrementing the dbuf count means that the hold
1986 			 * corresponding to the removed dbuf is no longer
1987 			 * discounted in dnode_move(), so the dnode cannot be
1988 			 * moved until after we release the hold.
1989 			 */
1990 			dnode_rele(dn, db);
1991 			db->db_dnode_handle = NULL;
1992 		}
1993 		dbuf_hash_remove(db);
1994 	}
1995 	db->db_parent = NULL;
1996 	db->db_buf = NULL;
1997 
1998 	ASSERT(db->db.db_data == NULL);
1999 	ASSERT(db->db_hash_next == NULL);
2000 	ASSERT(db->db_blkptr == NULL);
2001 	ASSERT(db->db_data_pending == NULL);
2002 
2003 	kmem_cache_free(dbuf_cache, db);
2004 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2005 }
2006 
2007 typedef struct dbuf_prefetch_arg {
2008 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
2009 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2010 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2011 	int dpa_curlevel; /* The current level that we're reading */
2012 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2013 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2014 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2015 } dbuf_prefetch_arg_t;
2016 
2017 /*
2018  * Actually issue the prefetch read for the block given.
2019  */
2020 static void
2021 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2022 {
2023 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2024 		return;
2025 
2026 	arc_flags_t aflags =
2027 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2028 
2029 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2030 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2031 	ASSERT(dpa->dpa_zio != NULL);
2032 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2033 	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2034 	    &aflags, &dpa->dpa_zb);
2035 }
2036 
2037 /*
2038  * Called when an indirect block above our prefetch target is read in.  This
2039  * will either read in the next indirect block down the tree or issue the actual
2040  * prefetch if the next block down is our target.
2041  */
2042 static void
2043 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
2044 {
2045 	dbuf_prefetch_arg_t *dpa = private;
2046 
2047 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2048 	ASSERT3S(dpa->dpa_curlevel, >, 0);
2049 	if (zio != NULL) {
2050 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2051 		ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
2052 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
2053 	}
2054 
2055 	dpa->dpa_curlevel--;
2056 
2057 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
2058 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
2059 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
2060 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
2061 	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
2062 		kmem_free(dpa, sizeof (*dpa));
2063 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
2064 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
2065 		dbuf_issue_final_prefetch(dpa, bp);
2066 		kmem_free(dpa, sizeof (*dpa));
2067 	} else {
2068 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2069 		zbookmark_phys_t zb;
2070 
2071 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2072 
2073 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
2074 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
2075 
2076 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2077 		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
2078 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2079 		    &iter_aflags, &zb);
2080 	}
2081 	(void) arc_buf_remove_ref(abuf, private);
2082 }
2083 
2084 /*
2085  * Issue prefetch reads for the given block on the given level.  If the indirect
2086  * blocks above that block are not in memory, we will read them in
2087  * asynchronously.  As a result, this call never blocks waiting for a read to
2088  * complete.
2089  */
2090 void
2091 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
2092     arc_flags_t aflags)
2093 {
2094 	blkptr_t bp;
2095 	int epbs, nlevels, curlevel;
2096 	uint64_t curblkid;
2097 
2098 	ASSERT(blkid != DMU_BONUS_BLKID);
2099 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2100 
2101 	if (blkid > dn->dn_maxblkid)
2102 		return;
2103 
2104 	if (dnode_block_freed(dn, blkid))
2105 		return;
2106 
2107 	/*
2108 	 * This dnode hasn't been written to disk yet, so there's nothing to
2109 	 * prefetch.
2110 	 */
2111 	nlevels = dn->dn_phys->dn_nlevels;
2112 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
2113 		return;
2114 
2115 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2116 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
2117 		return;
2118 
2119 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
2120 	    level, blkid);
2121 	if (db != NULL) {
2122 		mutex_exit(&db->db_mtx);
2123 		/*
2124 		 * This dbuf already exists.  It is either CACHED, or
2125 		 * (we assume) about to be read or filled.
2126 		 */
2127 		return;
2128 	}
2129 
2130 	/*
2131 	 * Find the closest ancestor (indirect block) of the target block
2132 	 * that is present in the cache.  In this indirect block, we will
2133 	 * find the bp that is at curlevel, curblkid.
2134 	 */
2135 	curlevel = level;
2136 	curblkid = blkid;
2137 	while (curlevel < nlevels - 1) {
2138 		int parent_level = curlevel + 1;
2139 		uint64_t parent_blkid = curblkid >> epbs;
2140 		dmu_buf_impl_t *db;
2141 
2142 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
2143 		    FALSE, TRUE, FTAG, &db) == 0) {
2144 			blkptr_t *bpp = db->db_buf->b_data;
2145 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
2146 			dbuf_rele(db, FTAG);
2147 			break;
2148 		}
2149 
2150 		curlevel = parent_level;
2151 		curblkid = parent_blkid;
2152 	}
2153 
2154 	if (curlevel == nlevels - 1) {
2155 		/* No cached indirect blocks found. */
2156 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
2157 		bp = dn->dn_phys->dn_blkptr[curblkid];
2158 	}
2159 	if (BP_IS_HOLE(&bp))
2160 		return;
2161 
2162 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
2163 
2164 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
2165 	    ZIO_FLAG_CANFAIL);
2166 
2167 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
2168 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
2169 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2170 	    dn->dn_object, level, blkid);
2171 	dpa->dpa_curlevel = curlevel;
2172 	dpa->dpa_prio = prio;
2173 	dpa->dpa_aflags = aflags;
2174 	dpa->dpa_spa = dn->dn_objset->os_spa;
2175 	dpa->dpa_epbs = epbs;
2176 	dpa->dpa_zio = pio;
2177 
2178 	/*
2179 	 * If we have the indirect just above us, no need to do the asynchronous
2180 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
2181 	 * a higher level, though, we want to issue the prefetches for all the
2182 	 * indirect blocks asynchronously, so we can go on with whatever we were
2183 	 * doing.
2184 	 */
2185 	if (curlevel == level) {
2186 		ASSERT3U(curblkid, ==, blkid);
2187 		dbuf_issue_final_prefetch(dpa, &bp);
2188 		kmem_free(dpa, sizeof (*dpa));
2189 	} else {
2190 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2191 		zbookmark_phys_t zb;
2192 
2193 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2194 		    dn->dn_object, curlevel, curblkid);
2195 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2196 		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
2197 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2198 		    &iter_aflags, &zb);
2199 	}
2200 	/*
2201 	 * We use pio here instead of dpa_zio since it's possible that
2202 	 * dpa may have already been freed.
2203 	 */
2204 	zio_nowait(pio);
2205 }
2206 
2207 /*
2208  * Returns with db_holds incremented, and db_mtx not held.
2209  * Note: dn_struct_rwlock must be held.
2210  */
2211 int
2212 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
2213     boolean_t fail_sparse, boolean_t fail_uncached,
2214     void *tag, dmu_buf_impl_t **dbp)
2215 {
2216 	dmu_buf_impl_t *db, *parent = NULL;
2217 
2218 	ASSERT(blkid != DMU_BONUS_BLKID);
2219 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2220 	ASSERT3U(dn->dn_nlevels, >, level);
2221 
2222 	*dbp = NULL;
2223 top:
2224 	/* dbuf_find() returns with db_mtx held */
2225 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2226 
2227 	if (db == NULL) {
2228 		blkptr_t *bp = NULL;
2229 		int err;
2230 
2231 		if (fail_uncached)
2232 			return (SET_ERROR(ENOENT));
2233 
2234 		ASSERT3P(parent, ==, NULL);
2235 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2236 		if (fail_sparse) {
2237 			if (err == 0 && bp && BP_IS_HOLE(bp))
2238 				err = SET_ERROR(ENOENT);
2239 			if (err) {
2240 				if (parent)
2241 					dbuf_rele(parent, NULL);
2242 				return (err);
2243 			}
2244 		}
2245 		if (err && err != ENOENT)
2246 			return (err);
2247 		db = dbuf_create(dn, level, blkid, parent, bp);
2248 	}
2249 
2250 	if (fail_uncached && db->db_state != DB_CACHED) {
2251 		mutex_exit(&db->db_mtx);
2252 		return (SET_ERROR(ENOENT));
2253 	}
2254 
2255 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
2256 		arc_buf_add_ref(db->db_buf, db);
2257 		if (db->db_buf->b_data == NULL) {
2258 			dbuf_clear(db);
2259 			if (parent) {
2260 				dbuf_rele(parent, NULL);
2261 				parent = NULL;
2262 			}
2263 			goto top;
2264 		}
2265 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2266 	}
2267 
2268 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2269 
2270 	/*
2271 	 * If this buffer is currently syncing out, and we are are
2272 	 * still referencing it from db_data, we need to make a copy
2273 	 * of it in case we decide we want to dirty it again in this txg.
2274 	 */
2275 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2276 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2277 	    db->db_state == DB_CACHED && db->db_data_pending) {
2278 		dbuf_dirty_record_t *dr = db->db_data_pending;
2279 
2280 		if (dr->dt.dl.dr_data == db->db_buf) {
2281 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2282 
2283 			dbuf_set_data(db,
2284 			    arc_buf_alloc(dn->dn_objset->os_spa,
2285 			    db->db.db_size, db, type));
2286 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2287 			    db->db.db_size);
2288 		}
2289 	}
2290 
2291 	(void) refcount_add(&db->db_holds, tag);
2292 	DBUF_VERIFY(db);
2293 	mutex_exit(&db->db_mtx);
2294 
2295 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
2296 	if (parent)
2297 		dbuf_rele(parent, NULL);
2298 
2299 	ASSERT3P(DB_DNODE(db), ==, dn);
2300 	ASSERT3U(db->db_blkid, ==, blkid);
2301 	ASSERT3U(db->db_level, ==, level);
2302 	*dbp = db;
2303 
2304 	return (0);
2305 }
2306 
2307 dmu_buf_impl_t *
2308 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2309 {
2310 	return (dbuf_hold_level(dn, 0, blkid, tag));
2311 }
2312 
2313 dmu_buf_impl_t *
2314 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2315 {
2316 	dmu_buf_impl_t *db;
2317 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
2318 	return (err ? NULL : db);
2319 }
2320 
2321 void
2322 dbuf_create_bonus(dnode_t *dn)
2323 {
2324 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2325 
2326 	ASSERT(dn->dn_bonus == NULL);
2327 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2328 }
2329 
2330 int
2331 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2332 {
2333 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2334 	dnode_t *dn;
2335 
2336 	if (db->db_blkid != DMU_SPILL_BLKID)
2337 		return (SET_ERROR(ENOTSUP));
2338 	if (blksz == 0)
2339 		blksz = SPA_MINBLOCKSIZE;
2340 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2341 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2342 
2343 	DB_DNODE_ENTER(db);
2344 	dn = DB_DNODE(db);
2345 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2346 	dbuf_new_size(db, blksz, tx);
2347 	rw_exit(&dn->dn_struct_rwlock);
2348 	DB_DNODE_EXIT(db);
2349 
2350 	return (0);
2351 }
2352 
2353 void
2354 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2355 {
2356 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2357 }
2358 
2359 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2360 void
2361 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2362 {
2363 	int64_t holds = refcount_add(&db->db_holds, tag);
2364 	ASSERT(holds > 1);
2365 }
2366 
2367 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2368 boolean_t
2369 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2370     void *tag)
2371 {
2372 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2373 	dmu_buf_impl_t *found_db;
2374 	boolean_t result = B_FALSE;
2375 
2376 	if (db->db_blkid == DMU_BONUS_BLKID)
2377 		found_db = dbuf_find_bonus(os, obj);
2378 	else
2379 		found_db = dbuf_find(os, obj, 0, blkid);
2380 
2381 	if (found_db != NULL) {
2382 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
2383 			(void) refcount_add(&db->db_holds, tag);
2384 			result = B_TRUE;
2385 		}
2386 		mutex_exit(&db->db_mtx);
2387 	}
2388 	return (result);
2389 }
2390 
2391 /*
2392  * If you call dbuf_rele() you had better not be referencing the dnode handle
2393  * unless you have some other direct or indirect hold on the dnode. (An indirect
2394  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2395  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2396  * dnode's parent dbuf evicting its dnode handles.
2397  */
2398 void
2399 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2400 {
2401 	mutex_enter(&db->db_mtx);
2402 	dbuf_rele_and_unlock(db, tag);
2403 }
2404 
2405 void
2406 dmu_buf_rele(dmu_buf_t *db, void *tag)
2407 {
2408 	dbuf_rele((dmu_buf_impl_t *)db, tag);
2409 }
2410 
2411 /*
2412  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2413  * db_dirtycnt and db_holds to be updated atomically.
2414  */
2415 void
2416 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2417 {
2418 	int64_t holds;
2419 
2420 	ASSERT(MUTEX_HELD(&db->db_mtx));
2421 	DBUF_VERIFY(db);
2422 
2423 	/*
2424 	 * Remove the reference to the dbuf before removing its hold on the
2425 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2426 	 * buffer has a corresponding dnode hold.
2427 	 */
2428 	holds = refcount_remove(&db->db_holds, tag);
2429 	ASSERT(holds >= 0);
2430 
2431 	/*
2432 	 * We can't freeze indirects if there is a possibility that they
2433 	 * may be modified in the current syncing context.
2434 	 */
2435 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2436 		arc_buf_freeze(db->db_buf);
2437 
2438 	if (holds == db->db_dirtycnt &&
2439 	    db->db_level == 0 && db->db_user_immediate_evict)
2440 		dbuf_evict_user(db);
2441 
2442 	if (holds == 0) {
2443 		if (db->db_blkid == DMU_BONUS_BLKID) {
2444 			dnode_t *dn;
2445 			boolean_t evict_dbuf = db->db_pending_evict;
2446 
2447 			/*
2448 			 * If the dnode moves here, we cannot cross this
2449 			 * barrier until the move completes.
2450 			 */
2451 			DB_DNODE_ENTER(db);
2452 
2453 			dn = DB_DNODE(db);
2454 			atomic_dec_32(&dn->dn_dbufs_count);
2455 
2456 			/*
2457 			 * Decrementing the dbuf count means that the bonus
2458 			 * buffer's dnode hold is no longer discounted in
2459 			 * dnode_move(). The dnode cannot move until after
2460 			 * the dnode_rele() below.
2461 			 */
2462 			DB_DNODE_EXIT(db);
2463 
2464 			/*
2465 			 * Do not reference db after its lock is dropped.
2466 			 * Another thread may evict it.
2467 			 */
2468 			mutex_exit(&db->db_mtx);
2469 
2470 			if (evict_dbuf)
2471 				dnode_evict_bonus(dn);
2472 
2473 			dnode_rele(dn, db);
2474 		} else if (db->db_buf == NULL) {
2475 			/*
2476 			 * This is a special case: we never associated this
2477 			 * dbuf with any data allocated from the ARC.
2478 			 */
2479 			ASSERT(db->db_state == DB_UNCACHED ||
2480 			    db->db_state == DB_NOFILL);
2481 			dbuf_evict(db);
2482 		} else if (arc_released(db->db_buf)) {
2483 			arc_buf_t *buf = db->db_buf;
2484 			/*
2485 			 * This dbuf has anonymous data associated with it.
2486 			 */
2487 			dbuf_clear_data(db);
2488 			VERIFY(arc_buf_remove_ref(buf, db));
2489 			dbuf_evict(db);
2490 		} else {
2491 			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2492 
2493 			/*
2494 			 * A dbuf will be eligible for eviction if either the
2495 			 * 'primarycache' property is set or a duplicate
2496 			 * copy of this buffer is already cached in the arc.
2497 			 *
2498 			 * In the case of the 'primarycache' a buffer
2499 			 * is considered for eviction if it matches the
2500 			 * criteria set in the property.
2501 			 *
2502 			 * To decide if our buffer is considered a
2503 			 * duplicate, we must call into the arc to determine
2504 			 * if multiple buffers are referencing the same
2505 			 * block on-disk. If so, then we simply evict
2506 			 * ourselves.
2507 			 */
2508 			if (!DBUF_IS_CACHEABLE(db)) {
2509 				if (db->db_blkptr != NULL &&
2510 				    !BP_IS_HOLE(db->db_blkptr) &&
2511 				    !BP_IS_EMBEDDED(db->db_blkptr)) {
2512 					spa_t *spa =
2513 					    dmu_objset_spa(db->db_objset);
2514 					blkptr_t bp = *db->db_blkptr;
2515 					dbuf_clear(db);
2516 					arc_freed(spa, &bp);
2517 				} else {
2518 					dbuf_clear(db);
2519 				}
2520 			} else if (db->db_pending_evict ||
2521 			    arc_buf_eviction_needed(db->db_buf)) {
2522 				dbuf_clear(db);
2523 			} else {
2524 				mutex_exit(&db->db_mtx);
2525 			}
2526 		}
2527 	} else {
2528 		mutex_exit(&db->db_mtx);
2529 	}
2530 }
2531 
2532 #pragma weak dmu_buf_refcount = dbuf_refcount
2533 uint64_t
2534 dbuf_refcount(dmu_buf_impl_t *db)
2535 {
2536 	return (refcount_count(&db->db_holds));
2537 }
2538 
2539 void *
2540 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2541     dmu_buf_user_t *new_user)
2542 {
2543 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2544 
2545 	mutex_enter(&db->db_mtx);
2546 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
2547 	if (db->db_user == old_user)
2548 		db->db_user = new_user;
2549 	else
2550 		old_user = db->db_user;
2551 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
2552 	mutex_exit(&db->db_mtx);
2553 
2554 	return (old_user);
2555 }
2556 
2557 void *
2558 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2559 {
2560 	return (dmu_buf_replace_user(db_fake, NULL, user));
2561 }
2562 
2563 void *
2564 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2565 {
2566 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2567 
2568 	db->db_user_immediate_evict = TRUE;
2569 	return (dmu_buf_set_user(db_fake, user));
2570 }
2571 
2572 void *
2573 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2574 {
2575 	return (dmu_buf_replace_user(db_fake, user, NULL));
2576 }
2577 
2578 void *
2579 dmu_buf_get_user(dmu_buf_t *db_fake)
2580 {
2581 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2582 
2583 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
2584 	return (db->db_user);
2585 }
2586 
2587 void
2588 dmu_buf_user_evict_wait()
2589 {
2590 	taskq_wait(dbu_evict_taskq);
2591 }
2592 
2593 boolean_t
2594 dmu_buf_freeable(dmu_buf_t *dbuf)
2595 {
2596 	boolean_t res = B_FALSE;
2597 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2598 
2599 	if (db->db_blkptr)
2600 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2601 		    db->db_blkptr, db->db_blkptr->blk_birth);
2602 
2603 	return (res);
2604 }
2605 
2606 blkptr_t *
2607 dmu_buf_get_blkptr(dmu_buf_t *db)
2608 {
2609 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2610 	return (dbi->db_blkptr);
2611 }
2612 
2613 static void
2614 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2615 {
2616 	/* ASSERT(dmu_tx_is_syncing(tx) */
2617 	ASSERT(MUTEX_HELD(&db->db_mtx));
2618 
2619 	if (db->db_blkptr != NULL)
2620 		return;
2621 
2622 	if (db->db_blkid == DMU_SPILL_BLKID) {
2623 		db->db_blkptr = &dn->dn_phys->dn_spill;
2624 		BP_ZERO(db->db_blkptr);
2625 		return;
2626 	}
2627 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2628 		/*
2629 		 * This buffer was allocated at a time when there was
2630 		 * no available blkptrs from the dnode, or it was
2631 		 * inappropriate to hook it in (i.e., nlevels mis-match).
2632 		 */
2633 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2634 		ASSERT(db->db_parent == NULL);
2635 		db->db_parent = dn->dn_dbuf;
2636 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2637 		DBUF_VERIFY(db);
2638 	} else {
2639 		dmu_buf_impl_t *parent = db->db_parent;
2640 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2641 
2642 		ASSERT(dn->dn_phys->dn_nlevels > 1);
2643 		if (parent == NULL) {
2644 			mutex_exit(&db->db_mtx);
2645 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2646 			parent = dbuf_hold_level(dn, db->db_level + 1,
2647 			    db->db_blkid >> epbs, db);
2648 			rw_exit(&dn->dn_struct_rwlock);
2649 			mutex_enter(&db->db_mtx);
2650 			db->db_parent = parent;
2651 		}
2652 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2653 		    (db->db_blkid & ((1ULL << epbs) - 1));
2654 		DBUF_VERIFY(db);
2655 	}
2656 }
2657 
2658 static void
2659 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2660 {
2661 	dmu_buf_impl_t *db = dr->dr_dbuf;
2662 	dnode_t *dn;
2663 	zio_t *zio;
2664 
2665 	ASSERT(dmu_tx_is_syncing(tx));
2666 
2667 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2668 
2669 	mutex_enter(&db->db_mtx);
2670 
2671 	ASSERT(db->db_level > 0);
2672 	DBUF_VERIFY(db);
2673 
2674 	/* Read the block if it hasn't been read yet. */
2675 	if (db->db_buf == NULL) {
2676 		mutex_exit(&db->db_mtx);
2677 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2678 		mutex_enter(&db->db_mtx);
2679 	}
2680 	ASSERT3U(db->db_state, ==, DB_CACHED);
2681 	ASSERT(db->db_buf != NULL);
2682 
2683 	DB_DNODE_ENTER(db);
2684 	dn = DB_DNODE(db);
2685 	/* Indirect block size must match what the dnode thinks it is. */
2686 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2687 	dbuf_check_blkptr(dn, db);
2688 	DB_DNODE_EXIT(db);
2689 
2690 	/* Provide the pending dirty record to child dbufs */
2691 	db->db_data_pending = dr;
2692 
2693 	mutex_exit(&db->db_mtx);
2694 	dbuf_write(dr, db->db_buf, tx);
2695 
2696 	zio = dr->dr_zio;
2697 	mutex_enter(&dr->dt.di.dr_mtx);
2698 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
2699 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2700 	mutex_exit(&dr->dt.di.dr_mtx);
2701 	zio_nowait(zio);
2702 }
2703 
2704 static void
2705 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2706 {
2707 	arc_buf_t **datap = &dr->dt.dl.dr_data;
2708 	dmu_buf_impl_t *db = dr->dr_dbuf;
2709 	dnode_t *dn;
2710 	objset_t *os;
2711 	uint64_t txg = tx->tx_txg;
2712 
2713 	ASSERT(dmu_tx_is_syncing(tx));
2714 
2715 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2716 
2717 	mutex_enter(&db->db_mtx);
2718 	/*
2719 	 * To be synced, we must be dirtied.  But we
2720 	 * might have been freed after the dirty.
2721 	 */
2722 	if (db->db_state == DB_UNCACHED) {
2723 		/* This buffer has been freed since it was dirtied */
2724 		ASSERT(db->db.db_data == NULL);
2725 	} else if (db->db_state == DB_FILL) {
2726 		/* This buffer was freed and is now being re-filled */
2727 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2728 	} else {
2729 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2730 	}
2731 	DBUF_VERIFY(db);
2732 
2733 	DB_DNODE_ENTER(db);
2734 	dn = DB_DNODE(db);
2735 
2736 	if (db->db_blkid == DMU_SPILL_BLKID) {
2737 		mutex_enter(&dn->dn_mtx);
2738 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2739 		mutex_exit(&dn->dn_mtx);
2740 	}
2741 
2742 	/*
2743 	 * If this is a bonus buffer, simply copy the bonus data into the
2744 	 * dnode.  It will be written out when the dnode is synced (and it
2745 	 * will be synced, since it must have been dirty for dbuf_sync to
2746 	 * be called).
2747 	 */
2748 	if (db->db_blkid == DMU_BONUS_BLKID) {
2749 		dbuf_dirty_record_t **drp;
2750 
2751 		ASSERT(*datap != NULL);
2752 		ASSERT0(db->db_level);
2753 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2754 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2755 		DB_DNODE_EXIT(db);
2756 
2757 		if (*datap != db->db.db_data) {
2758 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2759 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2760 		}
2761 		db->db_data_pending = NULL;
2762 		drp = &db->db_last_dirty;
2763 		while (*drp != dr)
2764 			drp = &(*drp)->dr_next;
2765 		ASSERT(dr->dr_next == NULL);
2766 		ASSERT(dr->dr_dbuf == db);
2767 		*drp = dr->dr_next;
2768 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2769 		ASSERT(db->db_dirtycnt > 0);
2770 		db->db_dirtycnt -= 1;
2771 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2772 		return;
2773 	}
2774 
2775 	os = dn->dn_objset;
2776 
2777 	/*
2778 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2779 	 * operation to sneak in. As a result, we need to ensure that we
2780 	 * don't check the dr_override_state until we have returned from
2781 	 * dbuf_check_blkptr.
2782 	 */
2783 	dbuf_check_blkptr(dn, db);
2784 
2785 	/*
2786 	 * If this buffer is in the middle of an immediate write,
2787 	 * wait for the synchronous IO to complete.
2788 	 */
2789 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2790 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2791 		cv_wait(&db->db_changed, &db->db_mtx);
2792 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2793 	}
2794 
2795 	if (db->db_state != DB_NOFILL &&
2796 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2797 	    refcount_count(&db->db_holds) > 1 &&
2798 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2799 	    *datap == db->db_buf) {
2800 		/*
2801 		 * If this buffer is currently "in use" (i.e., there
2802 		 * are active holds and db_data still references it),
2803 		 * then make a copy before we start the write so that
2804 		 * any modifications from the open txg will not leak
2805 		 * into this write.
2806 		 *
2807 		 * NOTE: this copy does not need to be made for
2808 		 * objects only modified in the syncing context (e.g.
2809 		 * DNONE_DNODE blocks).
2810 		 */
2811 		int blksz = arc_buf_size(*datap);
2812 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2813 		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2814 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2815 	}
2816 	db->db_data_pending = dr;
2817 
2818 	mutex_exit(&db->db_mtx);
2819 
2820 	dbuf_write(dr, *datap, tx);
2821 
2822 	ASSERT(!list_link_active(&dr->dr_dirty_node));
2823 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2824 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2825 		DB_DNODE_EXIT(db);
2826 	} else {
2827 		/*
2828 		 * Although zio_nowait() does not "wait for an IO", it does
2829 		 * initiate the IO. If this is an empty write it seems plausible
2830 		 * that the IO could actually be completed before the nowait
2831 		 * returns. We need to DB_DNODE_EXIT() first in case
2832 		 * zio_nowait() invalidates the dbuf.
2833 		 */
2834 		DB_DNODE_EXIT(db);
2835 		zio_nowait(dr->dr_zio);
2836 	}
2837 }
2838 
2839 void
2840 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
2841 {
2842 	dbuf_dirty_record_t *dr;
2843 
2844 	while (dr = list_head(list)) {
2845 		if (dr->dr_zio != NULL) {
2846 			/*
2847 			 * If we find an already initialized zio then we
2848 			 * are processing the meta-dnode, and we have finished.
2849 			 * The dbufs for all dnodes are put back on the list
2850 			 * during processing, so that we can zio_wait()
2851 			 * these IOs after initiating all child IOs.
2852 			 */
2853 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2854 			    DMU_META_DNODE_OBJECT);
2855 			break;
2856 		}
2857 		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
2858 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
2859 			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
2860 		}
2861 		list_remove(list, dr);
2862 		if (dr->dr_dbuf->db_level > 0)
2863 			dbuf_sync_indirect(dr, tx);
2864 		else
2865 			dbuf_sync_leaf(dr, tx);
2866 	}
2867 }
2868 
2869 /* ARGSUSED */
2870 static void
2871 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2872 {
2873 	dmu_buf_impl_t *db = vdb;
2874 	dnode_t *dn;
2875 	blkptr_t *bp = zio->io_bp;
2876 	blkptr_t *bp_orig = &zio->io_bp_orig;
2877 	spa_t *spa = zio->io_spa;
2878 	int64_t delta;
2879 	uint64_t fill = 0;
2880 	int i;
2881 
2882 	ASSERT3P(db->db_blkptr, ==, bp);
2883 
2884 	DB_DNODE_ENTER(db);
2885 	dn = DB_DNODE(db);
2886 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2887 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2888 	zio->io_prev_space_delta = delta;
2889 
2890 	if (bp->blk_birth != 0) {
2891 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2892 		    BP_GET_TYPE(bp) == dn->dn_type) ||
2893 		    (db->db_blkid == DMU_SPILL_BLKID &&
2894 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
2895 		    BP_IS_EMBEDDED(bp));
2896 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2897 	}
2898 
2899 	mutex_enter(&db->db_mtx);
2900 
2901 #ifdef ZFS_DEBUG
2902 	if (db->db_blkid == DMU_SPILL_BLKID) {
2903 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2904 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2905 		    db->db_blkptr == &dn->dn_phys->dn_spill);
2906 	}
2907 #endif
2908 
2909 	if (db->db_level == 0) {
2910 		mutex_enter(&dn->dn_mtx);
2911 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2912 		    db->db_blkid != DMU_SPILL_BLKID)
2913 			dn->dn_phys->dn_maxblkid = db->db_blkid;
2914 		mutex_exit(&dn->dn_mtx);
2915 
2916 		if (dn->dn_type == DMU_OT_DNODE) {
2917 			dnode_phys_t *dnp = db->db.db_data;
2918 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2919 			    i--, dnp++) {
2920 				if (dnp->dn_type != DMU_OT_NONE)
2921 					fill++;
2922 			}
2923 		} else {
2924 			if (BP_IS_HOLE(bp)) {
2925 				fill = 0;
2926 			} else {
2927 				fill = 1;
2928 			}
2929 		}
2930 	} else {
2931 		blkptr_t *ibp = db->db.db_data;
2932 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2933 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2934 			if (BP_IS_HOLE(ibp))
2935 				continue;
2936 			fill += BP_GET_FILL(ibp);
2937 		}
2938 	}
2939 	DB_DNODE_EXIT(db);
2940 
2941 	if (!BP_IS_EMBEDDED(bp))
2942 		bp->blk_fill = fill;
2943 
2944 	mutex_exit(&db->db_mtx);
2945 }
2946 
2947 /*
2948  * The SPA will call this callback several times for each zio - once
2949  * for every physical child i/o (zio->io_phys_children times).  This
2950  * allows the DMU to monitor the progress of each logical i/o.  For example,
2951  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2952  * block.  There may be a long delay before all copies/fragments are completed,
2953  * so this callback allows us to retire dirty space gradually, as the physical
2954  * i/os complete.
2955  */
2956 /* ARGSUSED */
2957 static void
2958 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2959 {
2960 	dmu_buf_impl_t *db = arg;
2961 	objset_t *os = db->db_objset;
2962 	dsl_pool_t *dp = dmu_objset_pool(os);
2963 	dbuf_dirty_record_t *dr;
2964 	int delta = 0;
2965 
2966 	dr = db->db_data_pending;
2967 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2968 
2969 	/*
2970 	 * The callback will be called io_phys_children times.  Retire one
2971 	 * portion of our dirty space each time we are called.  Any rounding
2972 	 * error will be cleaned up by dsl_pool_sync()'s call to
2973 	 * dsl_pool_undirty_space().
2974 	 */
2975 	delta = dr->dr_accounted / zio->io_phys_children;
2976 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
2977 }
2978 
2979 /* ARGSUSED */
2980 static void
2981 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2982 {
2983 	dmu_buf_impl_t *db = vdb;
2984 	blkptr_t *bp_orig = &zio->io_bp_orig;
2985 	blkptr_t *bp = db->db_blkptr;
2986 	objset_t *os = db->db_objset;
2987 	dmu_tx_t *tx = os->os_synctx;
2988 	dbuf_dirty_record_t **drp, *dr;
2989 
2990 	ASSERT0(zio->io_error);
2991 	ASSERT(db->db_blkptr == bp);
2992 
2993 	/*
2994 	 * For nopwrites and rewrites we ensure that the bp matches our
2995 	 * original and bypass all the accounting.
2996 	 */
2997 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2998 		ASSERT(BP_EQUAL(bp, bp_orig));
2999 	} else {
3000 		dsl_dataset_t *ds = os->os_dsl_dataset;
3001 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
3002 		dsl_dataset_block_born(ds, bp, tx);
3003 	}
3004 
3005 	mutex_enter(&db->db_mtx);
3006 
3007 	DBUF_VERIFY(db);
3008 
3009 	drp = &db->db_last_dirty;
3010 	while ((dr = *drp) != db->db_data_pending)
3011 		drp = &dr->dr_next;
3012 	ASSERT(!list_link_active(&dr->dr_dirty_node));
3013 	ASSERT(dr->dr_dbuf == db);
3014 	ASSERT(dr->dr_next == NULL);
3015 	*drp = dr->dr_next;
3016 
3017 #ifdef ZFS_DEBUG
3018 	if (db->db_blkid == DMU_SPILL_BLKID) {
3019 		dnode_t *dn;
3020 
3021 		DB_DNODE_ENTER(db);
3022 		dn = DB_DNODE(db);
3023 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3024 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
3025 		    db->db_blkptr == &dn->dn_phys->dn_spill);
3026 		DB_DNODE_EXIT(db);
3027 	}
3028 #endif
3029 
3030 	if (db->db_level == 0) {
3031 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3032 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
3033 		if (db->db_state != DB_NOFILL) {
3034 			if (dr->dt.dl.dr_data != db->db_buf)
3035 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
3036 				    db));
3037 			else if (!arc_released(db->db_buf))
3038 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
3039 		}
3040 	} else {
3041 		dnode_t *dn;
3042 
3043 		DB_DNODE_ENTER(db);
3044 		dn = DB_DNODE(db);
3045 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3046 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
3047 		if (!BP_IS_HOLE(db->db_blkptr)) {
3048 			int epbs =
3049 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3050 			ASSERT3U(db->db_blkid, <=,
3051 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
3052 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
3053 			    db->db.db_size);
3054 			if (!arc_released(db->db_buf))
3055 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
3056 		}
3057 		DB_DNODE_EXIT(db);
3058 		mutex_destroy(&dr->dt.di.dr_mtx);
3059 		list_destroy(&dr->dt.di.dr_children);
3060 	}
3061 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
3062 
3063 	cv_broadcast(&db->db_changed);
3064 	ASSERT(db->db_dirtycnt > 0);
3065 	db->db_dirtycnt -= 1;
3066 	db->db_data_pending = NULL;
3067 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
3068 }
3069 
3070 static void
3071 dbuf_write_nofill_ready(zio_t *zio)
3072 {
3073 	dbuf_write_ready(zio, NULL, zio->io_private);
3074 }
3075 
3076 static void
3077 dbuf_write_nofill_done(zio_t *zio)
3078 {
3079 	dbuf_write_done(zio, NULL, zio->io_private);
3080 }
3081 
3082 static void
3083 dbuf_write_override_ready(zio_t *zio)
3084 {
3085 	dbuf_dirty_record_t *dr = zio->io_private;
3086 	dmu_buf_impl_t *db = dr->dr_dbuf;
3087 
3088 	dbuf_write_ready(zio, NULL, db);
3089 }
3090 
3091 static void
3092 dbuf_write_override_done(zio_t *zio)
3093 {
3094 	dbuf_dirty_record_t *dr = zio->io_private;
3095 	dmu_buf_impl_t *db = dr->dr_dbuf;
3096 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3097 
3098 	mutex_enter(&db->db_mtx);
3099 	if (!BP_EQUAL(zio->io_bp, obp)) {
3100 		if (!BP_IS_HOLE(obp))
3101 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3102 		arc_release(dr->dt.dl.dr_data, db);
3103 	}
3104 	mutex_exit(&db->db_mtx);
3105 
3106 	dbuf_write_done(zio, NULL, db);
3107 }
3108 
3109 /* Issue I/O to commit a dirty buffer to disk. */
3110 static void
3111 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3112 {
3113 	dmu_buf_impl_t *db = dr->dr_dbuf;
3114 	dnode_t *dn;
3115 	objset_t *os;
3116 	dmu_buf_impl_t *parent = db->db_parent;
3117 	uint64_t txg = tx->tx_txg;
3118 	zbookmark_phys_t zb;
3119 	zio_prop_t zp;
3120 	zio_t *zio;
3121 	int wp_flag = 0;
3122 
3123 	DB_DNODE_ENTER(db);
3124 	dn = DB_DNODE(db);
3125 	os = dn->dn_objset;
3126 
3127 	if (db->db_state != DB_NOFILL) {
3128 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3129 			/*
3130 			 * Private object buffers are released here rather
3131 			 * than in dbuf_dirty() since they are only modified
3132 			 * in the syncing context and we don't want the
3133 			 * overhead of making multiple copies of the data.
3134 			 */
3135 			if (BP_IS_HOLE(db->db_blkptr)) {
3136 				arc_buf_thaw(data);
3137 			} else {
3138 				dbuf_release_bp(db);
3139 			}
3140 		}
3141 	}
3142 
3143 	if (parent != dn->dn_dbuf) {
3144 		/* Our parent is an indirect block. */
3145 		/* We have a dirty parent that has been scheduled for write. */
3146 		ASSERT(parent && parent->db_data_pending);
3147 		/* Our parent's buffer is one level closer to the dnode. */
3148 		ASSERT(db->db_level == parent->db_level-1);
3149 		/*
3150 		 * We're about to modify our parent's db_data by modifying
3151 		 * our block pointer, so the parent must be released.
3152 		 */
3153 		ASSERT(arc_released(parent->db_buf));
3154 		zio = parent->db_data_pending->dr_zio;
3155 	} else {
3156 		/* Our parent is the dnode itself. */
3157 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
3158 		    db->db_blkid != DMU_SPILL_BLKID) ||
3159 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
3160 		if (db->db_blkid != DMU_SPILL_BLKID)
3161 			ASSERT3P(db->db_blkptr, ==,
3162 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
3163 		zio = dn->dn_zio;
3164 	}
3165 
3166 	ASSERT(db->db_level == 0 || data == db->db_buf);
3167 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3168 	ASSERT(zio);
3169 
3170 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3171 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3172 	    db->db.db_object, db->db_level, db->db_blkid);
3173 
3174 	if (db->db_blkid == DMU_SPILL_BLKID)
3175 		wp_flag = WP_SPILL;
3176 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
3177 
3178 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3179 	DB_DNODE_EXIT(db);
3180 
3181 	if (db->db_level == 0 &&
3182 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
3183 		/*
3184 		 * The BP for this block has been provided by open context
3185 		 * (by dmu_sync() or dmu_buf_write_embedded()).
3186 		 */
3187 		void *contents = (data != NULL) ? data->b_data : NULL;
3188 
3189 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
3190 		    db->db_blkptr, contents, db->db.db_size, &zp,
3191 		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
3192 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3193 		mutex_enter(&db->db_mtx);
3194 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3195 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
3196 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3197 		mutex_exit(&db->db_mtx);
3198 	} else if (db->db_state == DB_NOFILL) {
3199 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3200 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3201 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
3202 		    db->db_blkptr, NULL, db->db.db_size, &zp,
3203 		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
3204 		    ZIO_PRIORITY_ASYNC_WRITE,
3205 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3206 	} else {
3207 		ASSERT(arc_released(data));
3208 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
3209 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
3210 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
3211 		    dbuf_write_physdone, dbuf_write_done, db,
3212 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3213 	}
3214 }
3215