1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 */
30
31#include <sys/zfs_context.h>
32#include <sys/dmu.h>
33#include <sys/dmu_send.h>
34#include <sys/dmu_impl.h>
35#include <sys/dbuf.h>
36#include <sys/dmu_objset.h>
37#include <sys/dsl_dataset.h>
38#include <sys/dsl_dir.h>
39#include <sys/dmu_tx.h>
40#include <sys/spa.h>
41#include <sys/zio.h>
42#include <sys/dmu_zfetch.h>
43#include <sys/sa.h>
44#include <sys/sa_impl.h>
45#include <sys/zfeature.h>
46#include <sys/blkptr.h>
47#include <sys/range_tree.h>
48#include <sys/callb.h>
49#include <sys/abd.h>
50#include <sys/vdev.h>
51#include <sys/cityhash.h>
52#include <sys/spa_impl.h>
53
54static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
55static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
56
57#ifndef __lint
58extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
59    dmu_buf_evict_func_t *evict_func_sync,
60    dmu_buf_evict_func_t *evict_func_async,
61    dmu_buf_t **clear_on_evict_dbufp);
62#endif /* ! __lint */
63
64/*
65 * Global data structures and functions for the dbuf cache.
66 */
67static kmem_cache_t *dbuf_kmem_cache;
68static taskq_t *dbu_evict_taskq;
69
70static kthread_t *dbuf_cache_evict_thread;
71static kmutex_t dbuf_evict_lock;
72static kcondvar_t dbuf_evict_cv;
73static boolean_t dbuf_evict_thread_exit;
74
75/*
76 * There are two dbuf caches; each dbuf can only be in one of them at a time.
77 *
78 * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
79 *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
80 *    that represent the metadata that describes filesystems/snapshots/
81 *    bookmarks/properties/etc. We only evict from this cache when we export a
82 *    pool, to short-circuit as much I/O as possible for all administrative
83 *    commands that need the metadata. There is no eviction policy for this
84 *    cache, because we try to only include types in it which would occupy a
85 *    very small amount of space per object but create a large impact on the
86 *    performance of these commands. Instead, after it reaches a maximum size
87 *    (which should only happen on very small memory systems with a very large
88 *    number of filesystem objects), we stop taking new dbufs into the
89 *    metadata cache, instead putting them in the normal dbuf cache.
90 *
91 * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
92 *    are not currently held but have been recently released. These dbufs
93 *    are not eligible for arc eviction until they are aged out of the cache.
94 *    Dbufs that are aged out of the cache will be immediately destroyed and
95 *    become eligible for arc eviction.
96 *
97 * Dbufs are added to these caches once the last hold is released. If a dbuf is
98 * later accessed and still exists in the dbuf cache, then it will be removed
99 * from the cache and later re-added to the head of the cache.
100 *
101 * If a given dbuf meets the requirements for the metadata cache, it will go
102 * there, otherwise it will be considered for the generic LRU dbuf cache. The
103 * caches and the refcounts tracking their sizes are stored in an array indexed
104 * by those caches' matching enum values (from dbuf_cached_state_t).
105 */
106typedef struct dbuf_cache {
107	multilist_t *cache;
108	zfs_refcount_t size;
109} dbuf_cache_t;
110dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
111
112/* Size limits for the caches */
113uint64_t dbuf_cache_max_bytes = 0;
114uint64_t dbuf_metadata_cache_max_bytes = 0;
115/* Set the default sizes of the caches to log2 fraction of arc size */
116int dbuf_cache_shift = 5;
117int dbuf_metadata_cache_shift = 6;
118
119/*
120 * For diagnostic purposes, this is incremented whenever we can't add
121 * something to the metadata cache because it's full, and instead put
122 * the data in the regular dbuf cache.
123 */
124uint64_t dbuf_metadata_cache_overflow;
125
126/*
127 * The LRU dbuf cache uses a three-stage eviction policy:
128 *	- A low water marker designates when the dbuf eviction thread
129 *	should stop evicting from the dbuf cache.
130 *	- When we reach the maximum size (aka mid water mark), we
131 *	signal the eviction thread to run.
132 *	- The high water mark indicates when the eviction thread
133 *	is unable to keep up with the incoming load and eviction must
134 *	happen in the context of the calling thread.
135 *
136 * The dbuf cache:
137 *                                                 (max size)
138 *                                      low water   mid water   hi water
139 * +----------------------------------------+----------+----------+
140 * |                                        |          |          |
141 * |                                        |          |          |
142 * |                                        |          |          |
143 * |                                        |          |          |
144 * +----------------------------------------+----------+----------+
145 *                                        stop        signal     evict
146 *                                      evicting     eviction   directly
147 *                                                    thread
148 *
149 * The high and low water marks indicate the operating range for the eviction
150 * thread. The low water mark is, by default, 90% of the total size of the
151 * cache and the high water mark is at 110% (both of these percentages can be
152 * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
153 * respectively). The eviction thread will try to ensure that the cache remains
154 * within this range by waking up every second and checking if the cache is
155 * above the low water mark. The thread can also be woken up by callers adding
156 * elements into the cache if the cache is larger than the mid water (i.e max
157 * cache size). Once the eviction thread is woken up and eviction is required,
158 * it will continue evicting buffers until it's able to reduce the cache size
159 * to the low water mark. If the cache size continues to grow and hits the high
160 * water mark, then callers adding elements to the cache will begin to evict
161 * directly from the cache until the cache is no longer above the high water
162 * mark.
163 */
164
165/*
166 * The percentage above and below the maximum cache size.
167 */
168uint_t dbuf_cache_hiwater_pct = 10;
169uint_t dbuf_cache_lowater_pct = 10;
170
171/* ARGSUSED */
172static int
173dbuf_cons(void *vdb, void *unused, int kmflag)
174{
175	dmu_buf_impl_t *db = vdb;
176	bzero(db, sizeof (dmu_buf_impl_t));
177
178	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
179	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
180	multilist_link_init(&db->db_cache_link);
181	zfs_refcount_create(&db->db_holds);
182
183	return (0);
184}
185
186/* ARGSUSED */
187static void
188dbuf_dest(void *vdb, void *unused)
189{
190	dmu_buf_impl_t *db = vdb;
191	mutex_destroy(&db->db_mtx);
192	cv_destroy(&db->db_changed);
193	ASSERT(!multilist_link_active(&db->db_cache_link));
194	zfs_refcount_destroy(&db->db_holds);
195}
196
197/*
198 * dbuf hash table routines
199 */
200static dbuf_hash_table_t dbuf_hash_table;
201
202static uint64_t dbuf_hash_count;
203
204/*
205 * We use Cityhash for this. It's fast, and has good hash properties without
206 * requiring any large static buffers.
207 */
208static uint64_t
209dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
210{
211	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
212}
213
214#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
215	((dbuf)->db.db_object == (obj) &&		\
216	(dbuf)->db_objset == (os) &&			\
217	(dbuf)->db_level == (level) &&			\
218	(dbuf)->db_blkid == (blkid))
219
220dmu_buf_impl_t *
221dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
222{
223	dbuf_hash_table_t *h = &dbuf_hash_table;
224	uint64_t hv = dbuf_hash(os, obj, level, blkid);
225	uint64_t idx = hv & h->hash_table_mask;
226	dmu_buf_impl_t *db;
227
228	mutex_enter(DBUF_HASH_MUTEX(h, idx));
229	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
230		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
231			mutex_enter(&db->db_mtx);
232			if (db->db_state != DB_EVICTING) {
233				mutex_exit(DBUF_HASH_MUTEX(h, idx));
234				return (db);
235			}
236			mutex_exit(&db->db_mtx);
237		}
238	}
239	mutex_exit(DBUF_HASH_MUTEX(h, idx));
240	return (NULL);
241}
242
243static dmu_buf_impl_t *
244dbuf_find_bonus(objset_t *os, uint64_t object)
245{
246	dnode_t *dn;
247	dmu_buf_impl_t *db = NULL;
248
249	if (dnode_hold(os, object, FTAG, &dn) == 0) {
250		rw_enter(&dn->dn_struct_rwlock, RW_READER);
251		if (dn->dn_bonus != NULL) {
252			db = dn->dn_bonus;
253			mutex_enter(&db->db_mtx);
254		}
255		rw_exit(&dn->dn_struct_rwlock);
256		dnode_rele(dn, FTAG);
257	}
258	return (db);
259}
260
261/*
262 * Insert an entry into the hash table.  If there is already an element
263 * equal to elem in the hash table, then the already existing element
264 * will be returned and the new element will not be inserted.
265 * Otherwise returns NULL.
266 */
267static dmu_buf_impl_t *
268dbuf_hash_insert(dmu_buf_impl_t *db)
269{
270	dbuf_hash_table_t *h = &dbuf_hash_table;
271	objset_t *os = db->db_objset;
272	uint64_t obj = db->db.db_object;
273	int level = db->db_level;
274	uint64_t blkid = db->db_blkid;
275	uint64_t hv = dbuf_hash(os, obj, level, blkid);
276	uint64_t idx = hv & h->hash_table_mask;
277	dmu_buf_impl_t *dbf;
278
279	mutex_enter(DBUF_HASH_MUTEX(h, idx));
280	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
281		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
282			mutex_enter(&dbf->db_mtx);
283			if (dbf->db_state != DB_EVICTING) {
284				mutex_exit(DBUF_HASH_MUTEX(h, idx));
285				return (dbf);
286			}
287			mutex_exit(&dbf->db_mtx);
288		}
289	}
290
291	mutex_enter(&db->db_mtx);
292	db->db_hash_next = h->hash_table[idx];
293	h->hash_table[idx] = db;
294	mutex_exit(DBUF_HASH_MUTEX(h, idx));
295	atomic_inc_64(&dbuf_hash_count);
296
297	return (NULL);
298}
299
300/*
301 * Remove an entry from the hash table.  It must be in the EVICTING state.
302 */
303static void
304dbuf_hash_remove(dmu_buf_impl_t *db)
305{
306	dbuf_hash_table_t *h = &dbuf_hash_table;
307	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
308	    db->db_level, db->db_blkid);
309	uint64_t idx = hv & h->hash_table_mask;
310	dmu_buf_impl_t *dbf, **dbp;
311
312	/*
313	 * We mustn't hold db_mtx to maintain lock ordering:
314	 * DBUF_HASH_MUTEX > db_mtx.
315	 */
316	ASSERT(zfs_refcount_is_zero(&db->db_holds));
317	ASSERT(db->db_state == DB_EVICTING);
318	ASSERT(!MUTEX_HELD(&db->db_mtx));
319
320	mutex_enter(DBUF_HASH_MUTEX(h, idx));
321	dbp = &h->hash_table[idx];
322	while ((dbf = *dbp) != db) {
323		dbp = &dbf->db_hash_next;
324		ASSERT(dbf != NULL);
325	}
326	*dbp = db->db_hash_next;
327	db->db_hash_next = NULL;
328	mutex_exit(DBUF_HASH_MUTEX(h, idx));
329	atomic_dec_64(&dbuf_hash_count);
330}
331
332typedef enum {
333	DBVU_EVICTING,
334	DBVU_NOT_EVICTING
335} dbvu_verify_type_t;
336
337static void
338dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
339{
340#ifdef ZFS_DEBUG
341	int64_t holds;
342
343	if (db->db_user == NULL)
344		return;
345
346	/* Only data blocks support the attachment of user data. */
347	ASSERT(db->db_level == 0);
348
349	/* Clients must resolve a dbuf before attaching user data. */
350	ASSERT(db->db.db_data != NULL);
351	ASSERT3U(db->db_state, ==, DB_CACHED);
352
353	holds = zfs_refcount_count(&db->db_holds);
354	if (verify_type == DBVU_EVICTING) {
355		/*
356		 * Immediate eviction occurs when holds == dirtycnt.
357		 * For normal eviction buffers, holds is zero on
358		 * eviction, except when dbuf_fix_old_data() calls
359		 * dbuf_clear_data().  However, the hold count can grow
360		 * during eviction even though db_mtx is held (see
361		 * dmu_bonus_hold() for an example), so we can only
362		 * test the generic invariant that holds >= dirtycnt.
363		 */
364		ASSERT3U(holds, >=, db->db_dirtycnt);
365	} else {
366		if (db->db_user_immediate_evict == TRUE)
367			ASSERT3U(holds, >=, db->db_dirtycnt);
368		else
369			ASSERT3U(holds, >, 0);
370	}
371#endif
372}
373
374static void
375dbuf_evict_user(dmu_buf_impl_t *db)
376{
377	dmu_buf_user_t *dbu = db->db_user;
378
379	ASSERT(MUTEX_HELD(&db->db_mtx));
380
381	if (dbu == NULL)
382		return;
383
384	dbuf_verify_user(db, DBVU_EVICTING);
385	db->db_user = NULL;
386
387#ifdef ZFS_DEBUG
388	if (dbu->dbu_clear_on_evict_dbufp != NULL)
389		*dbu->dbu_clear_on_evict_dbufp = NULL;
390#endif
391
392	/*
393	 * There are two eviction callbacks - one that we call synchronously
394	 * and one that we invoke via a taskq.  The async one is useful for
395	 * avoiding lock order reversals and limiting stack depth.
396	 *
397	 * Note that if we have a sync callback but no async callback,
398	 * it's likely that the sync callback will free the structure
399	 * containing the dbu.  In that case we need to take care to not
400	 * dereference dbu after calling the sync evict func.
401	 */
402	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
403
404	if (dbu->dbu_evict_func_sync != NULL)
405		dbu->dbu_evict_func_sync(dbu);
406
407	if (has_async) {
408		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
409		    dbu, 0, &dbu->dbu_tqent);
410	}
411}
412
413boolean_t
414dbuf_is_metadata(dmu_buf_impl_t *db)
415{
416	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
417		return (B_TRUE);
418	} else {
419		boolean_t is_metadata;
420
421		DB_DNODE_ENTER(db);
422		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
423		DB_DNODE_EXIT(db);
424
425		return (is_metadata);
426	}
427}
428
429/*
430 * This returns whether this dbuf should be stored in the metadata cache, which
431 * is based on whether it's from one of the dnode types that store data related
432 * to traversing dataset hierarchies.
433 */
434static boolean_t
435dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
436{
437	DB_DNODE_ENTER(db);
438	dmu_object_type_t type = DB_DNODE(db)->dn_type;
439	DB_DNODE_EXIT(db);
440
441	/* Check if this dbuf is one of the types we care about */
442	if (DMU_OT_IS_METADATA_CACHED(type)) {
443		/* If we hit this, then we set something up wrong in dmu_ot */
444		ASSERT(DMU_OT_IS_METADATA(type));
445
446		/*
447		 * Sanity check for small-memory systems: don't allocate too
448		 * much memory for this purpose.
449		 */
450		if (zfs_refcount_count(
451		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
452		    dbuf_metadata_cache_max_bytes) {
453			dbuf_metadata_cache_overflow++;
454			DTRACE_PROBE1(dbuf__metadata__cache__overflow,
455			    dmu_buf_impl_t *, db);
456			return (B_FALSE);
457		}
458
459		return (B_TRUE);
460	}
461
462	return (B_FALSE);
463}
464
465/*
466 * This function *must* return indices evenly distributed between all
467 * sublists of the multilist. This is needed due to how the dbuf eviction
468 * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
469 * distributed between all sublists and uses this assumption when
470 * deciding which sublist to evict from and how much to evict from it.
471 */
472unsigned int
473dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
474{
475	dmu_buf_impl_t *db = obj;
476
477	/*
478	 * The assumption here, is the hash value for a given
479	 * dmu_buf_impl_t will remain constant throughout it's lifetime
480	 * (i.e. it's objset, object, level and blkid fields don't change).
481	 * Thus, we don't need to store the dbuf's sublist index
482	 * on insertion, as this index can be recalculated on removal.
483	 *
484	 * Also, the low order bits of the hash value are thought to be
485	 * distributed evenly. Otherwise, in the case that the multilist
486	 * has a power of two number of sublists, each sublists' usage
487	 * would not be evenly distributed.
488	 */
489	return (dbuf_hash(db->db_objset, db->db.db_object,
490	    db->db_level, db->db_blkid) %
491	    multilist_get_num_sublists(ml));
492}
493
494static inline boolean_t
495dbuf_cache_above_hiwater(void)
496{
497	uint64_t dbuf_cache_hiwater_bytes =
498	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
499
500	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
501	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
502}
503
504static inline boolean_t
505dbuf_cache_above_lowater(void)
506{
507	uint64_t dbuf_cache_lowater_bytes =
508	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
509
510	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
511	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
512}
513
514/*
515 * Evict the oldest eligible dbuf from the dbuf cache.
516 */
517static void
518dbuf_evict_one(void)
519{
520	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
521	multilist_sublist_t *mls = multilist_sublist_lock(
522	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
523
524	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
525
526	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
527	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
528		db = multilist_sublist_prev(mls, db);
529	}
530
531	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
532	    multilist_sublist_t *, mls);
533
534	if (db != NULL) {
535		multilist_sublist_remove(mls, db);
536		multilist_sublist_unlock(mls);
537		(void) zfs_refcount_remove_many(
538		    &dbuf_caches[DB_DBUF_CACHE].size,
539		    db->db.db_size, db);
540		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
541		db->db_caching_status = DB_NO_CACHE;
542		dbuf_destroy(db);
543	} else {
544		multilist_sublist_unlock(mls);
545	}
546}
547
548/*
549 * The dbuf evict thread is responsible for aging out dbufs from the
550 * cache. Once the cache has reached it's maximum size, dbufs are removed
551 * and destroyed. The eviction thread will continue running until the size
552 * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
553 * out of the cache it is destroyed and becomes eligible for arc eviction.
554 */
555/* ARGSUSED */
556static void
557dbuf_evict_thread(void *unused)
558{
559	callb_cpr_t cpr;
560
561	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
562
563	mutex_enter(&dbuf_evict_lock);
564	while (!dbuf_evict_thread_exit) {
565		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
566			CALLB_CPR_SAFE_BEGIN(&cpr);
567			(void) cv_timedwait_hires(&dbuf_evict_cv,
568			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
569			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
570		}
571		mutex_exit(&dbuf_evict_lock);
572
573		/*
574		 * Keep evicting as long as we're above the low water mark
575		 * for the cache. We do this without holding the locks to
576		 * minimize lock contention.
577		 */
578		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
579			dbuf_evict_one();
580		}
581
582		mutex_enter(&dbuf_evict_lock);
583	}
584
585	dbuf_evict_thread_exit = B_FALSE;
586	cv_broadcast(&dbuf_evict_cv);
587	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
588	thread_exit();
589}
590
591/*
592 * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
593 * If the dbuf cache is at its high water mark, then evict a dbuf from the
594 * dbuf cache using the callers context.
595 */
596static void
597dbuf_evict_notify(void)
598{
599	/*
600	 * We check if we should evict without holding the dbuf_evict_lock,
601	 * because it's OK to occasionally make the wrong decision here,
602	 * and grabbing the lock results in massive lock contention.
603	 */
604	if (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
605	    dbuf_cache_max_bytes) {
606		if (dbuf_cache_above_hiwater())
607			dbuf_evict_one();
608		cv_signal(&dbuf_evict_cv);
609	}
610}
611
612void
613dbuf_init(void)
614{
615	uint64_t hsize = 1ULL << 16;
616	dbuf_hash_table_t *h = &dbuf_hash_table;
617	int i;
618
619	/*
620	 * The hash table is big enough to fill all of physical memory
621	 * with an average 4K block size.  The table will take up
622	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
623	 */
624	while (hsize * 4096 < physmem * PAGESIZE)
625		hsize <<= 1;
626
627retry:
628	h->hash_table_mask = hsize - 1;
629	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
630	if (h->hash_table == NULL) {
631		/* XXX - we should really return an error instead of assert */
632		ASSERT(hsize > (1ULL << 10));
633		hsize >>= 1;
634		goto retry;
635	}
636
637	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
638	    sizeof (dmu_buf_impl_t),
639	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
640
641	for (i = 0; i < DBUF_MUTEXES; i++)
642		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
643
644	/*
645	 * Setup the parameters for the dbuf caches. We set the sizes of the
646	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
647	 * of the size of the ARC, respectively. If the values are set in
648	 * /etc/system and they're not greater than the size of the ARC, then
649	 * we honor that value.
650	 */
651	if (dbuf_cache_max_bytes == 0 ||
652	    dbuf_cache_max_bytes >= arc_max_bytes())  {
653		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
654	}
655	if (dbuf_metadata_cache_max_bytes == 0 ||
656	    dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
657		dbuf_metadata_cache_max_bytes =
658		    arc_max_bytes() >> dbuf_metadata_cache_shift;
659	}
660
661	/*
662	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
663	 * configuration is not required.
664	 */
665	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
666
667	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
668		dbuf_caches[dcs].cache =
669		    multilist_create(sizeof (dmu_buf_impl_t),
670		    offsetof(dmu_buf_impl_t, db_cache_link),
671		    dbuf_cache_multilist_index_func);
672		zfs_refcount_create(&dbuf_caches[dcs].size);
673	}
674
675	dbuf_evict_thread_exit = B_FALSE;
676	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
677	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
678	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
679	    NULL, 0, &p0, TS_RUN, minclsyspri);
680}
681
682void
683dbuf_fini(void)
684{
685	dbuf_hash_table_t *h = &dbuf_hash_table;
686	int i;
687
688	for (i = 0; i < DBUF_MUTEXES; i++)
689		mutex_destroy(&h->hash_mutexes[i]);
690	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
691	kmem_cache_destroy(dbuf_kmem_cache);
692	taskq_destroy(dbu_evict_taskq);
693
694	mutex_enter(&dbuf_evict_lock);
695	dbuf_evict_thread_exit = B_TRUE;
696	while (dbuf_evict_thread_exit) {
697		cv_signal(&dbuf_evict_cv);
698		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
699	}
700	mutex_exit(&dbuf_evict_lock);
701
702	mutex_destroy(&dbuf_evict_lock);
703	cv_destroy(&dbuf_evict_cv);
704
705	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
706		zfs_refcount_destroy(&dbuf_caches[dcs].size);
707		multilist_destroy(dbuf_caches[dcs].cache);
708	}
709}
710
711/*
712 * Other stuff.
713 */
714
715#ifdef ZFS_DEBUG
716static void
717dbuf_verify(dmu_buf_impl_t *db)
718{
719	dnode_t *dn;
720	dbuf_dirty_record_t *dr;
721
722	ASSERT(MUTEX_HELD(&db->db_mtx));
723
724	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
725		return;
726
727	ASSERT(db->db_objset != NULL);
728	DB_DNODE_ENTER(db);
729	dn = DB_DNODE(db);
730	if (dn == NULL) {
731		ASSERT(db->db_parent == NULL);
732		ASSERT(db->db_blkptr == NULL);
733	} else {
734		ASSERT3U(db->db.db_object, ==, dn->dn_object);
735		ASSERT3P(db->db_objset, ==, dn->dn_objset);
736		ASSERT3U(db->db_level, <, dn->dn_nlevels);
737		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
738		    db->db_blkid == DMU_SPILL_BLKID ||
739		    !avl_is_empty(&dn->dn_dbufs));
740	}
741	if (db->db_blkid == DMU_BONUS_BLKID) {
742		ASSERT(dn != NULL);
743		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
744		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
745	} else if (db->db_blkid == DMU_SPILL_BLKID) {
746		ASSERT(dn != NULL);
747		ASSERT0(db->db.db_offset);
748	} else {
749		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
750	}
751
752	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
753		ASSERT(dr->dr_dbuf == db);
754
755	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
756		ASSERT(dr->dr_dbuf == db);
757
758	/*
759	 * We can't assert that db_size matches dn_datablksz because it
760	 * can be momentarily different when another thread is doing
761	 * dnode_set_blksz().
762	 */
763	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
764		dr = db->db_data_pending;
765		/*
766		 * It should only be modified in syncing context, so
767		 * make sure we only have one copy of the data.
768		 */
769		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
770	}
771
772	/* verify db->db_blkptr */
773	if (db->db_blkptr) {
774		if (db->db_parent == dn->dn_dbuf) {
775			/* db is pointed to by the dnode */
776			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
777			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
778				ASSERT(db->db_parent == NULL);
779			else
780				ASSERT(db->db_parent != NULL);
781			if (db->db_blkid != DMU_SPILL_BLKID)
782				ASSERT3P(db->db_blkptr, ==,
783				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
784		} else {
785			/* db is pointed to by an indirect block */
786			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
787			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
788			ASSERT3U(db->db_parent->db.db_object, ==,
789			    db->db.db_object);
790			/*
791			 * dnode_grow_indblksz() can make this fail if we don't
792			 * have the struct_rwlock.  XXX indblksz no longer
793			 * grows.  safe to do this now?
794			 */
795			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
796				ASSERT3P(db->db_blkptr, ==,
797				    ((blkptr_t *)db->db_parent->db.db_data +
798				    db->db_blkid % epb));
799			}
800		}
801	}
802	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
803	    (db->db_buf == NULL || db->db_buf->b_data) &&
804	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
805	    db->db_state != DB_FILL && !dn->dn_free_txg) {
806		/*
807		 * If the blkptr isn't set but they have nonzero data,
808		 * it had better be dirty, otherwise we'll lose that
809		 * data when we evict this buffer.
810		 *
811		 * There is an exception to this rule for indirect blocks; in
812		 * this case, if the indirect block is a hole, we fill in a few
813		 * fields on each of the child blocks (importantly, birth time)
814		 * to prevent hole birth times from being lost when you
815		 * partially fill in a hole.
816		 */
817		if (db->db_dirtycnt == 0) {
818			if (db->db_level == 0) {
819				uint64_t *buf = db->db.db_data;
820				int i;
821
822				for (i = 0; i < db->db.db_size >> 3; i++) {
823					ASSERT(buf[i] == 0);
824				}
825			} else {
826				blkptr_t *bps = db->db.db_data;
827				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
828				    db->db.db_size);
829				/*
830				 * We want to verify that all the blkptrs in the
831				 * indirect block are holes, but we may have
832				 * automatically set up a few fields for them.
833				 * We iterate through each blkptr and verify
834				 * they only have those fields set.
835				 */
836				for (int i = 0;
837				    i < db->db.db_size / sizeof (blkptr_t);
838				    i++) {
839					blkptr_t *bp = &bps[i];
840					ASSERT(ZIO_CHECKSUM_IS_ZERO(
841					    &bp->blk_cksum));
842					ASSERT(
843					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
844					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
845					    DVA_IS_EMPTY(&bp->blk_dva[2]));
846					ASSERT0(bp->blk_fill);
847					ASSERT0(bp->blk_pad[0]);
848					ASSERT0(bp->blk_pad[1]);
849					ASSERT(!BP_IS_EMBEDDED(bp));
850					ASSERT(BP_IS_HOLE(bp));
851					ASSERT0(bp->blk_phys_birth);
852				}
853			}
854		}
855	}
856	DB_DNODE_EXIT(db);
857}
858#endif
859
860static void
861dbuf_clear_data(dmu_buf_impl_t *db)
862{
863	ASSERT(MUTEX_HELD(&db->db_mtx));
864	dbuf_evict_user(db);
865	ASSERT3P(db->db_buf, ==, NULL);
866	db->db.db_data = NULL;
867	if (db->db_state != DB_NOFILL)
868		db->db_state = DB_UNCACHED;
869}
870
871static void
872dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
873{
874	ASSERT(MUTEX_HELD(&db->db_mtx));
875	ASSERT(buf != NULL);
876
877	db->db_buf = buf;
878	ASSERT(buf->b_data != NULL);
879	db->db.db_data = buf->b_data;
880}
881
882/*
883 * Loan out an arc_buf for read.  Return the loaned arc_buf.
884 */
885arc_buf_t *
886dbuf_loan_arcbuf(dmu_buf_impl_t *db)
887{
888	arc_buf_t *abuf;
889
890	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
891	mutex_enter(&db->db_mtx);
892	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
893		int blksz = db->db.db_size;
894		spa_t *spa = db->db_objset->os_spa;
895
896		mutex_exit(&db->db_mtx);
897		abuf = arc_loan_buf(spa, B_FALSE, blksz);
898		bcopy(db->db.db_data, abuf->b_data, blksz);
899	} else {
900		abuf = db->db_buf;
901		arc_loan_inuse_buf(abuf, db);
902		db->db_buf = NULL;
903		dbuf_clear_data(db);
904		mutex_exit(&db->db_mtx);
905	}
906	return (abuf);
907}
908
909/*
910 * Calculate which level n block references the data at the level 0 offset
911 * provided.
912 */
913uint64_t
914dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
915{
916	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
917		/*
918		 * The level n blkid is equal to the level 0 blkid divided by
919		 * the number of level 0s in a level n block.
920		 *
921		 * The level 0 blkid is offset >> datablkshift =
922		 * offset / 2^datablkshift.
923		 *
924		 * The number of level 0s in a level n is the number of block
925		 * pointers in an indirect block, raised to the power of level.
926		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
927		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
928		 *
929		 * Thus, the level n blkid is: offset /
930		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
931		 * = offset / 2^(datablkshift + level *
932		 *   (indblkshift - SPA_BLKPTRSHIFT))
933		 * = offset >> (datablkshift + level *
934		 *   (indblkshift - SPA_BLKPTRSHIFT))
935		 */
936		return (offset >> (dn->dn_datablkshift + level *
937		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
938	} else {
939		ASSERT3U(offset, <, dn->dn_datablksz);
940		return (0);
941	}
942}
943
944/* ARGSUSED */
945static void
946dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
947    arc_buf_t *buf, void *vdb)
948{
949	dmu_buf_impl_t *db = vdb;
950
951	mutex_enter(&db->db_mtx);
952	ASSERT3U(db->db_state, ==, DB_READ);
953	/*
954	 * All reads are synchronous, so we must have a hold on the dbuf
955	 */
956	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
957	ASSERT(db->db_buf == NULL);
958	ASSERT(db->db.db_data == NULL);
959	if (buf == NULL) {
960		/* i/o error */
961		ASSERT(zio == NULL || zio->io_error != 0);
962		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
963		ASSERT3P(db->db_buf, ==, NULL);
964		db->db_state = DB_UNCACHED;
965	} else if (db->db_level == 0 && db->db_freed_in_flight) {
966		/* we were freed in flight; disregard any error */
967		ASSERT(zio == NULL || zio->io_error == 0);
968		if (buf == NULL) {
969			buf = arc_alloc_buf(db->db_objset->os_spa,
970			    db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
971		}
972		arc_release(buf, db);
973		bzero(buf->b_data, db->db.db_size);
974		arc_buf_freeze(buf);
975		db->db_freed_in_flight = FALSE;
976		dbuf_set_data(db, buf);
977		db->db_state = DB_CACHED;
978	} else if (buf != NULL) {
979		/* success */
980		ASSERT(zio == NULL || zio->io_error == 0);
981		dbuf_set_data(db, buf);
982		db->db_state = DB_CACHED;
983	}
984	cv_broadcast(&db->db_changed);
985	dbuf_rele_and_unlock(db, NULL, B_FALSE);
986}
987
988
989/*
990 * This function ensures that, when doing a decrypting read of a block,
991 * we make sure we have decrypted the dnode associated with it. We must do
992 * this so that we ensure we are fully authenticating the checksum-of-MACs
993 * tree from the root of the objset down to this block. Indirect blocks are
994 * always verified against their secure checksum-of-MACs assuming that the
995 * dnode containing them is correct. Now that we are doing a decrypting read,
996 * we can be sure that the key is loaded and verify that assumption. This is
997 * especially important considering that we always read encrypted dnode
998 * blocks as raw data (without verifying their MACs) to start, and
999 * decrypt / authenticate them when we need to read an encrypted bonus buffer.
1000 */
1001static int
1002dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
1003{
1004	int err = 0;
1005	objset_t *os = db->db_objset;
1006	arc_buf_t *dnode_abuf;
1007	dnode_t *dn;
1008	zbookmark_phys_t zb;
1009
1010	ASSERT(MUTEX_HELD(&db->db_mtx));
1011
1012	if (!os->os_encrypted || os->os_raw_receive ||
1013	    (flags & DB_RF_NO_DECRYPT) != 0)
1014		return (0);
1015
1016	DB_DNODE_ENTER(db);
1017	dn = DB_DNODE(db);
1018	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
1019
1020	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
1021		DB_DNODE_EXIT(db);
1022		return (0);
1023	}
1024
1025	SET_BOOKMARK(&zb, dmu_objset_id(os),
1026	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
1027	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
1028
1029	/*
1030	 * An error code of EACCES tells us that the key is still not
1031	 * available. This is ok if we are only reading authenticated
1032	 * (and therefore non-encrypted) blocks.
1033	 */
1034	if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
1035	    !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
1036	    (db->db_blkid == DMU_BONUS_BLKID &&
1037	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
1038		err = 0;
1039
1040	DB_DNODE_EXIT(db);
1041
1042	return (err);
1043}
1044
1045static int
1046dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
1047{
1048	dnode_t *dn;
1049	zbookmark_phys_t zb;
1050	arc_flags_t aflags = ARC_FLAG_NOWAIT;
1051	int err, zio_flags = 0;
1052
1053	DB_DNODE_ENTER(db);
1054	dn = DB_DNODE(db);
1055	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1056	/* We need the struct_rwlock to prevent db_blkptr from changing. */
1057	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1058	ASSERT(MUTEX_HELD(&db->db_mtx));
1059	ASSERT(db->db_state == DB_UNCACHED);
1060	ASSERT(db->db_buf == NULL);
1061
1062	if (db->db_blkid == DMU_BONUS_BLKID) {
1063		/*
1064		 * The bonus length stored in the dnode may be less than
1065		 * the maximum available space in the bonus buffer.
1066		 */
1067		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
1068		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1069
1070		/* if the underlying dnode block is encrypted, decrypt it */
1071		err = dbuf_read_verify_dnode_crypt(db, flags);
1072		if (err != 0) {
1073			DB_DNODE_EXIT(db);
1074			mutex_exit(&db->db_mtx);
1075			return (err);
1076		}
1077
1078		ASSERT3U(bonuslen, <=, db->db.db_size);
1079		db->db.db_data = zio_buf_alloc(max_bonuslen);
1080		arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
1081		if (bonuslen < max_bonuslen)
1082			bzero(db->db.db_data, max_bonuslen);
1083		if (bonuslen)
1084			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
1085		DB_DNODE_EXIT(db);
1086		db->db_state = DB_CACHED;
1087		mutex_exit(&db->db_mtx);
1088		return (0);
1089	}
1090
1091	/*
1092	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
1093	 * processes the delete record and clears the bp while we are waiting
1094	 * for the dn_mtx (resulting in a "no" from block_freed).
1095	 */
1096	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
1097	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
1098	    BP_IS_HOLE(db->db_blkptr)))) {
1099		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1100
1101		dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
1102		    db->db.db_size));
1103		bzero(db->db.db_data, db->db.db_size);
1104
1105		if (db->db_blkptr != NULL && db->db_level > 0 &&
1106		    BP_IS_HOLE(db->db_blkptr) &&
1107		    db->db_blkptr->blk_birth != 0) {
1108			blkptr_t *bps = db->db.db_data;
1109			for (int i = 0; i < ((1 <<
1110			    DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
1111			    i++) {
1112				blkptr_t *bp = &bps[i];
1113				ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
1114				    1 << dn->dn_indblkshift);
1115				BP_SET_LSIZE(bp,
1116				    BP_GET_LEVEL(db->db_blkptr) == 1 ?
1117				    dn->dn_datablksz :
1118				    BP_GET_LSIZE(db->db_blkptr));
1119				BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
1120				BP_SET_LEVEL(bp,
1121				    BP_GET_LEVEL(db->db_blkptr) - 1);
1122				BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
1123			}
1124		}
1125		DB_DNODE_EXIT(db);
1126		db->db_state = DB_CACHED;
1127		mutex_exit(&db->db_mtx);
1128		return (0);
1129	}
1130
1131	SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1132	    db->db.db_object, db->db_level, db->db_blkid);
1133
1134	/*
1135	 * All bps of an encrypted os should have the encryption bit set.
1136	 * If this is not true it indicates tampering and we report an error.
1137	 */
1138	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
1139		spa_log_error(db->db_objset->os_spa, &zb);
1140		zfs_panic_recover("unencrypted block in encrypted "
1141		    "object set %llu", dmu_objset_id(db->db_objset));
1142		DB_DNODE_EXIT(db);
1143		mutex_exit(&db->db_mtx);
1144		return (SET_ERROR(EIO));
1145	}
1146
1147	err = dbuf_read_verify_dnode_crypt(db, flags);
1148	if (err != 0) {
1149		DB_DNODE_EXIT(db);
1150		mutex_exit(&db->db_mtx);
1151		return (err);
1152	}
1153
1154	DB_DNODE_EXIT(db);
1155
1156	db->db_state = DB_READ;
1157	mutex_exit(&db->db_mtx);
1158
1159	if (DBUF_IS_L2CACHEABLE(db))
1160		aflags |= ARC_FLAG_L2CACHE;
1161
1162	dbuf_add_ref(db, NULL);
1163
1164	zio_flags = (flags & DB_RF_CANFAIL) ?
1165	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
1166
1167	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
1168		zio_flags |= ZIO_FLAG_RAW;
1169
1170	err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
1171	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
1172	    &aflags, &zb);
1173
1174	return (err);
1175}
1176
1177/*
1178 * This is our just-in-time copy function.  It makes a copy of buffers that
1179 * have been modified in a previous transaction group before we access them in
1180 * the current active group.
1181 *
1182 * This function is used in three places: when we are dirtying a buffer for the
1183 * first time in a txg, when we are freeing a range in a dnode that includes
1184 * this buffer, and when we are accessing a buffer which was received compressed
1185 * and later referenced in a WRITE_BYREF record.
1186 *
1187 * Note that when we are called from dbuf_free_range() we do not put a hold on
1188 * the buffer, we just traverse the active dbuf list for the dnode.
1189 */
1190static void
1191dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
1192{
1193	dbuf_dirty_record_t *dr = db->db_last_dirty;
1194
1195	ASSERT(MUTEX_HELD(&db->db_mtx));
1196	ASSERT(db->db.db_data != NULL);
1197	ASSERT(db->db_level == 0);
1198	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
1199
1200	if (dr == NULL ||
1201	    (dr->dt.dl.dr_data !=
1202	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
1203		return;
1204
1205	/*
1206	 * If the last dirty record for this dbuf has not yet synced
1207	 * and its referencing the dbuf data, either:
1208	 *	reset the reference to point to a new copy,
1209	 * or (if there a no active holders)
1210	 *	just null out the current db_data pointer.
1211	 */
1212	ASSERT3U(dr->dr_txg, >=, txg - 2);
1213	if (db->db_blkid == DMU_BONUS_BLKID) {
1214		/* Note that the data bufs here are zio_bufs */
1215		dnode_t *dn = DB_DNODE(db);
1216		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1217		dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
1218		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1219		bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
1220	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
1221		dnode_t *dn = DB_DNODE(db);
1222		int size = arc_buf_size(db->db_buf);
1223		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1224		spa_t *spa = db->db_objset->os_spa;
1225		enum zio_compress compress_type =
1226		    arc_get_compression(db->db_buf);
1227
1228		if (arc_is_encrypted(db->db_buf)) {
1229			boolean_t byteorder;
1230			uint8_t salt[ZIO_DATA_SALT_LEN];
1231			uint8_t iv[ZIO_DATA_IV_LEN];
1232			uint8_t mac[ZIO_DATA_MAC_LEN];
1233
1234			arc_get_raw_params(db->db_buf, &byteorder, salt,
1235			    iv, mac);
1236			dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
1237			    dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
1238			    mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
1239			    compress_type);
1240		} else if (compress_type != ZIO_COMPRESS_OFF) {
1241			ASSERT3U(type, ==, ARC_BUFC_DATA);
1242			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
1243			    size, arc_buf_lsize(db->db_buf), compress_type);
1244		} else {
1245			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
1246		}
1247		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
1248	} else {
1249		db->db_buf = NULL;
1250		dbuf_clear_data(db);
1251	}
1252}
1253
1254int
1255dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
1256{
1257	int err = 0;
1258	boolean_t prefetch;
1259	dnode_t *dn;
1260
1261	/*
1262	 * We don't have to hold the mutex to check db_state because it
1263	 * can't be freed while we have a hold on the buffer.
1264	 */
1265	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1266
1267	if (db->db_state == DB_NOFILL)
1268		return (SET_ERROR(EIO));
1269
1270	DB_DNODE_ENTER(db);
1271	dn = DB_DNODE(db);
1272	if ((flags & DB_RF_HAVESTRUCT) == 0)
1273		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1274
1275	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1276	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
1277	    DBUF_IS_CACHEABLE(db);
1278
1279	mutex_enter(&db->db_mtx);
1280	if (db->db_state == DB_CACHED) {
1281		spa_t *spa = dn->dn_objset->os_spa;
1282
1283		/*
1284		 * Ensure that this block's dnode has been decrypted if
1285		 * the caller has requested decrypted data.
1286		 */
1287		err = dbuf_read_verify_dnode_crypt(db, flags);
1288
1289		/*
1290		 * If the arc buf is compressed or encrypted and the caller
1291		 * requested uncompressed data, we need to untransform it
1292		 * before returning. We also call arc_untransform() on any
1293		 * unauthenticated blocks, which will verify their MAC if
1294		 * the key is now available.
1295		 */
1296		if (err == 0 && db->db_buf != NULL &&
1297		    (flags & DB_RF_NO_DECRYPT) == 0 &&
1298		    (arc_is_encrypted(db->db_buf) ||
1299		    arc_is_unauthenticated(db->db_buf) ||
1300		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
1301			zbookmark_phys_t zb;
1302
1303			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1304			    db->db.db_object, db->db_level, db->db_blkid);
1305			dbuf_fix_old_data(db, spa_syncing_txg(spa));
1306			err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
1307			dbuf_set_data(db, db->db_buf);
1308		}
1309		mutex_exit(&db->db_mtx);
1310		if (err == 0 && prefetch)
1311			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
1312		if ((flags & DB_RF_HAVESTRUCT) == 0)
1313			rw_exit(&dn->dn_struct_rwlock);
1314		DB_DNODE_EXIT(db);
1315	} else if (db->db_state == DB_UNCACHED) {
1316		spa_t *spa = dn->dn_objset->os_spa;
1317		boolean_t need_wait = B_FALSE;
1318
1319		if (zio == NULL &&
1320		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1321			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1322			need_wait = B_TRUE;
1323		}
1324		err = dbuf_read_impl(db, zio, flags);
1325
1326		/* dbuf_read_impl has dropped db_mtx for us */
1327
1328		if (!err && prefetch)
1329			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
1330
1331		if ((flags & DB_RF_HAVESTRUCT) == 0)
1332			rw_exit(&dn->dn_struct_rwlock);
1333		DB_DNODE_EXIT(db);
1334
1335		if (!err && need_wait)
1336			err = zio_wait(zio);
1337	} else {
1338		/*
1339		 * Another reader came in while the dbuf was in flight
1340		 * between UNCACHED and CACHED.  Either a writer will finish
1341		 * writing the buffer (sending the dbuf to CACHED) or the
1342		 * first reader's request will reach the read_done callback
1343		 * and send the dbuf to CACHED.  Otherwise, a failure
1344		 * occurred and the dbuf went to UNCACHED.
1345		 */
1346		mutex_exit(&db->db_mtx);
1347		if (prefetch)
1348			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
1349		if ((flags & DB_RF_HAVESTRUCT) == 0)
1350			rw_exit(&dn->dn_struct_rwlock);
1351		DB_DNODE_EXIT(db);
1352
1353		/* Skip the wait per the caller's request. */
1354		mutex_enter(&db->db_mtx);
1355		if ((flags & DB_RF_NEVERWAIT) == 0) {
1356			while (db->db_state == DB_READ ||
1357			    db->db_state == DB_FILL) {
1358				ASSERT(db->db_state == DB_READ ||
1359				    (flags & DB_RF_HAVESTRUCT) == 0);
1360				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
1361				    db, zio_t *, zio);
1362				cv_wait(&db->db_changed, &db->db_mtx);
1363			}
1364			if (db->db_state == DB_UNCACHED)
1365				err = SET_ERROR(EIO);
1366		}
1367		mutex_exit(&db->db_mtx);
1368	}
1369
1370	return (err);
1371}
1372
1373static void
1374dbuf_noread(dmu_buf_impl_t *db)
1375{
1376	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1377	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1378	mutex_enter(&db->db_mtx);
1379	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1380		cv_wait(&db->db_changed, &db->db_mtx);
1381	if (db->db_state == DB_UNCACHED) {
1382		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1383		spa_t *spa = db->db_objset->os_spa;
1384
1385		ASSERT(db->db_buf == NULL);
1386		ASSERT(db->db.db_data == NULL);
1387		dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
1388		db->db_state = DB_FILL;
1389	} else if (db->db_state == DB_NOFILL) {
1390		dbuf_clear_data(db);
1391	} else {
1392		ASSERT3U(db->db_state, ==, DB_CACHED);
1393	}
1394	mutex_exit(&db->db_mtx);
1395}
1396
1397void
1398dbuf_unoverride(dbuf_dirty_record_t *dr)
1399{
1400	dmu_buf_impl_t *db = dr->dr_dbuf;
1401	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
1402	uint64_t txg = dr->dr_txg;
1403
1404	ASSERT(MUTEX_HELD(&db->db_mtx));
1405	/*
1406	 * This assert is valid because dmu_sync() expects to be called by
1407	 * a zilog's get_data while holding a range lock.  This call only
1408	 * comes from dbuf_dirty() callers who must also hold a range lock.
1409	 */
1410	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
1411	ASSERT(db->db_level == 0);
1412
1413	if (db->db_blkid == DMU_BONUS_BLKID ||
1414	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
1415		return;
1416
1417	ASSERT(db->db_data_pending != dr);
1418
1419	/* free this block */
1420	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
1421		zio_free(db->db_objset->os_spa, txg, bp);
1422
1423	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1424	dr->dt.dl.dr_nopwrite = B_FALSE;
1425	dr->dt.dl.dr_has_raw_params = B_FALSE;
1426
1427	/*
1428	 * Release the already-written buffer, so we leave it in
1429	 * a consistent dirty state.  Note that all callers are
1430	 * modifying the buffer, so they will immediately do
1431	 * another (redundant) arc_release().  Therefore, leave
1432	 * the buf thawed to save the effort of freezing &
1433	 * immediately re-thawing it.
1434	 */
1435	arc_release(dr->dt.dl.dr_data, db);
1436}
1437
1438/*
1439 * Evict (if its unreferenced) or clear (if its referenced) any level-0
1440 * data blocks in the free range, so that any future readers will find
1441 * empty blocks.
1442 */
1443void
1444dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1445    dmu_tx_t *tx)
1446{
1447	dmu_buf_impl_t db_search;
1448	dmu_buf_impl_t *db, *db_next;
1449	uint64_t txg = tx->tx_txg;
1450	avl_index_t where;
1451
1452	if (end_blkid > dn->dn_maxblkid &&
1453	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
1454		end_blkid = dn->dn_maxblkid;
1455	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
1456
1457	db_search.db_level = 0;
1458	db_search.db_blkid = start_blkid;
1459	db_search.db_state = DB_SEARCH;
1460
1461	mutex_enter(&dn->dn_dbufs_mtx);
1462	db = avl_find(&dn->dn_dbufs, &db_search, &where);
1463	ASSERT3P(db, ==, NULL);
1464
1465	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1466
1467	for (; db != NULL; db = db_next) {
1468		db_next = AVL_NEXT(&dn->dn_dbufs, db);
1469		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1470
1471		if (db->db_level != 0 || db->db_blkid > end_blkid) {
1472			break;
1473		}
1474		ASSERT3U(db->db_blkid, >=, start_blkid);
1475
1476		/* found a level 0 buffer in the range */
1477		mutex_enter(&db->db_mtx);
1478		if (dbuf_undirty(db, tx)) {
1479			/* mutex has been dropped and dbuf destroyed */
1480			continue;
1481		}
1482
1483		if (db->db_state == DB_UNCACHED ||
1484		    db->db_state == DB_NOFILL ||
1485		    db->db_state == DB_EVICTING) {
1486			ASSERT(db->db.db_data == NULL);
1487			mutex_exit(&db->db_mtx);
1488			continue;
1489		}
1490		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
1491			/* will be handled in dbuf_read_done or dbuf_rele */
1492			db->db_freed_in_flight = TRUE;
1493			mutex_exit(&db->db_mtx);
1494			continue;
1495		}
1496		if (zfs_refcount_count(&db->db_holds) == 0) {
1497			ASSERT(db->db_buf);
1498			dbuf_destroy(db);
1499			continue;
1500		}
1501		/* The dbuf is referenced */
1502
1503		if (db->db_last_dirty != NULL) {
1504			dbuf_dirty_record_t *dr = db->db_last_dirty;
1505
1506			if (dr->dr_txg == txg) {
1507				/*
1508				 * This buffer is "in-use", re-adjust the file
1509				 * size to reflect that this buffer may
1510				 * contain new data when we sync.
1511				 */
1512				if (db->db_blkid != DMU_SPILL_BLKID &&
1513				    db->db_blkid > dn->dn_maxblkid)
1514					dn->dn_maxblkid = db->db_blkid;
1515				dbuf_unoverride(dr);
1516			} else {
1517				/*
1518				 * This dbuf is not dirty in the open context.
1519				 * Either uncache it (if its not referenced in
1520				 * the open context) or reset its contents to
1521				 * empty.
1522				 */
1523				dbuf_fix_old_data(db, txg);
1524			}
1525		}
1526		/* clear the contents if its cached */
1527		if (db->db_state == DB_CACHED) {
1528			ASSERT(db->db.db_data != NULL);
1529			arc_release(db->db_buf, db);
1530			bzero(db->db.db_data, db->db.db_size);
1531			arc_buf_freeze(db->db_buf);
1532		}
1533
1534		mutex_exit(&db->db_mtx);
1535	}
1536	mutex_exit(&dn->dn_dbufs_mtx);
1537}
1538
1539void
1540dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1541{
1542	arc_buf_t *buf, *obuf;
1543	int osize = db->db.db_size;
1544	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1545	dnode_t *dn;
1546
1547	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1548
1549	DB_DNODE_ENTER(db);
1550	dn = DB_DNODE(db);
1551
1552	/* XXX does *this* func really need the lock? */
1553	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1554
1555	/*
1556	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1557	 * is OK, because there can be no other references to the db
1558	 * when we are changing its size, so no concurrent DB_FILL can
1559	 * be happening.
1560	 */
1561	/*
1562	 * XXX we should be doing a dbuf_read, checking the return
1563	 * value and returning that up to our callers
1564	 */
1565	dmu_buf_will_dirty(&db->db, tx);
1566
1567	/* create the data buffer for the new block */
1568	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
1569
1570	/* copy old block data to the new block */
1571	obuf = db->db_buf;
1572	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1573	/* zero the remainder */
1574	if (size > osize)
1575		bzero((uint8_t *)buf->b_data + osize, size - osize);
1576
1577	mutex_enter(&db->db_mtx);
1578	dbuf_set_data(db, buf);
1579	arc_buf_destroy(obuf, db);
1580	db->db.db_size = size;
1581
1582	if (db->db_level == 0) {
1583		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1584		db->db_last_dirty->dt.dl.dr_data = buf;
1585	}
1586	mutex_exit(&db->db_mtx);
1587
1588	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
1589	DB_DNODE_EXIT(db);
1590}
1591
1592void
1593dbuf_release_bp(dmu_buf_impl_t *db)
1594{
1595	objset_t *os = db->db_objset;
1596
1597	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1598	ASSERT(arc_released(os->os_phys_buf) ||
1599	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1600	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1601
1602	(void) arc_release(db->db_buf, db);
1603}
1604
1605/*
1606 * We already have a dirty record for this TXG, and we are being
1607 * dirtied again.
1608 */
1609static void
1610dbuf_redirty(dbuf_dirty_record_t *dr)
1611{
1612	dmu_buf_impl_t *db = dr->dr_dbuf;
1613
1614	ASSERT(MUTEX_HELD(&db->db_mtx));
1615
1616	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1617		/*
1618		 * If this buffer has already been written out,
1619		 * we now need to reset its state.
1620		 */
1621		dbuf_unoverride(dr);
1622		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1623		    db->db_state != DB_NOFILL) {
1624			/* Already released on initial dirty, so just thaw. */
1625			ASSERT(arc_released(db->db_buf));
1626			arc_buf_thaw(db->db_buf);
1627		}
1628	}
1629}
1630
1631dbuf_dirty_record_t *
1632dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1633{
1634	dnode_t *dn;
1635	objset_t *os;
1636	dbuf_dirty_record_t **drp, *dr;
1637	int drop_struct_lock = FALSE;
1638	int txgoff = tx->tx_txg & TXG_MASK;
1639
1640	ASSERT(tx->tx_txg != 0);
1641	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1642	DMU_TX_DIRTY_BUF(tx, db);
1643
1644	DB_DNODE_ENTER(db);
1645	dn = DB_DNODE(db);
1646	/*
1647	 * Shouldn't dirty a regular buffer in syncing context.  Private
1648	 * objects may be dirtied in syncing context, but only if they
1649	 * were already pre-dirtied in open context.
1650	 */
1651#ifdef DEBUG
1652	if (dn->dn_objset->os_dsl_dataset != NULL) {
1653		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1654		    RW_READER, FTAG);
1655	}
1656	ASSERT(!dmu_tx_is_syncing(tx) ||
1657	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1658	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1659	    dn->dn_objset->os_dsl_dataset == NULL);
1660	if (dn->dn_objset->os_dsl_dataset != NULL)
1661		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
1662#endif
1663	/*
1664	 * We make this assert for private objects as well, but after we
1665	 * check if we're already dirty.  They are allowed to re-dirty
1666	 * in syncing context.
1667	 */
1668	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1669	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1670	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1671
1672	mutex_enter(&db->db_mtx);
1673	/*
1674	 * XXX make this true for indirects too?  The problem is that
1675	 * transactions created with dmu_tx_create_assigned() from
1676	 * syncing context don't bother holding ahead.
1677	 */
1678	ASSERT(db->db_level != 0 ||
1679	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1680	    db->db_state == DB_NOFILL);
1681
1682	mutex_enter(&dn->dn_mtx);
1683	/*
1684	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1685	 * initialize the objset.
1686	 */
1687	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
1688		if (dn->dn_objset->os_dsl_dataset != NULL) {
1689			rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1690			    RW_READER, FTAG);
1691		}
1692		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1693			dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
1694			    DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1695			ASSERT(dn->dn_dirtyctx_firstset == NULL);
1696			dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1697		}
1698		if (dn->dn_objset->os_dsl_dataset != NULL) {
1699			rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1700			    FTAG);
1701		}
1702	}
1703
1704	if (tx->tx_txg > dn->dn_dirty_txg)
1705		dn->dn_dirty_txg = tx->tx_txg;
1706	mutex_exit(&dn->dn_mtx);
1707
1708	if (db->db_blkid == DMU_SPILL_BLKID)
1709		dn->dn_have_spill = B_TRUE;
1710
1711	/*
1712	 * If this buffer is already dirty, we're done.
1713	 */
1714	drp = &db->db_last_dirty;
1715	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1716	    db->db.db_object == DMU_META_DNODE_OBJECT);
1717	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1718		drp = &dr->dr_next;
1719	if (dr && dr->dr_txg == tx->tx_txg) {
1720		DB_DNODE_EXIT(db);
1721
1722		dbuf_redirty(dr);
1723		mutex_exit(&db->db_mtx);
1724		return (dr);
1725	}
1726
1727	/*
1728	 * Only valid if not already dirty.
1729	 */
1730	ASSERT(dn->dn_object == 0 ||
1731	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1732	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1733
1734	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1735
1736	/*
1737	 * We should only be dirtying in syncing context if it's the
1738	 * mos or we're initializing the os or it's a special object.
1739	 * However, we are allowed to dirty in syncing context provided
1740	 * we already dirtied it in open context.  Hence we must make
1741	 * this assertion only if we're not already dirty.
1742	 */
1743	os = dn->dn_objset;
1744	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
1745#ifdef DEBUG
1746	if (dn->dn_objset->os_dsl_dataset != NULL)
1747		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
1748	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1749	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1750	if (dn->dn_objset->os_dsl_dataset != NULL)
1751		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
1752#endif
1753	ASSERT(db->db.db_size != 0);
1754
1755	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1756
1757	if (db->db_blkid != DMU_BONUS_BLKID) {
1758		dmu_objset_willuse_space(os, db->db.db_size, tx);
1759	}
1760
1761	/*
1762	 * If this buffer is dirty in an old transaction group we need
1763	 * to make a copy of it so that the changes we make in this
1764	 * transaction group won't leak out when we sync the older txg.
1765	 */
1766	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1767	if (db->db_level == 0) {
1768		void *data_old = db->db_buf;
1769
1770		if (db->db_state != DB_NOFILL) {
1771			if (db->db_blkid == DMU_BONUS_BLKID) {
1772				dbuf_fix_old_data(db, tx->tx_txg);
1773				data_old = db->db.db_data;
1774			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1775				/*
1776				 * Release the data buffer from the cache so
1777				 * that we can modify it without impacting
1778				 * possible other users of this cached data
1779				 * block.  Note that indirect blocks and
1780				 * private objects are not released until the
1781				 * syncing state (since they are only modified
1782				 * then).
1783				 */
1784				arc_release(db->db_buf, db);
1785				dbuf_fix_old_data(db, tx->tx_txg);
1786				data_old = db->db_buf;
1787			}
1788			ASSERT(data_old != NULL);
1789		}
1790		dr->dt.dl.dr_data = data_old;
1791	} else {
1792		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1793		list_create(&dr->dt.di.dr_children,
1794		    sizeof (dbuf_dirty_record_t),
1795		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1796	}
1797	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1798		dr->dr_accounted = db->db.db_size;
1799	dr->dr_dbuf = db;
1800	dr->dr_txg = tx->tx_txg;
1801	dr->dr_next = *drp;
1802	*drp = dr;
1803
1804	/*
1805	 * We could have been freed_in_flight between the dbuf_noread
1806	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1807	 * happened after the free.
1808	 */
1809	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1810	    db->db_blkid != DMU_SPILL_BLKID) {
1811		mutex_enter(&dn->dn_mtx);
1812		if (dn->dn_free_ranges[txgoff] != NULL) {
1813			range_tree_clear(dn->dn_free_ranges[txgoff],
1814			    db->db_blkid, 1);
1815		}
1816		mutex_exit(&dn->dn_mtx);
1817		db->db_freed_in_flight = FALSE;
1818	}
1819
1820	/*
1821	 * This buffer is now part of this txg
1822	 */
1823	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1824	db->db_dirtycnt += 1;
1825	ASSERT3U(db->db_dirtycnt, <=, 3);
1826
1827	mutex_exit(&db->db_mtx);
1828
1829	if (db->db_blkid == DMU_BONUS_BLKID ||
1830	    db->db_blkid == DMU_SPILL_BLKID) {
1831		mutex_enter(&dn->dn_mtx);
1832		ASSERT(!list_link_active(&dr->dr_dirty_node));
1833		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1834		mutex_exit(&dn->dn_mtx);
1835		dnode_setdirty(dn, tx);
1836		DB_DNODE_EXIT(db);
1837		return (dr);
1838	}
1839
1840	/*
1841	 * The dn_struct_rwlock prevents db_blkptr from changing
1842	 * due to a write from syncing context completing
1843	 * while we are running, so we want to acquire it before
1844	 * looking at db_blkptr.
1845	 */
1846	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1847		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1848		drop_struct_lock = TRUE;
1849	}
1850
1851	/*
1852	 * We need to hold the dn_struct_rwlock to make this assertion,
1853	 * because it protects dn_phys / dn_next_nlevels from changing.
1854	 */
1855	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1856	    dn->dn_phys->dn_nlevels > db->db_level ||
1857	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1858	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1859	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1860
1861	/*
1862	 * If we are overwriting a dedup BP, then unless it is snapshotted,
1863	 * when we get to syncing context we will need to decrement its
1864	 * refcount in the DDT.  Prefetch the relevant DDT block so that
1865	 * syncing context won't have to wait for the i/o.
1866	 */
1867	ddt_prefetch(os->os_spa, db->db_blkptr);
1868
1869	if (db->db_level == 0) {
1870		ASSERT(!db->db_objset->os_raw_receive ||
1871		    dn->dn_maxblkid >= db->db_blkid);
1872		dnode_new_blkid(dn, db->db_blkid, tx,
1873		    drop_struct_lock, B_FALSE);
1874		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1875	}
1876
1877	if (db->db_level+1 < dn->dn_nlevels) {
1878		dmu_buf_impl_t *parent = db->db_parent;
1879		dbuf_dirty_record_t *di;
1880		int parent_held = FALSE;
1881
1882		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1883			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1884
1885			parent = dbuf_hold_level(dn, db->db_level+1,
1886			    db->db_blkid >> epbs, FTAG);
1887			ASSERT(parent != NULL);
1888			parent_held = TRUE;
1889		}
1890		if (drop_struct_lock)
1891			rw_exit(&dn->dn_struct_rwlock);
1892		ASSERT3U(db->db_level+1, ==, parent->db_level);
1893		di = dbuf_dirty(parent, tx);
1894		if (parent_held)
1895			dbuf_rele(parent, FTAG);
1896
1897		mutex_enter(&db->db_mtx);
1898		/*
1899		 * Since we've dropped the mutex, it's possible that
1900		 * dbuf_undirty() might have changed this out from under us.
1901		 */
1902		if (db->db_last_dirty == dr ||
1903		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1904			mutex_enter(&di->dt.di.dr_mtx);
1905			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1906			ASSERT(!list_link_active(&dr->dr_dirty_node));
1907			list_insert_tail(&di->dt.di.dr_children, dr);
1908			mutex_exit(&di->dt.di.dr_mtx);
1909			dr->dr_parent = di;
1910		}
1911		mutex_exit(&db->db_mtx);
1912	} else {
1913		ASSERT(db->db_level+1 == dn->dn_nlevels);
1914		ASSERT(db->db_blkid < dn->dn_nblkptr);
1915		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1916		mutex_enter(&dn->dn_mtx);
1917		ASSERT(!list_link_active(&dr->dr_dirty_node));
1918		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1919		mutex_exit(&dn->dn_mtx);
1920		if (drop_struct_lock)
1921			rw_exit(&dn->dn_struct_rwlock);
1922	}
1923
1924	dnode_setdirty(dn, tx);
1925	DB_DNODE_EXIT(db);
1926	return (dr);
1927}
1928
1929/*
1930 * Undirty a buffer in the transaction group referenced by the given
1931 * transaction.  Return whether this evicted the dbuf.
1932 */
1933static boolean_t
1934dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1935{
1936	dnode_t *dn;
1937	uint64_t txg = tx->tx_txg;
1938	dbuf_dirty_record_t *dr, **drp;
1939
1940	ASSERT(txg != 0);
1941
1942	/*
1943	 * Due to our use of dn_nlevels below, this can only be called
1944	 * in open context, unless we are operating on the MOS.
1945	 * From syncing context, dn_nlevels may be different from the
1946	 * dn_nlevels used when dbuf was dirtied.
1947	 */
1948	ASSERT(db->db_objset ==
1949	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
1950	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
1951	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1952	ASSERT0(db->db_level);
1953	ASSERT(MUTEX_HELD(&db->db_mtx));
1954
1955	/*
1956	 * If this buffer is not dirty, we're done.
1957	 */
1958	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1959		if (dr->dr_txg <= txg)
1960			break;
1961	if (dr == NULL || dr->dr_txg < txg)
1962		return (B_FALSE);
1963	ASSERT(dr->dr_txg == txg);
1964	ASSERT(dr->dr_dbuf == db);
1965
1966	DB_DNODE_ENTER(db);
1967	dn = DB_DNODE(db);
1968
1969	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1970
1971	ASSERT(db->db.db_size != 0);
1972
1973	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
1974	    dr->dr_accounted, txg);
1975
1976	*drp = dr->dr_next;
1977
1978	/*
1979	 * Note that there are three places in dbuf_dirty()
1980	 * where this dirty record may be put on a list.
1981	 * Make sure to do a list_remove corresponding to
1982	 * every one of those list_insert calls.
1983	 */
1984	if (dr->dr_parent) {
1985		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1986		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1987		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1988	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1989	    db->db_level + 1 == dn->dn_nlevels) {
1990		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1991		mutex_enter(&dn->dn_mtx);
1992		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1993		mutex_exit(&dn->dn_mtx);
1994	}
1995	DB_DNODE_EXIT(db);
1996
1997	if (db->db_state != DB_NOFILL) {
1998		dbuf_unoverride(dr);
1999
2000		ASSERT(db->db_buf != NULL);
2001		ASSERT(dr->dt.dl.dr_data != NULL);
2002		if (dr->dt.dl.dr_data != db->db_buf)
2003			arc_buf_destroy(dr->dt.dl.dr_data, db);
2004	}
2005
2006	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2007
2008	ASSERT(db->db_dirtycnt > 0);
2009	db->db_dirtycnt -= 1;
2010
2011	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
2012		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
2013		dbuf_destroy(db);
2014		return (B_TRUE);
2015	}
2016
2017	return (B_FALSE);
2018}
2019
2020static void
2021dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
2022{
2023	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2024
2025	ASSERT(tx->tx_txg != 0);
2026	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2027
2028	/*
2029	 * Quick check for dirtyness.  For already dirty blocks, this
2030	 * reduces runtime of this function by >90%, and overall performance
2031	 * by 50% for some workloads (e.g. file deletion with indirect blocks
2032	 * cached).
2033	 */
2034	mutex_enter(&db->db_mtx);
2035	dbuf_dirty_record_t *dr;
2036	for (dr = db->db_last_dirty;
2037	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
2038		/*
2039		 * It's possible that it is already dirty but not cached,
2040		 * because there are some calls to dbuf_dirty() that don't
2041		 * go through dmu_buf_will_dirty().
2042		 */
2043		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
2044			/* This dbuf is already dirty and cached. */
2045			dbuf_redirty(dr);
2046			mutex_exit(&db->db_mtx);
2047			return;
2048		}
2049	}
2050	mutex_exit(&db->db_mtx);
2051
2052	DB_DNODE_ENTER(db);
2053	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
2054		flags |= DB_RF_HAVESTRUCT;
2055	DB_DNODE_EXIT(db);
2056	(void) dbuf_read(db, NULL, flags);
2057	(void) dbuf_dirty(db, tx);
2058}
2059
2060void
2061dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2062{
2063	dmu_buf_will_dirty_impl(db_fake,
2064	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
2065}
2066
2067void
2068dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2069{
2070	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2071
2072	db->db_state = DB_NOFILL;
2073
2074	dmu_buf_will_fill(db_fake, tx);
2075}
2076
2077void
2078dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2079{
2080	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2081
2082	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2083	ASSERT(tx->tx_txg != 0);
2084	ASSERT(db->db_level == 0);
2085	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2086
2087	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
2088	    dmu_tx_private_ok(tx));
2089
2090	dbuf_noread(db);
2091	(void) dbuf_dirty(db, tx);
2092}
2093
2094/*
2095 * This function is effectively the same as dmu_buf_will_dirty(), but
2096 * indicates the caller expects raw encrypted data in the db, and provides
2097 * the crypt params (byteorder, salt, iv, mac) which should be stored in the
2098 * blkptr_t when this dbuf is written.  This is only used for blocks of
2099 * dnodes during a raw receive.
2100 */
2101void
2102dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
2103    const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
2104{
2105	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2106	dbuf_dirty_record_t *dr;
2107
2108	/*
2109	 * dr_has_raw_params is only processed for blocks of dnodes
2110	 * (see dbuf_sync_dnode_leaf_crypt()).
2111	 */
2112	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
2113	ASSERT3U(db->db_level, ==, 0);
2114
2115	dmu_buf_will_dirty_impl(db_fake,
2116	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
2117
2118	dr = db->db_last_dirty;
2119	while (dr != NULL && dr->dr_txg > tx->tx_txg)
2120		dr = dr->dr_next;
2121
2122	ASSERT3P(dr, !=, NULL);
2123	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2124
2125	dr->dt.dl.dr_has_raw_params = B_TRUE;
2126	dr->dt.dl.dr_byteorder = byteorder;
2127	bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
2128	bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
2129	bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
2130}
2131
2132#pragma weak dmu_buf_fill_done = dbuf_fill_done
2133/* ARGSUSED */
2134void
2135dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
2136{
2137	mutex_enter(&db->db_mtx);
2138	DBUF_VERIFY(db);
2139
2140	if (db->db_state == DB_FILL) {
2141		if (db->db_level == 0 && db->db_freed_in_flight) {
2142			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2143			/* we were freed while filling */
2144			/* XXX dbuf_undirty? */
2145			bzero(db->db.db_data, db->db.db_size);
2146			db->db_freed_in_flight = FALSE;
2147		}
2148		db->db_state = DB_CACHED;
2149		cv_broadcast(&db->db_changed);
2150	}
2151	mutex_exit(&db->db_mtx);
2152}
2153
2154void
2155dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
2156    bp_embedded_type_t etype, enum zio_compress comp,
2157    int uncompressed_size, int compressed_size, int byteorder,
2158    dmu_tx_t *tx)
2159{
2160	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2161	struct dirty_leaf *dl;
2162	dmu_object_type_t type;
2163
2164	if (etype == BP_EMBEDDED_TYPE_DATA) {
2165		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
2166		    SPA_FEATURE_EMBEDDED_DATA));
2167	}
2168
2169	DB_DNODE_ENTER(db);
2170	type = DB_DNODE(db)->dn_type;
2171	DB_DNODE_EXIT(db);
2172
2173	ASSERT0(db->db_level);
2174	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2175
2176	dmu_buf_will_not_fill(dbuf, tx);
2177
2178	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
2179	dl = &db->db_last_dirty->dt.dl;
2180	encode_embedded_bp_compressed(&dl->dr_overridden_by,
2181	    data, comp, uncompressed_size, compressed_size);
2182	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
2183	BP_SET_TYPE(&dl->dr_overridden_by, type);
2184	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
2185	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
2186
2187	dl->dr_override_state = DR_OVERRIDDEN;
2188	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
2189}
2190
2191/*
2192 * Directly assign a provided arc buf to a given dbuf if it's not referenced
2193 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
2194 */
2195void
2196dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
2197{
2198	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2199	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2200	ASSERT(db->db_level == 0);
2201	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
2202	ASSERT(buf != NULL);
2203	ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
2204	ASSERT(tx->tx_txg != 0);
2205
2206	arc_return_buf(buf, db);
2207	ASSERT(arc_released(buf));
2208
2209	mutex_enter(&db->db_mtx);
2210
2211	while (db->db_state == DB_READ || db->db_state == DB_FILL)
2212		cv_wait(&db->db_changed, &db->db_mtx);
2213
2214	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
2215
2216	if (db->db_state == DB_CACHED &&
2217	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
2218		/*
2219		 * In practice, we will never have a case where we have an
2220		 * encrypted arc buffer while additional holds exist on the
2221		 * dbuf. We don't handle this here so we simply assert that
2222		 * fact instead.
2223		 */
2224		ASSERT(!arc_is_encrypted(buf));
2225		mutex_exit(&db->db_mtx);
2226		(void) dbuf_dirty(db, tx);
2227		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
2228		arc_buf_destroy(buf, db);
2229		xuio_stat_wbuf_copied();
2230		return;
2231	}
2232
2233	xuio_stat_wbuf_nocopy();
2234	if (db->db_state == DB_CACHED) {
2235		dbuf_dirty_record_t *dr = db->db_last_dirty;
2236
2237		ASSERT(db->db_buf != NULL);
2238		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
2239			ASSERT(dr->dt.dl.dr_data == db->db_buf);
2240
2241			if (!arc_released(db->db_buf)) {
2242				ASSERT(dr->dt.dl.dr_override_state ==
2243				    DR_OVERRIDDEN);
2244				arc_release(db->db_buf, db);
2245			}
2246			dr->dt.dl.dr_data = buf;
2247			arc_buf_destroy(db->db_buf, db);
2248		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
2249			arc_release(db->db_buf, db);
2250			arc_buf_destroy(db->db_buf, db);
2251		}
2252		db->db_buf = NULL;
2253	}
2254	ASSERT(db->db_buf == NULL);
2255	dbuf_set_data(db, buf);
2256	db->db_state = DB_FILL;
2257	mutex_exit(&db->db_mtx);
2258	(void) dbuf_dirty(db, tx);
2259	dmu_buf_fill_done(&db->db, tx);
2260}
2261
2262void
2263dbuf_destroy(dmu_buf_impl_t *db)
2264{
2265	dnode_t *dn;
2266	dmu_buf_impl_t *parent = db->db_parent;
2267	dmu_buf_impl_t *dndb;
2268
2269	ASSERT(MUTEX_HELD(&db->db_mtx));
2270	ASSERT(zfs_refcount_is_zero(&db->db_holds));
2271
2272	if (db->db_buf != NULL) {
2273		arc_buf_destroy(db->db_buf, db);
2274		db->db_buf = NULL;
2275	}
2276
2277	if (db->db_blkid == DMU_BONUS_BLKID) {
2278		int slots = DB_DNODE(db)->dn_num_slots;
2279		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
2280		if (db->db.db_data != NULL) {
2281			zio_buf_free(db->db.db_data, bonuslen);
2282			arc_space_return(bonuslen, ARC_SPACE_BONUS);
2283			db->db_state = DB_UNCACHED;
2284		}
2285	}
2286
2287	dbuf_clear_data(db);
2288
2289	if (multilist_link_active(&db->db_cache_link)) {
2290		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
2291		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
2292
2293		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
2294		(void) zfs_refcount_remove_many(
2295		    &dbuf_caches[db->db_caching_status].size,
2296		    db->db.db_size, db);
2297
2298		db->db_caching_status = DB_NO_CACHE;
2299	}
2300
2301	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
2302	ASSERT(db->db_data_pending == NULL);
2303
2304	db->db_state = DB_EVICTING;
2305	db->db_blkptr = NULL;
2306
2307	/*
2308	 * Now that db_state is DB_EVICTING, nobody else can find this via
2309	 * the hash table.  We can now drop db_mtx, which allows us to
2310	 * acquire the dn_dbufs_mtx.
2311	 */
2312	mutex_exit(&db->db_mtx);
2313
2314	DB_DNODE_ENTER(db);
2315	dn = DB_DNODE(db);
2316	dndb = dn->dn_dbuf;
2317	if (db->db_blkid != DMU_BONUS_BLKID) {
2318		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
2319		if (needlock)
2320			mutex_enter(&dn->dn_dbufs_mtx);
2321		avl_remove(&dn->dn_dbufs, db);
2322		atomic_dec_32(&dn->dn_dbufs_count);
2323		membar_producer();
2324		DB_DNODE_EXIT(db);
2325		if (needlock)
2326			mutex_exit(&dn->dn_dbufs_mtx);
2327		/*
2328		 * Decrementing the dbuf count means that the hold corresponding
2329		 * to the removed dbuf is no longer discounted in dnode_move(),
2330		 * so the dnode cannot be moved until after we release the hold.
2331		 * The membar_producer() ensures visibility of the decremented
2332		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
2333		 * release any lock.
2334		 */
2335		mutex_enter(&dn->dn_mtx);
2336		dnode_rele_and_unlock(dn, db, B_TRUE);
2337		db->db_dnode_handle = NULL;
2338
2339		dbuf_hash_remove(db);
2340	} else {
2341		DB_DNODE_EXIT(db);
2342	}
2343
2344	ASSERT(zfs_refcount_is_zero(&db->db_holds));
2345
2346	db->db_parent = NULL;
2347
2348	ASSERT(db->db_buf == NULL);
2349	ASSERT(db->db.db_data == NULL);
2350	ASSERT(db->db_hash_next == NULL);
2351	ASSERT(db->db_blkptr == NULL);
2352	ASSERT(db->db_data_pending == NULL);
2353	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
2354	ASSERT(!multilist_link_active(&db->db_cache_link));
2355
2356	kmem_cache_free(dbuf_kmem_cache, db);
2357	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2358
2359	/*
2360	 * If this dbuf is referenced from an indirect dbuf,
2361	 * decrement the ref count on the indirect dbuf.
2362	 */
2363	if (parent && parent != dndb) {
2364		mutex_enter(&parent->db_mtx);
2365		dbuf_rele_and_unlock(parent, db, B_TRUE);
2366	}
2367}
2368
2369/*
2370 * Note: While bpp will always be updated if the function returns success,
2371 * parentp will not be updated if the dnode does not have dn_dbuf filled in;
2372 * this happens when the dnode is the meta-dnode, or {user|group|project}used
2373 * object.
2374 */
2375static int
2376dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
2377    dmu_buf_impl_t **parentp, blkptr_t **bpp)
2378{
2379	*parentp = NULL;
2380	*bpp = NULL;
2381
2382	ASSERT(blkid != DMU_BONUS_BLKID);
2383
2384	if (blkid == DMU_SPILL_BLKID) {
2385		mutex_enter(&dn->dn_mtx);
2386		if (dn->dn_have_spill &&
2387		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2388			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
2389		else
2390			*bpp = NULL;
2391		dbuf_add_ref(dn->dn_dbuf, NULL);
2392		*parentp = dn->dn_dbuf;
2393		mutex_exit(&dn->dn_mtx);
2394		return (0);
2395	}
2396
2397	int nlevels =
2398	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
2399	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2400
2401	ASSERT3U(level * epbs, <, 64);
2402	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2403	/*
2404	 * This assertion shouldn't trip as long as the max indirect block size
2405	 * is less than 1M.  The reason for this is that up to that point,
2406	 * the number of levels required to address an entire object with blocks
2407	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.  In
2408	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
2409	 * (i.e. we can address the entire object), objects will all use at most
2410	 * N-1 levels and the assertion won't overflow.  However, once epbs is
2411	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
2412	 * enough to address an entire object, so objects will have 5 levels,
2413	 * but then this assertion will overflow.
2414	 *
2415	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
2416	 * need to redo this logic to handle overflows.
2417	 */
2418	ASSERT(level >= nlevels ||
2419	    ((nlevels - level - 1) * epbs) +
2420	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
2421	if (level >= nlevels ||
2422	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
2423	    ((nlevels - level - 1) * epbs)) ||
2424	    (fail_sparse &&
2425	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
2426		/* the buffer has no parent yet */
2427		return (SET_ERROR(ENOENT));
2428	} else if (level < nlevels-1) {
2429		/* this block is referenced from an indirect block */
2430		int err = dbuf_hold_impl(dn, level+1,
2431		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
2432		if (err)
2433			return (err);
2434		err = dbuf_read(*parentp, NULL,
2435		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
2436		if (err) {
2437			dbuf_rele(*parentp, NULL);
2438			*parentp = NULL;
2439			return (err);
2440		}
2441		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
2442		    (blkid & ((1ULL << epbs) - 1));
2443		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
2444			ASSERT(BP_IS_HOLE(*bpp));
2445		return (0);
2446	} else {
2447		/* the block is referenced from the dnode */
2448		ASSERT3U(level, ==, nlevels-1);
2449		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
2450		    blkid < dn->dn_phys->dn_nblkptr);
2451		if (dn->dn_dbuf) {
2452			dbuf_add_ref(dn->dn_dbuf, NULL);
2453			*parentp = dn->dn_dbuf;
2454		}
2455		*bpp = &dn->dn_phys->dn_blkptr[blkid];
2456		return (0);
2457	}
2458}
2459
2460static dmu_buf_impl_t *
2461dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
2462    dmu_buf_impl_t *parent, blkptr_t *blkptr)
2463{
2464	objset_t *os = dn->dn_objset;
2465	dmu_buf_impl_t *db, *odb;
2466
2467	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2468	ASSERT(dn->dn_type != DMU_OT_NONE);
2469
2470	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
2471
2472	db->db_objset = os;
2473	db->db.db_object = dn->dn_object;
2474	db->db_level = level;
2475	db->db_blkid = blkid;
2476	db->db_last_dirty = NULL;
2477	db->db_dirtycnt = 0;
2478	db->db_dnode_handle = dn->dn_handle;
2479	db->db_parent = parent;
2480	db->db_blkptr = blkptr;
2481
2482	db->db_user = NULL;
2483	db->db_user_immediate_evict = FALSE;
2484	db->db_freed_in_flight = FALSE;
2485	db->db_pending_evict = FALSE;
2486
2487	if (blkid == DMU_BONUS_BLKID) {
2488		ASSERT3P(parent, ==, dn->dn_dbuf);
2489		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
2490		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
2491		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
2492		db->db.db_offset = DMU_BONUS_BLKID;
2493		db->db_state = DB_UNCACHED;
2494		db->db_caching_status = DB_NO_CACHE;
2495		/* the bonus dbuf is not placed in the hash table */
2496		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2497		return (db);
2498	} else if (blkid == DMU_SPILL_BLKID) {
2499		db->db.db_size = (blkptr != NULL) ?
2500		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
2501		db->db.db_offset = 0;
2502	} else {
2503		int blocksize =
2504		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
2505		db->db.db_size = blocksize;
2506		db->db.db_offset = db->db_blkid * blocksize;
2507	}
2508
2509	/*
2510	 * Hold the dn_dbufs_mtx while we get the new dbuf
2511	 * in the hash table *and* added to the dbufs list.
2512	 * This prevents a possible deadlock with someone
2513	 * trying to look up this dbuf before its added to the
2514	 * dn_dbufs list.
2515	 */
2516	mutex_enter(&dn->dn_dbufs_mtx);
2517	db->db_state = DB_EVICTING;
2518	if ((odb = dbuf_hash_insert(db)) != NULL) {
2519		/* someone else inserted it first */
2520		kmem_cache_free(dbuf_kmem_cache, db);
2521		mutex_exit(&dn->dn_dbufs_mtx);
2522		return (odb);
2523	}
2524	avl_add(&dn->dn_dbufs, db);
2525
2526	db->db_state = DB_UNCACHED;
2527	db->db_caching_status = DB_NO_CACHE;
2528	mutex_exit(&dn->dn_dbufs_mtx);
2529	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2530
2531	if (parent && parent != dn->dn_dbuf)
2532		dbuf_add_ref(parent, db);
2533
2534	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2535	    zfs_refcount_count(&dn->dn_holds) > 0);
2536	(void) zfs_refcount_add(&dn->dn_holds, db);
2537	atomic_inc_32(&dn->dn_dbufs_count);
2538
2539	dprintf_dbuf(db, "db=%p\n", db);
2540
2541	return (db);
2542}
2543
2544typedef struct dbuf_prefetch_arg {
2545	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
2546	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2547	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2548	int dpa_curlevel; /* The current level that we're reading */
2549	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
2550	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2551	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2552	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2553} dbuf_prefetch_arg_t;
2554
2555/*
2556 * Actually issue the prefetch read for the block given.
2557 */
2558static void
2559dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2560{
2561	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2562		return;
2563
2564	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
2565	arc_flags_t aflags =
2566	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2567
2568	/* dnodes are always read as raw and then converted later */
2569	if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
2570	    dpa->dpa_curlevel == 0)
2571		zio_flags |= ZIO_FLAG_RAW;
2572
2573	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2574	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2575	ASSERT(dpa->dpa_zio != NULL);
2576	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2577	    dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
2578}
2579
2580/*
2581 * Called when an indirect block above our prefetch target is read in.  This
2582 * will either read in the next indirect block down the tree or issue the actual
2583 * prefetch if the next block down is our target.
2584 */
2585/* ARGSUSED */
2586static void
2587dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
2588    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
2589{
2590	dbuf_prefetch_arg_t *dpa = private;
2591
2592	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2593	ASSERT3S(dpa->dpa_curlevel, >, 0);
2594
2595	if (abuf == NULL) {
2596		ASSERT(zio == NULL || zio->io_error != 0);
2597		kmem_free(dpa, sizeof (*dpa));
2598		return;
2599	}
2600	ASSERT(zio == NULL || zio->io_error == 0);
2601
2602	/*
2603	 * The dpa_dnode is only valid if we are called with a NULL
2604	 * zio. This indicates that the arc_read() returned without
2605	 * first calling zio_read() to issue a physical read. Once
2606	 * a physical read is made the dpa_dnode must be invalidated
2607	 * as the locks guarding it may have been dropped. If the
2608	 * dpa_dnode is still valid, then we want to add it to the dbuf
2609	 * cache. To do so, we must hold the dbuf associated with the block
2610	 * we just prefetched, read its contents so that we associate it
2611	 * with an arc_buf_t, and then release it.
2612	 */
2613	if (zio != NULL) {
2614		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2615		if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
2616			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
2617		} else {
2618			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
2619		}
2620		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
2621
2622		dpa->dpa_dnode = NULL;
2623	} else if (dpa->dpa_dnode != NULL) {
2624		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
2625		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
2626		    dpa->dpa_zb.zb_level));
2627		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
2628		    dpa->dpa_curlevel, curblkid, FTAG);
2629		(void) dbuf_read(db, NULL,
2630		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
2631		dbuf_rele(db, FTAG);
2632	}
2633
2634	dpa->dpa_curlevel--;
2635	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
2636	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
2637	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
2638	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
2639
2640	if (BP_IS_HOLE(bp)) {
2641		kmem_free(dpa, sizeof (*dpa));
2642	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
2643		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
2644		dbuf_issue_final_prefetch(dpa, bp);
2645		kmem_free(dpa, sizeof (*dpa));
2646	} else {
2647		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2648		zbookmark_phys_t zb;
2649
2650		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
2651		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
2652			iter_aflags |= ARC_FLAG_L2CACHE;
2653
2654		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2655
2656		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
2657		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
2658
2659		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2660		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
2661		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2662		    &iter_aflags, &zb);
2663	}
2664
2665	arc_buf_destroy(abuf, private);
2666}
2667
2668/*
2669 * Issue prefetch reads for the given block on the given level.  If the indirect
2670 * blocks above that block are not in memory, we will read them in
2671 * asynchronously.  As a result, this call never blocks waiting for a read to
2672 * complete. Note that the prefetch might fail if the dataset is encrypted and
2673 * the encryption key is unmapped before the IO completes.
2674 */
2675void
2676dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
2677    arc_flags_t aflags)
2678{
2679	blkptr_t bp;
2680	int epbs, nlevels, curlevel;
2681	uint64_t curblkid;
2682
2683	ASSERT(blkid != DMU_BONUS_BLKID);
2684	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2685
2686	if (blkid > dn->dn_maxblkid)
2687		return;
2688
2689	if (dnode_block_freed(dn, blkid))
2690		return;
2691
2692	/*
2693	 * This dnode hasn't been written to disk yet, so there's nothing to
2694	 * prefetch.
2695	 */
2696	nlevels = dn->dn_phys->dn_nlevels;
2697	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
2698		return;
2699
2700	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2701	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
2702		return;
2703
2704	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
2705	    level, blkid);
2706	if (db != NULL) {
2707		mutex_exit(&db->db_mtx);
2708		/*
2709		 * This dbuf already exists.  It is either CACHED, or
2710		 * (we assume) about to be read or filled.
2711		 */
2712		return;
2713	}
2714
2715	/*
2716	 * Find the closest ancestor (indirect block) of the target block
2717	 * that is present in the cache.  In this indirect block, we will
2718	 * find the bp that is at curlevel, curblkid.
2719	 */
2720	curlevel = level;
2721	curblkid = blkid;
2722	while (curlevel < nlevels - 1) {
2723		int parent_level = curlevel + 1;
2724		uint64_t parent_blkid = curblkid >> epbs;
2725		dmu_buf_impl_t *db;
2726
2727		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
2728		    FALSE, TRUE, FTAG, &db) == 0) {
2729			blkptr_t *bpp = db->db_buf->b_data;
2730			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
2731			dbuf_rele(db, FTAG);
2732			break;
2733		}
2734
2735		curlevel = parent_level;
2736		curblkid = parent_blkid;
2737	}
2738
2739	if (curlevel == nlevels - 1) {
2740		/* No cached indirect blocks found. */
2741		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
2742		bp = dn->dn_phys->dn_blkptr[curblkid];
2743	}
2744	if (BP_IS_HOLE(&bp))
2745		return;
2746
2747	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
2748
2749	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
2750	    ZIO_FLAG_CANFAIL);
2751
2752	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
2753	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
2754	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2755	    dn->dn_object, level, blkid);
2756	dpa->dpa_curlevel = curlevel;
2757	dpa->dpa_prio = prio;
2758	dpa->dpa_aflags = aflags;
2759	dpa->dpa_spa = dn->dn_objset->os_spa;
2760	dpa->dpa_dnode = dn;
2761	dpa->dpa_epbs = epbs;
2762	dpa->dpa_zio = pio;
2763
2764	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
2765	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
2766		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
2767
2768	/*
2769	 * If we have the indirect just above us, no need to do the asynchronous
2770	 * prefetch chain; we'll just run the last step ourselves.  If we're at
2771	 * a higher level, though, we want to issue the prefetches for all the
2772	 * indirect blocks asynchronously, so we can go on with whatever we were
2773	 * doing.
2774	 */
2775	if (curlevel == level) {
2776		ASSERT3U(curblkid, ==, blkid);
2777		dbuf_issue_final_prefetch(dpa, &bp);
2778		kmem_free(dpa, sizeof (*dpa));
2779	} else {
2780		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2781		zbookmark_phys_t zb;
2782
2783		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
2784		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
2785			iter_aflags |= ARC_FLAG_L2CACHE;
2786
2787		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2788		    dn->dn_object, curlevel, curblkid);
2789		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2790		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
2791		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2792		    &iter_aflags, &zb);
2793	}
2794	/*
2795	 * We use pio here instead of dpa_zio since it's possible that
2796	 * dpa may have already been freed.
2797	 */
2798	zio_nowait(pio);
2799}
2800
2801/*
2802 * Helper function for __dbuf_hold_impl() to copy a buffer. Handles
2803 * the case of encrypted, compressed and uncompressed buffers by
2804 * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
2805 * arc_alloc_compressed_buf() or arc_alloc_buf().*
2806 *
2807 * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl().
2808 */
2809static void
2810dbuf_hold_copy(dnode_t *dn,	dmu_buf_impl_t *db, dbuf_dirty_record_t *dr)
2811{
2812	arc_buf_t *data = dr->dt.dl.dr_data;
2813	enum zio_compress compress_type = arc_get_compression(data);
2814
2815	if (arc_is_encrypted(data)) {
2816		boolean_t byteorder;
2817		uint8_t salt[ZIO_DATA_SALT_LEN];
2818		uint8_t iv[ZIO_DATA_IV_LEN];
2819		uint8_t mac[ZIO_DATA_MAC_LEN];
2820
2821		arc_get_raw_params(data, &byteorder, salt, iv, mac);
2822		dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
2823		    dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
2824		    dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
2825		    compress_type));
2826	} else if (compress_type != ZIO_COMPRESS_OFF) {
2827		dbuf_set_data(db, arc_alloc_compressed_buf(
2828		    dn->dn_objset->os_spa, db, arc_buf_size(data),
2829		    arc_buf_lsize(data), compress_type));
2830	} else {
2831		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
2832		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
2833	}
2834
2835	bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
2836}
2837
2838/*
2839 * Returns with db_holds incremented, and db_mtx not held.
2840 * Note: dn_struct_rwlock must be held.
2841 */
2842int
2843dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
2844    boolean_t fail_sparse, boolean_t fail_uncached,
2845    void *tag, dmu_buf_impl_t **dbp)
2846{
2847	dmu_buf_impl_t *db, *parent = NULL;
2848
2849	ASSERT(blkid != DMU_BONUS_BLKID);
2850	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2851	ASSERT3U(dn->dn_nlevels, >, level);
2852
2853	*dbp = NULL;
2854top:
2855	/* dbuf_find() returns with db_mtx held */
2856	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2857
2858	if (db == NULL) {
2859		blkptr_t *bp = NULL;
2860		int err;
2861
2862		if (fail_uncached)
2863			return (SET_ERROR(ENOENT));
2864
2865		ASSERT3P(parent, ==, NULL);
2866		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2867		if (fail_sparse) {
2868			if (err == 0 && bp && BP_IS_HOLE(bp))
2869				err = SET_ERROR(ENOENT);
2870			if (err) {
2871				if (parent)
2872					dbuf_rele(parent, NULL);
2873				return (err);
2874			}
2875		}
2876		if (err && err != ENOENT)
2877			return (err);
2878		db = dbuf_create(dn, level, blkid, parent, bp);
2879	}
2880
2881	if (fail_uncached && db->db_state != DB_CACHED) {
2882		mutex_exit(&db->db_mtx);
2883		return (SET_ERROR(ENOENT));
2884	}
2885
2886	if (db->db_buf != NULL) {
2887		arc_buf_access(db->db_buf);
2888		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2889	}
2890
2891	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2892
2893	/*
2894	 * If this buffer is currently syncing out, and we are are
2895	 * still referencing it from db_data, we need to make a copy
2896	 * of it in case we decide we want to dirty it again in this txg.
2897	 */
2898	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2899	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2900	    db->db_state == DB_CACHED && db->db_data_pending) {
2901		dbuf_dirty_record_t *dr = db->db_data_pending;
2902		if (dr->dt.dl.dr_data == db->db_buf)
2903			dbuf_hold_copy(dn, db, dr);
2904	}
2905
2906	if (multilist_link_active(&db->db_cache_link)) {
2907		ASSERT(zfs_refcount_is_zero(&db->db_holds));
2908		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
2909		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
2910
2911		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
2912		(void) zfs_refcount_remove_many(
2913		    &dbuf_caches[db->db_caching_status].size,
2914		    db->db.db_size, db);
2915
2916		db->db_caching_status = DB_NO_CACHE;
2917	}
2918	(void) zfs_refcount_add(&db->db_holds, tag);
2919	DBUF_VERIFY(db);
2920	mutex_exit(&db->db_mtx);
2921
2922	/* NOTE: we can't rele the parent until after we drop the db_mtx */
2923	if (parent)
2924		dbuf_rele(parent, NULL);
2925
2926	ASSERT3P(DB_DNODE(db), ==, dn);
2927	ASSERT3U(db->db_blkid, ==, blkid);
2928	ASSERT3U(db->db_level, ==, level);
2929	*dbp = db;
2930
2931	return (0);
2932}
2933
2934dmu_buf_impl_t *
2935dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2936{
2937	return (dbuf_hold_level(dn, 0, blkid, tag));
2938}
2939
2940dmu_buf_impl_t *
2941dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2942{
2943	dmu_buf_impl_t *db;
2944	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
2945	return (err ? NULL : db);
2946}
2947
2948void
2949dbuf_create_bonus(dnode_t *dn)
2950{
2951	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2952
2953	ASSERT(dn->dn_bonus == NULL);
2954	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2955}
2956
2957int
2958dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2959{
2960	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2961	dnode_t *dn;
2962
2963	if (db->db_blkid != DMU_SPILL_BLKID)
2964		return (SET_ERROR(ENOTSUP));
2965	if (blksz == 0)
2966		blksz = SPA_MINBLOCKSIZE;
2967	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
2968	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2969
2970	DB_DNODE_ENTER(db);
2971	dn = DB_DNODE(db);
2972	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2973	dbuf_new_size(db, blksz, tx);
2974	rw_exit(&dn->dn_struct_rwlock);
2975	DB_DNODE_EXIT(db);
2976
2977	return (0);
2978}
2979
2980void
2981dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2982{
2983	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2984}
2985
2986#pragma weak dmu_buf_add_ref = dbuf_add_ref
2987void
2988dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2989{
2990	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
2991	ASSERT3S(holds, >, 1);
2992}
2993
2994#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2995boolean_t
2996dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2997    void *tag)
2998{
2999	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3000	dmu_buf_impl_t *found_db;
3001	boolean_t result = B_FALSE;
3002
3003	if (db->db_blkid == DMU_BONUS_BLKID)
3004		found_db = dbuf_find_bonus(os, obj);
3005	else
3006		found_db = dbuf_find(os, obj, 0, blkid);
3007
3008	if (found_db != NULL) {
3009		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
3010			(void) zfs_refcount_add(&db->db_holds, tag);
3011			result = B_TRUE;
3012		}
3013		mutex_exit(&db->db_mtx);
3014	}
3015	return (result);
3016}
3017
3018/*
3019 * If you call dbuf_rele() you had better not be referencing the dnode handle
3020 * unless you have some other direct or indirect hold on the dnode. (An indirect
3021 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
3022 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
3023 * dnode's parent dbuf evicting its dnode handles.
3024 */
3025void
3026dbuf_rele(dmu_buf_impl_t *db, void *tag)
3027{
3028	mutex_enter(&db->db_mtx);
3029	dbuf_rele_and_unlock(db, tag, B_FALSE);
3030}
3031
3032void
3033dmu_buf_rele(dmu_buf_t *db, void *tag)
3034{
3035	dbuf_rele((dmu_buf_impl_t *)db, tag);
3036}
3037
3038/*
3039 * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
3040 * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
3041 * argument should be set if we are already in the dbuf-evicting code
3042 * path, in which case we don't want to recursively evict.  This allows us to
3043 * avoid deeply nested stacks that would have a call flow similar to this:
3044 *
3045 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
3046 *	^						|
3047 *	|						|
3048 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
3049 *
3050 */
3051void
3052dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
3053{
3054	int64_t holds;
3055
3056	ASSERT(MUTEX_HELD(&db->db_mtx));
3057	DBUF_VERIFY(db);
3058
3059	/*
3060	 * Remove the reference to the dbuf before removing its hold on the
3061	 * dnode so we can guarantee in dnode_move() that a referenced bonus
3062	 * buffer has a corresponding dnode hold.
3063	 */
3064	holds = zfs_refcount_remove(&db->db_holds, tag);
3065	ASSERT(holds >= 0);
3066
3067	/*
3068	 * We can't freeze indirects if there is a possibility that they
3069	 * may be modified in the current syncing context.
3070	 */
3071	if (db->db_buf != NULL &&
3072	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
3073		arc_buf_freeze(db->db_buf);
3074	}
3075
3076	if (holds == db->db_dirtycnt &&
3077	    db->db_level == 0 && db->db_user_immediate_evict)
3078		dbuf_evict_user(db);
3079
3080	if (holds == 0) {
3081		if (db->db_blkid == DMU_BONUS_BLKID) {
3082			dnode_t *dn;
3083			boolean_t evict_dbuf = db->db_pending_evict;
3084
3085			/*
3086			 * If the dnode moves here, we cannot cross this
3087			 * barrier until the move completes.
3088			 */
3089			DB_DNODE_ENTER(db);
3090
3091			dn = DB_DNODE(db);
3092			atomic_dec_32(&dn->dn_dbufs_count);
3093
3094			/*
3095			 * Decrementing the dbuf count means that the bonus
3096			 * buffer's dnode hold is no longer discounted in
3097			 * dnode_move(). The dnode cannot move until after
3098			 * the dnode_rele() below.
3099			 */
3100			DB_DNODE_EXIT(db);
3101
3102			/*
3103			 * Do not reference db after its lock is dropped.
3104			 * Another thread may evict it.
3105			 */
3106			mutex_exit(&db->db_mtx);
3107
3108			if (evict_dbuf)
3109				dnode_evict_bonus(dn);
3110
3111			dnode_rele(dn, db);
3112		} else if (db->db_buf == NULL) {
3113			/*
3114			 * This is a special case: we never associated this
3115			 * dbuf with any data allocated from the ARC.
3116			 */
3117			ASSERT(db->db_state == DB_UNCACHED ||
3118			    db->db_state == DB_NOFILL);
3119			dbuf_destroy(db);
3120		} else if (arc_released(db->db_buf)) {
3121			/*
3122			 * This dbuf has anonymous data associated with it.
3123			 */
3124			dbuf_destroy(db);
3125		} else {
3126			boolean_t do_arc_evict = B_FALSE;
3127			blkptr_t bp;
3128			spa_t *spa = dmu_objset_spa(db->db_objset);
3129
3130			if (!DBUF_IS_CACHEABLE(db) &&
3131			    db->db_blkptr != NULL &&
3132			    !BP_IS_HOLE(db->db_blkptr) &&
3133			    !BP_IS_EMBEDDED(db->db_blkptr)) {
3134				do_arc_evict = B_TRUE;
3135				bp = *db->db_blkptr;
3136			}
3137
3138			if (!DBUF_IS_CACHEABLE(db) ||
3139			    db->db_pending_evict) {
3140				dbuf_destroy(db);
3141			} else if (!multilist_link_active(&db->db_cache_link)) {
3142				ASSERT3U(db->db_caching_status, ==,
3143				    DB_NO_CACHE);
3144
3145				dbuf_cached_state_t dcs =
3146				    dbuf_include_in_metadata_cache(db) ?
3147				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
3148				db->db_caching_status = dcs;
3149
3150				multilist_insert(dbuf_caches[dcs].cache, db);
3151				(void) zfs_refcount_add_many(
3152				    &dbuf_caches[dcs].size, db->db.db_size, db);
3153				mutex_exit(&db->db_mtx);
3154
3155				if (db->db_caching_status == DB_DBUF_CACHE &&
3156				    !evicting) {
3157					dbuf_evict_notify();
3158				}
3159			}
3160
3161			if (do_arc_evict)
3162				arc_freed(spa, &bp);
3163		}
3164	} else {
3165		mutex_exit(&db->db_mtx);
3166	}
3167
3168}
3169
3170#pragma weak dmu_buf_refcount = dbuf_refcount
3171uint64_t
3172dbuf_refcount(dmu_buf_impl_t *db)
3173{
3174	return (zfs_refcount_count(&db->db_holds));
3175}
3176
3177uint64_t
3178dmu_buf_user_refcount(dmu_buf_t *db_fake)
3179{
3180	uint64_t holds;
3181	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3182
3183	mutex_enter(&db->db_mtx);
3184	ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
3185	holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
3186	mutex_exit(&db->db_mtx);
3187
3188	return (holds);
3189}
3190
3191void *
3192dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
3193    dmu_buf_user_t *new_user)
3194{
3195	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3196
3197	mutex_enter(&db->db_mtx);
3198	dbuf_verify_user(db, DBVU_NOT_EVICTING);
3199	if (db->db_user == old_user)
3200		db->db_user = new_user;
3201	else
3202		old_user = db->db_user;
3203	dbuf_verify_user(db, DBVU_NOT_EVICTING);
3204	mutex_exit(&db->db_mtx);
3205
3206	return (old_user);
3207}
3208
3209void *
3210dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3211{
3212	return (dmu_buf_replace_user(db_fake, NULL, user));
3213}
3214
3215void *
3216dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3217{
3218	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3219
3220	db->db_user_immediate_evict = TRUE;
3221	return (dmu_buf_set_user(db_fake, user));
3222}
3223
3224void *
3225dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3226{
3227	return (dmu_buf_replace_user(db_fake, user, NULL));
3228}
3229
3230void *
3231dmu_buf_get_user(dmu_buf_t *db_fake)
3232{
3233	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3234
3235	dbuf_verify_user(db, DBVU_NOT_EVICTING);
3236	return (db->db_user);
3237}
3238
3239void
3240dmu_buf_user_evict_wait()
3241{
3242	taskq_wait(dbu_evict_taskq);
3243}
3244
3245blkptr_t *
3246dmu_buf_get_blkptr(dmu_buf_t *db)
3247{
3248	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3249	return (dbi->db_blkptr);
3250}
3251
3252objset_t *
3253dmu_buf_get_objset(dmu_buf_t *db)
3254{
3255	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3256	return (dbi->db_objset);
3257}
3258
3259dnode_t *
3260dmu_buf_dnode_enter(dmu_buf_t *db)
3261{
3262	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3263	DB_DNODE_ENTER(dbi);
3264	return (DB_DNODE(dbi));
3265}
3266
3267void
3268dmu_buf_dnode_exit(dmu_buf_t *db)
3269{
3270	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3271	DB_DNODE_EXIT(dbi);
3272}
3273
3274static void
3275dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
3276{
3277	/* ASSERT(dmu_tx_is_syncing(tx) */
3278	ASSERT(MUTEX_HELD(&db->db_mtx));
3279
3280	if (db->db_blkptr != NULL)
3281		return;
3282
3283	if (db->db_blkid == DMU_SPILL_BLKID) {
3284		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
3285		BP_ZERO(db->db_blkptr);
3286		return;
3287	}
3288	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
3289		/*
3290		 * This buffer was allocated at a time when there was
3291		 * no available blkptrs from the dnode, or it was
3292		 * inappropriate to hook it in (i.e., nlevels mis-match).
3293		 */
3294		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
3295		ASSERT(db->db_parent == NULL);
3296		db->db_parent = dn->dn_dbuf;
3297		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
3298		DBUF_VERIFY(db);
3299	} else {
3300		dmu_buf_impl_t *parent = db->db_parent;
3301		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3302
3303		ASSERT(dn->dn_phys->dn_nlevels > 1);
3304		if (parent == NULL) {
3305			mutex_exit(&db->db_mtx);
3306			rw_enter(&dn->dn_struct_rwlock, RW_READER);
3307			parent = dbuf_hold_level(dn, db->db_level + 1,
3308			    db->db_blkid >> epbs, db);
3309			rw_exit(&dn->dn_struct_rwlock);
3310			mutex_enter(&db->db_mtx);
3311			db->db_parent = parent;
3312		}
3313		db->db_blkptr = (blkptr_t *)parent->db.db_data +
3314		    (db->db_blkid & ((1ULL << epbs) - 1));
3315		DBUF_VERIFY(db);
3316	}
3317}
3318
3319/*
3320 * When syncing out blocks of dnodes, adjust the block to deal with
3321 * encryption.  Normally, we make sure the block is decrypted before writing
3322 * it.  If we have crypt params, then we are writing a raw (encrypted) block,
3323 * from a raw receive.  In this case, set the ARC buf's crypt params so
3324 * that the BP will be filled with the correct byteorder, salt, iv, and mac.
3325 *
3326 * XXX we should handle decrypting the dnode block in dbuf_dirty().
3327 */
3328static void
3329dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
3330{
3331	int err;
3332	dmu_buf_impl_t *db = dr->dr_dbuf;
3333
3334	ASSERT(MUTEX_HELD(&db->db_mtx));
3335	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
3336	ASSERT3U(db->db_level, ==, 0);
3337
3338	if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
3339		zbookmark_phys_t zb;
3340
3341		/*
3342		 * Unfortunately, there is currently no mechanism for
3343		 * syncing context to handle decryption errors. An error
3344		 * here is only possible if an attacker maliciously
3345		 * changed a dnode block and updated the associated
3346		 * checksums going up the block tree.
3347		 */
3348		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
3349		    db->db.db_object, db->db_level, db->db_blkid);
3350		err = arc_untransform(db->db_buf, db->db_objset->os_spa,
3351		    &zb, B_TRUE);
3352		if (err)
3353			panic("Invalid dnode block MAC");
3354	} else if (dr->dt.dl.dr_has_raw_params) {
3355		(void) arc_release(dr->dt.dl.dr_data, db);
3356		arc_convert_to_raw(dr->dt.dl.dr_data,
3357		    dmu_objset_id(db->db_objset),
3358		    dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
3359		    dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
3360	}
3361}
3362
3363static void
3364dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3365{
3366	dmu_buf_impl_t *db = dr->dr_dbuf;
3367	dnode_t *dn;
3368	zio_t *zio;
3369
3370	ASSERT(dmu_tx_is_syncing(tx));
3371
3372	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
3373
3374	mutex_enter(&db->db_mtx);
3375
3376	ASSERT(db->db_level > 0);
3377	DBUF_VERIFY(db);
3378
3379	/* Read the block if it hasn't been read yet. */
3380	if (db->db_buf == NULL) {
3381		mutex_exit(&db->db_mtx);
3382		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
3383		mutex_enter(&db->db_mtx);
3384	}
3385	ASSERT3U(db->db_state, ==, DB_CACHED);
3386	ASSERT(db->db_buf != NULL);
3387
3388	DB_DNODE_ENTER(db);
3389	dn = DB_DNODE(db);
3390	/* Indirect block size must match what the dnode thinks it is. */
3391	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
3392	dbuf_check_blkptr(dn, db);
3393	DB_DNODE_EXIT(db);
3394
3395	/* Provide the pending dirty record to child dbufs */
3396	db->db_data_pending = dr;
3397
3398	mutex_exit(&db->db_mtx);
3399
3400	dbuf_write(dr, db->db_buf, tx);
3401
3402	zio = dr->dr_zio;
3403	mutex_enter(&dr->dt.di.dr_mtx);
3404	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
3405	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3406	mutex_exit(&dr->dt.di.dr_mtx);
3407	zio_nowait(zio);
3408}
3409
3410static void
3411dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3412{
3413	arc_buf_t **datap = &dr->dt.dl.dr_data;
3414	dmu_buf_impl_t *db = dr->dr_dbuf;
3415	dnode_t *dn;
3416	objset_t *os;
3417	uint64_t txg = tx->tx_txg;
3418
3419	ASSERT(dmu_tx_is_syncing(tx));
3420
3421	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
3422
3423	mutex_enter(&db->db_mtx);
3424	/*
3425	 * To be synced, we must be dirtied.  But we
3426	 * might have been freed after the dirty.
3427	 */
3428	if (db->db_state == DB_UNCACHED) {
3429		/* This buffer has been freed since it was dirtied */
3430		ASSERT(db->db.db_data == NULL);
3431	} else if (db->db_state == DB_FILL) {
3432		/* This buffer was freed and is now being re-filled */
3433		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
3434	} else {
3435		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
3436	}
3437	DBUF_VERIFY(db);
3438
3439	DB_DNODE_ENTER(db);
3440	dn = DB_DNODE(db);
3441
3442	if (db->db_blkid == DMU_SPILL_BLKID) {
3443		mutex_enter(&dn->dn_mtx);
3444		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
3445		mutex_exit(&dn->dn_mtx);
3446	}
3447
3448	/*
3449	 * If this is a bonus buffer, simply copy the bonus data into the
3450	 * dnode.  It will be written out when the dnode is synced (and it
3451	 * will be synced, since it must have been dirty for dbuf_sync to
3452	 * be called).
3453	 */
3454	if (db->db_blkid == DMU_BONUS_BLKID) {
3455		dbuf_dirty_record_t **drp;
3456
3457		ASSERT(*datap != NULL);
3458		ASSERT0(db->db_level);
3459		ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
3460		    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
3461		bcopy(*datap, DN_BONUS(dn->dn_phys),
3462		    DN_MAX_BONUS_LEN(dn->dn_phys));
3463		DB_DNODE_EXIT(db);
3464
3465		if (*datap != db->db.db_data) {
3466			int slots = DB_DNODE(db)->dn_num_slots;
3467			int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
3468			zio_buf_free(*datap, bonuslen);
3469			arc_space_return(bonuslen, ARC_SPACE_BONUS);
3470		}
3471		db->db_data_pending = NULL;
3472		drp = &db->db_last_dirty;
3473		while (*drp != dr)
3474			drp = &(*drp)->dr_next;
3475		ASSERT(dr->dr_next == NULL);
3476		ASSERT(dr->dr_dbuf == db);
3477		*drp = dr->dr_next;
3478		kmem_free(dr, sizeof (dbuf_dirty_record_t));
3479		ASSERT(db->db_dirtycnt > 0);
3480		db->db_dirtycnt -= 1;
3481		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
3482		return;
3483	}
3484
3485	os = dn->dn_objset;
3486
3487	/*
3488	 * This function may have dropped the db_mtx lock allowing a dmu_sync
3489	 * operation to sneak in. As a result, we need to ensure that we
3490	 * don't check the dr_override_state until we have returned from
3491	 * dbuf_check_blkptr.
3492	 */
3493	dbuf_check_blkptr(dn, db);
3494
3495	/*
3496	 * If this buffer is in the middle of an immediate write,
3497	 * wait for the synchronous IO to complete.
3498	 */
3499	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
3500		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
3501		cv_wait(&db->db_changed, &db->db_mtx);
3502		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
3503	}
3504
3505	/*
3506	 * If this is a dnode block, ensure it is appropriately encrypted
3507	 * or decrypted, depending on what we are writing to it this txg.
3508	 */
3509	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
3510		dbuf_prepare_encrypted_dnode_leaf(dr);
3511
3512	if (db->db_state != DB_NOFILL &&
3513	    dn->dn_object != DMU_META_DNODE_OBJECT &&
3514	    zfs_refcount_count(&db->db_holds) > 1 &&
3515	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
3516	    *datap == db->db_buf) {
3517		/*
3518		 * If this buffer is currently "in use" (i.e., there
3519		 * are active holds and db_data still references it),
3520		 * then make a copy before we start the write so that
3521		 * any modifications from the open txg will not leak
3522		 * into this write.
3523		 *
3524		 * NOTE: this copy does not need to be made for
3525		 * objects only modified in the syncing context (e.g.
3526		 * DNONE_DNODE blocks).
3527		 */
3528		int psize = arc_buf_size(*datap);
3529		int lsize = arc_buf_lsize(*datap);
3530		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
3531		enum zio_compress compress_type = arc_get_compression(*datap);
3532
3533		if (arc_is_encrypted(*datap)) {
3534			boolean_t byteorder;
3535			uint8_t salt[ZIO_DATA_SALT_LEN];
3536			uint8_t iv[ZIO_DATA_IV_LEN];
3537			uint8_t mac[ZIO_DATA_MAC_LEN];
3538
3539			arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
3540			*datap = arc_alloc_raw_buf(os->os_spa, db,
3541			    dmu_objset_id(os), byteorder, salt, iv, mac,
3542			    dn->dn_type, psize, lsize, compress_type);
3543		} else if (compress_type != ZIO_COMPRESS_OFF) {
3544			ASSERT3U(type, ==, ARC_BUFC_DATA);
3545			*datap = arc_alloc_compressed_buf(os->os_spa, db,
3546			    psize, lsize, compress_type);
3547		} else {
3548			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
3549		}
3550		bcopy(db->db.db_data, (*datap)->b_data, psize);
3551	}
3552	db->db_data_pending = dr;
3553
3554	mutex_exit(&db->db_mtx);
3555
3556	dbuf_write(dr, *datap, tx);
3557
3558	ASSERT(!list_link_active(&dr->dr_dirty_node));
3559	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
3560		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
3561		DB_DNODE_EXIT(db);
3562	} else {
3563		/*
3564		 * Although zio_nowait() does not "wait for an IO", it does
3565		 * initiate the IO. If this is an empty write it seems plausible
3566		 * that the IO could actually be completed before the nowait
3567		 * returns. We need to DB_DNODE_EXIT() first in case
3568		 * zio_nowait() invalidates the dbuf.
3569		 */
3570		DB_DNODE_EXIT(db);
3571		zio_nowait(dr->dr_zio);
3572	}
3573}
3574
3575void
3576dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
3577{
3578	dbuf_dirty_record_t *dr;
3579
3580	while (dr = list_head(list)) {
3581		if (dr->dr_zio != NULL) {
3582			/*
3583			 * If we find an already initialized zio then we
3584			 * are processing the meta-dnode, and we have finished.
3585			 * The dbufs for all dnodes are put back on the list
3586			 * during processing, so that we can zio_wait()
3587			 * these IOs after initiating all child IOs.
3588			 */
3589			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
3590			    DMU_META_DNODE_OBJECT);
3591			break;
3592		}
3593		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
3594		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
3595			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
3596		}
3597		list_remove(list, dr);
3598		if (dr->dr_dbuf->db_level > 0)
3599			dbuf_sync_indirect(dr, tx);
3600		else
3601			dbuf_sync_leaf(dr, tx);
3602	}
3603}
3604
3605/* ARGSUSED */
3606static void
3607dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
3608{
3609	dmu_buf_impl_t *db = vdb;
3610	dnode_t *dn;
3611	blkptr_t *bp = zio->io_bp;
3612	blkptr_t *bp_orig = &zio->io_bp_orig;
3613	spa_t *spa = zio->io_spa;
3614	int64_t delta;
3615	uint64_t fill = 0;
3616	int i;
3617
3618	ASSERT3P(db->db_blkptr, !=, NULL);
3619	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
3620
3621	DB_DNODE_ENTER(db);
3622	dn = DB_DNODE(db);
3623	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
3624	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
3625	zio->io_prev_space_delta = delta;
3626
3627	if (bp->blk_birth != 0) {
3628		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
3629		    BP_GET_TYPE(bp) == dn->dn_type) ||
3630		    (db->db_blkid == DMU_SPILL_BLKID &&
3631		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
3632		    BP_IS_EMBEDDED(bp));
3633		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
3634	}
3635
3636	mutex_enter(&db->db_mtx);
3637
3638#ifdef ZFS_DEBUG
3639	if (db->db_blkid == DMU_SPILL_BLKID) {
3640		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3641		ASSERT(!(BP_IS_HOLE(bp)) &&
3642		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
3643	}
3644#endif
3645
3646	if (db->db_level == 0) {
3647		mutex_enter(&dn->dn_mtx);
3648		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
3649		    db->db_blkid != DMU_SPILL_BLKID) {
3650			ASSERT0(db->db_objset->os_raw_receive);
3651			dn->dn_phys->dn_maxblkid = db->db_blkid;
3652		}
3653		mutex_exit(&dn->dn_mtx);
3654
3655		if (dn->dn_type == DMU_OT_DNODE) {
3656			i = 0;
3657			while (i < db->db.db_size) {
3658				dnode_phys_t *dnp =
3659				    (void *)(((char *)db->db.db_data) + i);
3660
3661				i += DNODE_MIN_SIZE;
3662				if (dnp->dn_type != DMU_OT_NONE) {
3663					fill++;
3664					i += dnp->dn_extra_slots *
3665					    DNODE_MIN_SIZE;
3666				}
3667			}
3668		} else {
3669			if (BP_IS_HOLE(bp)) {
3670				fill = 0;
3671			} else {
3672				fill = 1;
3673			}
3674		}
3675	} else {
3676		blkptr_t *ibp = db->db.db_data;
3677		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
3678		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
3679			if (BP_IS_HOLE(ibp))
3680				continue;
3681			fill += BP_GET_FILL(ibp);
3682		}
3683	}
3684	DB_DNODE_EXIT(db);
3685
3686	if (!BP_IS_EMBEDDED(bp))
3687		BP_SET_FILL(bp, fill);
3688
3689	mutex_exit(&db->db_mtx);
3690
3691	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
3692	*db->db_blkptr = *bp;
3693	rw_exit(&dn->dn_struct_rwlock);
3694}
3695
3696/* ARGSUSED */
3697/*
3698 * This function gets called just prior to running through the compression
3699 * stage of the zio pipeline. If we're an indirect block comprised of only
3700 * holes, then we want this indirect to be compressed away to a hole. In
3701 * order to do that we must zero out any information about the holes that
3702 * this indirect points to prior to before we try to compress it.
3703 */
3704static void
3705dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
3706{
3707	dmu_buf_impl_t *db = vdb;
3708	dnode_t *dn;
3709	blkptr_t *bp;
3710	unsigned int epbs, i;
3711
3712	ASSERT3U(db->db_level, >, 0);
3713	DB_DNODE_ENTER(db);
3714	dn = DB_DNODE(db);
3715	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3716	ASSERT3U(epbs, <, 31);
3717
3718	/* Determine if all our children are holes */
3719	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
3720		if (!BP_IS_HOLE(bp))
3721			break;
3722	}
3723
3724	/*
3725	 * If all the children are holes, then zero them all out so that
3726	 * we may get compressed away.
3727	 */
3728	if (i == 1 << epbs) {
3729		/*
3730		 * We only found holes. Grab the rwlock to prevent
3731		 * anybody from reading the blocks we're about to
3732		 * zero out.
3733		 */
3734		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
3735		bzero(db->db.db_data, db->db.db_size);
3736		rw_exit(&dn->dn_struct_rwlock);
3737	}
3738	DB_DNODE_EXIT(db);
3739}
3740
3741/*
3742 * The SPA will call this callback several times for each zio - once
3743 * for every physical child i/o (zio->io_phys_children times).  This
3744 * allows the DMU to monitor the progress of each logical i/o.  For example,
3745 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
3746 * block.  There may be a long delay before all copies/fragments are completed,
3747 * so this callback allows us to retire dirty space gradually, as the physical
3748 * i/os complete.
3749 */
3750/* ARGSUSED */
3751static void
3752dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
3753{
3754	dmu_buf_impl_t *db = arg;
3755	objset_t *os = db->db_objset;
3756	dsl_pool_t *dp = dmu_objset_pool(os);
3757	dbuf_dirty_record_t *dr;
3758	int delta = 0;
3759
3760	dr = db->db_data_pending;
3761	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
3762
3763	/*
3764	 * The callback will be called io_phys_children times.  Retire one
3765	 * portion of our dirty space each time we are called.  Any rounding
3766	 * error will be cleaned up by dsl_pool_sync()'s call to
3767	 * dsl_pool_undirty_space().
3768	 */
3769	delta = dr->dr_accounted / zio->io_phys_children;
3770	dsl_pool_undirty_space(dp, delta, zio->io_txg);
3771}
3772
3773/* ARGSUSED */
3774static void
3775dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
3776{
3777	dmu_buf_impl_t *db = vdb;
3778	blkptr_t *bp_orig = &zio->io_bp_orig;
3779	blkptr_t *bp = db->db_blkptr;
3780	objset_t *os = db->db_objset;
3781	dmu_tx_t *tx = os->os_synctx;
3782	dbuf_dirty_record_t **drp, *dr;
3783
3784	ASSERT0(zio->io_error);
3785	ASSERT(db->db_blkptr == bp);
3786
3787	/*
3788	 * For nopwrites and rewrites we ensure that the bp matches our
3789	 * original and bypass all the accounting.
3790	 */
3791	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
3792		ASSERT(BP_EQUAL(bp, bp_orig));
3793	} else {
3794		dsl_dataset_t *ds = os->os_dsl_dataset;
3795		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
3796		dsl_dataset_block_born(ds, bp, tx);
3797	}
3798
3799	mutex_enter(&db->db_mtx);
3800
3801	DBUF_VERIFY(db);
3802
3803	drp = &db->db_last_dirty;
3804	while ((dr = *drp) != db->db_data_pending)
3805		drp = &dr->dr_next;
3806	ASSERT(!list_link_active(&dr->dr_dirty_node));
3807	ASSERT(dr->dr_dbuf == db);
3808	ASSERT(dr->dr_next == NULL);
3809	*drp = dr->dr_next;
3810
3811#ifdef ZFS_DEBUG
3812	if (db->db_blkid == DMU_SPILL_BLKID) {
3813		dnode_t *dn;
3814
3815		DB_DNODE_ENTER(db);
3816		dn = DB_DNODE(db);
3817		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
3818		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
3819		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
3820		DB_DNODE_EXIT(db);
3821	}
3822#endif
3823
3824	if (db->db_level == 0) {
3825		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3826		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
3827		if (db->db_state != DB_NOFILL) {
3828			if (dr->dt.dl.dr_data != db->db_buf)
3829				arc_buf_destroy(dr->dt.dl.dr_data, db);
3830		}
3831	} else {
3832		dnode_t *dn;
3833
3834		DB_DNODE_ENTER(db);
3835		dn = DB_DNODE(db);
3836		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3837		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
3838		if (!BP_IS_HOLE(db->db_blkptr)) {
3839			int epbs =
3840			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3841			ASSERT3U(db->db_blkid, <=,
3842			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
3843			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
3844			    db->db.db_size);
3845		}
3846		DB_DNODE_EXIT(db);
3847		mutex_destroy(&dr->dt.di.dr_mtx);
3848		list_destroy(&dr->dt.di.dr_children);
3849	}
3850	kmem_free(dr, sizeof (dbuf_dirty_record_t));
3851
3852	cv_broadcast(&db->db_changed);
3853	ASSERT(db->db_dirtycnt > 0);
3854	db->db_dirtycnt -= 1;
3855	db->db_data_pending = NULL;
3856	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
3857}
3858
3859static void
3860dbuf_write_nofill_ready(zio_t *zio)
3861{
3862	dbuf_write_ready(zio, NULL, zio->io_private);
3863}
3864
3865static void
3866dbuf_write_nofill_done(zio_t *zio)
3867{
3868	dbuf_write_done(zio, NULL, zio->io_private);
3869}
3870
3871static void
3872dbuf_write_override_ready(zio_t *zio)
3873{
3874	dbuf_dirty_record_t *dr = zio->io_private;
3875	dmu_buf_impl_t *db = dr->dr_dbuf;
3876
3877	dbuf_write_ready(zio, NULL, db);
3878}
3879
3880static void
3881dbuf_write_override_done(zio_t *zio)
3882{
3883	dbuf_dirty_record_t *dr = zio->io_private;
3884	dmu_buf_impl_t *db = dr->dr_dbuf;
3885	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3886
3887	mutex_enter(&db->db_mtx);
3888	if (!BP_EQUAL(zio->io_bp, obp)) {
3889		if (!BP_IS_HOLE(obp))
3890			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3891		arc_release(dr->dt.dl.dr_data, db);
3892	}
3893	mutex_exit(&db->db_mtx);
3894	dbuf_write_done(zio, NULL, db);
3895
3896	if (zio->io_abd != NULL)
3897		abd_put(zio->io_abd);
3898}
3899
3900typedef struct dbuf_remap_impl_callback_arg {
3901	objset_t	*drica_os;
3902	uint64_t	drica_blk_birth;
3903	dmu_tx_t	*drica_tx;
3904} dbuf_remap_impl_callback_arg_t;
3905
3906static void
3907dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
3908    void *arg)
3909{
3910	dbuf_remap_impl_callback_arg_t *drica = arg;
3911	objset_t *os = drica->drica_os;
3912	spa_t *spa = dmu_objset_spa(os);
3913	dmu_tx_t *tx = drica->drica_tx;
3914
3915	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
3916
3917	if (os == spa_meta_objset(spa)) {
3918		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
3919	} else {
3920		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
3921		    size, drica->drica_blk_birth, tx);
3922	}
3923}
3924
3925static void
3926dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
3927{
3928	blkptr_t bp_copy = *bp;
3929	spa_t *spa = dmu_objset_spa(dn->dn_objset);
3930	dbuf_remap_impl_callback_arg_t drica;
3931
3932	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
3933
3934	drica.drica_os = dn->dn_objset;
3935	drica.drica_blk_birth = bp->blk_birth;
3936	drica.drica_tx = tx;
3937	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
3938	    &drica)) {
3939		/*
3940		 * The struct_rwlock prevents dbuf_read_impl() from
3941		 * dereferencing the BP while we are changing it.  To
3942		 * avoid lock contention, only grab it when we are actually
3943		 * changing the BP.
3944		 */
3945		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
3946		*bp = bp_copy;
3947		rw_exit(&dn->dn_struct_rwlock);
3948	}
3949}
3950
3951/*
3952 * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
3953 * to remap a copy of every bp in the dbuf.
3954 */
3955boolean_t
3956dbuf_can_remap(const dmu_buf_impl_t *db)
3957{
3958	spa_t *spa = dmu_objset_spa(db->db_objset);
3959	blkptr_t *bp = db->db.db_data;
3960	boolean_t ret = B_FALSE;
3961
3962	ASSERT3U(db->db_level, >, 0);
3963	ASSERT3S(db->db_state, ==, DB_CACHED);
3964
3965	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
3966
3967	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3968	for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
3969		blkptr_t bp_copy = bp[i];
3970		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
3971			ret = B_TRUE;
3972			break;
3973		}
3974	}
3975	spa_config_exit(spa, SCL_VDEV, FTAG);
3976
3977	return (ret);
3978}
3979
3980boolean_t
3981dnode_needs_remap(const dnode_t *dn)
3982{
3983	spa_t *spa = dmu_objset_spa(dn->dn_objset);
3984	boolean_t ret = B_FALSE;
3985
3986	if (dn->dn_phys->dn_nlevels == 0) {
3987		return (B_FALSE);
3988	}
3989
3990	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
3991
3992	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3993	for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
3994		blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
3995		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
3996			ret = B_TRUE;
3997			break;
3998		}
3999	}
4000	spa_config_exit(spa, SCL_VDEV, FTAG);
4001
4002	return (ret);
4003}
4004
4005/*
4006 * Remap any existing BP's to concrete vdevs, if possible.
4007 */
4008static void
4009dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
4010{
4011	spa_t *spa = dmu_objset_spa(db->db_objset);
4012	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4013
4014	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
4015		return;
4016
4017	if (db->db_level > 0) {
4018		blkptr_t *bp = db->db.db_data;
4019		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
4020			dbuf_remap_impl(dn, &bp[i], tx);
4021		}
4022	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
4023		dnode_phys_t *dnp = db->db.db_data;
4024		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
4025		    DMU_OT_DNODE);
4026		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
4027			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
4028				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
4029			}
4030		}
4031	}
4032}
4033
4034
4035/* Issue I/O to commit a dirty buffer to disk. */
4036static void
4037dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
4038{
4039	dmu_buf_impl_t *db = dr->dr_dbuf;
4040	dnode_t *dn;
4041	objset_t *os;
4042	dmu_buf_impl_t *parent = db->db_parent;
4043	uint64_t txg = tx->tx_txg;
4044	zbookmark_phys_t zb;
4045	zio_prop_t zp;
4046	zio_t *zio;
4047	int wp_flag = 0;
4048
4049	ASSERT(dmu_tx_is_syncing(tx));
4050
4051	DB_DNODE_ENTER(db);
4052	dn = DB_DNODE(db);
4053	os = dn->dn_objset;
4054
4055	if (db->db_state != DB_NOFILL) {
4056		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
4057			/*
4058			 * Private object buffers are released here rather
4059			 * than in dbuf_dirty() since they are only modified
4060			 * in the syncing context and we don't want the
4061			 * overhead of making multiple copies of the data.
4062			 */
4063			if (BP_IS_HOLE(db->db_blkptr)) {
4064				arc_buf_thaw(data);
4065			} else {
4066				dbuf_release_bp(db);
4067			}
4068			dbuf_remap(dn, db, tx);
4069		}
4070	}
4071
4072	if (parent != dn->dn_dbuf) {
4073		/* Our parent is an indirect block. */
4074		/* We have a dirty parent that has been scheduled for write. */
4075		ASSERT(parent && parent->db_data_pending);
4076		/* Our parent's buffer is one level closer to the dnode. */
4077		ASSERT(db->db_level == parent->db_level-1);
4078		/*
4079		 * We're about to modify our parent's db_data by modifying
4080		 * our block pointer, so the parent must be released.
4081		 */
4082		ASSERT(arc_released(parent->db_buf));
4083		zio = parent->db_data_pending->dr_zio;
4084	} else {
4085		/* Our parent is the dnode itself. */
4086		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
4087		    db->db_blkid != DMU_SPILL_BLKID) ||
4088		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
4089		if (db->db_blkid != DMU_SPILL_BLKID)
4090			ASSERT3P(db->db_blkptr, ==,
4091			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
4092		zio = dn->dn_zio;
4093	}
4094
4095	ASSERT(db->db_level == 0 || data == db->db_buf);
4096	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
4097	ASSERT(zio);
4098
4099	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
4100	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
4101	    db->db.db_object, db->db_level, db->db_blkid);
4102
4103	if (db->db_blkid == DMU_SPILL_BLKID)
4104		wp_flag = WP_SPILL;
4105	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
4106
4107	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
4108
4109	DB_DNODE_EXIT(db);
4110
4111	/*
4112	 * We copy the blkptr now (rather than when we instantiate the dirty
4113	 * record), because its value can change between open context and
4114	 * syncing context. We do not need to hold dn_struct_rwlock to read
4115	 * db_blkptr because we are in syncing context.
4116	 */
4117	dr->dr_bp_copy = *db->db_blkptr;
4118
4119	if (db->db_level == 0 &&
4120	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
4121		/*
4122		 * The BP for this block has been provided by open context
4123		 * (by dmu_sync() or dmu_buf_write_embedded()).
4124		 */
4125		abd_t *contents = (data != NULL) ?
4126		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
4127
4128		dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
4129		    contents, db->db.db_size, db->db.db_size, &zp,
4130		    dbuf_write_override_ready, NULL, NULL,
4131		    dbuf_write_override_done,
4132		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
4133		mutex_enter(&db->db_mtx);
4134		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
4135		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
4136		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
4137		mutex_exit(&db->db_mtx);
4138	} else if (db->db_state == DB_NOFILL) {
4139		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
4140		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
4141		dr->dr_zio = zio_write(zio, os->os_spa, txg,
4142		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
4143		    dbuf_write_nofill_ready, NULL, NULL,
4144		    dbuf_write_nofill_done, db,
4145		    ZIO_PRIORITY_ASYNC_WRITE,
4146		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
4147	} else {
4148		ASSERT(arc_released(data));
4149
4150		/*
4151		 * For indirect blocks, we want to setup the children
4152		 * ready callback so that we can properly handle an indirect
4153		 * block that only contains holes.
4154		 */
4155		arc_write_done_func_t *children_ready_cb = NULL;
4156		if (db->db_level != 0)
4157			children_ready_cb = dbuf_write_children_ready;
4158
4159		dr->dr_zio = arc_write(zio, os->os_spa, txg,
4160		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
4161		    &zp, dbuf_write_ready, children_ready_cb,
4162		    dbuf_write_physdone, dbuf_write_done, db,
4163		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
4164	}
4165}
4166