2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5f65e61cahrens * Common Development and Distribution License (the "License").
6f65e61cahrens * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
2206e0070Mark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
233f2366cGordon Ross * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24fa98e48Matthew Ahrens * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25aad0257Saso Kiselkov * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26810e43bBill Pijewski * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27bc9014eJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
29fa9e406ahrens */
31fa9e406ahrens#include <sys/zfs_context.h>
32fa9e406ahrens#include <sys/dmu.h>
332f3d878Matthew Ahrens#include <sys/dmu_send.h>
34fa9e406ahrens#include <sys/dmu_impl.h>
35fa9e406ahrens#include <sys/dbuf.h>
36fa9e406ahrens#include <sys/dmu_objset.h>
37fa9e406ahrens#include <sys/dsl_dataset.h>
38fa9e406ahrens#include <sys/dsl_dir.h>
39fa9e406ahrens#include <sys/dmu_tx.h>
40fa9e406ahrens#include <sys/spa.h>
41fa9e406ahrens#include <sys/zio.h>
42fa9e406ahrens#include <sys/dmu_zfetch.h>
430a586ceMark Shellenbaum#include <sys/sa.h>
440a586ceMark Shellenbaum#include <sys/sa_impl.h>
455d7b4d4Matthew Ahrens#include <sys/zfeature.h>
465d7b4d4Matthew Ahrens#include <sys/blkptr.h>
47bf16b11Matthew Ahrens#include <sys/range_tree.h>
48dcbf3bdGeorge Wilson#include <sys/callb.h>
49770499eDan Kimmel#include <sys/abd.h>
505cabbc6Prashanth Sreenivasa#include <sys/vdev.h>
513a2d8a1Paul Dagnelie#include <sys/cityhash.h>
52adb52d9Matthew Ahrens#include <sys/spa_impl.h>
53dcbf3bdGeorge Wilson
543b2aab1Matthew Ahrensstatic boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
55088f389ahrensstatic void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
57bc9014eJustin Gibbs#ifndef __lint
58bc9014eJustin Gibbsextern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
5940510e8Josef 'Jeff' Sipek    dmu_buf_evict_func_t *evict_func_sync,
6040510e8Josef 'Jeff' Sipek    dmu_buf_evict_func_t *evict_func_async,
6140510e8Josef 'Jeff' Sipek    dmu_buf_t **clear_on_evict_dbufp);
62bc9014eJustin Gibbs#endif /* ! __lint */
63bc9014eJustin Gibbs
65fa9e406ahrens * Global data structures and functions for the dbuf cache.
66fa9e406ahrens */
67dcbf3bdGeorge Wilsonstatic kmem_cache_t *dbuf_kmem_cache;
68bc9014eJustin Gibbsstatic taskq_t *dbu_evict_taskq;
70dcbf3bdGeorge Wilsonstatic kthread_t *dbuf_cache_evict_thread;
71dcbf3bdGeorge Wilsonstatic kmutex_t dbuf_evict_lock;
72dcbf3bdGeorge Wilsonstatic kcondvar_t dbuf_evict_cv;
73dcbf3bdGeorge Wilsonstatic boolean_t dbuf_evict_thread_exit;
74dcbf3bdGeorge Wilson
75dcbf3bdGeorge Wilson/*
76adb52d9Matthew Ahrens * There are two dbuf caches; each dbuf can only be in one of them at a time.
77adb52d9Matthew Ahrens *
78adb52d9Matthew Ahrens * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
79adb52d9Matthew Ahrens *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
80adb52d9Matthew Ahrens *    that represent the metadata that describes filesystems/snapshots/
81adb52d9Matthew Ahrens *    bookmarks/properties/etc. We only evict from this cache when we export a
82adb52d9Matthew Ahrens *    pool, to short-circuit as much I/O as possible for all administrative
83adb52d9Matthew Ahrens *    commands that need the metadata. There is no eviction policy for this
84adb52d9Matthew Ahrens *    cache, because we try to only include types in it which would occupy a
85adb52d9Matthew Ahrens *    very small amount of space per object but create a large impact on the
86adb52d9Matthew Ahrens *    performance of these commands. Instead, after it reaches a maximum size
87adb52d9Matthew Ahrens *    (which should only happen on very small memory systems with a very large
88adb52d9Matthew Ahrens *    number of filesystem objects), we stop taking new dbufs into the
89adb52d9Matthew Ahrens *    metadata cache, instead putting them in the normal dbuf cache.
90adb52d9Matthew Ahrens *
91adb52d9Matthew Ahrens * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
92adb52d9Matthew Ahrens *    are not currently held but have been recently released. These dbufs
93adb52d9Matthew Ahrens *    are not eligible for arc eviction until they are aged out of the cache.
94adb52d9Matthew Ahrens *    Dbufs that are aged out of the cache will be immediately destroyed and
95adb52d9Matthew Ahrens *    become eligible for arc eviction.
96adb52d9Matthew Ahrens *
97adb52d9Matthew Ahrens * Dbufs are added to these caches once the last hold is released. If a dbuf is
98adb52d9Matthew Ahrens * later accessed and still exists in the dbuf cache, then it will be removed
99adb52d9Matthew Ahrens * from the cache and later re-added to the head of the cache.
100adb52d9Matthew Ahrens *
101adb52d9Matthew Ahrens * If a given dbuf meets the requirements for the metadata cache, it will go
102adb52d9Matthew Ahrens * there, otherwise it will be considered for the generic LRU dbuf cache. The
103adb52d9Matthew Ahrens * caches and the refcounts tracking their sizes are stored in an array indexed
104adb52d9Matthew Ahrens * by those caches' matching enum values (from dbuf_cached_state_t).
105dcbf3bdGeorge Wilson */
106adb52d9Matthew Ahrenstypedef struct dbuf_cache {
107adb52d9Matthew Ahrens	multilist_t *cache;
108e914aceTim Schumacher	zfs_refcount_t size;
109adb52d9Matthew Ahrens} dbuf_cache_t;
110adb52d9Matthew Ahrensdbuf_cache_t dbuf_caches[DB_CACHE_MAX];
111dcbf3bdGeorge Wilson
112adb52d9Matthew Ahrens/* Size limits for the caches */
113adb52d9Matthew Ahrensuint64_t dbuf_cache_max_bytes = 0;
114adb52d9Matthew Ahrensuint64_t dbuf_metadata_cache_max_bytes = 0;
115adb52d9Matthew Ahrens/* Set the default sizes of the caches to log2 fraction of arc size */
116268bbb2George Wilsonint dbuf_cache_shift = 5;
117adb52d9Matthew Ahrensint dbuf_metadata_cache_shift = 6;
118dcbf3bdGeorge Wilson
119dcbf3bdGeorge Wilson/*
120adb52d9Matthew Ahrens * For diagnostic purposes, this is incremented whenever we can't add
121adb52d9Matthew Ahrens * something to the metadata cache because it's full, and instead put
122adb52d9Matthew Ahrens * the data in the regular dbuf cache.
123adb52d9Matthew Ahrens */
124adb52d9Matthew Ahrensuint64_t dbuf_metadata_cache_overflow;
125adb52d9Matthew Ahrens
126adb52d9Matthew Ahrens/*
127adb52d9Matthew Ahrens * The LRU dbuf cache uses a three-stage eviction policy:
128dcbf3bdGeorge Wilson *	- A low water marker designates when the dbuf eviction thread
129dcbf3bdGeorge Wilson *	should stop evicting from the dbuf cache.
130dcbf3bdGeorge Wilson *	- When we reach the maximum size (aka mid water mark), we
131dcbf3bdGeorge Wilson *	signal the eviction thread to run.
132dcbf3bdGeorge Wilson *	- The high water mark indicates when the eviction thread
133dcbf3bdGeorge Wilson *	is unable to keep up with the incoming load and eviction must
134dcbf3bdGeorge Wilson *	happen in the context of the calling thread.
135dcbf3bdGeorge Wilson *
136dcbf3bdGeorge Wilson * The dbuf cache:
137dcbf3bdGeorge Wilson *                                                 (max size)
138dcbf3bdGeorge Wilson *                                      low water   mid water   hi water
139dcbf3bdGeorge Wilson * +----------------------------------------+----------+----------+
140dcbf3bdGeorge Wilson * |                                        |          |          |
141dcbf3bdGeorge Wilson * |                                        |          |          |
142dcbf3bdGeorge Wilson * |                                        |          |          |
143dcbf3bdGeorge Wilson * |                                        |          |          |
144dcbf3bdGeorge Wilson * +----------------------------------------+----------+----------+
145dcbf3bdGeorge Wilson *                                        stop        signal     evict
146dcbf3bdGeorge Wilson *                                      evicting     eviction   directly
147dcbf3bdGeorge Wilson *                                                    thread
148dcbf3bdGeorge Wilson *
149dcbf3bdGeorge Wilson * The high and low water marks indicate the operating range for the eviction
150dcbf3bdGeorge Wilson * thread. The low water mark is, by default, 90% of the total size of the
151dcbf3bdGeorge Wilson * cache and the high water mark is at 110% (both of these percentages can be
152dcbf3bdGeorge Wilson * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
153dcbf3bdGeorge Wilson * respectively). The eviction thread will try to ensure that the cache remains
154dcbf3bdGeorge Wilson * within this range by waking up every second and checking if the cache is
155dcbf3bdGeorge Wilson * above the low water mark. The thread can also be woken up by callers adding
156dcbf3bdGeorge Wilson * elements into the cache if the cache is larger than the mid water (i.e max
157dcbf3bdGeorge Wilson * cache size). Once the eviction thread is woken up and eviction is required,
158dcbf3bdGeorge Wilson * it will continue evicting buffers until it's able to reduce the cache size
159dcbf3bdGeorge Wilson * to the low water mark. If the cache size continues to grow and hits the high
160eb63303Tom Caputi * water mark, then callers adding elements to the cache will begin to evict
161dcbf3bdGeorge Wilson * directly from the cache until the cache is no longer above the high water
162dcbf3bdGeorge Wilson * mark.
163dcbf3bdGeorge Wilson */
164dcbf3bdGeorge Wilson
165dcbf3bdGeorge Wilson/*
166dcbf3bdGeorge Wilson * The percentage above and below the maximum cache size.
167dcbf3bdGeorge Wilson */
168dcbf3bdGeorge Wilsonuint_t dbuf_cache_hiwater_pct = 10;
169dcbf3bdGeorge Wilsonuint_t dbuf_cache_lowater_pct = 10;
170dcbf3bdGeorge Wilson
171fa9e406ahrens/* ARGSUSED */
172fa9e406ahrensstatic int
173fa9e406ahrensdbuf_cons(void *vdb, void *unused, int kmflag)
175fa9e406ahrens	dmu_buf_impl_t *db = vdb;
176fa9e406ahrens	bzero(db, sizeof (dmu_buf_impl_t));
178fa9e406ahrens	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
179fa9e406ahrens	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
180dcbf3bdGeorge Wilson	multilist_link_init(&db->db_cache_link);
181e914aceTim Schumacher	zfs_refcount_create(&db->db_holds);
1820f6d88aAlex Reece
183fa9e406ahrens	return (0);
186fa9e406ahrens/* ARGSUSED */
187fa9e406ahrensstatic void
188fa9e406ahrensdbuf_dest(void *vdb, void *unused)
190fa9e406ahrens	dmu_buf_impl_t *db = vdb;
191fa9e406ahrens	mutex_destroy(&db->db_mtx);
192fa9e406ahrens	cv_destroy(&db->db_changed);
193dcbf3bdGeorge Wilson	ASSERT(!multilist_link_active(&db->db_cache_link));
194e914aceTim Schumacher	zfs_refcount_destroy(&db->db_holds);
198fa9e406ahrens * dbuf hash table routines
199fa9e406ahrens */
200fa9e406ahrensstatic dbuf_hash_table_t dbuf_hash_table;
202fa9e406ahrensstatic uint64_t dbuf_hash_count;
2043a2d8a1Paul Dagnelie/*
2053a2d8a1Paul Dagnelie * We use Cityhash for this. It's fast, and has good hash properties without
2063a2d8a1Paul Dagnelie * requiring any large static buffers.
2073a2d8a1Paul Dagnelie */
208fa9e406ahrensstatic uint64_t
209fa9e406ahrensdbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
2113a2d8a1Paul Dagnelie	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
214fa9e406ahrens#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
215fa9e406ahrens	((dbuf)->db.db_object == (obj) &&		\
216fa9e406ahrens	(dbuf)->db_objset == (os) &&			\
217fa9e406ahrens	(dbuf)->db_level == (level) &&			\
218fa9e406ahrens	(dbuf)->db_blkid == (blkid))
220fa9e406ahrensdmu_buf_impl_t *
221e57a022Justin T. Gibbsdbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
223fa9e406ahrens	dbuf_hash_table_t *h = &dbuf_hash_table;
224dcbf3bdGeorge Wilson	uint64_t hv = dbuf_hash(os, obj, level, blkid);
225fa9e406ahrens	uint64_t idx = hv & h->hash_table_mask;
226fa9e406ahrens	dmu_buf_impl_t *db;
228fa9e406ahrens	mutex_enter(DBUF_HASH_MUTEX(h, idx));
229fa9e406ahrens	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
230fa9e406ahrens		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
231fa9e406ahrens			mutex_enter(&db->db_mtx);
232ea8dc4beschrock			if (db->db_state != DB_EVICTING) {
233fa9e406ahrens				mutex_exit(DBUF_HASH_MUTEX(h, idx));
234fa9e406ahrens				return (db);
235fa9e406ahrens			}
236fa9e406ahrens			mutex_exit(&db->db_mtx);
237fa9e406ahrens		}
238fa9e406ahrens	}
239fa9e406ahrens	mutex_exit(DBUF_HASH_MUTEX(h, idx));
240fa9e406ahrens	return (NULL);
243e57a022Justin T. Gibbsstatic dmu_buf_impl_t *
244e57a022Justin T. Gibbsdbuf_find_bonus(objset_t *os, uint64_t object)
245e57a022Justin T. Gibbs{
246e57a022Justin T. Gibbs	dnode_t *dn;
247e57a022Justin T. Gibbs	dmu_buf_impl_t *db = NULL;
248e57a022Justin T. Gibbs
249e57a022Justin T. Gibbs	if (dnode_hold(os, object, FTAG, &dn) == 0) {
250e57a022Justin T. Gibbs		rw_enter(&dn->dn_struct_rwlock, RW_READER);
251e57a022Justin T. Gibbs		if (dn->dn_bonus != NULL) {
252e57a022Justin T. Gibbs			db = dn->dn_bonus;
253e57a022Justin T. Gibbs			mutex_enter(&db->db_mtx);
254e57a022Justin T. Gibbs		}
255e57a022Justin T. Gibbs		rw_exit(&dn->dn_struct_rwlock);
256e57a022Justin T. Gibbs		dnode_rele(dn, FTAG);
257e57a022Justin T. Gibbs	}
258e57a022Justin T. Gibbs	return (db);
259e57a022Justin T. Gibbs}
260e57a022Justin T. Gibbs
262fa9e406ahrens * Insert an entry into the hash table.  If there is already an element
263fa9e406ahrens * equal to elem in the hash table, then the already existing element
264fa9e406ahrens * will be returned and the new element will not be inserted.
265fa9e406ahrens * Otherwise returns NULL.
266fa9e406ahrens */
267fa9e406ahrensstatic dmu_buf_impl_t *
268fa9e406ahrensdbuf_hash_insert(dmu_buf_impl_t *db)
270fa9e406ahrens	dbuf_hash_table_t *h = &dbuf_hash_table;
271503ad85Matthew Ahrens	objset_t *os = db->db_objset;
272fa9e406ahrens	uint64_t obj = db->db.db_object;
273fa9e406ahrens	int level = db->db_level;
274fa9e406ahrens	uint64_t blkid = db->db_blkid;
275dcbf3bdGeorge Wilson	uint64_t hv = dbuf_hash(os, obj, level, blkid);
276fa9e406ahrens	uint64_t idx = hv & h->hash_table_mask;
277fa9e406ahrens	dmu_buf_impl_t *dbf;
279fa9e406ahrens	mutex_enter(DBUF_HASH_MUTEX(h, idx));
280fa9e406ahrens	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
281fa9e406ahrens		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
282fa9e406ahrens			mutex_enter(&dbf->db_mtx);
283ea8dc4beschrock			if (dbf->db_state != DB_EVICTING) {
284fa9e406ahrens				mutex_exit(DBUF_HASH_MUTEX(h, idx));
285fa9e406ahrens				return (dbf);
286fa9e406ahrens			}
287fa9e406ahrens			mutex_exit(&dbf->db_mtx);
288fa9e406ahrens		}
289fa9e406ahrens	}
291fa9e406ahrens	mutex_enter(&db->db_mtx);
292fa9e406ahrens	db->db_hash_next = h->hash_table[idx];
293fa9e406ahrens	h->hash_table[idx] = db;
294fa9e406ahrens	mutex_exit(DBUF_HASH_MUTEX(h, idx));
2951a5e258Josef 'Jeff' Sipek	atomic_inc_64(&dbuf_hash_count);
297fa9e406ahrens	return (NULL);
301bbfa8eaMatthew Ahrens * Remove an entry from the hash table.  It must be in the EVICTING state.
302fa9e406ahrens */
303fa9e406ahrensstatic void
304fa9e406ahrensdbuf_hash_remove(dmu_buf_impl_t *db)
306fa9e406ahrens	dbuf_hash_table_t *h = &dbuf_hash_table;
307dcbf3bdGeorge Wilson	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
308fa9e406ahrens	    db->db_level, db->db_blkid);
309fa9e406ahrens	uint64_t idx = hv & h->hash_table_mask;
310fa9e406ahrens	dmu_buf_impl_t *dbf, **dbp;
312fa9e406ahrens	/*
313eb63303Tom Caputi	 * We mustn't hold db_mtx to maintain lock ordering:
314fa9e406ahrens	 * DBUF_HASH_MUTEX > db_mtx.
315fa9e406ahrens	 */
316e914aceTim Schumacher	ASSERT(zfs_refcount_is_zero(&db->db_holds));
317ea8dc4beschrock	ASSERT(db->db_state == DB_EVICTING);
318fa9e406ahrens	ASSERT(!MUTEX_HELD(&db->db_mtx));
320fa9e406ahrens	mutex_enter(DBUF_HASH_MUTEX(h, idx));
321fa9e406ahrens	dbp = &h->hash_table[idx];
322fa9e406ahrens	while ((dbf = *dbp) != db) {
323fa9e406ahrens		dbp = &dbf->db_hash_next;
324fa9e406ahrens		ASSERT(dbf != NULL);
325fa9e406ahrens	}
326fa9e406ahrens	*dbp = db->db_hash_next;
327fa9e406ahrens	db->db_hash_next = NULL;
328fa9e406ahrens	mutex_exit(DBUF_HASH_MUTEX(h, idx));
3291a5e258Josef 'Jeff' Sipek	atomic_dec_64(&dbuf_hash_count);
332bc9014eJustin Gibbstypedef enum {
333bc9014eJustin Gibbs	DBVU_EVICTING,
334bc9014eJustin Gibbs	DBVU_NOT_EVICTING
335bc9014eJustin Gibbs} dbvu_verify_type_t;
336bc9014eJustin Gibbs
337bc9014eJustin Gibbsstatic void
338bc9014eJustin Gibbsdbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
339bc9014eJustin Gibbs{
340bc9014eJustin Gibbs#ifdef ZFS_DEBUG
341bc9014eJustin Gibbs	int64_t holds;
342bc9014eJustin Gibbs
343bc9014eJustin Gibbs	if (db->db_user == NULL)
344bc9014eJustin Gibbs		return;
345bc9014eJustin Gibbs
346bc9014eJustin Gibbs	/* Only data blocks support the attachment of user data. */
347bc9014eJustin Gibbs	ASSERT(db->db_level == 0);
348bc9014eJustin Gibbs
349bc9014eJustin Gibbs	/* Clients must resolve a dbuf before attaching user data. */
350bc9014eJustin Gibbs	ASSERT(db->db.db_data != NULL);
351bc9014eJustin Gibbs	ASSERT3U(db->db_state, ==, DB_CACHED);
352bc9014eJustin Gibbs
353e914aceTim Schumacher	holds = zfs_refcount_count(&db->db_holds);
354bc9014eJustin Gibbs	if (verify_type == DBVU_EVICTING) {
355bc9014eJustin Gibbs		/*
356bc9014eJustin Gibbs		 * Immediate eviction occurs when holds == dirtycnt.
357bc9014eJustin Gibbs		 * For normal eviction buffers, holds is zero on
358bc9014eJustin Gibbs		 * eviction, except when dbuf_fix_old_data() calls
359bc9014eJustin Gibbs		 * dbuf_clear_data().  However, the hold count can grow
360bc9014eJustin Gibbs		 * during eviction even though db_mtx is held (see
361bc9014eJustin Gibbs		 * dmu_bonus_hold() for an example), so we can only
362bc9014eJustin Gibbs		 * test the generic invariant that holds >= dirtycnt.
363bc9014eJustin Gibbs		 */
364bc9014eJustin Gibbs		ASSERT3U(holds, >=, db->db_dirtycnt);
365bc9014eJustin Gibbs	} else {
366d205810Justin T. Gibbs		if (db->db_user_immediate_evict == TRUE)
367bc9014eJustin Gibbs			ASSERT3U(holds, >=, db->db_dirtycnt);
368bc9014eJustin Gibbs		else
369bc9014eJustin Gibbs			ASSERT3U(holds, >, 0);
370bc9014eJustin Gibbs	}
371bc9014eJustin Gibbs#endif
372bc9014eJustin Gibbs}
373bc9014eJustin Gibbs
374fa9e406ahrensstatic void
375fa9e406ahrensdbuf_evict_user(dmu_buf_impl_t *db)
377bc9014eJustin Gibbs	dmu_buf_user_t *dbu = db->db_user;
378bc9014eJustin Gibbs
379fa9e406ahrens	ASSERT(MUTEX_HELD(&db->db_mtx));
381bc9014eJustin Gibbs	if (dbu == NULL)
382fa9e406ahrens		return;
384bc9014eJustin Gibbs	dbuf_verify_user(db, DBVU_EVICTING);
385bc9014eJustin Gibbs	db->db_user = NULL;
386bc9014eJustin Gibbs
387bc9014eJustin Gibbs#ifdef ZFS_DEBUG
388bc9014eJustin Gibbs	if (dbu->dbu_clear_on_evict_dbufp != NULL)
389bc9014eJustin Gibbs		*dbu->dbu_clear_on_evict_dbufp = NULL;
390bc9014eJustin Gibbs#endif
391bc9014eJustin Gibbs
392bc9014eJustin Gibbs	/*
39340510e8Josef 'Jeff' Sipek	 * There are two eviction callbacks - one that we call synchronously
39440510e8Josef 'Jeff' Sipek	 * and one that we invoke via a taskq.  The async one is useful for
39540510e8Josef 'Jeff' Sipek	 * avoiding lock order reversals and limiting stack depth.
39640510e8Josef 'Jeff' Sipek	 *
39740510e8Josef 'Jeff' Sipek	 * Note that if we have a sync callback but no async callback,
39840510e8Josef 'Jeff' Sipek	 * it's likely that the sync callback will free the structure
39940510e8Josef 'Jeff' Sipek	 * containing the dbu.  In that case we need to take care to not
40040510e8Josef 'Jeff' Sipek	 * dereference dbu after calling the sync evict func.
401bc9014eJustin Gibbs	 */
40240510e8Josef 'Jeff' Sipek	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
40340510e8Josef 'Jeff' Sipek
40440510e8Josef 'Jeff' Sipek	if (dbu->dbu_evict_func_sync != NULL)
40540510e8Josef 'Jeff' Sipek		dbu->dbu_evict_func_sync(dbu);
40640510e8Josef 'Jeff' Sipek
40740510e8Josef 'Jeff' Sipek	if (has_async) {
40840510e8Josef 'Jeff' Sipek		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
40940510e8Josef 'Jeff' Sipek		    dbu, 0, &dbu->dbu_tqent);
41040510e8Josef 'Jeff' Sipek	}
413744947dTom Ericksonboolean_t
414744947dTom Ericksondbuf_is_metadata(dmu_buf_impl_t *db)
415744947dTom Erickson{
416eb63303Tom Caputi	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
417744947dTom Erickson		return (B_TRUE);
418744947dTom Erickson	} else {
419744947dTom Erickson		boolean_t is_metadata;
420744947dTom Erickson
421744947dTom Erickson		DB_DNODE_ENTER(db);
422ad135b5Christopher Siden		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
423744947dTom Erickson		DB_DNODE_EXIT(db);
424744947dTom Erickson
425744947dTom Erickson		return (is_metadata);
426744947dTom Erickson	}
427744947dTom Erickson}
428744947dTom Erickson
429dcbf3bdGeorge Wilson/*
430adb52d9Matthew Ahrens * This returns whether this dbuf should be stored in the metadata cache, which
431adb52d9Matthew Ahrens * is based on whether it's from one of the dnode types that store data related
432adb52d9Matthew Ahrens * to traversing dataset hierarchies.
433adb52d9Matthew Ahrens */
434adb52d9Matthew Ahrensstatic boolean_t
435adb52d9Matthew Ahrensdbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
436adb52d9Matthew Ahrens{
437adb52d9Matthew Ahrens	DB_DNODE_ENTER(db);
438adb52d9Matthew Ahrens	dmu_object_type_t type = DB_DNODE(db)->dn_type;
439adb52d9Matthew Ahrens	DB_DNODE_EXIT(db);
440adb52d9Matthew Ahrens
441adb52d9Matthew Ahrens	/* Check if this dbuf is one of the types we care about */
442adb52d9Matthew Ahrens	if (DMU_OT_IS_METADATA_CACHED(type)) {
443adb52d9Matthew Ahrens		/* If we hit this, then we set something up wrong in dmu_ot */
444adb52d9Matthew Ahrens		ASSERT(DMU_OT_IS_METADATA(type));
445adb52d9Matthew Ahrens
446adb52d9Matthew Ahrens		/*
447adb52d9Matthew Ahrens		 * Sanity check for small-memory systems: don't allocate too
448adb52d9Matthew Ahrens		 * much memory for this purpose.
449adb52d9Matthew Ahrens		 */
450e914aceTim Schumacher		if (zfs_refcount_count(
451e914aceTim Schumacher		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
452adb52d9Matthew Ahrens		    dbuf_metadata_cache_max_bytes) {
453adb52d9Matthew Ahrens			dbuf_metadata_cache_overflow++;
454adb52d9Matthew Ahrens			DTRACE_PROBE1(dbuf__metadata__cache__overflow,
455adb52d9Matthew Ahrens			    dmu_buf_impl_t *, db);
456adb52d9Matthew Ahrens			return (B_FALSE);
457adb52d9Matthew Ahrens		}
458adb52d9Matthew Ahrens
459adb52d9Matthew Ahrens		return (B_TRUE);
460adb52d9Matthew Ahrens	}
461adb52d9Matthew Ahrens
462adb52d9Matthew Ahrens	return (B_FALSE);
463adb52d9Matthew Ahrens}
464adb52d9Matthew Ahrens
465adb52d9Matthew Ahrens/*
466dcbf3bdGeorge Wilson * This function *must* return indices evenly distributed between all
467dcbf3bdGeorge Wilson * sublists of the multilist. This is needed due to how the dbuf eviction
468dcbf3bdGeorge Wilson * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
469dcbf3bdGeorge Wilson * distributed between all sublists and uses this assumption when
470dcbf3bdGeorge Wilson * deciding which sublist to evict from and how much to evict from it.
471dcbf3bdGeorge Wilson */
472dcbf3bdGeorge Wilsonunsigned int
473dcbf3bdGeorge Wilsondbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
475dcbf3bdGeorge Wilson	dmu_buf_impl_t *db = obj;
476dcbf3bdGeorge Wilson
477dcbf3bdGeorge Wilson	/*
478dcbf3bdGeorge Wilson	 * The assumption here, is the hash value for a given
479dcbf3bdGeorge Wilson	 * dmu_buf_impl_t will remain constant throughout it's lifetime
480dcbf3bdGeorge Wilson	 * (i.e. it's objset, object, level and blkid fields don't change).
481dcbf3bdGeorge Wilson	 * Thus, we don't need to store the dbuf's sublist index
482dcbf3bdGeorge Wilson	 * on insertion, as this index can be recalculated on removal.
483dcbf3bdGeorge Wilson	 *
484dcbf3bdGeorge Wilson	 * Also, the low order bits of the hash value are thought to be
485dcbf3bdGeorge Wilson	 * distributed evenly. Otherwise, in the case that the multilist
486dcbf3bdGeorge Wilson	 * has a power of two number of sublists, each sublists' usage
487dcbf3bdGeorge Wilson	 * would not be evenly distributed.
488dcbf3bdGeorge Wilson	 */
489dcbf3bdGeorge Wilson	return (dbuf_hash(db->db_objset, db->db.db_object,
490dcbf3bdGeorge Wilson	    db->db_level, db->db_blkid) %
491dcbf3bdGeorge Wilson	    multilist_get_num_sublists(ml));
492dcbf3bdGeorge Wilson}
493dcbf3bdGeorge Wilson
494dcbf3bdGeorge Wilsonstatic inline boolean_t
495dcbf3bdGeorge Wilsondbuf_cache_above_hiwater(void)
496dcbf3bdGeorge Wilson{
497dcbf3bdGeorge Wilson	uint64_t dbuf_cache_hiwater_bytes =
498dcbf3bdGeorge Wilson	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
499dcbf3bdGeorge Wilson
500e914aceTim Schumacher	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
501dcbf3bdGeorge Wilson	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
502dcbf3bdGeorge Wilson}
503dcbf3bdGeorge Wilson
504dcbf3bdGeorge Wilsonstatic inline boolean_t
505dcbf3bdGeorge Wilsondbuf_cache_above_lowater(void)
506dcbf3bdGeorge Wilson{
507dcbf3bdGeorge Wilson	uint64_t dbuf_cache_lowater_bytes =
508dcbf3bdGeorge Wilson	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
509dcbf3bdGeorge Wilson
510e914aceTim Schumacher	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
511dcbf3bdGeorge Wilson	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
512dcbf3bdGeorge Wilson}
513dcbf3bdGeorge Wilson
514dcbf3bdGeorge Wilson/*
515dcbf3bdGeorge Wilson * Evict the oldest eligible dbuf from the dbuf cache.
516dcbf3bdGeorge Wilson */
517dcbf3bdGeorge Wilsonstatic void
518dcbf3bdGeorge Wilsondbuf_evict_one(void)
519dcbf3bdGeorge Wilson{
520adb52d9Matthew Ahrens	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
521adb52d9Matthew Ahrens	multilist_sublist_t *mls = multilist_sublist_lock(
522adb52d9Matthew Ahrens	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
523dcbf3bdGeorge Wilson
524dcbf3bdGeorge Wilson	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
525dcbf3bdGeorge Wilson
526dcbf3bdGeorge Wilson	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
527dcbf3bdGeorge Wilson	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
528dcbf3bdGeorge Wilson		db = multilist_sublist_prev(mls, db);
529dcbf3bdGeorge Wilson	}
530dcbf3bdGeorge Wilson
531dcbf3bdGeorge Wilson	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
532dcbf3bdGeorge Wilson	    multilist_sublist_t *, mls);
533dcbf3bdGeorge Wilson
534dcbf3bdGeorge Wilson	if (db != NULL) {
535dcbf3bdGeorge Wilson		multilist_sublist_remove(mls, db);
536dcbf3bdGeorge Wilson		multilist_sublist_unlock(mls);
537e914aceTim Schumacher		(void) zfs_refcount_remove_many(
538e914aceTim Schumacher		    &dbuf_caches[DB_DBUF_CACHE].size,
539dcbf3bdGeorge Wilson		    db->db.db_size, db);
540adb52d9Matthew Ahrens		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
541adb52d9Matthew Ahrens		db->db_caching_status = DB_NO_CACHE;
542dcbf3bdGeorge Wilson		dbuf_destroy(db);
543dcbf3bdGeorge Wilson	} else {
544dcbf3bdGeorge Wilson		multilist_sublist_unlock(mls);
545dcbf3bdGeorge Wilson	}
546dcbf3bdGeorge Wilson}
547dcbf3bdGeorge Wilson
548dcbf3bdGeorge Wilson/*
549dcbf3bdGeorge Wilson * The dbuf evict thread is responsible for aging out dbufs from the
550dcbf3bdGeorge Wilson * cache. Once the cache has reached it's maximum size, dbufs are removed
551dcbf3bdGeorge Wilson * and destroyed. The eviction thread will continue running until the size
552dcbf3bdGeorge Wilson * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
553dcbf3bdGeorge Wilson * out of the cache it is destroyed and becomes eligible for arc eviction.
554dcbf3bdGeorge Wilson */
5553f7978dAlan Somers/* ARGSUSED */
556dcbf3bdGeorge Wilsonstatic void
5573f7978dAlan Somersdbuf_evict_thread(void *unused)
558dcbf3bdGeorge Wilson{
559dcbf3bdGeorge Wilson	callb_cpr_t cpr;
560dcbf3bdGeorge Wilson
561dcbf3bdGeorge Wilson	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
562dcbf3bdGeorge Wilson
563dcbf3bdGeorge Wilson	mutex_enter(&dbuf_evict_lock);
564dcbf3bdGeorge Wilson	while (!dbuf_evict_thread_exit) {
565dcbf3bdGeorge Wilson		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
566dcbf3bdGeorge Wilson			CALLB_CPR_SAFE_BEGIN(&cpr);
567dcbf3bdGeorge Wilson			(void) cv_timedwait_hires(&dbuf_evict_cv,
568dcbf3bdGeorge Wilson			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
569dcbf3bdGeorge Wilson			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
570dcbf3bdGeorge Wilson		}
571dcbf3bdGeorge Wilson		mutex_exit(&dbuf_evict_lock);
572dcbf3bdGeorge Wilson
573dcbf3bdGeorge Wilson		/*
574dcbf3bdGeorge Wilson		 * Keep evicting as long as we're above the low water mark
575dcbf3bdGeorge Wilson		 * for the cache. We do this without holding the locks to
576dcbf3bdGeorge Wilson		 * minimize lock contention.
577dcbf3bdGeorge Wilson		 */
578dcbf3bdGeorge Wilson		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
579dcbf3bdGeorge Wilson			dbuf_evict_one();
580dcbf3bdGeorge Wilson		}
581dcbf3bdGeorge Wilson
582dcbf3bdGeorge Wilson		mutex_enter(&dbuf_evict_lock);
583dcbf3bdGeorge Wilson	}
585dcbf3bdGeorge Wilson	dbuf_evict_thread_exit = B_FALSE;
586dcbf3bdGeorge Wilson	cv_broadcast(&dbuf_evict_cv);
587dcbf3bdGeorge Wilson	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
588dcbf3bdGeorge Wilson	thread_exit();
589dcbf3bdGeorge Wilson}
590dcbf3bdGeorge Wilson
591dcbf3bdGeorge Wilson/*
592dcbf3bdGeorge Wilson * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
593dcbf3bdGeorge Wilson * If the dbuf cache is at its high water mark, then evict a dbuf from the
594dcbf3bdGeorge Wilson * dbuf cache using the callers context.
595dcbf3bdGeorge Wilson */
596dcbf3bdGeorge Wilsonstatic void
597dcbf3bdGeorge Wilsondbuf_evict_notify(void)
598dcbf3bdGeorge Wilson{
599dbfd9f9Matthew Ahrens	/*
600dbfd9f9Matthew Ahrens	 * We check if we should evict without holding the dbuf_evict_lock,
601dbfd9f9Matthew Ahrens	 * because it's OK to occasionally make the wrong decision here,
602dbfd9f9Matthew Ahrens	 * and grabbing the lock results in massive lock contention.
603dbfd9f9Matthew Ahrens	 */
604e914aceTim Schumacher	if (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
605adb52d9Matthew Ahrens	    dbuf_cache_max_bytes) {
606dbfd9f9Matthew Ahrens		if (dbuf_cache_above_hiwater())
607dcbf3bdGeorge Wilson			dbuf_evict_one();
608dbfd9f9Matthew Ahrens		cv_signal(&dbuf_evict_cv);
609dcbf3bdGeorge Wilson	}
615ea8dc4beschrock	uint64_t hsize = 1ULL << 16;
616fa9e406ahrens	dbuf_hash_table_t *h = &dbuf_hash_table;
617fa9e406ahrens	int i;
619fa9e406ahrens	/*
620fa9e406ahrens	 * The hash table is big enough to fill all of physical memory
621ea8dc4beschrock	 * with an average 4K block size.  The table will take up
622ea8dc4beschrock	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
623fa9e406ahrens	 */
624ea8dc4beschrock	while (hsize * 4096 < physmem * PAGESIZE)
625fa9e406ahrens		hsize <<= 1;
628fa9e406ahrens	h->hash_table_mask = hsize - 1;
629ea8dc4beschrock	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
630ea8dc4beschrock	if (h->hash_table == NULL) {
631ea8dc4beschrock		/* XXX - we should really return an error instead of assert */
632ea8dc4beschrock		ASSERT(hsize > (1ULL << 10));
633ea8dc4beschrock		hsize >>= 1;
634ea8dc4beschrock		goto retry;
635ea8dc4beschrock	}
637dcbf3bdGeorge Wilson	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
638fa9e406ahrens	    sizeof (dmu_buf_impl_t),
639fa9e406ahrens	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
641fa9e406ahrens	for (i = 0; i < DBUF_MUTEXES; i++)
642fa9e406ahrens		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
643bc9014eJustin Gibbs
644bc9014eJustin Gibbs	/*
645adb52d9Matthew Ahrens	 * Setup the parameters for the dbuf caches. We set the sizes of the
646adb52d9Matthew Ahrens	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
647adb52d9Matthew Ahrens	 * of the size of the ARC, respectively. If the values are set in
648adb52d9Matthew Ahrens	 * /etc/system and they're not greater than the size of the ARC, then
649adb52d9Matthew Ahrens	 * we honor that value.
650dcbf3bdGeorge Wilson	 */
651268bbb2George Wilson	if (dbuf_cache_max_bytes == 0 ||
652268bbb2George Wilson	    dbuf_cache_max_bytes >= arc_max_bytes())  {
653268bbb2George Wilson		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
654268bbb2George Wilson	}
655adb52d9Matthew Ahrens	if (dbuf_metadata_cache_max_bytes == 0 ||
656adb52d9Matthew Ahrens	    dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
657adb52d9Matthew Ahrens		dbuf_metadata_cache_max_bytes =
658adb52d9Matthew Ahrens		    arc_max_bytes() >> dbuf_metadata_cache_shift;
659adb52d9Matthew Ahrens	}
660dcbf3bdGeorge Wilson
661dcbf3bdGeorge Wilson	/*
662bc9014eJustin Gibbs	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
663bc9014eJustin Gibbs	 * configuration is not required.
664bc9014eJustin Gibbs	 */
665bc9014eJustin Gibbs	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
666dcbf3bdGeorge Wilson
667adb52d9Matthew Ahrens	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
668adb52d9Matthew Ahrens		dbuf_caches[dcs].cache =
669adb52d9Matthew Ahrens		    multilist_create(sizeof (dmu_buf_impl_t),
670adb52d9Matthew Ahrens		    offsetof(dmu_buf_impl_t, db_cache_link),
671adb52d9Matthew Ahrens		    dbuf_cache_multilist_index_func);
672e914aceTim Schumacher		zfs_refcount_create(&dbuf_caches[dcs].size);
673adb52d9Matthew Ahrens	}
674dcbf3bdGeorge Wilson
675dcbf3bdGeorge Wilson	dbuf_evict_thread_exit = B_FALSE;
676dcbf3bdGeorge Wilson	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
677dcbf3bdGeorge Wilson	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
678dcbf3bdGeorge Wilson	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
679dcbf3bdGeorge Wilson	    NULL, 0, &p0, TS_RUN, minclsyspri);
685fa9e406ahrens	dbuf_hash_table_t *h = &dbuf_hash_table;
686fa9e406ahrens	int i;
688fa9e406ahrens	for (i = 0; i < DBUF_MUTEXES; i++)
689fa9e406ahrens		mutex_destroy(&h->hash_mutexes[i]);
690fa9e406ahrens	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
691dcbf3bdGeorge Wilson	kmem_cache_destroy(dbuf_kmem_cache);
692bc9014eJustin Gibbs	taskq_destroy(dbu_evict_taskq);
693dcbf3bdGeorge Wilson
694dcbf3bdGeorge Wilson	mutex_enter(&dbuf_evict_lock);
695dcbf3bdGeorge Wilson	dbuf_evict_thread_exit = B_TRUE;
696dcbf3bdGeorge Wilson	while (dbuf_evict_thread_exit) {
697dcbf3bdGeorge Wilson		cv_signal(&dbuf_evict_cv);
698dcbf3bdGeorge Wilson		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
699dcbf3bdGeorge Wilson	}
700dcbf3bdGeorge Wilson	mutex_exit(&dbuf_evict_lock);
701dcbf3bdGeorge Wilson
702dcbf3bdGeorge Wilson	mutex_destroy(&dbuf_evict_lock);
703dcbf3bdGeorge Wilson	cv_destroy(&dbuf_evict_cv);
704dcbf3bdGeorge Wilson
705adb52d9Matthew Ahrens	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
706e914aceTim Schumacher		zfs_refcount_destroy(&dbuf_caches[dcs].size);
707adb52d9Matthew Ahrens		multilist_destroy(dbuf_caches[dcs].cache);
708adb52d9Matthew Ahrens	}
712fa9e406ahrens * Other stuff.
713fa9e406ahrens */
7159c9dc39ek#ifdef ZFS_DEBUG
716fa9e406ahrensstatic void
717fa9e406ahrensdbuf_verify(dmu_buf_impl_t *db)
719744947dTom Erickson	dnode_t *dn;
720b24ab67Jeff Bonwick	dbuf_dirty_record_t *dr;
722fa9e406ahrens	ASSERT(MUTEX_HELD(&db->db_mtx));
724fa9e406ahrens	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
725fa9e406ahrens		return;
727fa9e406ahrens	ASSERT(db->db_objset != NULL);
728744947dTom Erickson	DB_DNODE_ENTER(db);
729744947dTom Erickson	dn = DB_DNODE(db);
730fa9e406ahrens	if (dn == NULL) {
731fa9e406ahrens		ASSERT(db->db_parent == NULL);
732fa9e406ahrens		ASSERT(db->db_blkptr == NULL);
733fa9e406ahrens	} else {
734fa9e406ahrens		ASSERT3U(db->db.db_object, ==, dn->dn_object);
735fa9e406ahrens		ASSERT3P(db->db_objset, ==, dn->dn_objset);
736fa9e406ahrens		ASSERT3U(db->db_level, <, dn->dn_nlevels);
737744947dTom Erickson		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
738744947dTom Erickson		    db->db_blkid == DMU_SPILL_BLKID ||
7390f6d88aAlex Reece		    !avl_is_empty(&dn->dn_dbufs));
740fa9e406ahrens	}
7410a586ceMark Shellenbaum	if (db->db_blkid == DMU_BONUS_BLKID) {
742fa9e406ahrens		ASSERT(dn != NULL);
7431934e92maybee		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
7440a586ceMark Shellenbaum		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
7450a586ceMark Shellenbaum	} else if (db->db_blkid == DMU_SPILL_BLKID) {
7460a586ceMark Shellenbaum		ASSERT(dn != NULL);
747fb09f5aMadhav Suresh		ASSERT0(db->db.db_offset);
748fa9e406ahrens	} else {
749fa9e406ahrens		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
750fa9e406ahrens	}
752b24ab67Jeff Bonwick	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
753b24ab67Jeff Bonwick		ASSERT(dr->dr_dbuf == db);
754b24ab67Jeff Bonwick
755b24ab67Jeff Bonwick	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
756b24ab67Jeff Bonwick		ASSERT(dr->dr_dbuf == db);
757b24ab67Jeff Bonwick
75888b7b0fMatthew Ahrens	/*
75988b7b0fMatthew Ahrens	 * We can't assert that db_size matches dn_datablksz because it
76088b7b0fMatthew Ahrens	 * can be momentarily different when another thread is doing
76188b7b0fMatthew Ahrens	 * dnode_set_blksz().
76288b7b0fMatthew Ahrens	 */
76388b7b0fMatthew Ahrens	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
764b24ab67Jeff Bonwick		dr = db->db_data_pending;
76588b7b0fMatthew Ahrens		/*
76688b7b0fMatthew Ahrens		 * It should only be modified in syncing context, so
76788b7b0fMatthew Ahrens		 * make sure we only have one copy of the data.
76888b7b0fMatthew Ahrens		 */
76988b7b0fMatthew Ahrens		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
770fa9e406ahrens	}
772fa9e406ahrens	/* verify db->db_blkptr */
773fa9e406ahrens	if (db->db_blkptr) {
774fa9e406ahrens		if (db->db_parent == dn->dn_dbuf) {
775fa9e406ahrens			/* db is pointed to by the dnode */
776fa9e406ahrens			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
7771484342Matthew Ahrens			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
778fa9e406ahrens				ASSERT(db->db_parent == NULL);
779fa9e406ahrens			else
780fa9e406ahrens				ASSERT(db->db_parent != NULL);
7810a586ceMark Shellenbaum			if (db->db_blkid != DMU_SPILL_BLKID)
7820a586ceMark Shellenbaum				ASSERT3P(db->db_blkptr, ==,
7830a586ceMark Shellenbaum				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
784fa9e406ahrens		} else {
785fa9e406ahrens			/* db is pointed to by an indirect block */
786fa9e406ahrens			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
787fa9e406ahrens			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);