xref: /illumos-gate/usr/src/uts/common/fs/zfs/dnode.c (revision f67950b21e185934ccabe311516f4dcbdb00ef79)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5f65e61c0Sahrens  * Common Development and Distribution License (the "License").
6f65e61c0Sahrens  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
2206e0070dSMark Shellenbaum  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2394c2d0ebSMatthew Ahrens  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
24bc9014e6SJustin Gibbs  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25c3d26abcSMatthew Ahrens  * Copyright (c) 2014 Integros [integros.com]
26f06dce2cSAndrew Stormont  * Copyright 2017 RackTop Systems.
27fa9e4066Sahrens  */
28fa9e4066Sahrens 
29fa9e4066Sahrens #include <sys/zfs_context.h>
30fa9e4066Sahrens #include <sys/dbuf.h>
31fa9e4066Sahrens #include <sys/dnode.h>
32fa9e4066Sahrens #include <sys/dmu.h>
33fa9e4066Sahrens #include <sys/dmu_impl.h>
34fa9e4066Sahrens #include <sys/dmu_tx.h>
35fa9e4066Sahrens #include <sys/dmu_objset.h>
36fa9e4066Sahrens #include <sys/dsl_dir.h>
37fa9e4066Sahrens #include <sys/dsl_dataset.h>
38fa9e4066Sahrens #include <sys/spa.h>
39fa9e4066Sahrens #include <sys/zio.h>
40fa9e4066Sahrens #include <sys/dmu_zfetch.h>
41bf16b11eSMatthew Ahrens #include <sys/range_tree.h>
42*f67950b2SNasf-Fan #include <sys/zfs_project.h>
43fa9e4066Sahrens 
4454811da5SToomas Soome dnode_stats_t dnode_stats = {
4554811da5SToomas Soome 	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
4654811da5SToomas Soome 	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
4754811da5SToomas Soome 	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
4854811da5SToomas Soome 	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
4954811da5SToomas Soome 	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
5054811da5SToomas Soome 	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
5154811da5SToomas Soome 	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
5254811da5SToomas Soome 	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
5354811da5SToomas Soome 	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
5454811da5SToomas Soome 	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
5554811da5SToomas Soome 	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
5654811da5SToomas Soome 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
5754811da5SToomas Soome 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
5854811da5SToomas Soome 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
5954811da5SToomas Soome 	{ "dnode_hold_free_txg",		KSTAT_DATA_UINT64 },
6054811da5SToomas Soome 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
6154811da5SToomas Soome 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
6254811da5SToomas Soome 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
6354811da5SToomas Soome 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
6454811da5SToomas Soome 	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
6554811da5SToomas Soome 	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
6654811da5SToomas Soome 	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
6754811da5SToomas Soome 	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
6854811da5SToomas Soome 	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
6954811da5SToomas Soome 	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
7054811da5SToomas Soome 	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
7154811da5SToomas Soome 	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
7254811da5SToomas Soome 	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
7354811da5SToomas Soome 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
7454811da5SToomas Soome };
7554811da5SToomas Soome 
7654811da5SToomas Soome static kstat_t *dnode_ksp;
77fa9e4066Sahrens static kmem_cache_t *dnode_cache;
78fa9e4066Sahrens 
79fa9e4066Sahrens static dnode_phys_t dnode_phys_zero;
80fa9e4066Sahrens 
81fa9e4066Sahrens int zfs_default_bs = SPA_MINBLOCKSHIFT;
82fa9e4066Sahrens int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
83fa9e4066Sahrens 
84f06dce2cSAndrew Stormont #ifdef	_KERNEL
85744947dcSTom Erickson static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
86f06dce2cSAndrew Stormont #endif	/* _KERNEL */
87744947dcSTom Erickson 
880f6d88adSAlex Reece static int
890f6d88adSAlex Reece dbuf_compare(const void *x1, const void *x2)
900f6d88adSAlex Reece {
910f6d88adSAlex Reece 	const dmu_buf_impl_t *d1 = x1;
920f6d88adSAlex Reece 	const dmu_buf_impl_t *d2 = x2;
930f6d88adSAlex Reece 
94c4ab0d3fSGvozden Neskovic 	int cmp = AVL_CMP(d1->db_level, d2->db_level);
95c4ab0d3fSGvozden Neskovic 	if (likely(cmp))
96c4ab0d3fSGvozden Neskovic 		return (cmp);
970f6d88adSAlex Reece 
98c4ab0d3fSGvozden Neskovic 	cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
99c4ab0d3fSGvozden Neskovic 	if (likely(cmp))
100c4ab0d3fSGvozden Neskovic 		return (cmp);
1010f6d88adSAlex Reece 
102a846f19dSAlex Reece 	if (d1->db_state == DB_SEARCH) {
103a846f19dSAlex Reece 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
1040f6d88adSAlex Reece 		return (-1);
105a846f19dSAlex Reece 	} else if (d2->db_state == DB_SEARCH) {
106a846f19dSAlex Reece 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
10786bb58aeSAlex Reece 		return (1);
10886bb58aeSAlex Reece 	}
10986bb58aeSAlex Reece 
110c4ab0d3fSGvozden Neskovic 	return (AVL_PCMP(d1, d2));
1110f6d88adSAlex Reece }
1120f6d88adSAlex Reece 
113fa9e4066Sahrens /* ARGSUSED */
114fa9e4066Sahrens static int
115fa9e4066Sahrens dnode_cons(void *arg, void *unused, int kmflag)
116fa9e4066Sahrens {
117fa9e4066Sahrens 	dnode_t *dn = arg;
118744947dcSTom Erickson 	int i;
119fa9e4066Sahrens 
120fa9e4066Sahrens 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
121fa9e4066Sahrens 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
122fa9e4066Sahrens 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
123b5e70f97SRicardo M. Correia 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
124b5e70f97SRicardo M. Correia 
1253b2aab18SMatthew Ahrens 	/*
1263b2aab18SMatthew Ahrens 	 * Every dbuf has a reference, and dropping a tracked reference is
1273b2aab18SMatthew Ahrens 	 * O(number of references), so don't track dn_holds.
1283b2aab18SMatthew Ahrens 	 */
129e914ace2STim Schumacher 	zfs_refcount_create_untracked(&dn->dn_holds);
130e914ace2STim Schumacher 	zfs_refcount_create(&dn->dn_tx_holds);
131744947dcSTom Erickson 	list_link_init(&dn->dn_link);
132744947dcSTom Erickson 
133744947dcSTom Erickson 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
134744947dcSTom Erickson 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
135744947dcSTom Erickson 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
136744947dcSTom Erickson 	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
137744947dcSTom Erickson 	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
138744947dcSTom Erickson 	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
139744947dcSTom Erickson 	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
140eb633035STom Caputi 	bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
141fa9e4066Sahrens 
142fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
143aa02ea01STom Caputi 		multilist_link_init(&dn->dn_dirty_link[i]);
144bf16b11eSMatthew Ahrens 		dn->dn_free_ranges[i] = NULL;
145c717a561Smaybee 		list_create(&dn->dn_dirty_records[i],
146c717a561Smaybee 		    sizeof (dbuf_dirty_record_t),
147c717a561Smaybee 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
148fa9e4066Sahrens 	}
149fa9e4066Sahrens 
150744947dcSTom Erickson 	dn->dn_allocated_txg = 0;
151744947dcSTom Erickson 	dn->dn_free_txg = 0;
152744947dcSTom Erickson 	dn->dn_assigned_txg = 0;
153aa02ea01STom Caputi 	dn->dn_dirty_txg = 0;
154744947dcSTom Erickson 	dn->dn_dirtyctx = 0;
155744947dcSTom Erickson 	dn->dn_dirtyctx_firstset = NULL;
156744947dcSTom Erickson 	dn->dn_bonus = NULL;
157744947dcSTom Erickson 	dn->dn_have_spill = B_FALSE;
158744947dcSTom Erickson 	dn->dn_zio = NULL;
159744947dcSTom Erickson 	dn->dn_oldused = 0;
160744947dcSTom Erickson 	dn->dn_oldflags = 0;
161744947dcSTom Erickson 	dn->dn_olduid = 0;
162744947dcSTom Erickson 	dn->dn_oldgid = 0;
163*f67950b2SNasf-Fan 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
164744947dcSTom Erickson 	dn->dn_newuid = 0;
165744947dcSTom Erickson 	dn->dn_newgid = 0;
166*f67950b2SNasf-Fan 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
167744947dcSTom Erickson 	dn->dn_id_flags = 0;
168744947dcSTom Erickson 
169744947dcSTom Erickson 	dn->dn_dbufs_count = 0;
1700f6d88adSAlex Reece 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
171fa9e4066Sahrens 	    offsetof(dmu_buf_impl_t, db_link));
172fa9e4066Sahrens 
173744947dcSTom Erickson 	dn->dn_moved = 0;
174fa9e4066Sahrens 	return (0);
175fa9e4066Sahrens }
176fa9e4066Sahrens 
177fa9e4066Sahrens /* ARGSUSED */
178fa9e4066Sahrens static void
179fa9e4066Sahrens dnode_dest(void *arg, void *unused)
180fa9e4066Sahrens {
181fa9e4066Sahrens 	int i;
182fa9e4066Sahrens 	dnode_t *dn = arg;
183fa9e4066Sahrens 
184fa9e4066Sahrens 	rw_destroy(&dn->dn_struct_rwlock);
185fa9e4066Sahrens 	mutex_destroy(&dn->dn_mtx);
186fa9e4066Sahrens 	mutex_destroy(&dn->dn_dbufs_mtx);
187b5e70f97SRicardo M. Correia 	cv_destroy(&dn->dn_notxholds);
188e914ace2STim Schumacher 	zfs_refcount_destroy(&dn->dn_holds);
189e914ace2STim Schumacher 	zfs_refcount_destroy(&dn->dn_tx_holds);
190744947dcSTom Erickson 	ASSERT(!list_link_active(&dn->dn_link));
191fa9e4066Sahrens 
192fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
193aa02ea01STom Caputi 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
194bf16b11eSMatthew Ahrens 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
195c717a561Smaybee 		list_destroy(&dn->dn_dirty_records[i]);
196fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nblkptr[i]);
197fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nlevels[i]);
198fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_indblkshift[i]);
199fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonustype[i]);
200fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_rm_spillblk[i]);
201fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonuslen[i]);
202fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_blksz[i]);
203eb633035STom Caputi 		ASSERT0(dn->dn_next_maxblkid[i]);
204fa9e4066Sahrens 	}
205fa9e4066Sahrens 
206fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_allocated_txg);
207fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_free_txg);
208fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_assigned_txg);
209aa02ea01STom Caputi 	ASSERT0(dn->dn_dirty_txg);
210fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_dirtyctx);
211744947dcSTom Erickson 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
212744947dcSTom Erickson 	ASSERT3P(dn->dn_bonus, ==, NULL);
213744947dcSTom Erickson 	ASSERT(!dn->dn_have_spill);
214744947dcSTom Erickson 	ASSERT3P(dn->dn_zio, ==, NULL);
215fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_oldused);
216fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_oldflags);
217fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_olduid);
218fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_oldgid);
219*f67950b2SNasf-Fan 	ASSERT0(dn->dn_oldprojid);
220fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_newuid);
221fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_newgid);
222*f67950b2SNasf-Fan 	ASSERT0(dn->dn_newprojid);
223fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_id_flags);
224fb09f5aaSMadhav Suresh 
225fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_dbufs_count);
2260f6d88adSAlex Reece 	avl_destroy(&dn->dn_dbufs);
227fa9e4066Sahrens }
228fa9e4066Sahrens 
229fa9e4066Sahrens void
230fa9e4066Sahrens dnode_init(void)
231fa9e4066Sahrens {
232744947dcSTom Erickson 	ASSERT(dnode_cache == NULL);
233fa9e4066Sahrens 	dnode_cache = kmem_cache_create("dnode_t",
234fa9e4066Sahrens 	    sizeof (dnode_t),
235fa9e4066Sahrens 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
236f06dce2cSAndrew Stormont #ifdef	_KERNEL
237744947dcSTom Erickson 	kmem_cache_set_move(dnode_cache, dnode_move);
23854811da5SToomas Soome 
23954811da5SToomas Soome 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
24054811da5SToomas Soome 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
24154811da5SToomas Soome 	    KSTAT_FLAG_VIRTUAL);
24254811da5SToomas Soome 	if (dnode_ksp != NULL) {
24354811da5SToomas Soome 		dnode_ksp->ks_data = &dnode_stats;
24454811da5SToomas Soome 		kstat_install(dnode_ksp);
24554811da5SToomas Soome 	}
246f06dce2cSAndrew Stormont #endif	/* _KERNEL */
247fa9e4066Sahrens }
248fa9e4066Sahrens 
249fa9e4066Sahrens void
250fa9e4066Sahrens dnode_fini(void)
251fa9e4066Sahrens {
25254811da5SToomas Soome 	if (dnode_ksp != NULL) {
25354811da5SToomas Soome 		kstat_delete(dnode_ksp);
25454811da5SToomas Soome 		dnode_ksp = NULL;
25554811da5SToomas Soome 	}
25654811da5SToomas Soome 
257fa9e4066Sahrens 	kmem_cache_destroy(dnode_cache);
258744947dcSTom Erickson 	dnode_cache = NULL;
259fa9e4066Sahrens }
260fa9e4066Sahrens 
261fa9e4066Sahrens 
2629c9dc39aSek #ifdef ZFS_DEBUG
263fa9e4066Sahrens void
264fa9e4066Sahrens dnode_verify(dnode_t *dn)
265fa9e4066Sahrens {
266fa9e4066Sahrens 	int drop_struct_lock = FALSE;
267fa9e4066Sahrens 
268fa9e4066Sahrens 	ASSERT(dn->dn_phys);
269fa9e4066Sahrens 	ASSERT(dn->dn_objset);
270744947dcSTom Erickson 	ASSERT(dn->dn_handle->dnh_dnode == dn);
271fa9e4066Sahrens 
272ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
273fa9e4066Sahrens 
274fa9e4066Sahrens 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
275fa9e4066Sahrens 		return;
276fa9e4066Sahrens 
277fa9e4066Sahrens 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
278fa9e4066Sahrens 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
279fa9e4066Sahrens 		drop_struct_lock = TRUE;
280fa9e4066Sahrens 	}
281fa9e4066Sahrens 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
282fa9e4066Sahrens 		int i;
28354811da5SToomas Soome 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
284fa9e4066Sahrens 		ASSERT3U(dn->dn_indblkshift, >=, 0);
285fa9e4066Sahrens 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
286fa9e4066Sahrens 		if (dn->dn_datablkshift) {
287fa9e4066Sahrens 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
288fa9e4066Sahrens 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
289fa9e4066Sahrens 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
290fa9e4066Sahrens 		}
291fa9e4066Sahrens 		ASSERT3U(dn->dn_nlevels, <=, 30);
292ad135b5dSChristopher Siden 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
293fa9e4066Sahrens 		ASSERT3U(dn->dn_nblkptr, >=, 1);
294fa9e4066Sahrens 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
29554811da5SToomas Soome 		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
296fa9e4066Sahrens 		ASSERT3U(dn->dn_datablksz, ==,
297fa9e4066Sahrens 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
298fa9e4066Sahrens 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
299fa9e4066Sahrens 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
30054811da5SToomas Soome 		    dn->dn_bonuslen, <=, max_bonuslen);
301fa9e4066Sahrens 		for (i = 0; i < TXG_SIZE; i++) {
302fa9e4066Sahrens 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
303fa9e4066Sahrens 		}
304fa9e4066Sahrens 	}
305fa9e4066Sahrens 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
306fa9e4066Sahrens 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
30714843421SMatthew Ahrens 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
308fa9e4066Sahrens 	if (dn->dn_dbuf != NULL) {
309fa9e4066Sahrens 		ASSERT3P(dn->dn_phys, ==,
310fa9e4066Sahrens 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
311fa9e4066Sahrens 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
312fa9e4066Sahrens 	}
313fa9e4066Sahrens 	if (drop_struct_lock)
314fa9e4066Sahrens 		rw_exit(&dn->dn_struct_rwlock);
315fa9e4066Sahrens }
3169c9dc39aSek #endif
317fa9e4066Sahrens 
318fa9e4066Sahrens void
319fa9e4066Sahrens dnode_byteswap(dnode_phys_t *dnp)
320fa9e4066Sahrens {
321fa9e4066Sahrens 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
322fa9e4066Sahrens 	int i;
323fa9e4066Sahrens 
324fa9e4066Sahrens 	if (dnp->dn_type == DMU_OT_NONE) {
325fa9e4066Sahrens 		bzero(dnp, sizeof (dnode_phys_t));
326fa9e4066Sahrens 		return;
327fa9e4066Sahrens 	}
328fa9e4066Sahrens 
329fa9e4066Sahrens 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
330fa9e4066Sahrens 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
33154811da5SToomas Soome 	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
332fa9e4066Sahrens 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
33399653d4eSeschrock 	dnp->dn_used = BSWAP_64(dnp->dn_used);
334fa9e4066Sahrens 
335fa9e4066Sahrens 	/*
336fa9e4066Sahrens 	 * dn_nblkptr is only one byte, so it's OK to read it in either
337fa9e4066Sahrens 	 * byte order.  We can't read dn_bouslen.
338fa9e4066Sahrens 	 */
339fa9e4066Sahrens 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
340fa9e4066Sahrens 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
341fa9e4066Sahrens 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
342fa9e4066Sahrens 		buf64[i] = BSWAP_64(buf64[i]);
343fa9e4066Sahrens 
344fa9e4066Sahrens 	/*
345fa9e4066Sahrens 	 * OK to check dn_bonuslen for zero, because it won't matter if
346fa9e4066Sahrens 	 * we have the wrong byte order.  This is necessary because the
347fa9e4066Sahrens 	 * dnode dnode is smaller than a regular dnode.
348fa9e4066Sahrens 	 */
349fa9e4066Sahrens 	if (dnp->dn_bonuslen != 0) {
350fa9e4066Sahrens 		/*
351fa9e4066Sahrens 		 * Note that the bonus length calculated here may be
352fa9e4066Sahrens 		 * longer than the actual bonus buffer.  This is because
353fa9e4066Sahrens 		 * we always put the bonus buffer after the last block
354fa9e4066Sahrens 		 * pointer (instead of packing it against the end of the
355fa9e4066Sahrens 		 * dnode buffer).
356fa9e4066Sahrens 		 */
357fa9e4066Sahrens 		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
35854811da5SToomas Soome 		int slots = dnp->dn_extra_slots + 1;
35954811da5SToomas Soome 		size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
360ad135b5dSChristopher Siden 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
361ad135b5dSChristopher Siden 		dmu_object_byteswap_t byteswap =
362ad135b5dSChristopher Siden 		    DMU_OT_BYTESWAP(dnp->dn_bonustype);
363ad135b5dSChristopher Siden 		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
364fa9e4066Sahrens 	}
3650a586ceaSMark Shellenbaum 
3660a586ceaSMark Shellenbaum 	/* Swap SPILL block if we have one */
3670a586ceaSMark Shellenbaum 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
36854811da5SToomas Soome 		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
3690a586ceaSMark Shellenbaum 
370fa9e4066Sahrens }
371fa9e4066Sahrens 
372fa9e4066Sahrens void
373fa9e4066Sahrens dnode_buf_byteswap(void *vbuf, size_t size)
374fa9e4066Sahrens {
37554811da5SToomas Soome 	int i = 0;
376fa9e4066Sahrens 
377fa9e4066Sahrens 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
378fa9e4066Sahrens 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
379fa9e4066Sahrens 
38054811da5SToomas Soome 	while (i < size) {
38154811da5SToomas Soome 		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
38254811da5SToomas Soome 		dnode_byteswap(dnp);
38354811da5SToomas Soome 
38454811da5SToomas Soome 		i += DNODE_MIN_SIZE;
38554811da5SToomas Soome 		if (dnp->dn_type != DMU_OT_NONE)
38654811da5SToomas Soome 			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
387fa9e4066Sahrens 	}
388fa9e4066Sahrens }
389fa9e4066Sahrens 
3901934e92fSmaybee void
3911934e92fSmaybee dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
3921934e92fSmaybee {
393e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
3941934e92fSmaybee 
3951934e92fSmaybee 	dnode_setdirty(dn, tx);
3961934e92fSmaybee 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
39754811da5SToomas Soome 	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
3981934e92fSmaybee 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
3991934e92fSmaybee 	dn->dn_bonuslen = newsize;
4001934e92fSmaybee 	if (newsize == 0)
4011934e92fSmaybee 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
4021934e92fSmaybee 	else
4031934e92fSmaybee 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
4041934e92fSmaybee 	rw_exit(&dn->dn_struct_rwlock);
4051934e92fSmaybee }
4061934e92fSmaybee 
4070a586ceaSMark Shellenbaum void
4080a586ceaSMark Shellenbaum dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
4090a586ceaSMark Shellenbaum {
410e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
4110a586ceaSMark Shellenbaum 	dnode_setdirty(dn, tx);
4120a586ceaSMark Shellenbaum 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
4130a586ceaSMark Shellenbaum 	dn->dn_bonustype = newtype;
4140a586ceaSMark Shellenbaum 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
4150a586ceaSMark Shellenbaum 	rw_exit(&dn->dn_struct_rwlock);
4160a586ceaSMark Shellenbaum }
4170a586ceaSMark Shellenbaum 
4180a586ceaSMark Shellenbaum void
4190a586ceaSMark Shellenbaum dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
4200a586ceaSMark Shellenbaum {
421e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
42206e0070dSMark Shellenbaum 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
4230a586ceaSMark Shellenbaum 	dnode_setdirty(dn, tx);
4240a586ceaSMark Shellenbaum 	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
4250a586ceaSMark Shellenbaum 	dn->dn_have_spill = B_FALSE;
4260a586ceaSMark Shellenbaum }
4270a586ceaSMark Shellenbaum 
428fa9e4066Sahrens static void
429fa9e4066Sahrens dnode_setdblksz(dnode_t *dn, int size)
430fa9e4066Sahrens {
431fb09f5aaSMadhav Suresh 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
432fa9e4066Sahrens 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
433fa9e4066Sahrens 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
434fa9e4066Sahrens 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
435fa9e4066Sahrens 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
436fa9e4066Sahrens 	dn->dn_datablksz = size;
437fa9e4066Sahrens 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
438bf16b11eSMatthew Ahrens 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
439fa9e4066Sahrens }
440fa9e4066Sahrens 
441fa9e4066Sahrens static dnode_t *
442503ad85cSMatthew Ahrens dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
443744947dcSTom Erickson     uint64_t object, dnode_handle_t *dnh)
444fa9e4066Sahrens {
445bc9014e6SJustin Gibbs 	dnode_t *dn;
446fa9e4066Sahrens 
447bc9014e6SJustin Gibbs 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
448f06dce2cSAndrew Stormont #ifdef _KERNEL
449744947dcSTom Erickson 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
450f06dce2cSAndrew Stormont #endif /* _KERNEL */
451744947dcSTom Erickson 	dn->dn_moved = 0;
452744947dcSTom Erickson 
453744947dcSTom Erickson 	/*
454744947dcSTom Erickson 	 * Defer setting dn_objset until the dnode is ready to be a candidate
455744947dcSTom Erickson 	 * for the dnode_move() callback.
456744947dcSTom Erickson 	 */
457fa9e4066Sahrens 	dn->dn_object = object;
458fa9e4066Sahrens 	dn->dn_dbuf = db;
459744947dcSTom Erickson 	dn->dn_handle = dnh;
460fa9e4066Sahrens 	dn->dn_phys = dnp;
461fa9e4066Sahrens 
462744947dcSTom Erickson 	if (dnp->dn_datablkszsec) {
463fa9e4066Sahrens 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
464744947dcSTom Erickson 	} else {
465744947dcSTom Erickson 		dn->dn_datablksz = 0;
466744947dcSTom Erickson 		dn->dn_datablkszsec = 0;
467744947dcSTom Erickson 		dn->dn_datablkshift = 0;
468744947dcSTom Erickson 	}
469fa9e4066Sahrens 	dn->dn_indblkshift = dnp->dn_indblkshift;
470fa9e4066Sahrens 	dn->dn_nlevels = dnp->dn_nlevels;
471fa9e4066Sahrens 	dn->dn_type = dnp->dn_type;
472fa9e4066Sahrens 	dn->dn_nblkptr = dnp->dn_nblkptr;
473fa9e4066Sahrens 	dn->dn_checksum = dnp->dn_checksum;
474fa9e4066Sahrens 	dn->dn_compress = dnp->dn_compress;
475fa9e4066Sahrens 	dn->dn_bonustype = dnp->dn_bonustype;
476fa9e4066Sahrens 	dn->dn_bonuslen = dnp->dn_bonuslen;
47754811da5SToomas Soome 	dn->dn_num_slots = dnp->dn_extra_slots + 1;
478fa9e4066Sahrens 	dn->dn_maxblkid = dnp->dn_maxblkid;
4790a586ceaSMark Shellenbaum 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
48006e0070dSMark Shellenbaum 	dn->dn_id_flags = 0;
481fa9e4066Sahrens 
482fa9e4066Sahrens 	dmu_zfetch_init(&dn->dn_zfetch, dn);
483fa9e4066Sahrens 
484ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
48554811da5SToomas Soome 	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
48654811da5SToomas Soome 	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
487744947dcSTom Erickson 
488fa9e4066Sahrens 	mutex_enter(&os->os_lock);
489bc9014e6SJustin Gibbs 
490bc9014e6SJustin Gibbs 	/*
491bc9014e6SJustin Gibbs 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
492bc9014e6SJustin Gibbs 	 * signifies that the special dnodes have no references from
493bc9014e6SJustin Gibbs 	 * their children (the entries in os_dnodes).  This allows
494bc9014e6SJustin Gibbs 	 * dnode_destroy() to easily determine if the last child has
495bc9014e6SJustin Gibbs 	 * been removed and then complete eviction of the objset.
496bc9014e6SJustin Gibbs 	 */
497bc9014e6SJustin Gibbs 	if (!DMU_OBJECT_IS_SPECIAL(object))
498bc9014e6SJustin Gibbs 		list_insert_head(&os->os_dnodes, dn);
499744947dcSTom Erickson 	membar_producer();
500bc9014e6SJustin Gibbs 
501744947dcSTom Erickson 	/*
502bc9014e6SJustin Gibbs 	 * Everything else must be valid before assigning dn_objset
503bc9014e6SJustin Gibbs 	 * makes the dnode eligible for dnode_move().
504744947dcSTom Erickson 	 */
505744947dcSTom Erickson 	dn->dn_objset = os;
506bc9014e6SJustin Gibbs 
507bc9014e6SJustin Gibbs 	dnh->dnh_dnode = dn;
508fa9e4066Sahrens 	mutex_exit(&os->os_lock);
509fa9e4066Sahrens 
5105a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
51154811da5SToomas Soome 
512fa9e4066Sahrens 	return (dn);
513fa9e4066Sahrens }
514fa9e4066Sahrens 
515744947dcSTom Erickson /*
516744947dcSTom Erickson  * Caller must be holding the dnode handle, which is released upon return.
517744947dcSTom Erickson  */
518fa9e4066Sahrens static void
519fa9e4066Sahrens dnode_destroy(dnode_t *dn)
520fa9e4066Sahrens {
521503ad85cSMatthew Ahrens 	objset_t *os = dn->dn_objset;
522bc9014e6SJustin Gibbs 	boolean_t complete_os_eviction = B_FALSE;
523fa9e4066Sahrens 
5240a586ceaSMark Shellenbaum 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
525a2eea2e1Sahrens 
526fa9e4066Sahrens 	mutex_enter(&os->os_lock);
527744947dcSTom Erickson 	POINTER_INVALIDATE(&dn->dn_objset);
528bc9014e6SJustin Gibbs 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
529bc9014e6SJustin Gibbs 		list_remove(&os->os_dnodes, dn);
530bc9014e6SJustin Gibbs 		complete_os_eviction =
531bc9014e6SJustin Gibbs 		    list_is_empty(&os->os_dnodes) &&
532bc9014e6SJustin Gibbs 		    list_link_active(&os->os_evicting_node);
533bc9014e6SJustin Gibbs 	}
534fa9e4066Sahrens 	mutex_exit(&os->os_lock);
535fa9e4066Sahrens 
536744947dcSTom Erickson 	/* the dnode can no longer move, so we can release the handle */
53754811da5SToomas Soome 	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
53854811da5SToomas Soome 		zrl_remove(&dn->dn_handle->dnh_zrlock);
539744947dcSTom Erickson 
540744947dcSTom Erickson 	dn->dn_allocated_txg = 0;
541744947dcSTom Erickson 	dn->dn_free_txg = 0;
542744947dcSTom Erickson 	dn->dn_assigned_txg = 0;
543aa02ea01STom Caputi 	dn->dn_dirty_txg = 0;
544744947dcSTom Erickson 
545744947dcSTom Erickson 	dn->dn_dirtyctx = 0;
546744947dcSTom Erickson 	if (dn->dn_dirtyctx_firstset != NULL) {
547fa9e4066Sahrens 		kmem_free(dn->dn_dirtyctx_firstset, 1);
548fa9e4066Sahrens 		dn->dn_dirtyctx_firstset = NULL;
549fa9e4066Sahrens 	}
550744947dcSTom Erickson 	if (dn->dn_bonus != NULL) {
551ea8dc4b6Seschrock 		mutex_enter(&dn->dn_bonus->db_mtx);
552dcbf3bd6SGeorge Wilson 		dbuf_destroy(dn->dn_bonus);
553ea8dc4b6Seschrock 		dn->dn_bonus = NULL;
554ea8dc4b6Seschrock 	}
555744947dcSTom Erickson 	dn->dn_zio = NULL;
556744947dcSTom Erickson 
557744947dcSTom Erickson 	dn->dn_have_spill = B_FALSE;
558744947dcSTom Erickson 	dn->dn_oldused = 0;
559744947dcSTom Erickson 	dn->dn_oldflags = 0;
560744947dcSTom Erickson 	dn->dn_olduid = 0;
561744947dcSTom Erickson 	dn->dn_oldgid = 0;
562*f67950b2SNasf-Fan 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
563744947dcSTom Erickson 	dn->dn_newuid = 0;
564744947dcSTom Erickson 	dn->dn_newgid = 0;
565*f67950b2SNasf-Fan 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
566744947dcSTom Erickson 	dn->dn_id_flags = 0;
567744947dcSTom Erickson 
568cf6106c8SMatthew Ahrens 	dmu_zfetch_fini(&dn->dn_zfetch);
569fa9e4066Sahrens 	kmem_cache_free(dnode_cache, dn);
5705a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
571bc9014e6SJustin Gibbs 
572bc9014e6SJustin Gibbs 	if (complete_os_eviction)
573bc9014e6SJustin Gibbs 		dmu_objset_evict_done(os);
574fa9e4066Sahrens }
575fa9e4066Sahrens 
576fa9e4066Sahrens void
577fa9e4066Sahrens dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
57854811da5SToomas Soome     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
579fa9e4066Sahrens {
580fa9e4066Sahrens 	int i;
581fa9e4066Sahrens 
58254811da5SToomas Soome 	ASSERT3U(dn_slots, >, 0);
58354811da5SToomas Soome 	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
58454811da5SToomas Soome 	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
585b5152584SMatthew Ahrens 	ASSERT3U(blocksize, <=,
586b5152584SMatthew Ahrens 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
587fa9e4066Sahrens 	if (blocksize == 0)
588fa9e4066Sahrens 		blocksize = 1 << zfs_default_bs;
5893b83abddSahrens 	else
5903b83abddSahrens 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
591fa9e4066Sahrens 
592fa9e4066Sahrens 	if (ibs == 0)
593fa9e4066Sahrens 		ibs = zfs_default_ibs;
594fa9e4066Sahrens 
595fa9e4066Sahrens 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
596fa9e4066Sahrens 
59754811da5SToomas Soome 	dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
59854811da5SToomas Soome 	    " blocksize=%d ibs=%d dn_slots=%d\n",
59954811da5SToomas Soome 	    dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
60054811da5SToomas Soome 	DNODE_STAT_BUMP(dnode_allocate);
601fa9e4066Sahrens 
602fa9e4066Sahrens 	ASSERT(dn->dn_type == DMU_OT_NONE);
603fa9e4066Sahrens 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
604fa9e4066Sahrens 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
605fa9e4066Sahrens 	ASSERT(ot != DMU_OT_NONE);
606ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(ot));
607fa9e4066Sahrens 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
6080a586ceaSMark Shellenbaum 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
609fa9e4066Sahrens 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
610ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(bonustype));
61154811da5SToomas Soome 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
612fa9e4066Sahrens 	ASSERT(dn->dn_type == DMU_OT_NONE);
613fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_maxblkid);
614fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_allocated_txg);
615aa02ea01STom Caputi 	ASSERT0(dn->dn_dirty_txg);
616fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_assigned_txg);
617e914ace2STim Schumacher 	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
618e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
6190f6d88adSAlex Reece 	ASSERT(avl_is_empty(&dn->dn_dbufs));
620fa9e4066Sahrens 
621fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
622fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nblkptr[i]);
623fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nlevels[i]);
624fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_indblkshift[i]);
625fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonuslen[i]);
626fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonustype[i]);
627fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_rm_spillblk[i]);
628fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_blksz[i]);
629eb633035STom Caputi 		ASSERT0(dn->dn_next_maxblkid[i]);
630aa02ea01STom Caputi 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
631c717a561Smaybee 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
632bf16b11eSMatthew Ahrens 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
633fa9e4066Sahrens 	}
634fa9e4066Sahrens 
635fa9e4066Sahrens 	dn->dn_type = ot;
636fa9e4066Sahrens 	dnode_setdblksz(dn, blocksize);
637fa9e4066Sahrens 	dn->dn_indblkshift = ibs;
638fa9e4066Sahrens 	dn->dn_nlevels = 1;
63954811da5SToomas Soome 	dn->dn_num_slots = dn_slots;
6400a586ceaSMark Shellenbaum 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
6410a586ceaSMark Shellenbaum 		dn->dn_nblkptr = 1;
64254811da5SToomas Soome 	else {
64354811da5SToomas Soome 		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
64454811da5SToomas Soome 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
64554811da5SToomas Soome 		    SPA_BLKPTRSHIFT));
64654811da5SToomas Soome 	}
64754811da5SToomas Soome 
648fa9e4066Sahrens 	dn->dn_bonustype = bonustype;
649fa9e4066Sahrens 	dn->dn_bonuslen = bonuslen;
650fa9e4066Sahrens 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
651fa9e4066Sahrens 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
652fa9e4066Sahrens 	dn->dn_dirtyctx = 0;
653fa9e4066Sahrens 
654fa9e4066Sahrens 	dn->dn_free_txg = 0;
655fa9e4066Sahrens 	if (dn->dn_dirtyctx_firstset) {
656fa9e4066Sahrens 		kmem_free(dn->dn_dirtyctx_firstset, 1);
657fa9e4066Sahrens 		dn->dn_dirtyctx_firstset = NULL;
658fa9e4066Sahrens 	}
659fa9e4066Sahrens 
660fa9e4066Sahrens 	dn->dn_allocated_txg = tx->tx_txg;
6610a586ceaSMark Shellenbaum 	dn->dn_id_flags = 0;
662f676ed34Sahrens 
663fa9e4066Sahrens 	dnode_setdirty(dn, tx);
664f676ed34Sahrens 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
6651934e92fSmaybee 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
6660a586ceaSMark Shellenbaum 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
667f676ed34Sahrens 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
668fa9e4066Sahrens }
669fa9e4066Sahrens 
670fa9e4066Sahrens void
671fa9e4066Sahrens dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
672eb633035STom Caputi     dmu_object_type_t bonustype, int bonuslen, int dn_slots,
673eb633035STom Caputi     boolean_t keep_spill, dmu_tx_t *tx)
674fa9e4066Sahrens {
6752bf405a2SMark Maybee 	int nblkptr;
676c543ec06Sahrens 
677fa9e4066Sahrens 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
678b5152584SMatthew Ahrens 	ASSERT3U(blocksize, <=,
679b5152584SMatthew Ahrens 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
680fb09f5aaSMadhav Suresh 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
681ea8dc4b6Seschrock 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
682fa9e4066Sahrens 	ASSERT(tx->tx_txg != 0);
683fa9e4066Sahrens 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
68406e0070dSMark Shellenbaum 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
68506e0070dSMark Shellenbaum 	    (bonustype == DMU_OT_SA && bonuslen == 0));
686ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(bonustype));
68754811da5SToomas Soome 	ASSERT3U(bonuslen, <=,
68854811da5SToomas Soome 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
689946342a2SFabian Grünbichler 	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
69054811da5SToomas Soome 
69154811da5SToomas Soome 	dnode_free_interior_slots(dn);
69254811da5SToomas Soome 	DNODE_STAT_BUMP(dnode_reallocate);
693c543ec06Sahrens 
694ea8dc4b6Seschrock 	/* clean up any unreferenced dbufs */
6951934e92fSmaybee 	dnode_evict_dbufs(dn);
696da03de99SMark Maybee 
69728d97a71SMark Shellenbaum 	dn->dn_id_flags = 0;
69828d97a71SMark Shellenbaum 
699fa9e4066Sahrens 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
700fa9e4066Sahrens 	dnode_setdirty(dn, tx);
7012bf405a2SMark Maybee 	if (dn->dn_datablksz != blocksize) {
7022bf405a2SMark Maybee 		/* change blocksize */
7032bf405a2SMark Maybee 		ASSERT(dn->dn_maxblkid == 0 &&
7042bf405a2SMark Maybee 		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
7052bf405a2SMark Maybee 		    dnode_block_freed(dn, 0)));
7062bf405a2SMark Maybee 		dnode_setdblksz(dn, blocksize);
7072bf405a2SMark Maybee 		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
7082bf405a2SMark Maybee 	}
7092bf405a2SMark Maybee 	if (dn->dn_bonuslen != bonuslen)
7102bf405a2SMark Maybee 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
71106e0070dSMark Shellenbaum 
71206e0070dSMark Shellenbaum 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
71306e0070dSMark Shellenbaum 		nblkptr = 1;
71406e0070dSMark Shellenbaum 	else
71554811da5SToomas Soome 		nblkptr = MIN(DN_MAX_NBLKPTR,
71654811da5SToomas Soome 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
71754811da5SToomas Soome 		    SPA_BLKPTRSHIFT));
7180a586ceaSMark Shellenbaum 	if (dn->dn_bonustype != bonustype)
7190a586ceaSMark Shellenbaum 		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
720da03de99SMark Maybee 	if (dn->dn_nblkptr != nblkptr)
721da03de99SMark Maybee 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
722eb633035STom Caputi 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
72306e0070dSMark Shellenbaum 		dbuf_rm_spill(dn, tx);
72406e0070dSMark Shellenbaum 		dnode_rm_spill(dn, tx);
7250a586ceaSMark Shellenbaum 	}
726fa9e4066Sahrens 	rw_exit(&dn->dn_struct_rwlock);
727fa9e4066Sahrens 
728fa9e4066Sahrens 	/* change type */
729fa9e4066Sahrens 	dn->dn_type = ot;
730fa9e4066Sahrens 
731fa9e4066Sahrens 	/* change bonus size and type */
732fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
733fa9e4066Sahrens 	dn->dn_bonustype = bonustype;
734fa9e4066Sahrens 	dn->dn_bonuslen = bonuslen;
73554811da5SToomas Soome 	dn->dn_num_slots = dn_slots;
736da03de99SMark Maybee 	dn->dn_nblkptr = nblkptr;
737fa9e4066Sahrens 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
738fa9e4066Sahrens 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
739fa9e4066Sahrens 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
740fa9e4066Sahrens 
741da03de99SMark Maybee 	/* fix up the bonus db_size */
742da03de99SMark Maybee 	if (dn->dn_bonus) {
7431934e92fSmaybee 		dn->dn_bonus->db.db_size =
74454811da5SToomas Soome 		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
74554811da5SToomas Soome 		    (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
7461934e92fSmaybee 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
7471934e92fSmaybee 	}
748432f72fdSahrens 
749fa9e4066Sahrens 	dn->dn_allocated_txg = tx->tx_txg;
750fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
751fa9e4066Sahrens }
752fa9e4066Sahrens 
753f06dce2cSAndrew Stormont #ifdef	_KERNEL
754744947dcSTom Erickson static void
755744947dcSTom Erickson dnode_move_impl(dnode_t *odn, dnode_t *ndn)
756744947dcSTom Erickson {
757744947dcSTom Erickson 	int i;
758744947dcSTom Erickson 
759744947dcSTom Erickson 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
760744947dcSTom Erickson 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
761744947dcSTom Erickson 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
762744947dcSTom Erickson 	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
763744947dcSTom Erickson 
764744947dcSTom Erickson 	/* Copy fields. */
765744947dcSTom Erickson 	ndn->dn_objset = odn->dn_objset;
766744947dcSTom Erickson 	ndn->dn_object = odn->dn_object;
767744947dcSTom Erickson 	ndn->dn_dbuf = odn->dn_dbuf;
768744947dcSTom Erickson 	ndn->dn_handle = odn->dn_handle;
769744947dcSTom Erickson 	ndn->dn_phys = odn->dn_phys;
770744947dcSTom Erickson 	ndn->dn_type = odn->dn_type;
771744947dcSTom Erickson 	ndn->dn_bonuslen = odn->dn_bonuslen;
772744947dcSTom Erickson 	ndn->dn_bonustype = odn->dn_bonustype;
773744947dcSTom Erickson 	ndn->dn_nblkptr = odn->dn_nblkptr;
774744947dcSTom Erickson 	ndn->dn_checksum = odn->dn_checksum;
775744947dcSTom Erickson 	ndn->dn_compress = odn->dn_compress;
776744947dcSTom Erickson 	ndn->dn_nlevels = odn->dn_nlevels;
777744947dcSTom Erickson 	ndn->dn_indblkshift = odn->dn_indblkshift;
778744947dcSTom Erickson 	ndn->dn_datablkshift = odn->dn_datablkshift;
779744947dcSTom Erickson 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
780744947dcSTom Erickson 	ndn->dn_datablksz = odn->dn_datablksz;
781744947dcSTom Erickson 	ndn->dn_maxblkid = odn->dn_maxblkid;
78254811da5SToomas Soome 	ndn->dn_num_slots = odn->dn_num_slots;
783c7fbe46dSMatthew Ahrens 	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
784c7fbe46dSMatthew Ahrens 	    sizeof (odn->dn_next_type));
785744947dcSTom Erickson 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
786744947dcSTom Erickson 	    sizeof (odn->dn_next_nblkptr));
787744947dcSTom Erickson 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
788744947dcSTom Erickson 	    sizeof (odn->dn_next_nlevels));
789744947dcSTom Erickson 	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
790744947dcSTom Erickson 	    sizeof (odn->dn_next_indblkshift));
791744947dcSTom Erickson 	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
792744947dcSTom Erickson 	    sizeof (odn->dn_next_bonustype));
793744947dcSTom Erickson 	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
794744947dcSTom Erickson 	    sizeof (odn->dn_rm_spillblk));
795744947dcSTom Erickson 	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
796744947dcSTom Erickson 	    sizeof (odn->dn_next_bonuslen));
797744947dcSTom Erickson 	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
798744947dcSTom Erickson 	    sizeof (odn->dn_next_blksz));
799eb633035STom Caputi 	bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
800eb633035STom Caputi 	    sizeof (odn->dn_next_maxblkid));
801744947dcSTom Erickson 	for (i = 0; i < TXG_SIZE; i++) {
802744947dcSTom Erickson 		list_move_tail(&ndn->dn_dirty_records[i],
803744947dcSTom Erickson 		    &odn->dn_dirty_records[i]);
804744947dcSTom Erickson 	}
805bf16b11eSMatthew Ahrens 	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
806bf16b11eSMatthew Ahrens 	    sizeof (odn->dn_free_ranges));
807744947dcSTom Erickson 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
808744947dcSTom Erickson 	ndn->dn_free_txg = odn->dn_free_txg;
809744947dcSTom Erickson 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
810aa02ea01STom Caputi 	ndn->dn_dirty_txg = odn->dn_dirty_txg;
811744947dcSTom Erickson 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
812744947dcSTom Erickson 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
813e914ace2STim Schumacher 	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
814e914ace2STim Schumacher 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
8150f6d88adSAlex Reece 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
8160f6d88adSAlex Reece 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
817744947dcSTom Erickson 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
818744947dcSTom Erickson 	ndn->dn_bonus = odn->dn_bonus;
819744947dcSTom Erickson 	ndn->dn_have_spill = odn->dn_have_spill;
820744947dcSTom Erickson 	ndn->dn_zio = odn->dn_zio;
821744947dcSTom Erickson 	ndn->dn_oldused = odn->dn_oldused;
822744947dcSTom Erickson 	ndn->dn_oldflags = odn->dn_oldflags;
823744947dcSTom Erickson 	ndn->dn_olduid = odn->dn_olduid;
824744947dcSTom Erickson 	ndn->dn_oldgid = odn->dn_oldgid;
825*f67950b2SNasf-Fan 	ndn->dn_oldprojid = odn->dn_oldprojid;
826744947dcSTom Erickson 	ndn->dn_newuid = odn->dn_newuid;
827744947dcSTom Erickson 	ndn->dn_newgid = odn->dn_newgid;
828*f67950b2SNasf-Fan 	ndn->dn_newprojid = odn->dn_newprojid;
829744947dcSTom Erickson 	ndn->dn_id_flags = odn->dn_id_flags;
830744947dcSTom Erickson 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
831744947dcSTom Erickson 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
832744947dcSTom Erickson 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
833744947dcSTom Erickson 
834744947dcSTom Erickson 	/*
835744947dcSTom Erickson 	 * Update back pointers. Updating the handle fixes the back pointer of
836744947dcSTom Erickson 	 * every descendant dbuf as well as the bonus dbuf.
837744947dcSTom Erickson 	 */
838744947dcSTom Erickson 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
839744947dcSTom Erickson 	ndn->dn_handle->dnh_dnode = ndn;
840744947dcSTom Erickson 	if (ndn->dn_zfetch.zf_dnode == odn) {
841744947dcSTom Erickson 		ndn->dn_zfetch.zf_dnode = ndn;
842744947dcSTom Erickson 	}
843744947dcSTom Erickson 
844744947dcSTom Erickson 	/*
845744947dcSTom Erickson 	 * Invalidate the original dnode by clearing all of its back pointers.
846744947dcSTom Erickson 	 */
847744947dcSTom Erickson 	odn->dn_dbuf = NULL;
848744947dcSTom Erickson 	odn->dn_handle = NULL;
8490f6d88adSAlex Reece 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
850744947dcSTom Erickson 	    offsetof(dmu_buf_impl_t, db_link));
851744947dcSTom Erickson 	odn->dn_dbufs_count = 0;
852744947dcSTom Erickson 	odn->dn_bonus = NULL;
853744947dcSTom Erickson 	odn->dn_zfetch.zf_dnode = NULL;
854744947dcSTom Erickson 
855744947dcSTom Erickson 	/*
856744947dcSTom Erickson 	 * Set the low bit of the objset pointer to ensure that dnode_move()
857744947dcSTom Erickson 	 * recognizes the dnode as invalid in any subsequent callback.
858744947dcSTom Erickson 	 */
859744947dcSTom Erickson 	POINTER_INVALIDATE(&odn->dn_objset);
860744947dcSTom Erickson 
861744947dcSTom Erickson 	/*
862744947dcSTom Erickson 	 * Satisfy the destructor.
863744947dcSTom Erickson 	 */
864744947dcSTom Erickson 	for (i = 0; i < TXG_SIZE; i++) {
865744947dcSTom Erickson 		list_create(&odn->dn_dirty_records[i],
866744947dcSTom Erickson 		    sizeof (dbuf_dirty_record_t),
867744947dcSTom Erickson 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
868bf16b11eSMatthew Ahrens 		odn->dn_free_ranges[i] = NULL;
869744947dcSTom Erickson 		odn->dn_next_nlevels[i] = 0;
870744947dcSTom Erickson 		odn->dn_next_indblkshift[i] = 0;
871744947dcSTom Erickson 		odn->dn_next_bonustype[i] = 0;
872744947dcSTom Erickson 		odn->dn_rm_spillblk[i] = 0;
873744947dcSTom Erickson 		odn->dn_next_bonuslen[i] = 0;
874744947dcSTom Erickson 		odn->dn_next_blksz[i] = 0;
875744947dcSTom Erickson 	}
876744947dcSTom Erickson 	odn->dn_allocated_txg = 0;
877744947dcSTom Erickson 	odn->dn_free_txg = 0;
878744947dcSTom Erickson 	odn->dn_assigned_txg = 0;
879aa02ea01STom Caputi 	odn->dn_dirty_txg = 0;
880744947dcSTom Erickson 	odn->dn_dirtyctx = 0;
881744947dcSTom Erickson 	odn->dn_dirtyctx_firstset = NULL;
882744947dcSTom Erickson 	odn->dn_have_spill = B_FALSE;
883744947dcSTom Erickson 	odn->dn_zio = NULL;
884744947dcSTom Erickson 	odn->dn_oldused = 0;
885744947dcSTom Erickson 	odn->dn_oldflags = 0;
886744947dcSTom Erickson 	odn->dn_olduid = 0;
887744947dcSTom Erickson 	odn->dn_oldgid = 0;
888*f67950b2SNasf-Fan 	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
889744947dcSTom Erickson 	odn->dn_newuid = 0;
890744947dcSTom Erickson 	odn->dn_newgid = 0;
891*f67950b2SNasf-Fan 	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
892744947dcSTom Erickson 	odn->dn_id_flags = 0;
893744947dcSTom Erickson 
894744947dcSTom Erickson 	/*
895744947dcSTom Erickson 	 * Mark the dnode.
896744947dcSTom Erickson 	 */
897744947dcSTom Erickson 	ndn->dn_moved = 1;
898744947dcSTom Erickson 	odn->dn_moved = (uint8_t)-1;
899744947dcSTom Erickson }
900744947dcSTom Erickson 
901744947dcSTom Erickson /*ARGSUSED*/
902744947dcSTom Erickson static kmem_cbrc_t
903744947dcSTom Erickson dnode_move(void *buf, void *newbuf, size_t size, void *arg)
904744947dcSTom Erickson {
905744947dcSTom Erickson 	dnode_t *odn = buf, *ndn = newbuf;
906744947dcSTom Erickson 	objset_t *os;
907744947dcSTom Erickson 	int64_t refcount;
908744947dcSTom Erickson 	uint32_t dbufs;
909744947dcSTom Erickson 
910744947dcSTom Erickson 	/*
911744947dcSTom Erickson 	 * The dnode is on the objset's list of known dnodes if the objset
912744947dcSTom Erickson 	 * pointer is valid. We set the low bit of the objset pointer when
913744947dcSTom Erickson 	 * freeing the dnode to invalidate it, and the memory patterns written
914744947dcSTom Erickson 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
915744947dcSTom Erickson 	 * A newly created dnode sets the objset pointer last of all to indicate
916744947dcSTom Erickson 	 * that the dnode is known and in a valid state to be moved by this
917744947dcSTom Erickson 	 * function.
918744947dcSTom Erickson 	 */
919744947dcSTom Erickson 	os = odn->dn_objset;
920744947dcSTom Erickson 	if (!POINTER_IS_VALID(os)) {
92154811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_invalid);
922744947dcSTom Erickson 		return (KMEM_CBRC_DONT_KNOW);
923744947dcSTom Erickson 	}
924744947dcSTom Erickson 
925744947dcSTom Erickson 	/*
926744947dcSTom Erickson 	 * Ensure that the objset does not go away during the move.
927744947dcSTom Erickson 	 */
928744947dcSTom Erickson 	rw_enter(&os_lock, RW_WRITER);
929744947dcSTom Erickson 	if (os != odn->dn_objset) {
930744947dcSTom Erickson 		rw_exit(&os_lock);
93154811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_recheck1);
932744947dcSTom Erickson 		return (KMEM_CBRC_DONT_KNOW);
933744947dcSTom Erickson 	}
934744947dcSTom Erickson 
935744947dcSTom Erickson 	/*
936744947dcSTom Erickson 	 * If the dnode is still valid, then so is the objset. We know that no
937744947dcSTom Erickson 	 * valid objset can be freed while we hold os_lock, so we can safely
938744947dcSTom Erickson 	 * ensure that the objset remains in use.
939744947dcSTom Erickson 	 */
940744947dcSTom Erickson 	mutex_enter(&os->os_lock);
941744947dcSTom Erickson 
942744947dcSTom Erickson 	/*
943744947dcSTom Erickson 	 * Recheck the objset pointer in case the dnode was removed just before
944744947dcSTom Erickson 	 * acquiring the lock.
945744947dcSTom Erickson 	 */
946744947dcSTom Erickson 	if (os != odn->dn_objset) {
947744947dcSTom Erickson 		mutex_exit(&os->os_lock);
948744947dcSTom Erickson 		rw_exit(&os_lock);
94954811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_recheck2);
950744947dcSTom Erickson 		return (KMEM_CBRC_DONT_KNOW);
951744947dcSTom Erickson 	}
952744947dcSTom Erickson 
953744947dcSTom Erickson 	/*
954744947dcSTom Erickson 	 * At this point we know that as long as we hold os->os_lock, the dnode
955744947dcSTom Erickson 	 * cannot be freed and fields within the dnode can be safely accessed.
956744947dcSTom Erickson 	 * The objset listing this dnode cannot go away as long as this dnode is
957744947dcSTom Erickson 	 * on its list.
958744947dcSTom Erickson 	 */
959744947dcSTom Erickson 	rw_exit(&os_lock);
960744947dcSTom Erickson 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
961744947dcSTom Erickson 		mutex_exit(&os->os_lock);
96254811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_special);
963744947dcSTom Erickson 		return (KMEM_CBRC_NO);
964744947dcSTom Erickson 	}
965744947dcSTom Erickson 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
966744947dcSTom Erickson 
967744947dcSTom Erickson 	/*
968744947dcSTom Erickson 	 * Lock the dnode handle to prevent the dnode from obtaining any new
969744947dcSTom Erickson 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
970744947dcSTom Erickson 	 * from accessing the dnode, so that we can discount their holds. The
971744947dcSTom Erickson 	 * handle is safe to access because we know that while the dnode cannot
972744947dcSTom Erickson 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
973744947dcSTom Erickson 	 * safely move any dnode referenced only by dbufs.
974744947dcSTom Erickson 	 */
975744947dcSTom Erickson 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
976744947dcSTom Erickson 		mutex_exit(&os->os_lock);
97754811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_handle);
978744947dcSTom Erickson 		return (KMEM_CBRC_LATER);
979744947dcSTom Erickson 	}
980744947dcSTom Erickson 
981744947dcSTom Erickson 	/*
982744947dcSTom Erickson 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
983744947dcSTom Erickson 	 * We need to guarantee that there is a hold for every dbuf in order to
984744947dcSTom Erickson 	 * determine whether the dnode is actively referenced. Falsely matching
985744947dcSTom Erickson 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
986744947dcSTom Erickson 	 * that a thread already having an active dnode hold is about to add a
987744947dcSTom Erickson 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
988744947dcSTom Erickson 	 * progress.
989744947dcSTom Erickson 	 */
990744947dcSTom Erickson 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
991744947dcSTom Erickson 		zrl_exit(&odn->dn_handle->dnh_zrlock);
992744947dcSTom Erickson 		mutex_exit(&os->os_lock);
99354811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_rwlock);
994744947dcSTom Erickson 		return (KMEM_CBRC_LATER);
995744947dcSTom Erickson 	}
996744947dcSTom Erickson 
997744947dcSTom Erickson 	/*
998744947dcSTom Erickson 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
999744947dcSTom Erickson 	 * case, the dbuf count is decremented under the handle lock before the
1000744947dcSTom Erickson 	 * dbuf's hold is released. This order ensures that if we count the hold
1001744947dcSTom Erickson 	 * after the dbuf is removed but before its hold is released, we will
1002744947dcSTom Erickson 	 * treat the unmatched hold as active and exit safely. If we count the
1003744947dcSTom Erickson 	 * hold before the dbuf is removed, the hold is discounted, and the
1004744947dcSTom Erickson 	 * removal is blocked until the move completes.
1005744947dcSTom Erickson 	 */
1006e914ace2STim Schumacher 	refcount = zfs_refcount_count(&odn->dn_holds);
1007744947dcSTom Erickson 	ASSERT(refcount >= 0);
1008744947dcSTom Erickson 	dbufs = odn->dn_dbufs_count;
1009744947dcSTom Erickson 
1010744947dcSTom Erickson 	/* We can't have more dbufs than dnode holds. */
1011744947dcSTom Erickson 	ASSERT3U(dbufs, <=, refcount);
1012744947dcSTom Erickson 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
1013744947dcSTom Erickson 	    uint32_t, dbufs);
1014744947dcSTom Erickson 
1015744947dcSTom Erickson 	if (refcount > dbufs) {
1016744947dcSTom Erickson 		rw_exit(&odn->dn_struct_rwlock);
1017744947dcSTom Erickson 		zrl_exit(&odn->dn_handle->dnh_zrlock);
1018744947dcSTom Erickson 		mutex_exit(&os->os_lock);
101954811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_active);
1020744947dcSTom Erickson 		return (KMEM_CBRC_LATER);
1021744947dcSTom Erickson 	}
1022744947dcSTom Erickson 
1023744947dcSTom Erickson 	rw_exit(&odn->dn_struct_rwlock);
1024744947dcSTom Erickson 
1025744947dcSTom Erickson 	/*
1026744947dcSTom Erickson 	 * At this point we know that anyone with a hold on the dnode is not
1027744947dcSTom Erickson 	 * actively referencing it. The dnode is known and in a valid state to
1028744947dcSTom Erickson 	 * move. We're holding the locks needed to execute the critical section.
1029744947dcSTom Erickson 	 */
1030744947dcSTom Erickson 	dnode_move_impl(odn, ndn);
1031744947dcSTom Erickson 
1032744947dcSTom Erickson 	list_link_replace(&odn->dn_link, &ndn->dn_link);
1033744947dcSTom Erickson 	/* If the dnode was safe to move, the refcount cannot have changed. */
1034e914ace2STim Schumacher 	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1035744947dcSTom Erickson 	ASSERT(dbufs == ndn->dn_dbufs_count);
1036744947dcSTom Erickson 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1037744947dcSTom Erickson 	mutex_exit(&os->os_lock);
1038744947dcSTom Erickson 
1039744947dcSTom Erickson 	return (KMEM_CBRC_YES);
1040744947dcSTom Erickson }
1041744947dcSTom Erickson #endif	/* _KERNEL */
1042744947dcSTom Erickson 
104354811da5SToomas Soome static void
104454811da5SToomas Soome dnode_slots_hold(dnode_children_t *children, int idx, int slots)
104554811da5SToomas Soome {
104654811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
104754811da5SToomas Soome 
104854811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
104954811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
105054811da5SToomas Soome 		zrl_add(&dnh->dnh_zrlock);
105154811da5SToomas Soome 	}
105254811da5SToomas Soome }
105354811da5SToomas Soome 
105454811da5SToomas Soome static void
105554811da5SToomas Soome dnode_slots_rele(dnode_children_t *children, int idx, int slots)
105654811da5SToomas Soome {
105754811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
105854811da5SToomas Soome 
105954811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
106054811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
106154811da5SToomas Soome 
106254811da5SToomas Soome 		if (zrl_is_locked(&dnh->dnh_zrlock))
106354811da5SToomas Soome 			zrl_exit(&dnh->dnh_zrlock);
106454811da5SToomas Soome 		else
106554811da5SToomas Soome 			zrl_remove(&dnh->dnh_zrlock);
106654811da5SToomas Soome 	}
106754811da5SToomas Soome }
106854811da5SToomas Soome 
106954811da5SToomas Soome static int
107054811da5SToomas Soome dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
107154811da5SToomas Soome {
107254811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
107354811da5SToomas Soome 
107454811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
107554811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
107654811da5SToomas Soome 
107754811da5SToomas Soome 		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
107854811da5SToomas Soome 			for (int j = idx; j < i; j++) {
107954811da5SToomas Soome 				dnh = &children->dnc_children[j];
108054811da5SToomas Soome 				zrl_exit(&dnh->dnh_zrlock);
108154811da5SToomas Soome 			}
108254811da5SToomas Soome 
108354811da5SToomas Soome 			return (0);
108454811da5SToomas Soome 		}
108554811da5SToomas Soome 	}
108654811da5SToomas Soome 
108754811da5SToomas Soome 	return (1);
108854811da5SToomas Soome }
108954811da5SToomas Soome 
109054811da5SToomas Soome static void
109154811da5SToomas Soome dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
109254811da5SToomas Soome {
109354811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
109454811da5SToomas Soome 
109554811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
109654811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
109754811da5SToomas Soome 		dnh->dnh_dnode = ptr;
109854811da5SToomas Soome 	}
109954811da5SToomas Soome }
110054811da5SToomas Soome 
110154811da5SToomas Soome static boolean_t
110254811da5SToomas Soome dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
110354811da5SToomas Soome {
110454811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
110554811da5SToomas Soome 
1106aa02ea01STom Caputi 	/*
1107aa02ea01STom Caputi 	 * If all dnode slots are either already free or
1108aa02ea01STom Caputi 	 * evictable return B_TRUE.
1109aa02ea01STom Caputi 	 */
111054811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
111154811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
111254811da5SToomas Soome 		dnode_t *dn = dnh->dnh_dnode;
111354811da5SToomas Soome 
111454811da5SToomas Soome 		if (dn == DN_SLOT_FREE) {
111554811da5SToomas Soome 			continue;
111654811da5SToomas Soome 		} else if (DN_SLOT_IS_PTR(dn)) {
111754811da5SToomas Soome 			mutex_enter(&dn->dn_mtx);
1118aa02ea01STom Caputi 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1119aa02ea01STom Caputi 			    zfs_refcount_is_zero(&dn->dn_holds) &&
1120aa02ea01STom Caputi 			    !DNODE_IS_DIRTY(dn));
112154811da5SToomas Soome 			mutex_exit(&dn->dn_mtx);
112254811da5SToomas Soome 
1123aa02ea01STom Caputi 			if (!can_free)
112454811da5SToomas Soome 				return (B_FALSE);
1125aa02ea01STom Caputi 			else
1126aa02ea01STom Caputi 				continue;
112754811da5SToomas Soome 		} else {
112854811da5SToomas Soome 			return (B_FALSE);
112954811da5SToomas Soome 		}
113054811da5SToomas Soome 	}
113154811da5SToomas Soome 
113254811da5SToomas Soome 	return (B_TRUE);
113354811da5SToomas Soome }
113454811da5SToomas Soome 
113554811da5SToomas Soome static void
113654811da5SToomas Soome dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
113754811da5SToomas Soome {
113854811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
113954811da5SToomas Soome 
114054811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
114154811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
114254811da5SToomas Soome 
114354811da5SToomas Soome 		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
114454811da5SToomas Soome 
114554811da5SToomas Soome 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
114654811da5SToomas Soome 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
114754811da5SToomas Soome 			dnode_destroy(dnh->dnh_dnode);
114854811da5SToomas Soome 			dnh->dnh_dnode = DN_SLOT_FREE;
114954811da5SToomas Soome 		}
115054811da5SToomas Soome 	}
115154811da5SToomas Soome }
115254811da5SToomas Soome 
115354811da5SToomas Soome void
115454811da5SToomas Soome dnode_free_interior_slots(dnode_t *dn)
115554811da5SToomas Soome {
115654811da5SToomas Soome 	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
115754811da5SToomas Soome 	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
115854811da5SToomas Soome 	int idx = (dn->dn_object & (epb - 1)) + 1;
115954811da5SToomas Soome 	int slots = dn->dn_num_slots - 1;
116054811da5SToomas Soome 
116154811da5SToomas Soome 	if (slots == 0)
116254811da5SToomas Soome 		return;
116354811da5SToomas Soome 
116454811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
116554811da5SToomas Soome 
116654811da5SToomas Soome 	while (!dnode_slots_tryenter(children, idx, slots))
116754811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
116854811da5SToomas Soome 
116954811da5SToomas Soome 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
117054811da5SToomas Soome 	dnode_slots_rele(children, idx, slots);
117154811da5SToomas Soome }
117254811da5SToomas Soome 
1173fa9e4066Sahrens void
1174744947dcSTom Erickson dnode_special_close(dnode_handle_t *dnh)
1175fa9e4066Sahrens {
1176744947dcSTom Erickson 	dnode_t *dn = dnh->dnh_dnode;
1177744947dcSTom Erickson 
1178ea8dc4b6Seschrock 	/*
1179ea8dc4b6Seschrock 	 * Wait for final references to the dnode to clear.  This can
118054811da5SToomas Soome 	 * only happen if the arc is asynchronously evicting state that
1181ea8dc4b6Seschrock 	 * has a hold on this dnode while we are trying to evict this
1182ea8dc4b6Seschrock 	 * dnode.
1183ea8dc4b6Seschrock 	 */
1184e914ace2STim Schumacher 	while (zfs_refcount_count(&dn->dn_holds) > 0)
1185ea8dc4b6Seschrock 		delay(1);
1186bc9014e6SJustin Gibbs 	ASSERT(dn->dn_dbuf == NULL ||
1187bc9014e6SJustin Gibbs 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1188744947dcSTom Erickson 	zrl_add(&dnh->dnh_zrlock);
1189744947dcSTom Erickson 	dnode_destroy(dn); /* implicit zrl_remove() */
1190744947dcSTom Erickson 	zrl_destroy(&dnh->dnh_zrlock);
1191744947dcSTom Erickson 	dnh->dnh_dnode = NULL;
1192fa9e4066Sahrens }
1193fa9e4066Sahrens 
1194bc9014e6SJustin Gibbs void
1195744947dcSTom Erickson dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1196744947dcSTom Erickson     dnode_handle_t *dnh)
1197fa9e4066Sahrens {
1198bc9014e6SJustin Gibbs 	dnode_t *dn;
1199bc9014e6SJustin Gibbs 
1200744947dcSTom Erickson 	zrl_init(&dnh->dnh_zrlock);
120154811da5SToomas Soome 	zrl_tryenter(&dnh->dnh_zrlock);
120254811da5SToomas Soome 
120354811da5SToomas Soome 	dn = dnode_create(os, dnp, NULL, object, dnh);
12049c9dc39aSek 	DNODE_VERIFY(dn);
120554811da5SToomas Soome 
120654811da5SToomas Soome 	zrl_exit(&dnh->dnh_zrlock);
1207fa9e4066Sahrens }
1208fa9e4066Sahrens 
1209fa9e4066Sahrens static void
121040510e8eSJosef 'Jeff' Sipek dnode_buf_evict_async(void *dbu)
1211fa9e4066Sahrens {
121254811da5SToomas Soome 	dnode_children_t *dnc = dbu;
121354811da5SToomas Soome 
121454811da5SToomas Soome 	DNODE_STAT_BUMP(dnode_buf_evict);
1215fa9e4066Sahrens 
121654811da5SToomas Soome 	for (int i = 0; i < dnc->dnc_count; i++) {
121754811da5SToomas Soome 		dnode_handle_t *dnh = &dnc->dnc_children[i];
1218744947dcSTom Erickson 		dnode_t *dn;
1219fa9e4066Sahrens 
1220744947dcSTom Erickson 		/*
1221744947dcSTom Erickson 		 * The dnode handle lock guards against the dnode moving to
1222744947dcSTom Erickson 		 * another valid address, so there is no need here to guard
1223744947dcSTom Erickson 		 * against changes to or from NULL.
1224744947dcSTom Erickson 		 */
122554811da5SToomas Soome 		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1226744947dcSTom Erickson 			zrl_destroy(&dnh->dnh_zrlock);
122754811da5SToomas Soome 			dnh->dnh_dnode = DN_SLOT_UNINIT;
1228fa9e4066Sahrens 			continue;
1229744947dcSTom Erickson 		}
1230744947dcSTom Erickson 
1231744947dcSTom Erickson 		zrl_add(&dnh->dnh_zrlock);
1232744947dcSTom Erickson 		dn = dnh->dnh_dnode;
1233fa9e4066Sahrens 		/*
1234fa9e4066Sahrens 		 * If there are holds on this dnode, then there should
1235fa9e4066Sahrens 		 * be holds on the dnode's containing dbuf as well; thus
1236744947dcSTom Erickson 		 * it wouldn't be eligible for eviction and this function
1237fa9e4066Sahrens 		 * would not have been called.
1238fa9e4066Sahrens 		 */
1239e914ace2STim Schumacher 		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1240e914ace2STim Schumacher 		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1241fa9e4066Sahrens 
124254811da5SToomas Soome 		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1243744947dcSTom Erickson 		zrl_destroy(&dnh->dnh_zrlock);
124454811da5SToomas Soome 		dnh->dnh_dnode = DN_SLOT_UNINIT;
1245fa9e4066Sahrens 	}
124654811da5SToomas Soome 	kmem_free(dnc, sizeof (dnode_children_t) +
124754811da5SToomas Soome 	    dnc->dnc_count * sizeof (dnode_handle_t));
1248fa9e4066Sahrens }
1249fa9e4066Sahrens 
1250fa9e4066Sahrens /*
125154811da5SToomas Soome  * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
125254811da5SToomas Soome  * to ensure the hole at the specified object offset is large enough to
125354811da5SToomas Soome  * hold the dnode being created. The slots parameter is also used to ensure
125454811da5SToomas Soome  * a dnode does not span multiple dnode blocks. In both of these cases, if
125554811da5SToomas Soome  * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
125654811da5SToomas Soome  * are only possible when using DNODE_MUST_BE_FREE.
125754811da5SToomas Soome  *
125854811da5SToomas Soome  * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
125954811da5SToomas Soome  * dnode_hold_impl() will check if the requested dnode is already consumed
126054811da5SToomas Soome  * as an extra dnode slot by an large dnode, in which case it returns
126154811da5SToomas Soome  * ENOENT.
126254811da5SToomas Soome  *
1263ea8dc4b6Seschrock  * errors:
126454811da5SToomas Soome  * EINVAL - invalid object number or flags.
126554811da5SToomas Soome  * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
126654811da5SToomas Soome  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
126754811da5SToomas Soome  *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
126854811da5SToomas Soome  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
126954811da5SToomas Soome  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
127054811da5SToomas Soome  *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
127154811da5SToomas Soome  * EIO    - i/o error error when reading the meta dnode dbuf.
1272ea8dc4b6Seschrock  * succeeds even for free dnodes.
1273fa9e4066Sahrens  */
1274ea8dc4b6Seschrock int
127554811da5SToomas Soome dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1276ea8dc4b6Seschrock     void *tag, dnode_t **dnp)
1277fa9e4066Sahrens {
1278ea8dc4b6Seschrock 	int epb, idx, err;
1279fa9e4066Sahrens 	int drop_struct_lock = FALSE;
1280ea8dc4b6Seschrock 	int type;
1281fa9e4066Sahrens 	uint64_t blk;
1282fa9e4066Sahrens 	dnode_t *mdn, *dn;
1283fa9e4066Sahrens 	dmu_buf_impl_t *db;
128454811da5SToomas Soome 	dnode_children_t *dnc;
128554811da5SToomas Soome 	dnode_phys_t *dn_block;
1286744947dcSTom Erickson 	dnode_handle_t *dnh;
1287fa9e4066Sahrens 
128854811da5SToomas Soome 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
128954811da5SToomas Soome 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
129054811da5SToomas Soome 
1291e14bb325SJeff Bonwick 	/*
1292e14bb325SJeff Bonwick 	 * If you are holding the spa config lock as writer, you shouldn't
1293dcba9f3fSGeorge Wilson 	 * be asking the DMU to do *anything* unless it's the root pool
1294dcba9f3fSGeorge Wilson 	 * which may require us to read from the root filesystem while
1295dcba9f3fSGeorge Wilson 	 * holding some (not all) of the locks as writer.
1296e14bb325SJeff Bonwick 	 */
1297dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1298dcba9f3fSGeorge Wilson 	    (spa_is_root(os->os_spa) &&
129944ecc532SGeorge Wilson 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1300e14bb325SJeff Bonwick 
130186714001SSerapheim Dimitropoulos 	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
130286714001SSerapheim Dimitropoulos 
1303*f67950b2SNasf-Fan 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
1304*f67950b2SNasf-Fan 	    object == DMU_PROJECTUSED_OBJECT) {
1305*f67950b2SNasf-Fan 		if (object == DMU_USERUSED_OBJECT)
1306*f67950b2SNasf-Fan 			dn = DMU_USERUSED_DNODE(os);
1307*f67950b2SNasf-Fan 		else if (object == DMU_GROUPUSED_OBJECT)
1308*f67950b2SNasf-Fan 			dn = DMU_GROUPUSED_DNODE(os);
1309*f67950b2SNasf-Fan 		else
1310*f67950b2SNasf-Fan 			dn = DMU_PROJECTUSED_DNODE(os);
131114843421SMatthew Ahrens 		if (dn == NULL)
1312be6fd75aSMatthew Ahrens 			return (SET_ERROR(ENOENT));
131314843421SMatthew Ahrens 		type = dn->dn_type;
131414843421SMatthew Ahrens 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1315be6fd75aSMatthew Ahrens 			return (SET_ERROR(ENOENT));
131614843421SMatthew Ahrens 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1317be6fd75aSMatthew Ahrens 			return (SET_ERROR(EEXIST));
131814843421SMatthew Ahrens 		DNODE_VERIFY(dn);
1319e914ace2STim Schumacher 		(void) zfs_refcount_add(&dn->dn_holds, tag);
132014843421SMatthew Ahrens 		*dnp = dn;
132114843421SMatthew Ahrens 		return (0);
132214843421SMatthew Ahrens 	}
132314843421SMatthew Ahrens 
1324fa9e4066Sahrens 	if (object == 0 || object >= DN_MAX_OBJECT)
1325be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
1326fa9e4066Sahrens 
1327744947dcSTom Erickson 	mdn = DMU_META_DNODE(os);
1328744947dcSTom Erickson 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1329fa9e4066Sahrens 
13309c9dc39aSek 	DNODE_VERIFY(mdn);
1331fa9e4066Sahrens 
1332fa9e4066Sahrens 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1333fa9e4066Sahrens 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1334fa9e4066Sahrens 		drop_struct_lock = TRUE;
1335fa9e4066Sahrens 	}
1336fa9e4066Sahrens 
1337a2cdcdd2SPaul Dagnelie 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1338fa9e4066Sahrens 
1339ea8dc4b6Seschrock 	db = dbuf_hold(mdn, blk, FTAG);
1340fa9e4066Sahrens 	if (drop_struct_lock)
1341fa9e4066Sahrens 		rw_exit(&mdn->dn_struct_rwlock);
134254811da5SToomas Soome 	if (db == NULL) {
134354811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1344be6fd75aSMatthew Ahrens 		return (SET_ERROR(EIO));
134554811da5SToomas Soome 	}
1346eb633035STom Caputi 	/*
1347eb633035STom Caputi 	 * We do not need to decrypt to read the dnode so it doesn't matter
1348eb633035STom Caputi 	 * if we get the encrypted or decrypted version.
1349eb633035STom Caputi 	 */
1350eb633035STom Caputi 	err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
1351ea8dc4b6Seschrock 	if (err) {
135254811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1353ea8dc4b6Seschrock 		dbuf_rele(db, FTAG);
1354ea8dc4b6Seschrock 		return (err);
1355ea8dc4b6Seschrock 	}
1356fa9e4066Sahrens 
1357fa9e4066Sahrens 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1358fa9e4066Sahrens 	epb = db->db.db_size >> DNODE_SHIFT;
1359fa9e4066Sahrens 
136054811da5SToomas Soome 	idx = object & (epb - 1);
136154811da5SToomas Soome 	dn_block = (dnode_phys_t *)db->db.db_data;
1362fa9e4066Sahrens 
1363744947dcSTom Erickson 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
136454811da5SToomas Soome 	dnc = dmu_buf_get_user(&db->db);
136554811da5SToomas Soome 	dnh = NULL;
136654811da5SToomas Soome 	if (dnc == NULL) {
1367744947dcSTom Erickson 		dnode_children_t *winner;
136854811da5SToomas Soome 		int skip = 0;
136954811da5SToomas Soome 
137054811da5SToomas Soome 		dnc = kmem_zalloc(sizeof (dnode_children_t) +
13717f18da4cSJustin T. Gibbs 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
137254811da5SToomas Soome 		dnc->dnc_count = epb;
137354811da5SToomas Soome 		dnh = &dnc->dnc_children[0];
137454811da5SToomas Soome 
137554811da5SToomas Soome 		/* Initialize dnode slot status from dnode_phys_t */
137654811da5SToomas Soome 		for (int i = 0; i < epb; i++) {
1377744947dcSTom Erickson 			zrl_init(&dnh[i].dnh_zrlock);
137854811da5SToomas Soome 
137954811da5SToomas Soome 			if (skip) {
138054811da5SToomas Soome 				skip--;
138154811da5SToomas Soome 				continue;
138254811da5SToomas Soome 			}
138354811da5SToomas Soome 
138454811da5SToomas Soome 			if (dn_block[i].dn_type != DMU_OT_NONE) {
138554811da5SToomas Soome 				int interior = dn_block[i].dn_extra_slots;
138654811da5SToomas Soome 
138754811da5SToomas Soome 				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
138854811da5SToomas Soome 				dnode_set_slots(dnc, i + 1, interior,
138954811da5SToomas Soome 				    DN_SLOT_INTERIOR);
139054811da5SToomas Soome 				skip = interior;
139154811da5SToomas Soome 			} else {
139254811da5SToomas Soome 				dnh[i].dnh_dnode = DN_SLOT_FREE;
139354811da5SToomas Soome 				skip = 0;
139454811da5SToomas Soome 			}
1395744947dcSTom Erickson 		}
139654811da5SToomas Soome 
139754811da5SToomas Soome 		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
139840510e8eSJosef 'Jeff' Sipek 		    dnode_buf_evict_async, NULL);
139954811da5SToomas Soome 		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1400bc9014e6SJustin Gibbs 		if (winner != NULL) {
1401d2b3cbbdSJorgen Lundman 
140254811da5SToomas Soome 			for (int i = 0; i < epb; i++)
1403d2b3cbbdSJorgen Lundman 				zrl_destroy(&dnh[i].dnh_zrlock);
1404d2b3cbbdSJorgen Lundman 
140554811da5SToomas Soome 			kmem_free(dnc, sizeof (dnode_children_t) +
14067f18da4cSJustin T. Gibbs 			    epb * sizeof (dnode_handle_t));
140754811da5SToomas Soome 			dnc = winner;
1408fa9e4066Sahrens 		}
1409fa9e4066Sahrens 	}
1410fa9e4066Sahrens 
141154811da5SToomas Soome 	ASSERT(dnc->dnc_count == epb);
141254811da5SToomas Soome 	dn = DN_SLOT_UNINIT;
141354811da5SToomas Soome 
141454811da5SToomas Soome 	if (flag & DNODE_MUST_BE_ALLOCATED) {
141554811da5SToomas Soome 		slots = 1;
141654811da5SToomas Soome 
141754811da5SToomas Soome 		while (dn == DN_SLOT_UNINIT) {
141854811da5SToomas Soome 			dnode_slots_hold(dnc, idx, slots);
141954811da5SToomas Soome 			dnh = &dnc->dnc_children[idx];
142054811da5SToomas Soome 
142154811da5SToomas Soome 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
142254811da5SToomas Soome 				dn = dnh->dnh_dnode;
142354811da5SToomas Soome 				break;
142454811da5SToomas Soome 			} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
142554811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_interior);
142654811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
142754811da5SToomas Soome 				dbuf_rele(db, FTAG);
142854811da5SToomas Soome 				return (SET_ERROR(EEXIST));
142954811da5SToomas Soome 			} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
143054811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_misses);
143154811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
143254811da5SToomas Soome 				dbuf_rele(db, FTAG);
143354811da5SToomas Soome 				return (SET_ERROR(ENOENT));
143454811da5SToomas Soome 			}
143554811da5SToomas Soome 
143654811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
143754811da5SToomas Soome 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
143854811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
143954811da5SToomas Soome 				continue;
144054811da5SToomas Soome 			}
144154811da5SToomas Soome 
144254811da5SToomas Soome 			/*
144354811da5SToomas Soome 			 * Someone else won the race and called dnode_create()
144454811da5SToomas Soome 			 * after we checked DN_SLOT_IS_PTR() above but before
144554811da5SToomas Soome 			 * we acquired the lock.
144654811da5SToomas Soome 			 */
144754811da5SToomas Soome 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
144854811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
144954811da5SToomas Soome 				dn = dnh->dnh_dnode;
145054811da5SToomas Soome 			} else {
145154811da5SToomas Soome 				dn = dnode_create(os, dn_block + idx, db,
145254811da5SToomas Soome 				    object, dnh);
145354811da5SToomas Soome 			}
145454811da5SToomas Soome 		}
145554811da5SToomas Soome 
145654811da5SToomas Soome 		mutex_enter(&dn->dn_mtx);
145754811da5SToomas Soome 		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
145854811da5SToomas Soome 			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
145954811da5SToomas Soome 			mutex_exit(&dn->dn_mtx);
146054811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
146154811da5SToomas Soome 			dbuf_rele(db, FTAG);
146254811da5SToomas Soome 			return (SET_ERROR(ENOENT));
146354811da5SToomas Soome 		}
146454811da5SToomas Soome 
146554811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
146654811da5SToomas Soome 	} else if (flag & DNODE_MUST_BE_FREE) {
146754811da5SToomas Soome 
146854811da5SToomas Soome 		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
146954811da5SToomas Soome 			DNODE_STAT_BUMP(dnode_hold_free_overflow);
147054811da5SToomas Soome 			dbuf_rele(db, FTAG);
147154811da5SToomas Soome 			return (SET_ERROR(ENOSPC));
147254811da5SToomas Soome 		}
147354811da5SToomas Soome 
147454811da5SToomas Soome 		while (dn == DN_SLOT_UNINIT) {
147554811da5SToomas Soome 			dnode_slots_hold(dnc, idx, slots);
147654811da5SToomas Soome 
147754811da5SToomas Soome 			if (!dnode_check_slots_free(dnc, idx, slots)) {
147854811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_free_misses);
147954811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
148054811da5SToomas Soome 				dbuf_rele(db, FTAG);
148154811da5SToomas Soome 				return (SET_ERROR(ENOSPC));
148254811da5SToomas Soome 			}
148354811da5SToomas Soome 
148454811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
148554811da5SToomas Soome 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
148654811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
148754811da5SToomas Soome 				continue;
148854811da5SToomas Soome 			}
148954811da5SToomas Soome 
149054811da5SToomas Soome 			if (!dnode_check_slots_free(dnc, idx, slots)) {
149154811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
149254811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
149354811da5SToomas Soome 				dbuf_rele(db, FTAG);
149454811da5SToomas Soome 				return (SET_ERROR(ENOSPC));
149554811da5SToomas Soome 			}
149654811da5SToomas Soome 
149754811da5SToomas Soome 			/*
149854811da5SToomas Soome 			 * Allocated but otherwise free dnodes which would
149954811da5SToomas Soome 			 * be in the interior of a multi-slot dnodes need
150054811da5SToomas Soome 			 * to be freed.  Single slot dnodes can be safely
150154811da5SToomas Soome 			 * re-purposed as a performance optimization.
150254811da5SToomas Soome 			 */
150354811da5SToomas Soome 			if (slots > 1)
150454811da5SToomas Soome 				dnode_reclaim_slots(dnc, idx + 1, slots - 1);
150554811da5SToomas Soome 
150654811da5SToomas Soome 			dnh = &dnc->dnc_children[idx];
150754811da5SToomas Soome 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
150854811da5SToomas Soome 				dn = dnh->dnh_dnode;
150954811da5SToomas Soome 			} else {
151054811da5SToomas Soome 				dn = dnode_create(os, dn_block + idx, db,
151154811da5SToomas Soome 				    object, dnh);
151254811da5SToomas Soome 			}
151354811da5SToomas Soome 		}
151454811da5SToomas Soome 
151554811da5SToomas Soome 		mutex_enter(&dn->dn_mtx);
1516e914ace2STim Schumacher 		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
151754811da5SToomas Soome 			DNODE_STAT_BUMP(dnode_hold_free_refcount);
151854811da5SToomas Soome 			mutex_exit(&dn->dn_mtx);
151954811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
152054811da5SToomas Soome 			dbuf_rele(db, FTAG);
152154811da5SToomas Soome 			return (SET_ERROR(EEXIST));
152254811da5SToomas Soome 		}
15230e8c6158Smaybee 
152454811da5SToomas Soome 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
152554811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_free_hits);
152654811da5SToomas Soome 	} else {
152754811da5SToomas Soome 		dbuf_rele(db, FTAG);
152854811da5SToomas Soome 		return (SET_ERROR(EINVAL));
1529fa9e4066Sahrens 	}
1530fa9e4066Sahrens 
153154811da5SToomas Soome 	if (dn->dn_free_txg) {
153254811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_free_txg);
153354811da5SToomas Soome 		type = dn->dn_type;
1534fa9e4066Sahrens 		mutex_exit(&dn->dn_mtx);
153554811da5SToomas Soome 		dnode_slots_rele(dnc, idx, slots);
1536ea8dc4b6Seschrock 		dbuf_rele(db, FTAG);
153754811da5SToomas Soome 		return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
153854811da5SToomas Soome 		    ENOENT : EEXIST));
1539fa9e4066Sahrens 	}
154054811da5SToomas Soome 
1541e914ace2STim Schumacher 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1542744947dcSTom Erickson 		dbuf_add_ref(db, dnh);
154354811da5SToomas Soome 
1544bc9014e6SJustin Gibbs 	mutex_exit(&dn->dn_mtx);
1545bc9014e6SJustin Gibbs 
1546744947dcSTom Erickson 	/* Now we can rely on the hold to prevent the dnode from moving. */
154754811da5SToomas Soome 	dnode_slots_rele(dnc, idx, slots);
1548fa9e4066Sahrens 
15499c9dc39aSek 	DNODE_VERIFY(dn);
1550fa9e4066Sahrens 	ASSERT3P(dn->dn_dbuf, ==, db);
1551fa9e4066Sahrens 	ASSERT3U(dn->dn_object, ==, object);
1552ea8dc4b6Seschrock 	dbuf_rele(db, FTAG);
1553fa9e4066Sahrens 
1554ea8dc4b6Seschrock 	*dnp = dn;
1555ea8dc4b6Seschrock 	return (0);
1556fa9e4066Sahrens }
1557fa9e4066Sahrens 
1558fa9e4066Sahrens /*
1559fa9e4066Sahrens  * Return held dnode if the object is allocated, NULL if not.
1560fa9e4066Sahrens  */
1561ea8dc4b6Seschrock int
1562503ad85cSMatthew Ahrens dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1563fa9e4066Sahrens {
156454811da5SToomas Soome 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
156554811da5SToomas Soome 	    dnp));
1566fa9e4066Sahrens }
1567fa9e4066Sahrens 
15681934e92fSmaybee /*
15691934e92fSmaybee  * Can only add a reference if there is already at least one
15701934e92fSmaybee  * reference on the dnode.  Returns FALSE if unable to add a
15711934e92fSmaybee  * new reference.
15721934e92fSmaybee  */
15731934e92fSmaybee boolean_t
1574ea8dc4b6Seschrock dnode_add_ref(dnode_t *dn, void *tag)
1575fa9e4066Sahrens {
15761934e92fSmaybee 	mutex_enter(&dn->dn_mtx);
1577e914ace2STim Schumacher 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
15781934e92fSmaybee 		mutex_exit(&dn->dn_mtx);
15791934e92fSmaybee 		return (FALSE);
15801934e92fSmaybee 	}
1581e914ace2STim Schumacher 	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
15821934e92fSmaybee 	mutex_exit(&dn->dn_mtx);
15831934e92fSmaybee 	return (TRUE);
1584fa9e4066Sahrens }
1585fa9e4066Sahrens 
1586fa9e4066Sahrens void
1587ea8dc4b6Seschrock dnode_rele(dnode_t *dn, void *tag)
1588cd485b49SJustin T. Gibbs {
1589cd485b49SJustin T. Gibbs 	mutex_enter(&dn->dn_mtx);
1590c2919acbSMatthew Ahrens 	dnode_rele_and_unlock(dn, tag, B_FALSE);
1591cd485b49SJustin T. Gibbs }
1592cd485b49SJustin T. Gibbs 
1593cd485b49SJustin T. Gibbs void
1594c2919acbSMatthew Ahrens dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
1595fa9e4066Sahrens {
1596fa9e4066Sahrens 	uint64_t refs;
1597744947dcSTom Erickson 	/* Get while the hold prevents the dnode from moving. */
1598744947dcSTom Erickson 	dmu_buf_impl_t *db = dn->dn_dbuf;
1599744947dcSTom Erickson 	dnode_handle_t *dnh = dn->dn_handle;
1600fa9e4066Sahrens 
1601e914ace2STim Schumacher 	refs = zfs_refcount_remove(&dn->dn_holds, tag);
16021934e92fSmaybee 	mutex_exit(&dn->dn_mtx);
1603744947dcSTom Erickson 
1604744947dcSTom Erickson 	/*
1605744947dcSTom Erickson 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
1606744947dcSTom Erickson 	 * indirectly by dbuf_rele() while relying on the dnode handle to
1607744947dcSTom Erickson 	 * prevent the dnode from moving, since releasing the last hold could
1608744947dcSTom Erickson 	 * result in the dnode's parent dbuf evicting its dnode handles. For
1609744947dcSTom Erickson 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
1610744947dcSTom Erickson 	 * other direct or indirect hold on the dnode must first drop the dnode
1611744947dcSTom Erickson 	 * handle.
1612744947dcSTom Erickson 	 */
1613744947dcSTom Erickson 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1614744947dcSTom Erickson 
1615fa9e4066Sahrens 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1616744947dcSTom Erickson 	if (refs == 0 && db != NULL) {
1617744947dcSTom Erickson 		/*
1618744947dcSTom Erickson 		 * Another thread could add a hold to the dnode handle in
1619744947dcSTom Erickson 		 * dnode_hold_impl() while holding the parent dbuf. Since the
1620744947dcSTom Erickson 		 * hold on the parent dbuf prevents the handle from being
1621744947dcSTom Erickson 		 * destroyed, the hold on the handle is OK. We can't yet assert
1622744947dcSTom Erickson 		 * that the handle has zero references, but that will be
1623744947dcSTom Erickson 		 * asserted anyway when the handle gets destroyed.
1624744947dcSTom Erickson 		 */
1625c2919acbSMatthew Ahrens 		mutex_enter(&db->db_mtx);
1626c2919acbSMatthew Ahrens 		dbuf_rele_and_unlock(db, dnh, evicting);
1627744947dcSTom Erickson 	}
1628fa9e4066Sahrens }
1629fa9e4066Sahrens 
1630fa9e4066Sahrens void
1631fa9e4066Sahrens dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1632fa9e4066Sahrens {
1633503ad85cSMatthew Ahrens 	objset_t *os = dn->dn_objset;
1634fa9e4066Sahrens 	uint64_t txg = tx->tx_txg;
1635fa9e4066Sahrens 
163614843421SMatthew Ahrens 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
163714843421SMatthew Ahrens 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
1638fa9e4066Sahrens 		return;
163914843421SMatthew Ahrens 	}
1640fa9e4066Sahrens 
16419c9dc39aSek 	DNODE_VERIFY(dn);
1642fa9e4066Sahrens 
1643fa9e4066Sahrens #ifdef ZFS_DEBUG
1644fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
1645fa9e4066Sahrens 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1646744947dcSTom Erickson 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1647fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
1648fa9e4066Sahrens #endif
1649fa9e4066Sahrens 
16500a586ceaSMark Shellenbaum 	/*
16510a586ceaSMark Shellenbaum 	 * Determine old uid/gid when necessary
16520a586ceaSMark Shellenbaum 	 */
165306e0070dSMark Shellenbaum 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
16540a586ceaSMark Shellenbaum 
165594c2d0ebSMatthew Ahrens 	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
165694c2d0ebSMatthew Ahrens 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1657fa9e4066Sahrens 
1658fa9e4066Sahrens 	/*
1659fa9e4066Sahrens 	 * If we are already marked dirty, we're done.
1660fa9e4066Sahrens 	 */
1661aa02ea01STom Caputi 	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
166294c2d0ebSMatthew Ahrens 		multilist_sublist_unlock(mls);
1663fa9e4066Sahrens 		return;
1664fa9e4066Sahrens 	}
1665fa9e4066Sahrens 
1666e914ace2STim Schumacher 	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
16670f6d88adSAlex Reece 	    !avl_is_empty(&dn->dn_dbufs));
1668fa9e4066Sahrens 	ASSERT(dn->dn_datablksz != 0);
1669fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
1670fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
1671fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1672fa9e4066Sahrens 
1673fa9e4066Sahrens 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1674fa9e4066Sahrens 	    dn->dn_object, txg);
1675fa9e4066Sahrens 
167694c2d0ebSMatthew Ahrens 	multilist_sublist_insert_head(mls, dn);
1677fa9e4066Sahrens 
167894c2d0ebSMatthew Ahrens 	multilist_sublist_unlock(mls);
1679fa9e4066Sahrens 
1680fa9e4066Sahrens 	/*
1681fa9e4066Sahrens 	 * The dnode maintains a hold on its containing dbuf as
1682fa9e4066Sahrens 	 * long as there are holds on it.  Each instantiated child
1683744947dcSTom Erickson 	 * dbuf maintains a hold on the dnode.  When the last child
1684fa9e4066Sahrens 	 * drops its hold, the dnode will drop its hold on the
1685fa9e4066Sahrens 	 * containing dbuf. We add a "dirty hold" here so that the
1686fa9e4066Sahrens 	 * dnode will hang around after we finish processing its
1687fa9e4066Sahrens 	 * children.
1688fa9e4066Sahrens 	 */
16891934e92fSmaybee 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1690fa9e4066Sahrens 
1691c717a561Smaybee 	(void) dbuf_dirty(dn->dn_dbuf, tx);
1692fa9e4066Sahrens 
1693fa9e4066Sahrens 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
1694fa9e4066Sahrens }
1695fa9e4066Sahrens 
1696fa9e4066Sahrens void
1697fa9e4066Sahrens dnode_free(dnode_t *dn, dmu_tx_t *tx)
1698fa9e4066Sahrens {
1699fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
1700fa9e4066Sahrens 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1701fa9e4066Sahrens 		mutex_exit(&dn->dn_mtx);
1702fa9e4066Sahrens 		return;
1703fa9e4066Sahrens 	}
1704fa9e4066Sahrens 	dn->dn_free_txg = tx->tx_txg;
1705fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
1706fa9e4066Sahrens 
170794c2d0ebSMatthew Ahrens 	dnode_setdirty(dn, tx);
1708fa9e4066Sahrens }
1709fa9e4066Sahrens 
1710fa9e4066Sahrens /*
1711fa9e4066Sahrens  * Try to change the block size for the indicated dnode.  This can only
1712fa9e4066Sahrens  * succeed if there are no blocks allocated or dirty beyond first block
1713fa9e4066Sahrens  */
1714fa9e4066Sahrens int
1715fa9e4066Sahrens dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1716fa9e4066Sahrens {
17170f6d88adSAlex Reece 	dmu_buf_impl_t *db;
1718cdb0ab79Smaybee 	int err;
1719fa9e4066Sahrens 
1720b5152584SMatthew Ahrens 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1721fa9e4066Sahrens 	if (size == 0)
1722fa9e4066Sahrens 		size = SPA_MINBLOCKSIZE;
1723fa9e4066Sahrens 	else
1724fa9e4066Sahrens 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1725fa9e4066Sahrens 
1726b143e04bSahrens 	if (ibs == dn->dn_indblkshift)
1727b143e04bSahrens 		ibs = 0;
1728fa9e4066Sahrens 
1729b143e04bSahrens 	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1730fa9e4066Sahrens 		return (0);
1731fa9e4066Sahrens 
1732fa9e4066Sahrens 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1733fa9e4066Sahrens 
1734fa9e4066Sahrens 	/* Check for any allocated blocks beyond the first */
17350713e232SGeorge Wilson 	if (dn->dn_maxblkid != 0)
1736b143e04bSahrens 		goto fail;
1737fa9e4066Sahrens 
1738fa9e4066Sahrens 	mutex_enter(&dn->dn_dbufs_mtx);
17390f6d88adSAlex Reece 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
17400f6d88adSAlex Reece 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
17410a586ceaSMark Shellenbaum 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
17420a586ceaSMark Shellenbaum 		    db->db_blkid != DMU_SPILL_BLKID) {
1743fa9e4066Sahrens 			mutex_exit(&dn->dn_dbufs_mtx);
1744b143e04bSahrens 			goto fail;
1745fa9e4066Sahrens 		}
1746fa9e4066Sahrens 	}
1747fa9e4066Sahrens 	mutex_exit(&dn->dn_dbufs_mtx);
1748fa9e4066Sahrens 
1749b143e04bSahrens 	if (ibs && dn->dn_nlevels != 1)
1750b143e04bSahrens 		goto fail;
1751b143e04bSahrens 
1752cdb0ab79Smaybee 	/* resize the old block */
1753a2cdcdd2SPaul Dagnelie 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
1754cdb0ab79Smaybee 	if (err == 0)
1755c543ec06Sahrens 		dbuf_new_size(db, size, tx);
1756cdb0ab79Smaybee 	else if (err != ENOENT)
1757cdb0ab79Smaybee 		goto fail;
1758fa9e4066Sahrens 
1759fa9e4066Sahrens 	dnode_setdblksz(dn, size);
1760c543ec06Sahrens 	dnode_setdirty(dn, tx);
1761c543ec06Sahrens 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1762b143e04bSahrens 	if (ibs) {
1763b143e04bSahrens 		dn->dn_indblkshift = ibs;
1764b143e04bSahrens 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1765b143e04bSahrens 	}
1766cdb0ab79Smaybee 	/* rele after we have fixed the blocksize in the dnode */
1767c543ec06Sahrens 	if (db)
1768c543ec06Sahrens 		dbuf_rele(db, FTAG);
1769fa9e4066Sahrens 
1770fa9e4066Sahrens 	rw_exit(&dn->dn_struct_rwlock);
1771b143e04bSahrens 	return (0);
1772b143e04bSahrens 
1773b143e04bSahrens fail:
1774b143e04bSahrens 	rw_exit(&dn->dn_struct_rwlock);
1775be6fd75aSMatthew Ahrens 	return (SET_ERROR(ENOTSUP));
1776fa9e4066Sahrens }
1777fa9e4066Sahrens 
1778eb633035STom Caputi static void
1779eb633035STom Caputi dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
1780eb633035STom Caputi {
1781eb633035STom Caputi 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
1782eb633035STom Caputi 	int old_nlevels = dn->dn_nlevels;
1783eb633035STom Caputi 	dmu_buf_impl_t *db;
1784eb633035STom Caputi 	list_t *list;
1785eb633035STom Caputi 	dbuf_dirty_record_t *new, *dr, *dr_next;
1786eb633035STom Caputi 
1787eb633035STom Caputi 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1788eb633035STom Caputi 
1789eb633035STom Caputi 	dn->dn_nlevels = new_nlevels;
1790eb633035STom Caputi 
1791eb633035STom Caputi 	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1792eb633035STom Caputi 	dn->dn_next_nlevels[txgoff] = new_nlevels;
1793eb633035STom Caputi 
1794eb633035STom Caputi 	/* dirty the left indirects */
1795eb633035STom Caputi 	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1796eb633035STom Caputi 	ASSERT(db != NULL);
1797eb633035STom Caputi 	new = dbuf_dirty(db, tx);
1798eb633035STom Caputi 	dbuf_rele(db, FTAG);
1799eb633035STom Caputi 
1800eb633035STom Caputi 	/* transfer the dirty records to the new indirect */
1801eb633035STom Caputi 	mutex_enter(&dn->dn_mtx);
1802eb633035STom Caputi 	mutex_enter(&new->dt.di.dr_mtx);
1803eb633035STom Caputi 	list = &dn->dn_dirty_records[txgoff];
1804eb633035STom Caputi 	for (dr = list_head(list); dr; dr = dr_next) {
1805eb633035STom Caputi 		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1806eb633035STom Caputi 		if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1807eb633035STom Caputi 		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1808eb633035STom Caputi 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1809eb633035STom Caputi 			ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1810eb633035STom Caputi 			list_remove(&dn->dn_dirty_records[txgoff], dr);
1811eb633035STom Caputi 			list_insert_tail(&new->dt.di.dr_children, dr);
1812eb633035STom Caputi 			dr->dr_parent = new;
1813eb633035STom Caputi 		}
1814eb633035STom Caputi 	}
1815eb633035STom Caputi 	mutex_exit(&new->dt.di.dr_mtx);
1816eb633035STom Caputi 	mutex_exit(&dn->dn_mtx);
1817eb633035STom Caputi }
1818eb633035STom Caputi 
1819eb633035STom Caputi int
1820eb633035STom Caputi dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
1821eb633035STom Caputi {
1822eb633035STom Caputi 	int ret = 0;
1823eb633035STom Caputi 
1824eb633035STom Caputi 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1825eb633035STom Caputi 
1826eb633035STom Caputi 	if (dn->dn_nlevels == nlevels) {
1827eb633035STom Caputi 		ret = 0;
1828eb633035STom Caputi 		goto out;
1829eb633035STom Caputi 	} else if (nlevels < dn->dn_nlevels) {
1830eb633035STom Caputi 		ret = SET_ERROR(EINVAL);
1831eb633035STom Caputi 		goto out;
1832eb633035STom Caputi 	}
1833eb633035STom Caputi 
1834eb633035STom Caputi 	dnode_set_nlevels_impl(dn, nlevels, tx);
1835eb633035STom Caputi 
1836eb633035STom Caputi out:
1837eb633035STom Caputi 	rw_exit(&dn->dn_struct_rwlock);
1838eb633035STom Caputi 	return (ret);
1839eb633035STom Caputi }
1840eb633035STom Caputi 
18418346f03fSJonathan W Adams /* read-holding callers must not rely on the lock being continuously held */
1842fa9e4066Sahrens void
1843eb633035STom Caputi dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
1844eb633035STom Caputi     boolean_t force)
1845fa9e4066Sahrens {
1846c543ec06Sahrens 	int epbs, new_nlevels;
1847fa9e4066Sahrens 	uint64_t sz;
1848fa9e4066Sahrens 
18490a586ceaSMark Shellenbaum 	ASSERT(blkid != DMU_BONUS_BLKID);
1850fa9e4066Sahrens 
18518346f03fSJonathan W Adams 	ASSERT(have_read ?
18528346f03fSJonathan W Adams 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
18538346f03fSJonathan W Adams 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
18548346f03fSJonathan W Adams 
18558346f03fSJonathan W Adams 	/*
18568346f03fSJonathan W Adams 	 * if we have a read-lock, check to see if we need to do any work
18578346f03fSJonathan W Adams 	 * before upgrading to a write-lock.
18588346f03fSJonathan W Adams 	 */
18598346f03fSJonathan W Adams 	if (have_read) {
18608346f03fSJonathan W Adams 		if (blkid <= dn->dn_maxblkid)
18618346f03fSJonathan W Adams 			return;
18628346f03fSJonathan W Adams 
18638346f03fSJonathan W Adams 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
18648346f03fSJonathan W Adams 			rw_exit(&dn->dn_struct_rwlock);
18658346f03fSJonathan W Adams 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
18668346f03fSJonathan W Adams 		}
1867fa9e4066Sahrens 	}
1868fa9e4066Sahrens 
1869eb633035STom Caputi 	/*
1870eb633035STom Caputi 	 * Raw sends (indicated by the force flag) require that we take the
1871eb633035STom Caputi 	 * given blkid even if the value is lower than the current value.
1872eb633035STom Caputi 	 */
1873eb633035STom Caputi 	if (!force && blkid <= dn->dn_maxblkid)
1874c543ec06Sahrens 		goto out;
1875c543ec06Sahrens 
1876eb633035STom Caputi 	/*
1877eb633035STom Caputi 	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
1878eb633035STom Caputi 	 * to indicate that this field is set. This allows us to set the
1879eb633035STom Caputi 	 * maxblkid to 0 on an existing object in dnode_sync().
1880eb633035STom Caputi 	 */
1881c543ec06Sahrens 	dn->dn_maxblkid = blkid;
1882eb633035STom Caputi 	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
1883eb633035STom Caputi 	    blkid | DMU_NEXT_MAXBLKID_SET;
1884fa9e4066Sahrens 
1885fa9e4066Sahrens 	/*
1886c543ec06Sahrens 	 * Compute the number of levels necessary to support the new maxblkid.
1887eb633035STom Caputi 	 * Raw sends will ensure nlevels is set correctly for us.
1888fa9e4066Sahrens 	 */
1889fa9e4066Sahrens 	new_nlevels = 1;
1890fa9e4066Sahrens 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1891c543ec06Sahrens 	for (sz = dn->dn_nblkptr;
1892c543ec06Sahrens 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1893fa9e4066Sahrens 		new_nlevels++;
1894fa9e4066Sahrens 
1895eb633035STom Caputi 	if (!force) {
1896eb633035STom Caputi 		if (new_nlevels > dn->dn_nlevels)
1897eb633035STom Caputi 			dnode_set_nlevels_impl(dn, new_nlevels, tx);
1898eb633035STom Caputi 	} else {
1899eb633035STom Caputi 		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
1900fa9e4066Sahrens 	}
1901fa9e4066Sahrens 
1902fa9e4066Sahrens out:
19038346f03fSJonathan W Adams 	if (have_read)
19048346f03fSJonathan W Adams 		rw_downgrade(&dn->dn_struct_rwlock);
1905fa9e4066Sahrens }
1906fa9e4066Sahrens 
190746e1baa6SMatthew Ahrens static void
190846e1baa6SMatthew Ahrens dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
190946e1baa6SMatthew Ahrens {
191046e1baa6SMatthew Ahrens 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
191146e1baa6SMatthew Ahrens 	if (db != NULL) {
191246e1baa6SMatthew Ahrens 		dmu_buf_will_dirty(&db->db, tx);
191346e1baa6SMatthew Ahrens 		dbuf_rele(db, FTAG);
191446e1baa6SMatthew Ahrens 	}
191546e1baa6SMatthew Ahrens }
191646e1baa6SMatthew Ahrens 
1917738e2a3cSPaul Dagnelie /*
1918738e2a3cSPaul Dagnelie  * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
1919738e2a3cSPaul Dagnelie  * and end_blkid.
1920738e2a3cSPaul Dagnelie  */
1921738e2a3cSPaul Dagnelie static void
1922738e2a3cSPaul Dagnelie dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1923738e2a3cSPaul Dagnelie     dmu_tx_t *tx)
1924738e2a3cSPaul Dagnelie {
1925738e2a3cSPaul Dagnelie 	dmu_buf_impl_t db_search;
1926738e2a3cSPaul Dagnelie 	dmu_buf_impl_t *db;
1927738e2a3cSPaul Dagnelie 	avl_index_t where;
1928738e2a3cSPaul Dagnelie 
1929738e2a3cSPaul Dagnelie 	mutex_enter(&dn->dn_dbufs_mtx);
1930738e2a3cSPaul Dagnelie 
1931738e2a3cSPaul Dagnelie 	db_search.db_level = 1;
1932738e2a3cSPaul Dagnelie 	db_search.db_blkid = start_blkid + 1;
1933738e2a3cSPaul Dagnelie 	db_search.db_state = DB_SEARCH;
1934738e2a3cSPaul Dagnelie 	for (;;) {
1935738e2a3cSPaul Dagnelie 
1936738e2a3cSPaul Dagnelie 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
1937738e2a3cSPaul Dagnelie 		if (db == NULL)
1938738e2a3cSPaul Dagnelie 			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1939738e2a3cSPaul Dagnelie 
1940738e2a3cSPaul Dagnelie 		if (db == NULL || db->db_level != 1 ||
1941738e2a3cSPaul Dagnelie 		    db->db_blkid >= end_blkid) {
1942738e2a3cSPaul Dagnelie 			break;
1943738e2a3cSPaul Dagnelie 		}
1944738e2a3cSPaul Dagnelie 
1945738e2a3cSPaul Dagnelie 		/*
1946738e2a3cSPaul Dagnelie 		 * Setup the next blkid we want to search for.
1947738e2a3cSPaul Dagnelie 		 */
1948738e2a3cSPaul Dagnelie 		db_search.db_blkid = db->db_blkid + 1;
1949738e2a3cSPaul Dagnelie 		ASSERT3U(db->db_blkid, >=, start_blkid);
1950738e2a3cSPaul Dagnelie 
1951738e2a3cSPaul Dagnelie 		/*
1952738e2a3cSPaul Dagnelie 		 * If the dbuf transitions to DB_EVICTING while we're trying
1953738e2a3cSPaul Dagnelie 		 * to dirty it, then we will be unable to discover it in
1954738e2a3cSPaul Dagnelie 		 * the dbuf hash table. This will result in a call to
1955738e2a3cSPaul Dagnelie 		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
1956738e2a3cSPaul Dagnelie 		 * lock. To avoid a deadlock, we drop the lock before
1957738e2a3cSPaul Dagnelie 		 * dirtying the level-1 dbuf.
1958738e2a3cSPaul Dagnelie 		 */
1959738e2a3cSPaul Dagnelie 		mutex_exit(&dn->dn_dbufs_mtx);
1960738e2a3cSPaul Dagnelie 		dnode_dirty_l1(dn, db->db_blkid, tx);
1961738e2a3cSPaul Dagnelie 		mutex_enter(&dn->dn_dbufs_mtx);
1962738e2a3cSPaul Dagnelie 	}
1963738e2a3cSPaul Dagnelie 
1964738e2a3cSPaul Dagnelie #ifdef ZFS_DEBUG
1965738e2a3cSPaul Dagnelie 	/*
1966738e2a3cSPaul Dagnelie 	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
1967738e2a3cSPaul Dagnelie 	 */
1968738e2a3cSPaul Dagnelie 	db_search.db_level = 1;
1969738e2a3cSPaul Dagnelie 	db_search.db_blkid = start_blkid + 1;
1970738e2a3cSPaul Dagnelie 	db_search.db_state = DB_SEARCH;
1971738e2a3cSPaul Dagnelie 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
1972738e2a3cSPaul Dagnelie 	if (db == NULL)
1973738e2a3cSPaul Dagnelie 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1974738e2a3cSPaul Dagnelie 	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
1975738e2a3cSPaul Dagnelie 		if (db->db_level != 1 || db->db_blkid >= end_blkid)
1976738e2a3cSPaul Dagnelie 			break;
1977738e2a3cSPaul Dagnelie 		ASSERT(db->db_dirtycnt > 0);
1978738e2a3cSPaul Dagnelie 	}
1979738e2a3cSPaul Dagnelie #endif
1980738e2a3cSPaul Dagnelie 	mutex_exit(&dn->dn_dbufs_mtx);
1981738e2a3cSPaul Dagnelie }
1982738e2a3cSPaul Dagnelie 
1983fa9e4066Sahrens void
1984fa9e4066Sahrens dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1985fa9e4066Sahrens {
1986fa9e4066Sahrens 	dmu_buf_impl_t *db;
1987b143e04bSahrens 	uint64_t blkoff, blkid, nblks;
1988cdb0ab79Smaybee 	int blksz, blkshift, head, tail;
1989fa9e4066Sahrens 	int trunc = FALSE;
1990cdb0ab79Smaybee 	int epbs;
1991fa9e4066Sahrens 
1992fa9e4066Sahrens 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1993fa9e4066Sahrens 	blksz = dn->dn_datablksz;
1994cdb0ab79Smaybee 	blkshift = dn->dn_datablkshift;
1995cdb0ab79Smaybee 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1996fa9e4066Sahrens 
1997713d6c20SMatthew Ahrens 	if (len == DMU_OBJECT_END) {
1998fa9e4066Sahrens 		len = UINT64_MAX - off;
1999fa9e4066Sahrens 		trunc = TRUE;
2000fa9e4066Sahrens 	}
2001fa9e4066Sahrens 
2002fa9e4066Sahrens 	/*
2003fa9e4066Sahrens 	 * First, block align the region to free:
2004fa9e4066Sahrens 	 */
2005b143e04bSahrens 	if (ISP2(blksz)) {
2006b143e04bSahrens 		head = P2NPHASE(off, blksz);
2007b143e04bSahrens 		blkoff = P2PHASE(off, blksz);
2008cdb0ab79Smaybee 		if ((off >> blkshift) > dn->dn_maxblkid)
2009cdb0ab79Smaybee 			goto out;
2010b143e04bSahrens 	} else {
2011b143e04bSahrens 		ASSERT(dn->dn_maxblkid == 0);
2012b143e04bSahrens 		if (off == 0 && len >= blksz) {
201343466aaeSMax Grossman 			/*
201443466aaeSMax Grossman 			 * Freeing the whole block; fast-track this request.
201543466aaeSMax Grossman 			 */
2016cdb0ab79Smaybee 			blkid = 0;
2017cdb0ab79Smaybee 			nblks = 1;
201899a19144SMatthew Ahrens 			if (dn->dn_nlevels > 1)
201999a19144SMatthew Ahrens 				dnode_dirty_l1(dn, 0, tx);
2020cdb0ab79Smaybee 			goto done;
20211c8564a7SMark Maybee 		} else if (off >= blksz) {
2022cdb0ab79Smaybee 			/* Freeing past end-of-data */
2023cdb0ab79Smaybee 			goto out;
2024fa9e4066Sahrens 		} else {
2025b143e04bSahrens 			/* Freeing part of the block. */
2026fa9e4066Sahrens 			head = blksz - off;
2027fa9e4066Sahrens 			ASSERT3U(head, >, 0);
2028fa9e4066Sahrens 		}
2029b143e04bSahrens 		blkoff = off;
2030fa9e4066Sahrens 	}
2031fa9e4066Sahrens 	/* zero out any partial block data at the start of the range */
2032fa9e4066Sahrens 	if (head) {
2033b143e04bSahrens 		ASSERT3U(blkoff + head, ==, blksz);
2034fa9e4066Sahrens 		if (len < head)
2035fa9e4066Sahrens 			head = len;
2036a2cdcdd2SPaul Dagnelie 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
2037a2cdcdd2SPaul Dagnelie 		    TRUE, FALSE, FTAG, &db) == 0) {
2038fa9e4066Sahrens 			caddr_t data;
2039fa9e4066Sahrens 
2040fa9e4066Sahrens 			/* don't dirty if it isn't on disk and isn't dirty */
2041c717a561Smaybee 			if (db->db_last_dirty ||
2042fa9e4066Sahrens 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
2043fa9e4066Sahrens 				rw_exit(&dn->dn_struct_rwlock);
204443466aaeSMax Grossman 				dmu_buf_will_dirty(&db->db, tx);
2045fa9e4066Sahrens 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2046fa9e4066Sahrens 				data = db->db.db_data;
2047b143e04bSahrens 				bzero(data + blkoff, head);
2048fa9e4066Sahrens 			}
2049ea8dc4b6Seschrock 			dbuf_rele(db, FTAG);
2050fa9e4066Sahrens 		}
2051fa9e4066Sahrens 		off += head;
2052fa9e4066Sahrens 		len -= head;
2053fa9e4066Sahrens 	}
2054fa9e4066Sahrens 
2055b143e04bSahrens 	/* If the range was less than one block, we're done */
2056cdb0ab79Smaybee 	if (len == 0)
2057fa9e4066Sahrens 		goto out;
2058fa9e4066Sahrens 
2059cdb0ab79Smaybee 	/* If the remaining range is past end of file, we're done */
2060cdb0ab79Smaybee 	if ((off >> blkshift) > dn->dn_maxblkid)
2061cdb0ab79Smaybee 		goto out;
2062b143e04bSahrens 
20631c8564a7SMark Maybee 	ASSERT(ISP2(blksz));
2064cdb0ab79Smaybee 	if (trunc)
2065cdb0ab79Smaybee 		tail = 0;
2066cdb0ab79Smaybee 	else
2067cdb0ab79Smaybee 		tail = P2PHASE(len, blksz);
2068cdb0ab79Smaybee 
2069fb09f5aaSMadhav Suresh 	ASSERT0(P2PHASE(off, blksz));
2070cdb0ab79Smaybee 	/* zero out any partial block data at the end of the range */
2071cdb0ab79Smaybee 	if (tail) {
2072cdb0ab79Smaybee 		if (len < tail)
2073cdb0ab79Smaybee 			tail = len;
2074a2cdcdd2SPaul Dagnelie 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
2075a2cdcdd2SPaul Dagnelie 		    TRUE, FALSE, FTAG, &db) == 0) {
2076cdb0ab79Smaybee 			/* don't dirty if not on disk and not dirty */
2077cdb0ab79Smaybee 			if (db->db_last_dirty ||
2078cdb0ab79Smaybee 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
2079cdb0ab79Smaybee 				rw_exit(&dn->dn_struct_rwlock);
208043466aaeSMax Grossman 				dmu_buf_will_dirty(&db->db, tx);
2081cdb0ab79Smaybee 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2082cdb0ab79Smaybee 				bzero(db->db.db_data, tail);
2083fa9e4066Sahrens 			}
2084ea8dc4b6Seschrock 			dbuf_rele(db, FTAG);
2085fa9e4066Sahrens 		}
2086cdb0ab79Smaybee 		len -= tail;
2087cdb0ab79Smaybee 	}
2088fa9e4066Sahrens 
2089cdb0ab79Smaybee 	/* If the range did not include a full block, we are done */
2090cdb0ab79Smaybee 	if (len == 0)
2091cdb0ab79Smaybee 		goto out;
2092fa9e4066Sahrens 
2093cdb0ab79Smaybee 	ASSERT(IS_P2ALIGNED(off, blksz));
2094cdb0ab79Smaybee 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2095cdb0ab79Smaybee 	blkid = off >> blkshift;
2096cdb0ab79Smaybee 	nblks = len >> blkshift;
2097cdb0ab79Smaybee 	if (trunc)
2098cdb0ab79Smaybee 		nblks += 1;
2099cdb0ab79Smaybee 
2100cdb0ab79Smaybee 	/*
210146e1baa6SMatthew Ahrens 	 * Dirty all the indirect blocks in this range.  Note that only
210246e1baa6SMatthew Ahrens 	 * the first and last indirect blocks can actually be written
210346e1baa6SMatthew Ahrens 	 * (if they were partially freed) -- they must be dirtied, even if
210446e1baa6SMatthew Ahrens 	 * they do not exist on disk yet.  The interior blocks will
210546e1baa6SMatthew Ahrens 	 * be freed by free_children(), so they will not actually be written.
210646e1baa6SMatthew Ahrens 	 * Even though these interior blocks will not be written, we
210746e1baa6SMatthew Ahrens 	 * dirty them for two reasons:
210846e1baa6SMatthew Ahrens 	 *
210946e1baa6SMatthew Ahrens 	 *  - It ensures that the indirect blocks remain in memory until
211046e1baa6SMatthew Ahrens 	 *    syncing context.  (They have already been prefetched by
211146e1baa6SMatthew Ahrens 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
211246e1baa6SMatthew Ahrens 	 *    them serially here.)
211346e1baa6SMatthew Ahrens 	 *
211446e1baa6SMatthew Ahrens 	 *  - The dirty space accounting will put pressure on the txg sync
211546e1baa6SMatthew Ahrens 	 *    mechanism to begin syncing, and to delay transactions if there
211646e1baa6SMatthew Ahrens 	 *    is a large amount of freeing.  Even though these indirect
211746e1baa6SMatthew Ahrens 	 *    blocks will not be written, we could need to write the same
211846e1baa6SMatthew Ahrens 	 *    amount of space if we copy the freed BPs into deadlists.
2119cdb0ab79Smaybee 	 */
2120cdb0ab79Smaybee 	if (dn->dn_nlevels > 1) {
212143466aaeSMax Grossman 		uint64_t first, last;
2122b143e04bSahrens 
2123cdb0ab79Smaybee 		first = blkid >> epbs;
212446e1baa6SMatthew Ahrens 		dnode_dirty_l1(dn, first, tx);
2125b143e04bSahrens 		if (trunc)
2126cdb0ab79Smaybee 			last = dn->dn_maxblkid >> epbs;
2127cdb0ab79Smaybee 		else
2128cdb0ab79Smaybee 			last = (blkid + nblks - 1) >> epbs;
212946e1baa6SMatthew Ahrens 		if (last != first)
213046e1baa6SMatthew Ahrens 			dnode_dirty_l1(dn, last, tx);
213146e1baa6SMatthew Ahrens 
2132738e2a3cSPaul Dagnelie 		dnode_dirty_l1range(dn, first, last, tx);
2133738e2a3cSPaul Dagnelie 
213446e1baa6SMatthew Ahrens 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
213546e1baa6SMatthew Ahrens 		    SPA_BLKPTRSHIFT;
213646e1baa6SMatthew Ahrens 		for (uint64_t i = first + 1; i < last; i++) {
213746e1baa6SMatthew Ahrens 			/*
213846e1baa6SMatthew Ahrens 			 * Set i to the blockid of the next non-hole
213946e1baa6SMatthew Ahrens 			 * level-1 indirect block at or after i.  Note
214046e1baa6SMatthew Ahrens 			 * that dnode_next_offset() operates in terms of
214146e1baa6SMatthew Ahrens 			 * level-0-equivalent bytes.
214246e1baa6SMatthew Ahrens 			 */
214346e1baa6SMatthew Ahrens 			uint64_t ibyte = i << shift;
214446e1baa6SMatthew Ahrens 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
214546e1baa6SMatthew Ahrens 			    &ibyte, 2, 1, 0);
214646e1baa6SMatthew Ahrens 			i = ibyte >> shift;
214746e1baa6SMatthew Ahrens 			if (i >= last)
214846e1baa6SMatthew Ahrens 				break;
214946e1baa6SMatthew Ahrens 
215046e1baa6SMatthew Ahrens 			/*
215146e1baa6SMatthew Ahrens 			 * Normally we should not see an error, either
215246e1baa6SMatthew Ahrens 			 * from dnode_next_offset() or dbuf_hold_level()
215346e1baa6SMatthew Ahrens 			 * (except for ESRCH from dnode_next_offset).
215446e1baa6SMatthew Ahrens 			 * If there is an i/o error, then when we read
215546e1baa6SMatthew Ahrens 			 * this block in syncing context, it will use
215646e1baa6SMatthew Ahrens 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
215746e1baa6SMatthew Ahrens 			 * to the "failmode" property.  dnode_next_offset()
215846e1baa6SMatthew Ahrens 			 * doesn't have a flag to indicate MUSTSUCCEED.
215946e1baa6SMatthew Ahrens 			 */
216046e1baa6SMatthew Ahrens 			if (err != 0)
216146e1baa6SMatthew Ahrens 				break;
216246e1baa6SMatthew Ahrens 
216346e1baa6SMatthew Ahrens 			dnode_dirty_l1(dn, i, tx);
216456d55a53Smaybee 		}
2165fa9e4066Sahrens 	}
216643466aaeSMax Grossman 
2167cdb0ab79Smaybee done:
2168cdb0ab79Smaybee 	/*
2169cdb0ab79Smaybee 	 * Add this range to the dnode range list.
2170cdb0ab79Smaybee 	 * We will finish up this free operation in the syncing phase.
2171cdb0ab79Smaybee 	 */
2172fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
2173bf16b11eSMatthew Ahrens 	int txgoff = tx->tx_txg & TXG_MASK;
2174bf16b11eSMatthew Ahrens 	if (dn->dn_free_ranges[txgoff] == NULL) {
21755cabbc6bSPrashanth Sreenivasa 		dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
2176fa9e4066Sahrens 	}
2177bf16b11eSMatthew Ahrens 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2178bf16b11eSMatthew Ahrens 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2179bf16b11eSMatthew Ahrens 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2180bf16b11eSMatthew Ahrens 	    blkid, nblks, tx->tx_txg);
2181fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
2182fa9e4066Sahrens 
2183cdb0ab79Smaybee 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2184fa9e4066Sahrens 	dnode_setdirty(dn, tx);
2185fa9e4066Sahrens out:
2186cdb0ab79Smaybee 
2187fa9e4066Sahrens 	rw_exit(&dn->dn_struct_rwlock);
2188fa9e4066Sahrens }
2189fa9e4066Sahrens 
21900a586ceaSMark Shellenbaum static boolean_t
21910a586ceaSMark Shellenbaum dnode_spill_freed(dnode_t *dn)
21920a586ceaSMark Shellenbaum {
21930a586ceaSMark Shellenbaum 	int i;
21940a586ceaSMark Shellenbaum 
21950a586ceaSMark Shellenbaum 	mutex_enter(&dn->dn_mtx);
21960a586ceaSMark Shellenbaum 	for (i = 0; i < TXG_SIZE; i++) {
21970a586ceaSMark Shellenbaum 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
21980a586ceaSMark Shellenbaum 			break;
21990a586ceaSMark Shellenbaum 	}
22000a586ceaSMark Shellenbaum 	mutex_exit(&dn->dn_mtx);
22010a586ceaSMark Shellenbaum 	return (i < TXG_SIZE);
22020a586ceaSMark Shellenbaum }
22030a586ceaSMark Shellenbaum 
2204fa9e4066Sahrens /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2205fa9e4066Sahrens uint64_t
2206fa9e4066Sahrens dnode_block_freed(dnode_t *dn, uint64_t blkid)
2207fa9e4066Sahrens {
2208fa9e4066Sahrens 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
2209fa9e4066Sahrens 	int i;
2210fa9e4066Sahrens 
22110a586ceaSMark Shellenbaum 	if (blkid == DMU_BONUS_BLKID)
2212fa9e4066Sahrens 		return (FALSE);
2213fa9e4066Sahrens 
2214fa9e4066Sahrens 	/*
2215fa9e4066Sahrens 	 * If we're in the process of opening the pool, dp will not be
2216fa9e4066Sahrens 	 * set yet, but there shouldn't be anything dirty.
2217fa9e4066Sahrens 	 */
2218fa9e4066Sahrens 	if (dp == NULL)
2219fa9e4066Sahrens 		return (FALSE);
2220fa9e4066Sahrens 
2221fa9e4066Sahrens 	if (dn->dn_free_txg)
2222fa9e4066Sahrens 		return (TRUE);
2223fa9e4066Sahrens 
22240a586ceaSMark Shellenbaum 	if (blkid == DMU_SPILL_BLKID)
22250a586ceaSMark Shellenbaum 		return (dnode_spill_freed(dn));
22260a586ceaSMark Shellenbaum 
2227fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
2228fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
2229bf16b11eSMatthew Ahrens 		if (dn->dn_free_ranges[i] != NULL &&
2230bf16b11eSMatthew Ahrens 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2231fa9e4066Sahrens 			break;
2232fa9e4066Sahrens 	}
2233fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
2234fa9e4066Sahrens 	return (i < TXG_SIZE);
2235fa9e4066Sahrens }
2236fa9e4066Sahrens 
2237fa9e4066Sahrens /* call from syncing context when we actually write/free space for this dnode */
2238fa9e4066Sahrens void
223999653d4eSeschrock dnode_diduse_space(dnode_t *dn, int64_t delta)
2240fa9e4066Sahrens {
224199653d4eSeschrock 	uint64_t space;
224299653d4eSeschrock 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2243fa9e4066Sahrens 	    dn, dn->dn_phys,
224499653d4eSeschrock 	    (u_longlong_t)dn->dn_phys->dn_used,
224599653d4eSeschrock 	    (longlong_t)delta);
2246fa9e4066Sahrens 
2247fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
224899653d4eSeschrock 	space = DN_USED_BYTES(dn->dn_phys);
224999653d4eSeschrock 	if (delta > 0) {
225099653d4eSeschrock 		ASSERT3U(space + delta, >=, space); /* no overflow */
225199653d4eSeschrock 	} else {
225299653d4eSeschrock 		ASSERT3U(space, >=, -delta); /* no underflow */
225399653d4eSeschrock 	}
225499653d4eSeschrock 	space += delta;
2255e7437265Sahrens 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
225699653d4eSeschrock 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
2257fb09f5aaSMadhav Suresh 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
225899653d4eSeschrock 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2259fa9e4066Sahrens 	} else {
226099653d4eSeschrock 		dn->dn_phys->dn_used = space;
226199653d4eSeschrock 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2262fa9e4066Sahrens 	}
2263fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
2264fa9e4066Sahrens }
2265fa9e4066Sahrens 
226676256205SMark Maybee /*
2267f7170741SWill Andrews  * Scans a block at the indicated "level" looking for a hole or data,
2268f7170741SWill Andrews  * depending on 'flags'.
2269f7170741SWill Andrews  *
2270f7170741SWill Andrews  * If level > 0, then we are scanning an indirect block looking at its
2271f7170741SWill Andrews  * pointers.  If level == 0, then we are looking at a block of dnodes.
2272f7170741SWill Andrews  *
2273f7170741SWill Andrews  * If we don't find what we are looking for in the block, we return ESRCH.
2274f7170741SWill Andrews  * Otherwise, return with *offset pointing to the beginning (if searching
2275f7170741SWill Andrews  * forwards) or end (if searching backwards) of the range covered by the
2276f7170741SWill Andrews  * block pointer we matched on (or dnode).
227776256205SMark Maybee  *
227876256205SMark Maybee  * The basic search algorithm used below by dnode_next_offset() is to
227976256205SMark Maybee  * use this function to search up the block tree (widen the search) until
228076256205SMark Maybee  * we find something (i.e., we don't return ESRCH) and then search back
228176256205SMark Maybee  * down the tree (narrow the search) until we reach our original search
228276256205SMark Maybee  * level.
228376256205SMark Maybee  */
2284fa9e4066Sahrens static int
2285cdb0ab79Smaybee dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2286a2cdcdd2SPaul Dagnelie     int lvl, uint64_t blkfill, uint64_t txg)
2287fa9e4066Sahrens {
2288fa9e4066Sahrens 	dmu_buf_impl_t *db = NULL;
2289fa9e4066Sahrens 	void *data = NULL;
2290fa9e4066Sahrens 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2291fa9e4066Sahrens 	uint64_t epb = 1ULL << epbs;
2292fa9e4066Sahrens 	uint64_t minfill, maxfill;
2293cdb0ab79Smaybee 	boolean_t hole;
2294cdb0ab79Smaybee 	int i, inc, error, span;
2295fa9e4066Sahrens 
2296fa9e4066Sahrens 	dprintf("probing object %llu offset %llx level %d of %u\n",
2297fa9e4066Sahrens 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
2298fa9e4066Sahrens 
229914843421SMatthew Ahrens 	hole = ((flags & DNODE_FIND_HOLE) != 0);
2300cdb0ab79Smaybee 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
23011c8564a7SMark Maybee 	ASSERT(txg == 0 || !hole);
2302cdb0ab79Smaybee 
2303fa9e4066Sahrens 	if (lvl == dn->dn_phys->dn_nlevels) {
2304fa9e4066Sahrens 		error = 0;
2305fa9e4066Sahrens 		epb = dn->dn_phys->dn_nblkptr;
2306fa9e4066Sahrens 		data = dn->dn_phys->dn_blkptr;
2307fa9e4066Sahrens 	} else {
2308a2cdcdd2SPaul Dagnelie 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2309a2cdcdd2SPaul Dagnelie 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2310fa9e4066Sahrens 		if (error) {
23111c8564a7SMark Maybee 			if (error != ENOENT)
23121c8564a7SMark Maybee 				return (error);
23131c8564a7SMark Maybee 			if (hole)
23141c8564a7SMark Maybee 				return (0);
23151c8564a7SMark Maybee 			/*
23161c8564a7SMark Maybee 			 * This can only happen when we are searching up
23171c8564a7SMark Maybee 			 * the block tree for data.  We don't really need to
23181c8564a7SMark Maybee 			 * adjust the offset, as we will just end up looking
23191c8564a7SMark Maybee 			 * at the pointer to this block in its parent, and its
23201c8564a7SMark Maybee 			 * going to be unallocated, so we will skip over it.
23211c8564a7SMark Maybee 			 */
2322be6fd75aSMatthew Ahrens 			return (SET_ERROR(ESRCH));
2323fa9e4066Sahrens 		}
2324eb633035STom Caputi 		error = dbuf_read(db, NULL,
2325eb633035STom Caputi 		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
232698572ac1Sahrens 		if (error) {
232798572ac1Sahrens 			dbuf_rele(db, FTAG);
232898572ac1Sahrens 			return (error);
232998572ac1Sahrens 		}
2330fa9e4066Sahrens 		data = db->db.db_data;
2331fa9e4066Sahrens 	}
2332fa9e4066Sahrens 
233343466aaeSMax Grossman 
233443466aaeSMax Grossman 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
233543466aaeSMax Grossman 	    db->db_blkptr->blk_birth <= txg ||
233643466aaeSMax Grossman 	    BP_IS_HOLE(db->db_blkptr))) {
23371c8564a7SMark Maybee 		/*
23381c8564a7SMark Maybee 		 * This can only happen when we are searching up the tree
23391c8564a7SMark Maybee 		 * and these conditions mean that we need to keep climbing.
23401c8564a7SMark Maybee 		 */
2341be6fd75aSMatthew Ahrens 		error = SET_ERROR(ESRCH);
23426754306eSahrens 	} else if (lvl == 0) {
2343fa9e4066Sahrens 		dnode_phys_t *dnp = data;
234454811da5SToomas Soome 
2345fa9e4066Sahrens 		ASSERT(dn->dn_type == DMU_OT_DNODE);
234654811da5SToomas Soome 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2347fa9e4066Sahrens 
234854811da5SToomas Soome 		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
234954811da5SToomas Soome 		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
235008f3f137SJonathan W Adams 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2351fa9e4066Sahrens 				break;
2352fa9e4066Sahrens 		}
235354811da5SToomas Soome 
235454811da5SToomas Soome 		if (i == blkfill)
2355be6fd75aSMatthew Ahrens 			error = SET_ERROR(ESRCH);
235654811da5SToomas Soome 
235754811da5SToomas Soome 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
235854811da5SToomas Soome 		    (i << DNODE_SHIFT);
2359fa9e4066Sahrens 	} else {
2360fa9e4066Sahrens 		blkptr_t *bp = data;
236176256205SMark Maybee 		uint64_t start = *offset;
2362fa9e4066Sahrens 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
2363fa9e4066Sahrens 		minfill = 0;
2364fa9e4066Sahrens 		maxfill = blkfill << ((lvl - 1) * epbs);
2365fa9e4066Sahrens 
2366fa9e4066Sahrens 		if (hole)
2367fa9e4066Sahrens 			maxfill--;
2368fa9e4066Sahrens 		else
2369fa9e4066Sahrens 			minfill++;
2370fa9e4066Sahrens 
237176256205SMark Maybee 		*offset = *offset >> span;
237276256205SMark Maybee 		for (i = BF64_GET(*offset, 0, epbs);
2373cdb0ab79Smaybee 		    i >= 0 && i < epb; i += inc) {
23745d7b4d43SMatthew Ahrens 			if (BP_GET_FILL(&bp[i]) >= minfill &&
23755d7b4d43SMatthew Ahrens 			    BP_GET_FILL(&bp[i]) <= maxfill &&
23761c8564a7SMark Maybee 			    (hole || bp[i].blk_birth > txg))
2377fa9e4066Sahrens 				break;
237876256205SMark Maybee 			if (inc > 0 || *offset > 0)
237976256205SMark Maybee 				*offset += inc;
238076256205SMark Maybee 		}
238176256205SMark Maybee 		*offset = *offset << span;
238276256205SMark Maybee 		if (inc < 0) {
238376256205SMark Maybee 			/* traversing backwards; position offset at the end */
238476256205SMark Maybee 			ASSERT3U(*offset, <=, start);
238576256205SMark Maybee 			*offset = MIN(*offset + (1ULL << span) - 1, start);
238676256205SMark Maybee 		} else if (*offset < start) {
238776256205SMark Maybee 			*offset = start;
2388fa9e4066Sahrens 		}
238976256205SMark Maybee 		if (i < 0 || i >= epb)
2390be6fd75aSMatthew Ahrens 			error = SET_ERROR(ESRCH);
2391fa9e4066Sahrens 	}
2392fa9e4066Sahrens 
2393fa9e4066Sahrens 	if (db)
2394ea8dc4b6Seschrock 		dbuf_rele(db, FTAG);
2395fa9e4066Sahrens 
2396fa9e4066Sahrens 	return (error);
2397fa9e4066Sahrens }
2398fa9e4066Sahrens 
2399fa9e4066Sahrens /*
2400fa9e4066Sahrens  * Find the next hole, data, or sparse region at or after *offset.
2401fa9e4066Sahrens  * The value 'blkfill' tells us how many items we expect to find
2402fa9e4066Sahrens  * in an L0 data block; this value is 1 for normal objects,
2403fa9e4066Sahrens  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
2404fa9e4066Sahrens  * DNODES_PER_BLOCK when searching for sparse regions thereof.
24056754306eSahrens  *
2406fa9e4066Sahrens  * Examples:
2407fa9e4066Sahrens  *
2408cdb0ab79Smaybee  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
2409cdb0ab79Smaybee  *	Finds the next/previous hole/data in a file.
2410fa9e4066Sahrens  *	Used in dmu_offset_next().
2411fa9e4066Sahrens  *
2412cdb0ab79Smaybee  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2413fa9e4066Sahrens  *	Finds the next free/allocated dnode an objset's meta-dnode.
24146754306eSahrens  *	Only finds objects that have new contents since txg (ie.
24156754306eSahrens  *	bonus buffer changes and content removal are ignored).
2416fa9e4066Sahrens  *	Used in dmu_object_next().
2417fa9e4066Sahrens  *
2418cdb0ab79Smaybee  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2419fa9e4066Sahrens  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
2420fa9e4066Sahrens  *	Used in dmu_object_alloc().
2421fa9e4066Sahrens  */
2422fa9e4066Sahrens int
2423cdb0ab79Smaybee dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
24246754306eSahrens     int minlvl, uint64_t blkfill, uint64_t txg)
2425fa9e4066Sahrens {
2426cdb0ab79Smaybee 	uint64_t initial_offset = *offset;
2427fa9e4066Sahrens 	int lvl, maxlvl;
2428fa9e4066Sahrens 	int error = 0;
2429fa9e4066Sahrens 
2430cdb0ab79Smaybee 	if (!(flags & DNODE_FIND_HAVELOCK))
2431cdb0ab79Smaybee 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2432fa9e4066Sahrens 
2433fa9e4066Sahrens 	if (dn->dn_phys->dn_nlevels == 0) {
2434be6fd75aSMatthew Ahrens 		error = SET_ERROR(ESRCH);
2435cdb0ab79Smaybee 		goto out;
2436fa9e4066Sahrens 	}
2437fa9e4066Sahrens 
2438fa9e4066Sahrens 	if (dn->dn_datablkshift == 0) {
2439fa9e4066Sahrens 		if (*offset < dn->dn_datablksz) {
2440cdb0ab79Smaybee 			if (flags & DNODE_FIND_HOLE)
2441fa9e4066Sahrens 				*offset = dn->dn_datablksz;
2442fa9e4066Sahrens 		} else {
2443be6fd75aSMatthew Ahrens 			error = SET_ERROR(ESRCH);
2444fa9e4066Sahrens 		}
2445cdb0ab79Smaybee 		goto out;
2446fa9e4066Sahrens 	}
2447fa9e4066Sahrens 
2448fa9e4066Sahrens 	maxlvl = dn->dn_phys->dn_nlevels;
2449fa9e4066Sahrens 
2450fa9e4066Sahrens 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
24516754306eSahrens 		error = dnode_next_offset_level(dn,
2452cdb0ab79Smaybee 		    flags, offset, lvl, blkfill, txg);
245398572ac1Sahrens 		if (error != ESRCH)
2454fa9e4066Sahrens 			break;
2455fa9e4066Sahrens 	}
2456fa9e4066Sahrens 
2457cdb0ab79Smaybee 	while (error == 0 && --lvl >= minlvl) {
24586754306eSahrens 		error = dnode_next_offset_level(dn,
2459cdb0ab79Smaybee 		    flags, offset, lvl, blkfill, txg);
24606754306eSahrens 	}
2461fa9e4066Sahrens 
24620fbc0cd0SMatthew Ahrens 	/*
24630fbc0cd0SMatthew Ahrens 	 * There's always a "virtual hole" at the end of the object, even
24640fbc0cd0SMatthew Ahrens 	 * if all BP's which physically exist are non-holes.
24650fbc0cd0SMatthew Ahrens 	 */
24660fbc0cd0SMatthew Ahrens 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
24670fbc0cd0SMatthew Ahrens 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
24680fbc0cd0SMatthew Ahrens 		error = 0;
24690fbc0cd0SMatthew Ahrens 	}
24700fbc0cd0SMatthew Ahrens 
2471cdb0ab79Smaybee 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2472cdb0ab79Smaybee 	    initial_offset < *offset : initial_offset > *offset))
2473be6fd75aSMatthew Ahrens 		error = SET_ERROR(ESRCH);
2474cdb0ab79Smaybee out:
2475cdb0ab79Smaybee 	if (!(flags & DNODE_FIND_HAVELOCK))
2476cdb0ab79Smaybee 		rw_exit(&dn->dn_struct_rwlock);
2477fa9e4066Sahrens 
2478fa9e4066Sahrens 	return (error);
2479fa9e4066Sahrens }
2480