xref: /illumos-gate/usr/src/uts/common/fs/zfs/dnode.c (revision d4c2c737)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5f65e61c0Sahrens  * Common Development and Distribution License (the "License").
6f65e61c0Sahrens  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
2206e0070dSMark Shellenbaum  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
234d7988d6SPaul Dagnelie  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
24bc9014e6SJustin Gibbs  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25c3d26abcSMatthew Ahrens  * Copyright (c) 2014 Integros [integros.com]
26f06dce2cSAndrew Stormont  * Copyright 2017 RackTop Systems.
27fa9e4066Sahrens  */
28fa9e4066Sahrens 
29fa9e4066Sahrens #include <sys/zfs_context.h>
30fa9e4066Sahrens #include <sys/dbuf.h>
31fa9e4066Sahrens #include <sys/dnode.h>
32fa9e4066Sahrens #include <sys/dmu.h>
33fa9e4066Sahrens #include <sys/dmu_impl.h>
34fa9e4066Sahrens #include <sys/dmu_tx.h>
35fa9e4066Sahrens #include <sys/dmu_objset.h>
36fa9e4066Sahrens #include <sys/dsl_dir.h>
37fa9e4066Sahrens #include <sys/dsl_dataset.h>
38fa9e4066Sahrens #include <sys/spa.h>
39fa9e4066Sahrens #include <sys/zio.h>
40fa9e4066Sahrens #include <sys/dmu_zfetch.h>
41bf16b11eSMatthew Ahrens #include <sys/range_tree.h>
42f67950b2SNasf-Fan #include <sys/zfs_project.h>
43fa9e4066Sahrens 
4454811da5SToomas Soome dnode_stats_t dnode_stats = {
4554811da5SToomas Soome 	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
4654811da5SToomas Soome 	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
4754811da5SToomas Soome 	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
4854811da5SToomas Soome 	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
4954811da5SToomas Soome 	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
5054811da5SToomas Soome 	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
5154811da5SToomas Soome 	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
5254811da5SToomas Soome 	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
5354811da5SToomas Soome 	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
5454811da5SToomas Soome 	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
5554811da5SToomas Soome 	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
5654811da5SToomas Soome 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
5754811da5SToomas Soome 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
5854811da5SToomas Soome 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
5954811da5SToomas Soome 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
6054811da5SToomas Soome 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
6154811da5SToomas Soome 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
6254811da5SToomas Soome 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
6354811da5SToomas Soome 	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
6454811da5SToomas Soome 	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
6554811da5SToomas Soome 	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
6654811da5SToomas Soome 	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
6754811da5SToomas Soome 	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
6854811da5SToomas Soome 	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
6954811da5SToomas Soome 	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
7054811da5SToomas Soome 	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
7154811da5SToomas Soome 	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
7254811da5SToomas Soome 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
7354811da5SToomas Soome };
7454811da5SToomas Soome 
7554811da5SToomas Soome static kstat_t *dnode_ksp;
76fa9e4066Sahrens static kmem_cache_t *dnode_cache;
77fa9e4066Sahrens 
78fa9e4066Sahrens static dnode_phys_t dnode_phys_zero;
79fa9e4066Sahrens 
80fa9e4066Sahrens int zfs_default_bs = SPA_MINBLOCKSHIFT;
81fa9e4066Sahrens int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
82fa9e4066Sahrens 
83f06dce2cSAndrew Stormont #ifdef	_KERNEL
84744947dcSTom Erickson static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
85f06dce2cSAndrew Stormont #endif	/* _KERNEL */
86744947dcSTom Erickson 
870f6d88adSAlex Reece static int
dbuf_compare(const void * x1,const void * x2)880f6d88adSAlex Reece dbuf_compare(const void *x1, const void *x2)
890f6d88adSAlex Reece {
900f6d88adSAlex Reece 	const dmu_buf_impl_t *d1 = x1;
910f6d88adSAlex Reece 	const dmu_buf_impl_t *d2 = x2;
920f6d88adSAlex Reece 
934d7988d6SPaul Dagnelie 	int cmp = TREE_CMP(d1->db_level, d2->db_level);
94c4ab0d3fSGvozden Neskovic 	if (likely(cmp))
95c4ab0d3fSGvozden Neskovic 		return (cmp);
960f6d88adSAlex Reece 
974d7988d6SPaul Dagnelie 	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
98c4ab0d3fSGvozden Neskovic 	if (likely(cmp))
99c4ab0d3fSGvozden Neskovic 		return (cmp);
1000f6d88adSAlex Reece 
101a846f19dSAlex Reece 	if (d1->db_state == DB_SEARCH) {
102a846f19dSAlex Reece 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
1030f6d88adSAlex Reece 		return (-1);
104a846f19dSAlex Reece 	} else if (d2->db_state == DB_SEARCH) {
105a846f19dSAlex Reece 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
10686bb58aeSAlex Reece 		return (1);
10786bb58aeSAlex Reece 	}
10886bb58aeSAlex Reece 
1094d7988d6SPaul Dagnelie 	return (TREE_PCMP(d1, d2));
1100f6d88adSAlex Reece }
1110f6d88adSAlex Reece 
112fa9e4066Sahrens /* ARGSUSED */
113fa9e4066Sahrens static int
dnode_cons(void * arg,void * unused,int kmflag)114fa9e4066Sahrens dnode_cons(void *arg, void *unused, int kmflag)
115fa9e4066Sahrens {
116fa9e4066Sahrens 	dnode_t *dn = arg;
117744947dcSTom Erickson 	int i;
118fa9e4066Sahrens 
119fa9e4066Sahrens 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
120fa9e4066Sahrens 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
121fa9e4066Sahrens 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
122b5e70f97SRicardo M. Correia 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
123b390f3a9SJohn Poduska 	cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
124b5e70f97SRicardo M. Correia 
1253b2aab18SMatthew Ahrens 	/*
1263b2aab18SMatthew Ahrens 	 * Every dbuf has a reference, and dropping a tracked reference is
1273b2aab18SMatthew Ahrens 	 * O(number of references), so don't track dn_holds.
1283b2aab18SMatthew Ahrens 	 */
129e914ace2STim Schumacher 	zfs_refcount_create_untracked(&dn->dn_holds);
130e914ace2STim Schumacher 	zfs_refcount_create(&dn->dn_tx_holds);
131744947dcSTom Erickson 	list_link_init(&dn->dn_link);
132744947dcSTom Erickson 
133744947dcSTom Erickson 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
134744947dcSTom Erickson 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
135744947dcSTom Erickson 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
136744947dcSTom Erickson 	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
137744947dcSTom Erickson 	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
138744947dcSTom Erickson 	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
139744947dcSTom Erickson 	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
140eb633035STom Caputi 	bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
141fa9e4066Sahrens 
142fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
143aa02ea01STom Caputi 		multilist_link_init(&dn->dn_dirty_link[i]);
144bf16b11eSMatthew Ahrens 		dn->dn_free_ranges[i] = NULL;
145c717a561Smaybee 		list_create(&dn->dn_dirty_records[i],
146c717a561Smaybee 		    sizeof (dbuf_dirty_record_t),
147c717a561Smaybee 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
148fa9e4066Sahrens 	}
149fa9e4066Sahrens 
150744947dcSTom Erickson 	dn->dn_allocated_txg = 0;
151744947dcSTom Erickson 	dn->dn_free_txg = 0;
152744947dcSTom Erickson 	dn->dn_assigned_txg = 0;
153aa02ea01STom Caputi 	dn->dn_dirty_txg = 0;
154744947dcSTom Erickson 	dn->dn_dirtyctx = 0;
155744947dcSTom Erickson 	dn->dn_dirtyctx_firstset = NULL;
156744947dcSTom Erickson 	dn->dn_bonus = NULL;
157744947dcSTom Erickson 	dn->dn_have_spill = B_FALSE;
158744947dcSTom Erickson 	dn->dn_zio = NULL;
159744947dcSTom Erickson 	dn->dn_oldused = 0;
160744947dcSTom Erickson 	dn->dn_oldflags = 0;
161744947dcSTom Erickson 	dn->dn_olduid = 0;
162744947dcSTom Erickson 	dn->dn_oldgid = 0;
163f67950b2SNasf-Fan 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
164744947dcSTom Erickson 	dn->dn_newuid = 0;
165744947dcSTom Erickson 	dn->dn_newgid = 0;
166f67950b2SNasf-Fan 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
167744947dcSTom Erickson 	dn->dn_id_flags = 0;
168744947dcSTom Erickson 
169744947dcSTom Erickson 	dn->dn_dbufs_count = 0;
1700f6d88adSAlex Reece 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
171fa9e4066Sahrens 	    offsetof(dmu_buf_impl_t, db_link));
172fa9e4066Sahrens 
173744947dcSTom Erickson 	dn->dn_moved = 0;
174fa9e4066Sahrens 	return (0);
175fa9e4066Sahrens }
176fa9e4066Sahrens 
177fa9e4066Sahrens /* ARGSUSED */
178fa9e4066Sahrens static void
dnode_dest(void * arg,void * unused)179fa9e4066Sahrens dnode_dest(void *arg, void *unused)
180fa9e4066Sahrens {
181fa9e4066Sahrens 	int i;
182fa9e4066Sahrens 	dnode_t *dn = arg;
183fa9e4066Sahrens 
184fa9e4066Sahrens 	rw_destroy(&dn->dn_struct_rwlock);
185fa9e4066Sahrens 	mutex_destroy(&dn->dn_mtx);
186fa9e4066Sahrens 	mutex_destroy(&dn->dn_dbufs_mtx);
187b5e70f97SRicardo M. Correia 	cv_destroy(&dn->dn_notxholds);
188b390f3a9SJohn Poduska 	cv_destroy(&dn->dn_nodnholds);
189e914ace2STim Schumacher 	zfs_refcount_destroy(&dn->dn_holds);
190e914ace2STim Schumacher 	zfs_refcount_destroy(&dn->dn_tx_holds);
191744947dcSTom Erickson 	ASSERT(!list_link_active(&dn->dn_link));
192fa9e4066Sahrens 
193fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
194aa02ea01STom Caputi 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
195bf16b11eSMatthew Ahrens 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
196c717a561Smaybee 		list_destroy(&dn->dn_dirty_records[i]);
197fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nblkptr[i]);
198fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nlevels[i]);
199fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_indblkshift[i]);
200fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonustype[i]);
201fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_rm_spillblk[i]);
202fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonuslen[i]);
203fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_blksz[i]);
204eb633035STom Caputi 		ASSERT0(dn->dn_next_maxblkid[i]);
205fa9e4066Sahrens 	}
206fa9e4066Sahrens 
207fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_allocated_txg);
208fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_free_txg);
209fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_assigned_txg);
210aa02ea01STom Caputi 	ASSERT0(dn->dn_dirty_txg);
211fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_dirtyctx);
212744947dcSTom Erickson 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
213744947dcSTom Erickson 	ASSERT3P(dn->dn_bonus, ==, NULL);
214744947dcSTom Erickson 	ASSERT(!dn->dn_have_spill);
215744947dcSTom Erickson 	ASSERT3P(dn->dn_zio, ==, NULL);
216fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_oldused);
217fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_oldflags);
218fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_olduid);
219fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_oldgid);
220f67950b2SNasf-Fan 	ASSERT0(dn->dn_oldprojid);
221fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_newuid);
222fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_newgid);
223f67950b2SNasf-Fan 	ASSERT0(dn->dn_newprojid);
224fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_id_flags);
225fb09f5aaSMadhav Suresh 
226fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_dbufs_count);
2270f6d88adSAlex Reece 	avl_destroy(&dn->dn_dbufs);
228fa9e4066Sahrens }
229fa9e4066Sahrens 
230fa9e4066Sahrens void
dnode_init(void)231fa9e4066Sahrens dnode_init(void)
232fa9e4066Sahrens {
233744947dcSTom Erickson 	ASSERT(dnode_cache == NULL);
234fa9e4066Sahrens 	dnode_cache = kmem_cache_create("dnode_t",
235fa9e4066Sahrens 	    sizeof (dnode_t),
236fa9e4066Sahrens 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
237f06dce2cSAndrew Stormont #ifdef	_KERNEL
238744947dcSTom Erickson 	kmem_cache_set_move(dnode_cache, dnode_move);
23954811da5SToomas Soome 
24054811da5SToomas Soome 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
24154811da5SToomas Soome 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
24254811da5SToomas Soome 	    KSTAT_FLAG_VIRTUAL);
24354811da5SToomas Soome 	if (dnode_ksp != NULL) {
24454811da5SToomas Soome 		dnode_ksp->ks_data = &dnode_stats;
24554811da5SToomas Soome 		kstat_install(dnode_ksp);
24654811da5SToomas Soome 	}
247f06dce2cSAndrew Stormont #endif	/* _KERNEL */
248fa9e4066Sahrens }
249fa9e4066Sahrens 
250fa9e4066Sahrens void
dnode_fini(void)251fa9e4066Sahrens dnode_fini(void)
252fa9e4066Sahrens {
25354811da5SToomas Soome 	if (dnode_ksp != NULL) {
25454811da5SToomas Soome 		kstat_delete(dnode_ksp);
25554811da5SToomas Soome 		dnode_ksp = NULL;
25654811da5SToomas Soome 	}
25754811da5SToomas Soome 
258fa9e4066Sahrens 	kmem_cache_destroy(dnode_cache);
259744947dcSTom Erickson 	dnode_cache = NULL;
260fa9e4066Sahrens }
261fa9e4066Sahrens 
262fa9e4066Sahrens 
2639c9dc39aSek #ifdef ZFS_DEBUG
264fa9e4066Sahrens void
dnode_verify(dnode_t * dn)265fa9e4066Sahrens dnode_verify(dnode_t *dn)
266fa9e4066Sahrens {
267fa9e4066Sahrens 	int drop_struct_lock = FALSE;
268fa9e4066Sahrens 
269fa9e4066Sahrens 	ASSERT(dn->dn_phys);
270fa9e4066Sahrens 	ASSERT(dn->dn_objset);
271744947dcSTom Erickson 	ASSERT(dn->dn_handle->dnh_dnode == dn);
272fa9e4066Sahrens 
273ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
274fa9e4066Sahrens 
275fa9e4066Sahrens 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
276fa9e4066Sahrens 		return;
277fa9e4066Sahrens 
278fa9e4066Sahrens 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
279fa9e4066Sahrens 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
280fa9e4066Sahrens 		drop_struct_lock = TRUE;
281fa9e4066Sahrens 	}
282fa9e4066Sahrens 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
283fa9e4066Sahrens 		int i;
28454811da5SToomas Soome 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
285fa9e4066Sahrens 		ASSERT3U(dn->dn_indblkshift, >=, 0);
286fa9e4066Sahrens 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
287fa9e4066Sahrens 		if (dn->dn_datablkshift) {
288fa9e4066Sahrens 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
289fa9e4066Sahrens 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
290fa9e4066Sahrens 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
291fa9e4066Sahrens 		}
292fa9e4066Sahrens 		ASSERT3U(dn->dn_nlevels, <=, 30);
293ad135b5dSChristopher Siden 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
294fa9e4066Sahrens 		ASSERT3U(dn->dn_nblkptr, >=, 1);
295fa9e4066Sahrens 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
29654811da5SToomas Soome 		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
297fa9e4066Sahrens 		ASSERT3U(dn->dn_datablksz, ==,
298fa9e4066Sahrens 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
299fa9e4066Sahrens 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
300fa9e4066Sahrens 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
30154811da5SToomas Soome 		    dn->dn_bonuslen, <=, max_bonuslen);
302fa9e4066Sahrens 		for (i = 0; i < TXG_SIZE; i++) {
303fa9e4066Sahrens 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
304fa9e4066Sahrens 		}
305fa9e4066Sahrens 	}
306fa9e4066Sahrens 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
307fa9e4066Sahrens 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
30814843421SMatthew Ahrens 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
309fa9e4066Sahrens 	if (dn->dn_dbuf != NULL) {
310fa9e4066Sahrens 		ASSERT3P(dn->dn_phys, ==,
311fa9e4066Sahrens 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
312fa9e4066Sahrens 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
313fa9e4066Sahrens 	}
314fa9e4066Sahrens 	if (drop_struct_lock)
315fa9e4066Sahrens 		rw_exit(&dn->dn_struct_rwlock);
316fa9e4066Sahrens }
3179c9dc39aSek #endif
318fa9e4066Sahrens 
319fa9e4066Sahrens void
dnode_byteswap(dnode_phys_t * dnp)320fa9e4066Sahrens dnode_byteswap(dnode_phys_t *dnp)
321fa9e4066Sahrens {
322fa9e4066Sahrens 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
323fa9e4066Sahrens 	int i;
324fa9e4066Sahrens 
325fa9e4066Sahrens 	if (dnp->dn_type == DMU_OT_NONE) {
326fa9e4066Sahrens 		bzero(dnp, sizeof (dnode_phys_t));
327fa9e4066Sahrens 		return;
328fa9e4066Sahrens 	}
329fa9e4066Sahrens 
330fa9e4066Sahrens 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
331fa9e4066Sahrens 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
33254811da5SToomas Soome 	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
333fa9e4066Sahrens 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
33499653d4eSeschrock 	dnp->dn_used = BSWAP_64(dnp->dn_used);
335fa9e4066Sahrens 
336fa9e4066Sahrens 	/*
337fa9e4066Sahrens 	 * dn_nblkptr is only one byte, so it's OK to read it in either
338fa9e4066Sahrens 	 * byte order.  We can't read dn_bouslen.
339fa9e4066Sahrens 	 */
340fa9e4066Sahrens 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
341fa9e4066Sahrens 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
342fa9e4066Sahrens 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
343fa9e4066Sahrens 		buf64[i] = BSWAP_64(buf64[i]);
344fa9e4066Sahrens 
345fa9e4066Sahrens 	/*
346fa9e4066Sahrens 	 * OK to check dn_bonuslen for zero, because it won't matter if
347fa9e4066Sahrens 	 * we have the wrong byte order.  This is necessary because the
348fa9e4066Sahrens 	 * dnode dnode is smaller than a regular dnode.
349fa9e4066Sahrens 	 */
350fa9e4066Sahrens 	if (dnp->dn_bonuslen != 0) {
351*d4c2c737SGeorge Amanakis 		dmu_object_byteswap_t byteswap;
352ad135b5dSChristopher Siden 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
353*d4c2c737SGeorge Amanakis 		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
354*d4c2c737SGeorge Amanakis 		dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
355*d4c2c737SGeorge Amanakis 		    DN_MAX_BONUS_LEN(dnp));
356fa9e4066Sahrens 	}
3570a586ceaSMark Shellenbaum 
3580a586ceaSMark Shellenbaum 	/* Swap SPILL block if we have one */
3590a586ceaSMark Shellenbaum 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
36054811da5SToomas Soome 		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
3610a586ceaSMark Shellenbaum 
362fa9e4066Sahrens }
363fa9e4066Sahrens 
364fa9e4066Sahrens void
dnode_buf_byteswap(void * vbuf,size_t size)365fa9e4066Sahrens dnode_buf_byteswap(void *vbuf, size_t size)
366fa9e4066Sahrens {
36754811da5SToomas Soome 	int i = 0;
368fa9e4066Sahrens 
369fa9e4066Sahrens 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
370fa9e4066Sahrens 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
371fa9e4066Sahrens 
37254811da5SToomas Soome 	while (i < size) {
37354811da5SToomas Soome 		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
37454811da5SToomas Soome 		dnode_byteswap(dnp);
37554811da5SToomas Soome 
37654811da5SToomas Soome 		i += DNODE_MIN_SIZE;
37754811da5SToomas Soome 		if (dnp->dn_type != DMU_OT_NONE)
37854811da5SToomas Soome 			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
379fa9e4066Sahrens 	}
380fa9e4066Sahrens }
381fa9e4066Sahrens 
3821934e92fSmaybee void
dnode_setbonuslen(dnode_t * dn,int newsize,dmu_tx_t * tx)3831934e92fSmaybee dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
3841934e92fSmaybee {
385e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
3861934e92fSmaybee 
3871934e92fSmaybee 	dnode_setdirty(dn, tx);
3881934e92fSmaybee 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
38954811da5SToomas Soome 	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
3901934e92fSmaybee 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
3911934e92fSmaybee 	dn->dn_bonuslen = newsize;
3921934e92fSmaybee 	if (newsize == 0)
3931934e92fSmaybee 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
3941934e92fSmaybee 	else
3951934e92fSmaybee 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
3961934e92fSmaybee 	rw_exit(&dn->dn_struct_rwlock);
3971934e92fSmaybee }
3981934e92fSmaybee 
3990a586ceaSMark Shellenbaum void
dnode_setbonus_type(dnode_t * dn,dmu_object_type_t newtype,dmu_tx_t * tx)4000a586ceaSMark Shellenbaum dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
4010a586ceaSMark Shellenbaum {
402e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
4030a586ceaSMark Shellenbaum 	dnode_setdirty(dn, tx);
4040a586ceaSMark Shellenbaum 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
4050a586ceaSMark Shellenbaum 	dn->dn_bonustype = newtype;
4060a586ceaSMark Shellenbaum 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
4070a586ceaSMark Shellenbaum 	rw_exit(&dn->dn_struct_rwlock);
4080a586ceaSMark Shellenbaum }
4090a586ceaSMark Shellenbaum 
4100a586ceaSMark Shellenbaum void
dnode_rm_spill(dnode_t * dn,dmu_tx_t * tx)4110a586ceaSMark Shellenbaum dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
4120a586ceaSMark Shellenbaum {
413e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
41406e0070dSMark Shellenbaum 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
4150a586ceaSMark Shellenbaum 	dnode_setdirty(dn, tx);
4160a586ceaSMark Shellenbaum 	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
4170a586ceaSMark Shellenbaum 	dn->dn_have_spill = B_FALSE;
4180a586ceaSMark Shellenbaum }
4190a586ceaSMark Shellenbaum 
420fa9e4066Sahrens static void
dnode_setdblksz(dnode_t * dn,int size)421fa9e4066Sahrens dnode_setdblksz(dnode_t *dn, int size)
422fa9e4066Sahrens {
423fb09f5aaSMadhav Suresh 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
424fa9e4066Sahrens 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
425fa9e4066Sahrens 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
426fa9e4066Sahrens 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
427fa9e4066Sahrens 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
428fa9e4066Sahrens 	dn->dn_datablksz = size;
429fa9e4066Sahrens 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
430bf16b11eSMatthew Ahrens 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
431fa9e4066Sahrens }
432fa9e4066Sahrens 
433fa9e4066Sahrens static dnode_t *
dnode_create(objset_t * os,dnode_phys_t * dnp,dmu_buf_impl_t * db,uint64_t object,dnode_handle_t * dnh)434503ad85cSMatthew Ahrens dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
435744947dcSTom Erickson     uint64_t object, dnode_handle_t *dnh)
436fa9e4066Sahrens {
437bc9014e6SJustin Gibbs 	dnode_t *dn;
438fa9e4066Sahrens 
439bc9014e6SJustin Gibbs 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
440f06dce2cSAndrew Stormont #ifdef _KERNEL
441744947dcSTom Erickson 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
442f06dce2cSAndrew Stormont #endif /* _KERNEL */
443744947dcSTom Erickson 	dn->dn_moved = 0;
444744947dcSTom Erickson 
445744947dcSTom Erickson 	/*
446744947dcSTom Erickson 	 * Defer setting dn_objset until the dnode is ready to be a candidate
447744947dcSTom Erickson 	 * for the dnode_move() callback.
448744947dcSTom Erickson 	 */
449fa9e4066Sahrens 	dn->dn_object = object;
450fa9e4066Sahrens 	dn->dn_dbuf = db;
451744947dcSTom Erickson 	dn->dn_handle = dnh;
452fa9e4066Sahrens 	dn->dn_phys = dnp;
453fa9e4066Sahrens 
454744947dcSTom Erickson 	if (dnp->dn_datablkszsec) {
455fa9e4066Sahrens 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
456744947dcSTom Erickson 	} else {
457744947dcSTom Erickson 		dn->dn_datablksz = 0;
458744947dcSTom Erickson 		dn->dn_datablkszsec = 0;
459744947dcSTom Erickson 		dn->dn_datablkshift = 0;
460744947dcSTom Erickson 	}
461fa9e4066Sahrens 	dn->dn_indblkshift = dnp->dn_indblkshift;
462fa9e4066Sahrens 	dn->dn_nlevels = dnp->dn_nlevels;
463fa9e4066Sahrens 	dn->dn_type = dnp->dn_type;
464fa9e4066Sahrens 	dn->dn_nblkptr = dnp->dn_nblkptr;
465fa9e4066Sahrens 	dn->dn_checksum = dnp->dn_checksum;
466fa9e4066Sahrens 	dn->dn_compress = dnp->dn_compress;
467fa9e4066Sahrens 	dn->dn_bonustype = dnp->dn_bonustype;
468fa9e4066Sahrens 	dn->dn_bonuslen = dnp->dn_bonuslen;
46954811da5SToomas Soome 	dn->dn_num_slots = dnp->dn_extra_slots + 1;
470fa9e4066Sahrens 	dn->dn_maxblkid = dnp->dn_maxblkid;
4710a586ceaSMark Shellenbaum 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
47206e0070dSMark Shellenbaum 	dn->dn_id_flags = 0;
473fa9e4066Sahrens 
474fa9e4066Sahrens 	dmu_zfetch_init(&dn->dn_zfetch, dn);
475fa9e4066Sahrens 
476ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
47754811da5SToomas Soome 	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
47854811da5SToomas Soome 	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
479744947dcSTom Erickson 
480fa9e4066Sahrens 	mutex_enter(&os->os_lock);
481bc9014e6SJustin Gibbs 
482bc9014e6SJustin Gibbs 	/*
483bc9014e6SJustin Gibbs 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
484bc9014e6SJustin Gibbs 	 * signifies that the special dnodes have no references from
485bc9014e6SJustin Gibbs 	 * their children (the entries in os_dnodes).  This allows
486bc9014e6SJustin Gibbs 	 * dnode_destroy() to easily determine if the last child has
487bc9014e6SJustin Gibbs 	 * been removed and then complete eviction of the objset.
488bc9014e6SJustin Gibbs 	 */
489bc9014e6SJustin Gibbs 	if (!DMU_OBJECT_IS_SPECIAL(object))
490bc9014e6SJustin Gibbs 		list_insert_head(&os->os_dnodes, dn);
491744947dcSTom Erickson 	membar_producer();
492bc9014e6SJustin Gibbs 
493744947dcSTom Erickson 	/*
494bc9014e6SJustin Gibbs 	 * Everything else must be valid before assigning dn_objset
495bc9014e6SJustin Gibbs 	 * makes the dnode eligible for dnode_move().
496744947dcSTom Erickson 	 */
497744947dcSTom Erickson 	dn->dn_objset = os;
498bc9014e6SJustin Gibbs 
499bc9014e6SJustin Gibbs 	dnh->dnh_dnode = dn;
500fa9e4066Sahrens 	mutex_exit(&os->os_lock);
501fa9e4066Sahrens 
5025a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
50354811da5SToomas Soome 
504fa9e4066Sahrens 	return (dn);
505fa9e4066Sahrens }
506fa9e4066Sahrens 
507744947dcSTom Erickson /*
508744947dcSTom Erickson  * Caller must be holding the dnode handle, which is released upon return.
509744947dcSTom Erickson  */
510fa9e4066Sahrens static void
dnode_destroy(dnode_t * dn)511fa9e4066Sahrens dnode_destroy(dnode_t *dn)
512fa9e4066Sahrens {
513503ad85cSMatthew Ahrens 	objset_t *os = dn->dn_objset;
514bc9014e6SJustin Gibbs 	boolean_t complete_os_eviction = B_FALSE;
515fa9e4066Sahrens 
5160a586ceaSMark Shellenbaum 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
517a2eea2e1Sahrens 
518fa9e4066Sahrens 	mutex_enter(&os->os_lock);
519744947dcSTom Erickson 	POINTER_INVALIDATE(&dn->dn_objset);
520bc9014e6SJustin Gibbs 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
521bc9014e6SJustin Gibbs 		list_remove(&os->os_dnodes, dn);
522bc9014e6SJustin Gibbs 		complete_os_eviction =
523bc9014e6SJustin Gibbs 		    list_is_empty(&os->os_dnodes) &&
524bc9014e6SJustin Gibbs 		    list_link_active(&os->os_evicting_node);
525bc9014e6SJustin Gibbs 	}
526fa9e4066Sahrens 	mutex_exit(&os->os_lock);
527fa9e4066Sahrens 
528744947dcSTom Erickson 	/* the dnode can no longer move, so we can release the handle */
52954811da5SToomas Soome 	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
53054811da5SToomas Soome 		zrl_remove(&dn->dn_handle->dnh_zrlock);
531744947dcSTom Erickson 
532744947dcSTom Erickson 	dn->dn_allocated_txg = 0;
533744947dcSTom Erickson 	dn->dn_free_txg = 0;
534744947dcSTom Erickson 	dn->dn_assigned_txg = 0;
535aa02ea01STom Caputi 	dn->dn_dirty_txg = 0;
536744947dcSTom Erickson 
537744947dcSTom Erickson 	dn->dn_dirtyctx = 0;
538744947dcSTom Erickson 	if (dn->dn_dirtyctx_firstset != NULL) {
539fa9e4066Sahrens 		kmem_free(dn->dn_dirtyctx_firstset, 1);
540fa9e4066Sahrens 		dn->dn_dirtyctx_firstset = NULL;
541fa9e4066Sahrens 	}
542744947dcSTom Erickson 	if (dn->dn_bonus != NULL) {
543ea8dc4b6Seschrock 		mutex_enter(&dn->dn_bonus->db_mtx);
544dcbf3bd6SGeorge Wilson 		dbuf_destroy(dn->dn_bonus);
545ea8dc4b6Seschrock 		dn->dn_bonus = NULL;
546ea8dc4b6Seschrock 	}
547744947dcSTom Erickson 	dn->dn_zio = NULL;
548744947dcSTom Erickson 
549744947dcSTom Erickson 	dn->dn_have_spill = B_FALSE;
550744947dcSTom Erickson 	dn->dn_oldused = 0;
551744947dcSTom Erickson 	dn->dn_oldflags = 0;
552744947dcSTom Erickson 	dn->dn_olduid = 0;
553744947dcSTom Erickson 	dn->dn_oldgid = 0;
554f67950b2SNasf-Fan 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
555744947dcSTom Erickson 	dn->dn_newuid = 0;
556744947dcSTom Erickson 	dn->dn_newgid = 0;
557f67950b2SNasf-Fan 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
558744947dcSTom Erickson 	dn->dn_id_flags = 0;
559744947dcSTom Erickson 
560cf6106c8SMatthew Ahrens 	dmu_zfetch_fini(&dn->dn_zfetch);
561fa9e4066Sahrens 	kmem_cache_free(dnode_cache, dn);
5625a98e54bSBrendan Gregg - Sun Microsystems 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
563bc9014e6SJustin Gibbs 
564bc9014e6SJustin Gibbs 	if (complete_os_eviction)
565bc9014e6SJustin Gibbs 		dmu_objset_evict_done(os);
566fa9e4066Sahrens }
567fa9e4066Sahrens 
568fa9e4066Sahrens void
dnode_allocate(dnode_t * dn,dmu_object_type_t ot,int blocksize,int ibs,dmu_object_type_t bonustype,int bonuslen,int dn_slots,dmu_tx_t * tx)569fa9e4066Sahrens dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
57054811da5SToomas Soome     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
571fa9e4066Sahrens {
572fa9e4066Sahrens 	int i;
573fa9e4066Sahrens 
57454811da5SToomas Soome 	ASSERT3U(dn_slots, >, 0);
57554811da5SToomas Soome 	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
57654811da5SToomas Soome 	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
577b5152584SMatthew Ahrens 	ASSERT3U(blocksize, <=,
578b5152584SMatthew Ahrens 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
579fa9e4066Sahrens 	if (blocksize == 0)
580fa9e4066Sahrens 		blocksize = 1 << zfs_default_bs;
5813b83abddSahrens 	else
5823b83abddSahrens 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
583fa9e4066Sahrens 
584fa9e4066Sahrens 	if (ibs == 0)
585fa9e4066Sahrens 		ibs = zfs_default_ibs;
586fa9e4066Sahrens 
587fa9e4066Sahrens 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
588fa9e4066Sahrens 
58954811da5SToomas Soome 	dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
59054811da5SToomas Soome 	    " blocksize=%d ibs=%d dn_slots=%d\n",
59154811da5SToomas Soome 	    dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
59254811da5SToomas Soome 	DNODE_STAT_BUMP(dnode_allocate);
593fa9e4066Sahrens 
594fa9e4066Sahrens 	ASSERT(dn->dn_type == DMU_OT_NONE);
595fa9e4066Sahrens 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
596fa9e4066Sahrens 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
597fa9e4066Sahrens 	ASSERT(ot != DMU_OT_NONE);
598ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(ot));
599fa9e4066Sahrens 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
6000a586ceaSMark Shellenbaum 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
601fa9e4066Sahrens 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
602ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(bonustype));
60354811da5SToomas Soome 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
604fa9e4066Sahrens 	ASSERT(dn->dn_type == DMU_OT_NONE);
605fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_maxblkid);
606fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_allocated_txg);
607aa02ea01STom Caputi 	ASSERT0(dn->dn_dirty_txg);
608fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_assigned_txg);
609e914ace2STim Schumacher 	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
610e914ace2STim Schumacher 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
6110f6d88adSAlex Reece 	ASSERT(avl_is_empty(&dn->dn_dbufs));
612fa9e4066Sahrens 
613fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
614fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nblkptr[i]);
615fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_nlevels[i]);
616fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_indblkshift[i]);
617fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonuslen[i]);
618fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_bonustype[i]);
619fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_rm_spillblk[i]);
620fb09f5aaSMadhav Suresh 		ASSERT0(dn->dn_next_blksz[i]);
621eb633035STom Caputi 		ASSERT0(dn->dn_next_maxblkid[i]);
622aa02ea01STom Caputi 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
623c717a561Smaybee 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
624bf16b11eSMatthew Ahrens 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
625fa9e4066Sahrens 	}
626fa9e4066Sahrens 
627fa9e4066Sahrens 	dn->dn_type = ot;
628fa9e4066Sahrens 	dnode_setdblksz(dn, blocksize);
629fa9e4066Sahrens 	dn->dn_indblkshift = ibs;
630fa9e4066Sahrens 	dn->dn_nlevels = 1;
63154811da5SToomas Soome 	dn->dn_num_slots = dn_slots;
6320a586ceaSMark Shellenbaum 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
6330a586ceaSMark Shellenbaum 		dn->dn_nblkptr = 1;
63454811da5SToomas Soome 	else {
63554811da5SToomas Soome 		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
63654811da5SToomas Soome 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
63754811da5SToomas Soome 		    SPA_BLKPTRSHIFT));
63854811da5SToomas Soome 	}
63954811da5SToomas Soome 
640fa9e4066Sahrens 	dn->dn_bonustype = bonustype;
641fa9e4066Sahrens 	dn->dn_bonuslen = bonuslen;
642fa9e4066Sahrens 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
643fa9e4066Sahrens 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
644fa9e4066Sahrens 	dn->dn_dirtyctx = 0;
645fa9e4066Sahrens 
646fa9e4066Sahrens 	dn->dn_free_txg = 0;
647fa9e4066Sahrens 	if (dn->dn_dirtyctx_firstset) {
648fa9e4066Sahrens 		kmem_free(dn->dn_dirtyctx_firstset, 1);
649fa9e4066Sahrens 		dn->dn_dirtyctx_firstset = NULL;
650fa9e4066Sahrens 	}
651fa9e4066Sahrens 
652fa9e4066Sahrens 	dn->dn_allocated_txg = tx->tx_txg;
6530a586ceaSMark Shellenbaum 	dn->dn_id_flags = 0;
654f676ed34Sahrens 
655fa9e4066Sahrens 	dnode_setdirty(dn, tx);
656f676ed34Sahrens 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
6571934e92fSmaybee 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
6580a586ceaSMark Shellenbaum 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
659f676ed34Sahrens 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
660fa9e4066Sahrens }
661fa9e4066Sahrens 
662fa9e4066Sahrens void
dnode_reallocate(dnode_t * dn,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,int dn_slots,boolean_t keep_spill,dmu_tx_t * tx)663fa9e4066Sahrens dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
664eb633035STom Caputi     dmu_object_type_t bonustype, int bonuslen, int dn_slots,
665eb633035STom Caputi     boolean_t keep_spill, dmu_tx_t *tx)
666fa9e4066Sahrens {
6672bf405a2SMark Maybee 	int nblkptr;
668c543ec06Sahrens 
669fa9e4066Sahrens 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
670b5152584SMatthew Ahrens 	ASSERT3U(blocksize, <=,
671b5152584SMatthew Ahrens 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
672fb09f5aaSMadhav Suresh 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
673ea8dc4b6Seschrock 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
674fa9e4066Sahrens 	ASSERT(tx->tx_txg != 0);
675fa9e4066Sahrens 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
67606e0070dSMark Shellenbaum 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
67706e0070dSMark Shellenbaum 	    (bonustype == DMU_OT_SA && bonuslen == 0));
678ad135b5dSChristopher Siden 	ASSERT(DMU_OT_IS_VALID(bonustype));
67954811da5SToomas Soome 	ASSERT3U(bonuslen, <=,
68054811da5SToomas Soome 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
681946342a2SFabian Grünbichler 	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
68254811da5SToomas Soome 
68354811da5SToomas Soome 	dnode_free_interior_slots(dn);
68454811da5SToomas Soome 	DNODE_STAT_BUMP(dnode_reallocate);
685c543ec06Sahrens 
686ea8dc4b6Seschrock 	/* clean up any unreferenced dbufs */
6871934e92fSmaybee 	dnode_evict_dbufs(dn);
688da03de99SMark Maybee 
68928d97a71SMark Shellenbaum 	dn->dn_id_flags = 0;
69028d97a71SMark Shellenbaum 
691fa9e4066Sahrens 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
692fa9e4066Sahrens 	dnode_setdirty(dn, tx);
6932bf405a2SMark Maybee 	if (dn->dn_datablksz != blocksize) {
6942bf405a2SMark Maybee 		/* change blocksize */
6952bf405a2SMark Maybee 		ASSERT(dn->dn_maxblkid == 0 &&
6962bf405a2SMark Maybee 		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
6972bf405a2SMark Maybee 		    dnode_block_freed(dn, 0)));
6982bf405a2SMark Maybee 		dnode_setdblksz(dn, blocksize);
6992bf405a2SMark Maybee 		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
7002bf405a2SMark Maybee 	}
7012bf405a2SMark Maybee 	if (dn->dn_bonuslen != bonuslen)
7022bf405a2SMark Maybee 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
70306e0070dSMark Shellenbaum 
70406e0070dSMark Shellenbaum 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
70506e0070dSMark Shellenbaum 		nblkptr = 1;
70606e0070dSMark Shellenbaum 	else
70754811da5SToomas Soome 		nblkptr = MIN(DN_MAX_NBLKPTR,
70854811da5SToomas Soome 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
70954811da5SToomas Soome 		    SPA_BLKPTRSHIFT));
7100a586ceaSMark Shellenbaum 	if (dn->dn_bonustype != bonustype)
7110a586ceaSMark Shellenbaum 		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
712da03de99SMark Maybee 	if (dn->dn_nblkptr != nblkptr)
713da03de99SMark Maybee 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
714eb633035STom Caputi 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
71506e0070dSMark Shellenbaum 		dbuf_rm_spill(dn, tx);
71606e0070dSMark Shellenbaum 		dnode_rm_spill(dn, tx);
7170a586ceaSMark Shellenbaum 	}
718fa9e4066Sahrens 	rw_exit(&dn->dn_struct_rwlock);
719fa9e4066Sahrens 
720fa9e4066Sahrens 	/* change type */
721fa9e4066Sahrens 	dn->dn_type = ot;
722fa9e4066Sahrens 
723fa9e4066Sahrens 	/* change bonus size and type */
724fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
725fa9e4066Sahrens 	dn->dn_bonustype = bonustype;
726fa9e4066Sahrens 	dn->dn_bonuslen = bonuslen;
72754811da5SToomas Soome 	dn->dn_num_slots = dn_slots;
728da03de99SMark Maybee 	dn->dn_nblkptr = nblkptr;
729fa9e4066Sahrens 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
730fa9e4066Sahrens 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
731fa9e4066Sahrens 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
732fa9e4066Sahrens 
733da03de99SMark Maybee 	/* fix up the bonus db_size */
734da03de99SMark Maybee 	if (dn->dn_bonus) {
7351934e92fSmaybee 		dn->dn_bonus->db.db_size =
73654811da5SToomas Soome 		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
73754811da5SToomas Soome 		    (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
7381934e92fSmaybee 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
7391934e92fSmaybee 	}
740432f72fdSahrens 
741fa9e4066Sahrens 	dn->dn_allocated_txg = tx->tx_txg;
742fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
743fa9e4066Sahrens }
744fa9e4066Sahrens 
745f06dce2cSAndrew Stormont #ifdef	_KERNEL
746744947dcSTom Erickson static void
dnode_move_impl(dnode_t * odn,dnode_t * ndn)747744947dcSTom Erickson dnode_move_impl(dnode_t *odn, dnode_t *ndn)
748744947dcSTom Erickson {
749744947dcSTom Erickson 	int i;
750744947dcSTom Erickson 
751744947dcSTom Erickson 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
752744947dcSTom Erickson 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
753744947dcSTom Erickson 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
754744947dcSTom Erickson 	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
755744947dcSTom Erickson 
756744947dcSTom Erickson 	/* Copy fields. */
757744947dcSTom Erickson 	ndn->dn_objset = odn->dn_objset;
758744947dcSTom Erickson 	ndn->dn_object = odn->dn_object;
759744947dcSTom Erickson 	ndn->dn_dbuf = odn->dn_dbuf;
760744947dcSTom Erickson 	ndn->dn_handle = odn->dn_handle;
761744947dcSTom Erickson 	ndn->dn_phys = odn->dn_phys;
762744947dcSTom Erickson 	ndn->dn_type = odn->dn_type;
763744947dcSTom Erickson 	ndn->dn_bonuslen = odn->dn_bonuslen;
764744947dcSTom Erickson 	ndn->dn_bonustype = odn->dn_bonustype;
765744947dcSTom Erickson 	ndn->dn_nblkptr = odn->dn_nblkptr;
766744947dcSTom Erickson 	ndn->dn_checksum = odn->dn_checksum;
767744947dcSTom Erickson 	ndn->dn_compress = odn->dn_compress;
768744947dcSTom Erickson 	ndn->dn_nlevels = odn->dn_nlevels;
769744947dcSTom Erickson 	ndn->dn_indblkshift = odn->dn_indblkshift;
770744947dcSTom Erickson 	ndn->dn_datablkshift = odn->dn_datablkshift;
771744947dcSTom Erickson 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
772744947dcSTom Erickson 	ndn->dn_datablksz = odn->dn_datablksz;
773744947dcSTom Erickson 	ndn->dn_maxblkid = odn->dn_maxblkid;
77454811da5SToomas Soome 	ndn->dn_num_slots = odn->dn_num_slots;
775c7fbe46dSMatthew Ahrens 	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
776c7fbe46dSMatthew Ahrens 	    sizeof (odn->dn_next_type));
777744947dcSTom Erickson 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
778744947dcSTom Erickson 	    sizeof (odn->dn_next_nblkptr));
779744947dcSTom Erickson 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
780744947dcSTom Erickson 	    sizeof (odn->dn_next_nlevels));
781744947dcSTom Erickson 	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
782744947dcSTom Erickson 	    sizeof (odn->dn_next_indblkshift));
783744947dcSTom Erickson 	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
784744947dcSTom Erickson 	    sizeof (odn->dn_next_bonustype));
785744947dcSTom Erickson 	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
786744947dcSTom Erickson 	    sizeof (odn->dn_rm_spillblk));
787744947dcSTom Erickson 	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
788744947dcSTom Erickson 	    sizeof (odn->dn_next_bonuslen));
789744947dcSTom Erickson 	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
790744947dcSTom Erickson 	    sizeof (odn->dn_next_blksz));
791eb633035STom Caputi 	bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
792eb633035STom Caputi 	    sizeof (odn->dn_next_maxblkid));
793744947dcSTom Erickson 	for (i = 0; i < TXG_SIZE; i++) {
794744947dcSTom Erickson 		list_move_tail(&ndn->dn_dirty_records[i],
795744947dcSTom Erickson 		    &odn->dn_dirty_records[i]);
796744947dcSTom Erickson 	}
797bf16b11eSMatthew Ahrens 	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
798bf16b11eSMatthew Ahrens 	    sizeof (odn->dn_free_ranges));
799744947dcSTom Erickson 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
800744947dcSTom Erickson 	ndn->dn_free_txg = odn->dn_free_txg;
801744947dcSTom Erickson 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
802aa02ea01STom Caputi 	ndn->dn_dirty_txg = odn->dn_dirty_txg;
803744947dcSTom Erickson 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
804744947dcSTom Erickson 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
805e914ace2STim Schumacher 	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
806e914ace2STim Schumacher 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
8070f6d88adSAlex Reece 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
8080f6d88adSAlex Reece 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
809744947dcSTom Erickson 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
810744947dcSTom Erickson 	ndn->dn_bonus = odn->dn_bonus;
811744947dcSTom Erickson 	ndn->dn_have_spill = odn->dn_have_spill;
812744947dcSTom Erickson 	ndn->dn_zio = odn->dn_zio;
813744947dcSTom Erickson 	ndn->dn_oldused = odn->dn_oldused;
814744947dcSTom Erickson 	ndn->dn_oldflags = odn->dn_oldflags;
815744947dcSTom Erickson 	ndn->dn_olduid = odn->dn_olduid;
816744947dcSTom Erickson 	ndn->dn_oldgid = odn->dn_oldgid;
817f67950b2SNasf-Fan 	ndn->dn_oldprojid = odn->dn_oldprojid;
818744947dcSTom Erickson 	ndn->dn_newuid = odn->dn_newuid;
819744947dcSTom Erickson 	ndn->dn_newgid = odn->dn_newgid;
820f67950b2SNasf-Fan 	ndn->dn_newprojid = odn->dn_newprojid;
821744947dcSTom Erickson 	ndn->dn_id_flags = odn->dn_id_flags;
822744947dcSTom Erickson 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
823744947dcSTom Erickson 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
824744947dcSTom Erickson 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
825744947dcSTom Erickson 
826744947dcSTom Erickson 	/*
827744947dcSTom Erickson 	 * Update back pointers. Updating the handle fixes the back pointer of
828744947dcSTom Erickson 	 * every descendant dbuf as well as the bonus dbuf.
829744947dcSTom Erickson 	 */
830744947dcSTom Erickson 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
831744947dcSTom Erickson 	ndn->dn_handle->dnh_dnode = ndn;
832744947dcSTom Erickson 	if (ndn->dn_zfetch.zf_dnode == odn) {
833744947dcSTom Erickson 		ndn->dn_zfetch.zf_dnode = ndn;
834744947dcSTom Erickson 	}
835744947dcSTom Erickson 
836744947dcSTom Erickson 	/*
837744947dcSTom Erickson 	 * Invalidate the original dnode by clearing all of its back pointers.
838744947dcSTom Erickson 	 */
839744947dcSTom Erickson 	odn->dn_dbuf = NULL;
840744947dcSTom Erickson 	odn->dn_handle = NULL;
8410f6d88adSAlex Reece 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
842744947dcSTom Erickson 	    offsetof(dmu_buf_impl_t, db_link));
843744947dcSTom Erickson 	odn->dn_dbufs_count = 0;
844744947dcSTom Erickson 	odn->dn_bonus = NULL;
845744947dcSTom Erickson 	odn->dn_zfetch.zf_dnode = NULL;
846744947dcSTom Erickson 
847744947dcSTom Erickson 	/*
848744947dcSTom Erickson 	 * Set the low bit of the objset pointer to ensure that dnode_move()
849744947dcSTom Erickson 	 * recognizes the dnode as invalid in any subsequent callback.
850744947dcSTom Erickson 	 */
851744947dcSTom Erickson 	POINTER_INVALIDATE(&odn->dn_objset);
852744947dcSTom Erickson 
853744947dcSTom Erickson 	/*
854744947dcSTom Erickson 	 * Satisfy the destructor.
855744947dcSTom Erickson 	 */
856744947dcSTom Erickson 	for (i = 0; i < TXG_SIZE; i++) {
857744947dcSTom Erickson 		list_create(&odn->dn_dirty_records[i],
858744947dcSTom Erickson 		    sizeof (dbuf_dirty_record_t),
859744947dcSTom Erickson 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
860bf16b11eSMatthew Ahrens 		odn->dn_free_ranges[i] = NULL;
861744947dcSTom Erickson 		odn->dn_next_nlevels[i] = 0;
862744947dcSTom Erickson 		odn->dn_next_indblkshift[i] = 0;
863744947dcSTom Erickson 		odn->dn_next_bonustype[i] = 0;
864744947dcSTom Erickson 		odn->dn_rm_spillblk[i] = 0;
865744947dcSTom Erickson 		odn->dn_next_bonuslen[i] = 0;
866744947dcSTom Erickson 		odn->dn_next_blksz[i] = 0;
867744947dcSTom Erickson 	}
868744947dcSTom Erickson 	odn->dn_allocated_txg = 0;
869744947dcSTom Erickson 	odn->dn_free_txg = 0;
870744947dcSTom Erickson 	odn->dn_assigned_txg = 0;
871aa02ea01STom Caputi 	odn->dn_dirty_txg = 0;
872744947dcSTom Erickson 	odn->dn_dirtyctx = 0;
873744947dcSTom Erickson 	odn->dn_dirtyctx_firstset = NULL;
874744947dcSTom Erickson 	odn->dn_have_spill = B_FALSE;
875744947dcSTom Erickson 	odn->dn_zio = NULL;
876744947dcSTom Erickson 	odn->dn_oldused = 0;
877744947dcSTom Erickson 	odn->dn_oldflags = 0;
878744947dcSTom Erickson 	odn->dn_olduid = 0;
879744947dcSTom Erickson 	odn->dn_oldgid = 0;
880f67950b2SNasf-Fan 	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
881744947dcSTom Erickson 	odn->dn_newuid = 0;
882744947dcSTom Erickson 	odn->dn_newgid = 0;
883f67950b2SNasf-Fan 	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
884744947dcSTom Erickson 	odn->dn_id_flags = 0;
885744947dcSTom Erickson 
886744947dcSTom Erickson 	/*
887744947dcSTom Erickson 	 * Mark the dnode.
888744947dcSTom Erickson 	 */
889744947dcSTom Erickson 	ndn->dn_moved = 1;
890744947dcSTom Erickson 	odn->dn_moved = (uint8_t)-1;
891744947dcSTom Erickson }
892744947dcSTom Erickson 
893744947dcSTom Erickson /*ARGSUSED*/
894744947dcSTom Erickson static kmem_cbrc_t
dnode_move(void * buf,void * newbuf,size_t size,void * arg)895744947dcSTom Erickson dnode_move(void *buf, void *newbuf, size_t size, void *arg)
896744947dcSTom Erickson {
897744947dcSTom Erickson 	dnode_t *odn = buf, *ndn = newbuf;
898744947dcSTom Erickson 	objset_t *os;
899744947dcSTom Erickson 	int64_t refcount;
900744947dcSTom Erickson 	uint32_t dbufs;
901744947dcSTom Erickson 
902744947dcSTom Erickson 	/*
903744947dcSTom Erickson 	 * The dnode is on the objset's list of known dnodes if the objset
904744947dcSTom Erickson 	 * pointer is valid. We set the low bit of the objset pointer when
905744947dcSTom Erickson 	 * freeing the dnode to invalidate it, and the memory patterns written
906744947dcSTom Erickson 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
907744947dcSTom Erickson 	 * A newly created dnode sets the objset pointer last of all to indicate
908744947dcSTom Erickson 	 * that the dnode is known and in a valid state to be moved by this
909744947dcSTom Erickson 	 * function.
910744947dcSTom Erickson 	 */
911744947dcSTom Erickson 	os = odn->dn_objset;
912744947dcSTom Erickson 	if (!POINTER_IS_VALID(os)) {
91354811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_invalid);
914744947dcSTom Erickson 		return (KMEM_CBRC_DONT_KNOW);
915744947dcSTom Erickson 	}
916744947dcSTom Erickson 
917744947dcSTom Erickson 	/*
918744947dcSTom Erickson 	 * Ensure that the objset does not go away during the move.
919744947dcSTom Erickson 	 */
920744947dcSTom Erickson 	rw_enter(&os_lock, RW_WRITER);
921744947dcSTom Erickson 	if (os != odn->dn_objset) {
922744947dcSTom Erickson 		rw_exit(&os_lock);
92354811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_recheck1);
924744947dcSTom Erickson 		return (KMEM_CBRC_DONT_KNOW);
925744947dcSTom Erickson 	}
926744947dcSTom Erickson 
927744947dcSTom Erickson 	/*
928744947dcSTom Erickson 	 * If the dnode is still valid, then so is the objset. We know that no
929744947dcSTom Erickson 	 * valid objset can be freed while we hold os_lock, so we can safely
930744947dcSTom Erickson 	 * ensure that the objset remains in use.
931744947dcSTom Erickson 	 */
932744947dcSTom Erickson 	mutex_enter(&os->os_lock);
933744947dcSTom Erickson 
934744947dcSTom Erickson 	/*
935744947dcSTom Erickson 	 * Recheck the objset pointer in case the dnode was removed just before
936744947dcSTom Erickson 	 * acquiring the lock.
937744947dcSTom Erickson 	 */
938744947dcSTom Erickson 	if (os != odn->dn_objset) {
939744947dcSTom Erickson 		mutex_exit(&os->os_lock);
940744947dcSTom Erickson 		rw_exit(&os_lock);
94154811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_recheck2);
942744947dcSTom Erickson 		return (KMEM_CBRC_DONT_KNOW);
943744947dcSTom Erickson 	}
944744947dcSTom Erickson 
945744947dcSTom Erickson 	/*
946744947dcSTom Erickson 	 * At this point we know that as long as we hold os->os_lock, the dnode
947744947dcSTom Erickson 	 * cannot be freed and fields within the dnode can be safely accessed.
948744947dcSTom Erickson 	 * The objset listing this dnode cannot go away as long as this dnode is
949744947dcSTom Erickson 	 * on its list.
950744947dcSTom Erickson 	 */
951744947dcSTom Erickson 	rw_exit(&os_lock);
952744947dcSTom Erickson 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
953744947dcSTom Erickson 		mutex_exit(&os->os_lock);
95454811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_special);
955744947dcSTom Erickson 		return (KMEM_CBRC_NO);
956744947dcSTom Erickson 	}
957744947dcSTom Erickson 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
958744947dcSTom Erickson 
959744947dcSTom Erickson 	/*
960744947dcSTom Erickson 	 * Lock the dnode handle to prevent the dnode from obtaining any new
961744947dcSTom Erickson 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
962744947dcSTom Erickson 	 * from accessing the dnode, so that we can discount their holds. The
963744947dcSTom Erickson 	 * handle is safe to access because we know that while the dnode cannot
964744947dcSTom Erickson 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
965744947dcSTom Erickson 	 * safely move any dnode referenced only by dbufs.
966744947dcSTom Erickson 	 */
967744947dcSTom Erickson 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
968744947dcSTom Erickson 		mutex_exit(&os->os_lock);
96954811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_handle);
970744947dcSTom Erickson 		return (KMEM_CBRC_LATER);
971744947dcSTom Erickson 	}
972744947dcSTom Erickson 
973744947dcSTom Erickson 	/*
974744947dcSTom Erickson 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
975744947dcSTom Erickson 	 * We need to guarantee that there is a hold for every dbuf in order to
976744947dcSTom Erickson 	 * determine whether the dnode is actively referenced. Falsely matching
977744947dcSTom Erickson 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
978744947dcSTom Erickson 	 * that a thread already having an active dnode hold is about to add a
979744947dcSTom Erickson 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
980744947dcSTom Erickson 	 * progress.
981744947dcSTom Erickson 	 */
982744947dcSTom Erickson 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
983744947dcSTom Erickson 		zrl_exit(&odn->dn_handle->dnh_zrlock);
984744947dcSTom Erickson 		mutex_exit(&os->os_lock);
98554811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_rwlock);
986744947dcSTom Erickson 		return (KMEM_CBRC_LATER);
987744947dcSTom Erickson 	}
988744947dcSTom Erickson 
989744947dcSTom Erickson 	/*
990744947dcSTom Erickson 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
991744947dcSTom Erickson 	 * case, the dbuf count is decremented under the handle lock before the
992744947dcSTom Erickson 	 * dbuf's hold is released. This order ensures that if we count the hold
993744947dcSTom Erickson 	 * after the dbuf is removed but before its hold is released, we will
994744947dcSTom Erickson 	 * treat the unmatched hold as active and exit safely. If we count the
995744947dcSTom Erickson 	 * hold before the dbuf is removed, the hold is discounted, and the
996744947dcSTom Erickson 	 * removal is blocked until the move completes.
997744947dcSTom Erickson 	 */
998e914ace2STim Schumacher 	refcount = zfs_refcount_count(&odn->dn_holds);
999744947dcSTom Erickson 	ASSERT(refcount >= 0);
1000744947dcSTom Erickson 	dbufs = odn->dn_dbufs_count;
1001744947dcSTom Erickson 
1002744947dcSTom Erickson 	/* We can't have more dbufs than dnode holds. */
1003744947dcSTom Erickson 	ASSERT3U(dbufs, <=, refcount);
1004744947dcSTom Erickson 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
1005744947dcSTom Erickson 	    uint32_t, dbufs);
1006744947dcSTom Erickson 
1007744947dcSTom Erickson 	if (refcount > dbufs) {
1008744947dcSTom Erickson 		rw_exit(&odn->dn_struct_rwlock);
1009744947dcSTom Erickson 		zrl_exit(&odn->dn_handle->dnh_zrlock);
1010744947dcSTom Erickson 		mutex_exit(&os->os_lock);
101154811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_move_active);
1012744947dcSTom Erickson 		return (KMEM_CBRC_LATER);
1013744947dcSTom Erickson 	}
1014744947dcSTom Erickson 
1015744947dcSTom Erickson 	rw_exit(&odn->dn_struct_rwlock);
1016744947dcSTom Erickson 
1017744947dcSTom Erickson 	/*
1018744947dcSTom Erickson 	 * At this point we know that anyone with a hold on the dnode is not
1019744947dcSTom Erickson 	 * actively referencing it. The dnode is known and in a valid state to
1020744947dcSTom Erickson 	 * move. We're holding the locks needed to execute the critical section.
1021744947dcSTom Erickson 	 */
1022744947dcSTom Erickson 	dnode_move_impl(odn, ndn);
1023744947dcSTom Erickson 
1024744947dcSTom Erickson 	list_link_replace(&odn->dn_link, &ndn->dn_link);
1025744947dcSTom Erickson 	/* If the dnode was safe to move, the refcount cannot have changed. */
1026e914ace2STim Schumacher 	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1027744947dcSTom Erickson 	ASSERT(dbufs == ndn->dn_dbufs_count);
1028744947dcSTom Erickson 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1029744947dcSTom Erickson 	mutex_exit(&os->os_lock);
1030744947dcSTom Erickson 
1031744947dcSTom Erickson 	return (KMEM_CBRC_YES);
1032744947dcSTom Erickson }
1033744947dcSTom Erickson #endif	/* _KERNEL */
1034744947dcSTom Erickson 
103554811da5SToomas Soome static void
dnode_slots_hold(dnode_children_t * children,int idx,int slots)103654811da5SToomas Soome dnode_slots_hold(dnode_children_t *children, int idx, int slots)
103754811da5SToomas Soome {
103854811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
103954811da5SToomas Soome 
104054811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
104154811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
104254811da5SToomas Soome 		zrl_add(&dnh->dnh_zrlock);
104354811da5SToomas Soome 	}
104454811da5SToomas Soome }
104554811da5SToomas Soome 
104654811da5SToomas Soome static void
dnode_slots_rele(dnode_children_t * children,int idx,int slots)104754811da5SToomas Soome dnode_slots_rele(dnode_children_t *children, int idx, int slots)
104854811da5SToomas Soome {
104954811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
105054811da5SToomas Soome 
105154811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
105254811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
105354811da5SToomas Soome 
105454811da5SToomas Soome 		if (zrl_is_locked(&dnh->dnh_zrlock))
105554811da5SToomas Soome 			zrl_exit(&dnh->dnh_zrlock);
105654811da5SToomas Soome 		else
105754811da5SToomas Soome 			zrl_remove(&dnh->dnh_zrlock);
105854811da5SToomas Soome 	}
105954811da5SToomas Soome }
106054811da5SToomas Soome 
106154811da5SToomas Soome static int
dnode_slots_tryenter(dnode_children_t * children,int idx,int slots)106254811da5SToomas Soome dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
106354811da5SToomas Soome {
106454811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
106554811da5SToomas Soome 
106654811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
106754811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
106854811da5SToomas Soome 
106954811da5SToomas Soome 		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
107054811da5SToomas Soome 			for (int j = idx; j < i; j++) {
107154811da5SToomas Soome 				dnh = &children->dnc_children[j];
107254811da5SToomas Soome 				zrl_exit(&dnh->dnh_zrlock);
107354811da5SToomas Soome 			}
107454811da5SToomas Soome 
107554811da5SToomas Soome 			return (0);
107654811da5SToomas Soome 		}
107754811da5SToomas Soome 	}
107854811da5SToomas Soome 
107954811da5SToomas Soome 	return (1);
108054811da5SToomas Soome }
108154811da5SToomas Soome 
108254811da5SToomas Soome static void
dnode_set_slots(dnode_children_t * children,int idx,int slots,void * ptr)108354811da5SToomas Soome dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
108454811da5SToomas Soome {
108554811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
108654811da5SToomas Soome 
108754811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
108854811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
108954811da5SToomas Soome 		dnh->dnh_dnode = ptr;
109054811da5SToomas Soome 	}
109154811da5SToomas Soome }
109254811da5SToomas Soome 
109354811da5SToomas Soome static boolean_t
dnode_check_slots_free(dnode_children_t * children,int idx,int slots)109454811da5SToomas Soome dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
109554811da5SToomas Soome {
109654811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
109754811da5SToomas Soome 
1098aa02ea01STom Caputi 	/*
1099aa02ea01STom Caputi 	 * If all dnode slots are either already free or
1100aa02ea01STom Caputi 	 * evictable return B_TRUE.
1101aa02ea01STom Caputi 	 */
110254811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
110354811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
110454811da5SToomas Soome 		dnode_t *dn = dnh->dnh_dnode;
110554811da5SToomas Soome 
110654811da5SToomas Soome 		if (dn == DN_SLOT_FREE) {
110754811da5SToomas Soome 			continue;
110854811da5SToomas Soome 		} else if (DN_SLOT_IS_PTR(dn)) {
110954811da5SToomas Soome 			mutex_enter(&dn->dn_mtx);
1110aa02ea01STom Caputi 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1111aa02ea01STom Caputi 			    zfs_refcount_is_zero(&dn->dn_holds) &&
1112aa02ea01STom Caputi 			    !DNODE_IS_DIRTY(dn));
111354811da5SToomas Soome 			mutex_exit(&dn->dn_mtx);
111454811da5SToomas Soome 
1115aa02ea01STom Caputi 			if (!can_free)
111654811da5SToomas Soome 				return (B_FALSE);
1117aa02ea01STom Caputi 			else
1118aa02ea01STom Caputi 				continue;
111954811da5SToomas Soome 		} else {
112054811da5SToomas Soome 			return (B_FALSE);
112154811da5SToomas Soome 		}
112254811da5SToomas Soome 	}
112354811da5SToomas Soome 
112454811da5SToomas Soome 	return (B_TRUE);
112554811da5SToomas Soome }
112654811da5SToomas Soome 
112754811da5SToomas Soome static void
dnode_reclaim_slots(dnode_children_t * children,int idx,int slots)112854811da5SToomas Soome dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
112954811da5SToomas Soome {
113054811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
113154811da5SToomas Soome 
113254811da5SToomas Soome 	for (int i = idx; i < idx + slots; i++) {
113354811da5SToomas Soome 		dnode_handle_t *dnh = &children->dnc_children[i];
113454811da5SToomas Soome 
113554811da5SToomas Soome 		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
113654811da5SToomas Soome 
113754811da5SToomas Soome 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
113854811da5SToomas Soome 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
113954811da5SToomas Soome 			dnode_destroy(dnh->dnh_dnode);
114054811da5SToomas Soome 			dnh->dnh_dnode = DN_SLOT_FREE;
114154811da5SToomas Soome 		}
114254811da5SToomas Soome 	}
114354811da5SToomas Soome }
114454811da5SToomas Soome 
114554811da5SToomas Soome void
dnode_free_interior_slots(dnode_t * dn)114654811da5SToomas Soome dnode_free_interior_slots(dnode_t *dn)
114754811da5SToomas Soome {
114854811da5SToomas Soome 	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
114954811da5SToomas Soome 	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
115054811da5SToomas Soome 	int idx = (dn->dn_object & (epb - 1)) + 1;
115154811da5SToomas Soome 	int slots = dn->dn_num_slots - 1;
115254811da5SToomas Soome 
115354811da5SToomas Soome 	if (slots == 0)
115454811da5SToomas Soome 		return;
115554811da5SToomas Soome 
115654811da5SToomas Soome 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
115754811da5SToomas Soome 
115854811da5SToomas Soome 	while (!dnode_slots_tryenter(children, idx, slots))
115954811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
116054811da5SToomas Soome 
116154811da5SToomas Soome 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
116254811da5SToomas Soome 	dnode_slots_rele(children, idx, slots);
116354811da5SToomas Soome }
116454811da5SToomas Soome 
1165fa9e4066Sahrens void
dnode_special_close(dnode_handle_t * dnh)1166744947dcSTom Erickson dnode_special_close(dnode_handle_t *dnh)
1167fa9e4066Sahrens {
1168744947dcSTom Erickson 	dnode_t *dn = dnh->dnh_dnode;
1169744947dcSTom Erickson 
1170ea8dc4b6Seschrock 	/*
1171b390f3a9SJohn Poduska 	 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
1172b390f3a9SJohn Poduska 	 * zfs_refcount_remove()
1173ea8dc4b6Seschrock 	 */
1174b390f3a9SJohn Poduska 	mutex_enter(&dn->dn_mtx);
1175b390f3a9SJohn Poduska 	if (zfs_refcount_count(&dn->dn_holds) > 0)
1176b390f3a9SJohn Poduska 		cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
1177b390f3a9SJohn Poduska 	mutex_exit(&dn->dn_mtx);
1178b390f3a9SJohn Poduska 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
1179b390f3a9SJohn Poduska 
1180bc9014e6SJustin Gibbs 	ASSERT(dn->dn_dbuf == NULL ||
1181bc9014e6SJustin Gibbs 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1182744947dcSTom Erickson 	zrl_add(&dnh->dnh_zrlock);
1183744947dcSTom Erickson 	dnode_destroy(dn); /* implicit zrl_remove() */
1184744947dcSTom Erickson 	zrl_destroy(&dnh->dnh_zrlock);
1185744947dcSTom Erickson 	dnh->dnh_dnode = NULL;
1186fa9e4066Sahrens }
1187fa9e4066Sahrens 
1188bc9014e6SJustin Gibbs void
dnode_special_open(objset_t * os,dnode_phys_t * dnp,uint64_t object,dnode_handle_t * dnh)1189744947dcSTom Erickson dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1190744947dcSTom Erickson     dnode_handle_t *dnh)
1191fa9e4066Sahrens {
1192bc9014e6SJustin Gibbs 	dnode_t *dn;
1193bc9014e6SJustin Gibbs 
1194744947dcSTom Erickson 	zrl_init(&dnh->dnh_zrlock);
1195d061fa1fSToomas Soome 	VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
119654811da5SToomas Soome 
119754811da5SToomas Soome 	dn = dnode_create(os, dnp, NULL, object, dnh);
11989c9dc39aSek 	DNODE_VERIFY(dn);
119954811da5SToomas Soome 
120054811da5SToomas Soome 	zrl_exit(&dnh->dnh_zrlock);
1201fa9e4066Sahrens }
1202fa9e4066Sahrens 
1203fa9e4066Sahrens static void
dnode_buf_evict_async(void * dbu)120440510e8eSJosef 'Jeff' Sipek dnode_buf_evict_async(void *dbu)
1205fa9e4066Sahrens {
120654811da5SToomas Soome 	dnode_children_t *dnc = dbu;
120754811da5SToomas Soome 
120854811da5SToomas Soome 	DNODE_STAT_BUMP(dnode_buf_evict);
1209fa9e4066Sahrens 
121054811da5SToomas Soome 	for (int i = 0; i < dnc->dnc_count; i++) {
121154811da5SToomas Soome 		dnode_handle_t *dnh = &dnc->dnc_children[i];
1212744947dcSTom Erickson 		dnode_t *dn;
1213fa9e4066Sahrens 
1214744947dcSTom Erickson 		/*
1215744947dcSTom Erickson 		 * The dnode handle lock guards against the dnode moving to
1216744947dcSTom Erickson 		 * another valid address, so there is no need here to guard
1217744947dcSTom Erickson 		 * against changes to or from NULL.
1218744947dcSTom Erickson 		 */
121954811da5SToomas Soome 		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1220744947dcSTom Erickson 			zrl_destroy(&dnh->dnh_zrlock);
122154811da5SToomas Soome 			dnh->dnh_dnode = DN_SLOT_UNINIT;
1222fa9e4066Sahrens 			continue;
1223744947dcSTom Erickson 		}
1224744947dcSTom Erickson 
1225744947dcSTom Erickson 		zrl_add(&dnh->dnh_zrlock);
1226744947dcSTom Erickson 		dn = dnh->dnh_dnode;
1227fa9e4066Sahrens 		/*
1228fa9e4066Sahrens 		 * If there are holds on this dnode, then there should
1229fa9e4066Sahrens 		 * be holds on the dnode's containing dbuf as well; thus
1230744947dcSTom Erickson 		 * it wouldn't be eligible for eviction and this function
1231fa9e4066Sahrens 		 * would not have been called.
1232fa9e4066Sahrens 		 */
1233e914ace2STim Schumacher 		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1234e914ace2STim Schumacher 		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1235fa9e4066Sahrens 
123654811da5SToomas Soome 		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1237744947dcSTom Erickson 		zrl_destroy(&dnh->dnh_zrlock);
123854811da5SToomas Soome 		dnh->dnh_dnode = DN_SLOT_UNINIT;
1239fa9e4066Sahrens 	}
124054811da5SToomas Soome 	kmem_free(dnc, sizeof (dnode_children_t) +
124154811da5SToomas Soome 	    dnc->dnc_count * sizeof (dnode_handle_t));
1242fa9e4066Sahrens }
1243fa9e4066Sahrens 
1244fa9e4066Sahrens /*
124554811da5SToomas Soome  * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
124654811da5SToomas Soome  * to ensure the hole at the specified object offset is large enough to
124754811da5SToomas Soome  * hold the dnode being created. The slots parameter is also used to ensure
124854811da5SToomas Soome  * a dnode does not span multiple dnode blocks. In both of these cases, if
124954811da5SToomas Soome  * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
125054811da5SToomas Soome  * are only possible when using DNODE_MUST_BE_FREE.
125154811da5SToomas Soome  *
125254811da5SToomas Soome  * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
125354811da5SToomas Soome  * dnode_hold_impl() will check if the requested dnode is already consumed
125454811da5SToomas Soome  * as an extra dnode slot by an large dnode, in which case it returns
125554811da5SToomas Soome  * ENOENT.
125654811da5SToomas Soome  *
1257d8849d7dSChunwei Chen  * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
1258d8849d7dSChunwei Chen  * return whether the hold would succeed or not. tag and dnp should set to
1259d8849d7dSChunwei Chen  * NULL in this case.
1260d8849d7dSChunwei Chen  *
1261ea8dc4b6Seschrock  * errors:
126254811da5SToomas Soome  * EINVAL - invalid object number or flags.
126354811da5SToomas Soome  * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
126454811da5SToomas Soome  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
126554811da5SToomas Soome  *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
126654811da5SToomas Soome  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
126754811da5SToomas Soome  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
126854811da5SToomas Soome  *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
126954811da5SToomas Soome  * EIO    - i/o error error when reading the meta dnode dbuf.
1270ea8dc4b6Seschrock  * succeeds even for free dnodes.
1271fa9e4066Sahrens  */
1272ea8dc4b6Seschrock int
dnode_hold_impl(objset_t * os,uint64_t object,int flag,int slots,void * tag,dnode_t ** dnp)127354811da5SToomas Soome dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1274ea8dc4b6Seschrock     void *tag, dnode_t **dnp)
1275fa9e4066Sahrens {
1276ea8dc4b6Seschrock 	int epb, idx, err;
1277fa9e4066Sahrens 	int drop_struct_lock = FALSE;
1278ea8dc4b6Seschrock 	int type;
1279fa9e4066Sahrens 	uint64_t blk;
1280fa9e4066Sahrens 	dnode_t *mdn, *dn;
1281fa9e4066Sahrens 	dmu_buf_impl_t *db;
128254811da5SToomas Soome 	dnode_children_t *dnc;
128354811da5SToomas Soome 	dnode_phys_t *dn_block;
1284744947dcSTom Erickson 	dnode_handle_t *dnh;
1285fa9e4066Sahrens 
128654811da5SToomas Soome 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
128754811da5SToomas Soome 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
1288d8849d7dSChunwei Chen 	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
128954811da5SToomas Soome 
1290e14bb325SJeff Bonwick 	/*
1291e14bb325SJeff Bonwick 	 * If you are holding the spa config lock as writer, you shouldn't
1292dcba9f3fSGeorge Wilson 	 * be asking the DMU to do *anything* unless it's the root pool
1293dcba9f3fSGeorge Wilson 	 * which may require us to read from the root filesystem while
1294dcba9f3fSGeorge Wilson 	 * holding some (not all) of the locks as writer.
1295e14bb325SJeff Bonwick 	 */
1296dcba9f3fSGeorge Wilson 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1297dcba9f3fSGeorge Wilson 	    (spa_is_root(os->os_spa) &&
129844ecc532SGeorge Wilson 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1299e14bb325SJeff Bonwick 
130086714001SSerapheim Dimitropoulos 	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
130186714001SSerapheim Dimitropoulos 
1302f67950b2SNasf-Fan 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
1303f67950b2SNasf-Fan 	    object == DMU_PROJECTUSED_OBJECT) {
1304f67950b2SNasf-Fan 		if (object == DMU_USERUSED_OBJECT)
1305f67950b2SNasf-Fan 			dn = DMU_USERUSED_DNODE(os);
1306f67950b2SNasf-Fan 		else if (object == DMU_GROUPUSED_OBJECT)
1307f67950b2SNasf-Fan 			dn = DMU_GROUPUSED_DNODE(os);
1308f67950b2SNasf-Fan 		else
1309f67950b2SNasf-Fan 			dn = DMU_PROJECTUSED_DNODE(os);
131014843421SMatthew Ahrens 		if (dn == NULL)
1311be6fd75aSMatthew Ahrens 			return (SET_ERROR(ENOENT));
131214843421SMatthew Ahrens 		type = dn->dn_type;
131314843421SMatthew Ahrens 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1314be6fd75aSMatthew Ahrens 			return (SET_ERROR(ENOENT));
131514843421SMatthew Ahrens 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1316be6fd75aSMatthew Ahrens 			return (SET_ERROR(EEXIST));
131714843421SMatthew Ahrens 		DNODE_VERIFY(dn);
1318d8849d7dSChunwei Chen 		/* Don't actually hold if dry run, just return 0 */
1319d8849d7dSChunwei Chen 		if (!(flag & DNODE_DRY_RUN)) {
1320d8849d7dSChunwei Chen 			(void) zfs_refcount_add(&dn->dn_holds, tag);
1321d8849d7dSChunwei Chen 			*dnp = dn;
1322d8849d7dSChunwei Chen 		}
132314843421SMatthew Ahrens 		return (0);
132414843421SMatthew Ahrens 	}
132514843421SMatthew Ahrens 
1326fa9e4066Sahrens 	if (object == 0 || object >= DN_MAX_OBJECT)
1327be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
1328fa9e4066Sahrens 
1329744947dcSTom Erickson 	mdn = DMU_META_DNODE(os);
1330744947dcSTom Erickson 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1331fa9e4066Sahrens 
13329c9dc39aSek 	DNODE_VERIFY(mdn);
1333fa9e4066Sahrens 
1334fa9e4066Sahrens 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1335fa9e4066Sahrens 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1336fa9e4066Sahrens 		drop_struct_lock = TRUE;
1337fa9e4066Sahrens 	}
1338fa9e4066Sahrens 
1339a2cdcdd2SPaul Dagnelie 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1340ea8dc4b6Seschrock 	db = dbuf_hold(mdn, blk, FTAG);
1341fa9e4066Sahrens 	if (drop_struct_lock)
1342fa9e4066Sahrens 		rw_exit(&mdn->dn_struct_rwlock);
134354811da5SToomas Soome 	if (db == NULL) {
134454811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1345be6fd75aSMatthew Ahrens 		return (SET_ERROR(EIO));
134654811da5SToomas Soome 	}
1347eb633035STom Caputi 	/*
1348eb633035STom Caputi 	 * We do not need to decrypt to read the dnode so it doesn't matter
1349eb633035STom Caputi 	 * if we get the encrypted or decrypted version.
1350eb633035STom Caputi 	 */
1351eb633035STom Caputi 	err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
1352ea8dc4b6Seschrock 	if (err) {
135354811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1354ea8dc4b6Seschrock 		dbuf_rele(db, FTAG);
1355ea8dc4b6Seschrock 		return (err);
1356ea8dc4b6Seschrock 	}
1357fa9e4066Sahrens 
1358fa9e4066Sahrens 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1359fa9e4066Sahrens 	epb = db->db.db_size >> DNODE_SHIFT;
1360fa9e4066Sahrens 
136154811da5SToomas Soome 	idx = object & (epb - 1);
136254811da5SToomas Soome 	dn_block = (dnode_phys_t *)db->db.db_data;
1363fa9e4066Sahrens 
1364744947dcSTom Erickson 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
136554811da5SToomas Soome 	dnc = dmu_buf_get_user(&db->db);
136654811da5SToomas Soome 	dnh = NULL;
136754811da5SToomas Soome 	if (dnc == NULL) {
1368744947dcSTom Erickson 		dnode_children_t *winner;
136954811da5SToomas Soome 		int skip = 0;
137054811da5SToomas Soome 
137154811da5SToomas Soome 		dnc = kmem_zalloc(sizeof (dnode_children_t) +
13727f18da4cSJustin T. Gibbs 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
137354811da5SToomas Soome 		dnc->dnc_count = epb;
137454811da5SToomas Soome 		dnh = &dnc->dnc_children[0];
137554811da5SToomas Soome 
137654811da5SToomas Soome 		/* Initialize dnode slot status from dnode_phys_t */
137754811da5SToomas Soome 		for (int i = 0; i < epb; i++) {
1378744947dcSTom Erickson 			zrl_init(&dnh[i].dnh_zrlock);
137954811da5SToomas Soome 
138054811da5SToomas Soome 			if (skip) {
138154811da5SToomas Soome 				skip--;
138254811da5SToomas Soome 				continue;
138354811da5SToomas Soome 			}
138454811da5SToomas Soome 
138554811da5SToomas Soome 			if (dn_block[i].dn_type != DMU_OT_NONE) {
138654811da5SToomas Soome 				int interior = dn_block[i].dn_extra_slots;
138754811da5SToomas Soome 
138854811da5SToomas Soome 				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
138954811da5SToomas Soome 				dnode_set_slots(dnc, i + 1, interior,
139054811da5SToomas Soome 				    DN_SLOT_INTERIOR);
139154811da5SToomas Soome 				skip = interior;
139254811da5SToomas Soome 			} else {
139354811da5SToomas Soome 				dnh[i].dnh_dnode = DN_SLOT_FREE;
139454811da5SToomas Soome 				skip = 0;
139554811da5SToomas Soome 			}
1396744947dcSTom Erickson 		}
139754811da5SToomas Soome 
139854811da5SToomas Soome 		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
139940510e8eSJosef 'Jeff' Sipek 		    dnode_buf_evict_async, NULL);
140054811da5SToomas Soome 		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1401bc9014e6SJustin Gibbs 		if (winner != NULL) {
1402d2b3cbbdSJorgen Lundman 
140354811da5SToomas Soome 			for (int i = 0; i < epb; i++)
1404d2b3cbbdSJorgen Lundman 				zrl_destroy(&dnh[i].dnh_zrlock);
1405d2b3cbbdSJorgen Lundman 
140654811da5SToomas Soome 			kmem_free(dnc, sizeof (dnode_children_t) +
14077f18da4cSJustin T. Gibbs 			    epb * sizeof (dnode_handle_t));
140854811da5SToomas Soome 			dnc = winner;
1409fa9e4066Sahrens 		}
1410fa9e4066Sahrens 	}
1411fa9e4066Sahrens 
141254811da5SToomas Soome 	ASSERT(dnc->dnc_count == epb);
141354811da5SToomas Soome 	dn = DN_SLOT_UNINIT;
141454811da5SToomas Soome 
141554811da5SToomas Soome 	if (flag & DNODE_MUST_BE_ALLOCATED) {
141654811da5SToomas Soome 		slots = 1;
141754811da5SToomas Soome 
141854811da5SToomas Soome 		while (dn == DN_SLOT_UNINIT) {
141954811da5SToomas Soome 			dnode_slots_hold(dnc, idx, slots);
142054811da5SToomas Soome 			dnh = &dnc->dnc_children[idx];
142154811da5SToomas Soome 
142254811da5SToomas Soome 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
142354811da5SToomas Soome 				dn = dnh->dnh_dnode;
142454811da5SToomas Soome 				break;
142554811da5SToomas Soome 			} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
142654811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_interior);
142754811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
142854811da5SToomas Soome 				dbuf_rele(db, FTAG);
142954811da5SToomas Soome 				return (SET_ERROR(EEXIST));
143054811da5SToomas Soome 			} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
143154811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_misses);
143254811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
143354811da5SToomas Soome 				dbuf_rele(db, FTAG);
143454811da5SToomas Soome 				return (SET_ERROR(ENOENT));
143554811da5SToomas Soome 			}
143654811da5SToomas Soome 
143754811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
143854811da5SToomas Soome 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
143954811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
144054811da5SToomas Soome 				continue;
144154811da5SToomas Soome 			}
144254811da5SToomas Soome 
144354811da5SToomas Soome 			/*
144454811da5SToomas Soome 			 * Someone else won the race and called dnode_create()
144554811da5SToomas Soome 			 * after we checked DN_SLOT_IS_PTR() above but before
144654811da5SToomas Soome 			 * we acquired the lock.
144754811da5SToomas Soome 			 */
144854811da5SToomas Soome 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
144954811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
145054811da5SToomas Soome 				dn = dnh->dnh_dnode;
145154811da5SToomas Soome 			} else {
145254811da5SToomas Soome 				dn = dnode_create(os, dn_block + idx, db,
145354811da5SToomas Soome 				    object, dnh);
145454811da5SToomas Soome 			}
145554811da5SToomas Soome 		}
145654811da5SToomas Soome 
145754811da5SToomas Soome 		mutex_enter(&dn->dn_mtx);
145854811da5SToomas Soome 		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
145954811da5SToomas Soome 			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
146054811da5SToomas Soome 			mutex_exit(&dn->dn_mtx);
146154811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
146254811da5SToomas Soome 			dbuf_rele(db, FTAG);
146354811da5SToomas Soome 			return (SET_ERROR(ENOENT));
146454811da5SToomas Soome 		}
146554811da5SToomas Soome 
1466d8849d7dSChunwei Chen 		/* Don't actually hold if dry run, just return 0 */
1467d8849d7dSChunwei Chen 		if (flag & DNODE_DRY_RUN) {
1468d8849d7dSChunwei Chen 			mutex_exit(&dn->dn_mtx);
1469d8849d7dSChunwei Chen 			dnode_slots_rele(dnc, idx, slots);
1470d8849d7dSChunwei Chen 			dbuf_rele(db, FTAG);
1471d8849d7dSChunwei Chen 			return (0);
1472d8849d7dSChunwei Chen 		}
1473d8849d7dSChunwei Chen 
147454811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
147554811da5SToomas Soome 	} else if (flag & DNODE_MUST_BE_FREE) {
147654811da5SToomas Soome 
147754811da5SToomas Soome 		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
147854811da5SToomas Soome 			DNODE_STAT_BUMP(dnode_hold_free_overflow);
147954811da5SToomas Soome 			dbuf_rele(db, FTAG);
148054811da5SToomas Soome 			return (SET_ERROR(ENOSPC));
148154811da5SToomas Soome 		}
148254811da5SToomas Soome 
148354811da5SToomas Soome 		while (dn == DN_SLOT_UNINIT) {
148454811da5SToomas Soome 			dnode_slots_hold(dnc, idx, slots);
148554811da5SToomas Soome 
148654811da5SToomas Soome 			if (!dnode_check_slots_free(dnc, idx, slots)) {
148754811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_free_misses);
148854811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
148954811da5SToomas Soome 				dbuf_rele(db, FTAG);
149054811da5SToomas Soome 				return (SET_ERROR(ENOSPC));
149154811da5SToomas Soome 			}
149254811da5SToomas Soome 
149354811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
149454811da5SToomas Soome 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
149554811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
149654811da5SToomas Soome 				continue;
149754811da5SToomas Soome 			}
149854811da5SToomas Soome 
149954811da5SToomas Soome 			if (!dnode_check_slots_free(dnc, idx, slots)) {
150054811da5SToomas Soome 				DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
150154811da5SToomas Soome 				dnode_slots_rele(dnc, idx, slots);
150254811da5SToomas Soome 				dbuf_rele(db, FTAG);
150354811da5SToomas Soome 				return (SET_ERROR(ENOSPC));
150454811da5SToomas Soome 			}
150554811da5SToomas Soome 
150654811da5SToomas Soome 			/*
150754811da5SToomas Soome 			 * Allocated but otherwise free dnodes which would
150854811da5SToomas Soome 			 * be in the interior of a multi-slot dnodes need
150954811da5SToomas Soome 			 * to be freed.  Single slot dnodes can be safely
151054811da5SToomas Soome 			 * re-purposed as a performance optimization.
151154811da5SToomas Soome 			 */
151254811da5SToomas Soome 			if (slots > 1)
151354811da5SToomas Soome 				dnode_reclaim_slots(dnc, idx + 1, slots - 1);
151454811da5SToomas Soome 
151554811da5SToomas Soome 			dnh = &dnc->dnc_children[idx];
151654811da5SToomas Soome 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
151754811da5SToomas Soome 				dn = dnh->dnh_dnode;
151854811da5SToomas Soome 			} else {
151954811da5SToomas Soome 				dn = dnode_create(os, dn_block + idx, db,
152054811da5SToomas Soome 				    object, dnh);
152154811da5SToomas Soome 			}
152254811da5SToomas Soome 		}
152354811da5SToomas Soome 
152454811da5SToomas Soome 		mutex_enter(&dn->dn_mtx);
1525e914ace2STim Schumacher 		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
152654811da5SToomas Soome 			DNODE_STAT_BUMP(dnode_hold_free_refcount);
152754811da5SToomas Soome 			mutex_exit(&dn->dn_mtx);
152854811da5SToomas Soome 			dnode_slots_rele(dnc, idx, slots);
152954811da5SToomas Soome 			dbuf_rele(db, FTAG);
153054811da5SToomas Soome 			return (SET_ERROR(EEXIST));
153154811da5SToomas Soome 		}
15320e8c6158Smaybee 
1533d8849d7dSChunwei Chen 		/* Don't actually hold if dry run, just return 0 */
1534d8849d7dSChunwei Chen 		if (flag & DNODE_DRY_RUN) {
1535d8849d7dSChunwei Chen 			mutex_exit(&dn->dn_mtx);
1536d8849d7dSChunwei Chen 			dnode_slots_rele(dnc, idx, slots);
1537d8849d7dSChunwei Chen 			dbuf_rele(db, FTAG);
1538d8849d7dSChunwei Chen 			return (0);
1539d8849d7dSChunwei Chen 		}
1540d8849d7dSChunwei Chen 
154154811da5SToomas Soome 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
154254811da5SToomas Soome 		DNODE_STAT_BUMP(dnode_hold_free_hits);
154354811da5SToomas Soome 	} else {
154454811da5SToomas Soome 		dbuf_rele(db, FTAG);
154554811da5SToomas Soome 		return (SET_ERROR(EINVAL));
1546fa9e4066Sahrens 	}
1547fa9e4066Sahrens 
1548d8849d7dSChunwei Chen 	ASSERT0(dn->dn_free_txg);
154954811da5SToomas Soome 
1550e914ace2STim Schumacher 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1551744947dcSTom Erickson 		dbuf_add_ref(db, dnh);
155254811da5SToomas Soome 
1553bc9014e6SJustin Gibbs 	mutex_exit(&dn->dn_mtx);
1554bc9014e6SJustin Gibbs 
1555744947dcSTom Erickson 	/* Now we can rely on the hold to prevent the dnode from moving. */
155654811da5SToomas Soome 	dnode_slots_rele(dnc, idx, slots);
1557fa9e4066Sahrens 
15589c9dc39aSek 	DNODE_VERIFY(dn);
1559fa9e4066Sahrens 	ASSERT3P(dn->dn_dbuf, ==, db);
1560fa9e4066Sahrens 	ASSERT3U(dn->dn_object, ==, object);
1561ea8dc4b6Seschrock 	dbuf_rele(db, FTAG);
1562fa9e4066Sahrens 
1563ea8dc4b6Seschrock 	*dnp = dn;
1564ea8dc4b6Seschrock 	return (0);
1565fa9e4066Sahrens }
1566fa9e4066Sahrens 
1567fa9e4066Sahrens /*
1568fa9e4066Sahrens  * Return held dnode if the object is allocated, NULL if not.
1569fa9e4066Sahrens  */
1570ea8dc4b6Seschrock int
dnode_hold(objset_t * os,uint64_t object,void * tag,dnode_t ** dnp)1571503ad85cSMatthew Ahrens dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1572fa9e4066Sahrens {
157354811da5SToomas Soome 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
157454811da5SToomas Soome 	    dnp));
1575fa9e4066Sahrens }
1576fa9e4066Sahrens 
15771934e92fSmaybee /*
15781934e92fSmaybee  * Can only add a reference if there is already at least one
15791934e92fSmaybee  * reference on the dnode.  Returns FALSE if unable to add a
15801934e92fSmaybee  * new reference.
15811934e92fSmaybee  */
15821934e92fSmaybee boolean_t
dnode_add_ref(dnode_t * dn,void * tag)1583ea8dc4b6Seschrock dnode_add_ref(dnode_t *dn, void *tag)
1584fa9e4066Sahrens {
15851934e92fSmaybee 	mutex_enter(&dn->dn_mtx);
1586e914ace2STim Schumacher 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
15871934e92fSmaybee 		mutex_exit(&dn->dn_mtx);
15881934e92fSmaybee 		return (FALSE);
15891934e92fSmaybee 	}
1590e914ace2STim Schumacher 	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
15911934e92fSmaybee 	mutex_exit(&dn->dn_mtx);
15921934e92fSmaybee 	return (TRUE);
1593fa9e4066Sahrens }
1594fa9e4066Sahrens 
1595fa9e4066Sahrens void
dnode_rele(dnode_t * dn,void * tag)1596ea8dc4b6Seschrock dnode_rele(dnode_t *dn, void *tag)
1597cd485b49SJustin T. Gibbs {
1598cd485b49SJustin T. Gibbs 	mutex_enter(&dn->dn_mtx);
1599c2919acbSMatthew Ahrens 	dnode_rele_and_unlock(dn, tag, B_FALSE);
1600cd485b49SJustin T. Gibbs }
1601cd485b49SJustin T. Gibbs 
1602cd485b49SJustin T. Gibbs void
dnode_rele_and_unlock(dnode_t * dn,void * tag,boolean_t evicting)1603c2919acbSMatthew Ahrens dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
1604fa9e4066Sahrens {
1605fa9e4066Sahrens 	uint64_t refs;
1606744947dcSTom Erickson 	/* Get while the hold prevents the dnode from moving. */
1607744947dcSTom Erickson 	dmu_buf_impl_t *db = dn->dn_dbuf;
1608744947dcSTom Erickson 	dnode_handle_t *dnh = dn->dn_handle;
1609fa9e4066Sahrens 
1610e914ace2STim Schumacher 	refs = zfs_refcount_remove(&dn->dn_holds, tag);
1611b390f3a9SJohn Poduska 	if (refs == 0)
1612b390f3a9SJohn Poduska 		cv_broadcast(&dn->dn_nodnholds);
16131934e92fSmaybee 	mutex_exit(&dn->dn_mtx);
1614b390f3a9SJohn Poduska 	/* dnode could get destroyed at this point, so don't use it anymore */
1615744947dcSTom Erickson 
1616744947dcSTom Erickson 	/*
1617744947dcSTom Erickson 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
1618744947dcSTom Erickson 	 * indirectly by dbuf_rele() while relying on the dnode handle to
1619744947dcSTom Erickson 	 * prevent the dnode from moving, since releasing the last hold could
1620744947dcSTom Erickson 	 * result in the dnode's parent dbuf evicting its dnode handles. For
1621744947dcSTom Erickson 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
1622744947dcSTom Erickson 	 * other direct or indirect hold on the dnode must first drop the dnode
1623744947dcSTom Erickson 	 * handle.
1624744947dcSTom Erickson 	 */
1625744947dcSTom Erickson 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1626744947dcSTom Erickson 
1627fa9e4066Sahrens 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1628744947dcSTom Erickson 	if (refs == 0 && db != NULL) {
1629744947dcSTom Erickson 		/*
1630744947dcSTom Erickson 		 * Another thread could add a hold to the dnode handle in
1631744947dcSTom Erickson 		 * dnode_hold_impl() while holding the parent dbuf. Since the
1632744947dcSTom Erickson 		 * hold on the parent dbuf prevents the handle from being
1633744947dcSTom Erickson 		 * destroyed, the hold on the handle is OK. We can't yet assert
1634744947dcSTom Erickson 		 * that the handle has zero references, but that will be
1635744947dcSTom Erickson 		 * asserted anyway when the handle gets destroyed.
1636744947dcSTom Erickson 		 */
1637c2919acbSMatthew Ahrens 		mutex_enter(&db->db_mtx);
1638c2919acbSMatthew Ahrens 		dbuf_rele_and_unlock(db, dnh, evicting);
1639744947dcSTom Erickson 	}
1640fa9e4066Sahrens }
1641fa9e4066Sahrens 
1642d8849d7dSChunwei Chen /*
1643d8849d7dSChunwei Chen  * Test whether we can create a dnode at the specified location.
1644d8849d7dSChunwei Chen  */
1645d8849d7dSChunwei Chen int
dnode_try_claim(objset_t * os,uint64_t object,int slots)1646d8849d7dSChunwei Chen dnode_try_claim(objset_t *os, uint64_t object, int slots)
1647d8849d7dSChunwei Chen {
1648d8849d7dSChunwei Chen 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
1649d8849d7dSChunwei Chen 	    slots, NULL, NULL));
1650d8849d7dSChunwei Chen }
1651d8849d7dSChunwei Chen 
1652fa9e4066Sahrens void
dnode_setdirty(dnode_t * dn,dmu_tx_t * tx)1653fa9e4066Sahrens dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1654fa9e4066Sahrens {
1655503ad85cSMatthew Ahrens 	objset_t *os = dn->dn_objset;
1656fa9e4066Sahrens 	uint64_t txg = tx->tx_txg;
1657fa9e4066Sahrens 
165814843421SMatthew Ahrens 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
165914843421SMatthew Ahrens 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
1660fa9e4066Sahrens 		return;
166114843421SMatthew Ahrens 	}
1662fa9e4066Sahrens 
16639c9dc39aSek 	DNODE_VERIFY(dn);
1664fa9e4066Sahrens 
1665fa9e4066Sahrens #ifdef ZFS_DEBUG
1666fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
1667fa9e4066Sahrens 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1668744947dcSTom Erickson 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1669fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
1670fa9e4066Sahrens #endif
1671fa9e4066Sahrens 
16720a586ceaSMark Shellenbaum 	/*
16730a586ceaSMark Shellenbaum 	 * Determine old uid/gid when necessary
16740a586ceaSMark Shellenbaum 	 */
167506e0070dSMark Shellenbaum 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
16760a586ceaSMark Shellenbaum 
167794c2d0ebSMatthew Ahrens 	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
167894c2d0ebSMatthew Ahrens 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1679fa9e4066Sahrens 
1680fa9e4066Sahrens 	/*
1681fa9e4066Sahrens 	 * If we are already marked dirty, we're done.
1682fa9e4066Sahrens 	 */
1683aa02ea01STom Caputi 	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
168494c2d0ebSMatthew Ahrens 		multilist_sublist_unlock(mls);
1685fa9e4066Sahrens 		return;
1686fa9e4066Sahrens 	}
1687fa9e4066Sahrens 
1688e914ace2STim Schumacher 	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
16890f6d88adSAlex Reece 	    !avl_is_empty(&dn->dn_dbufs));
1690fa9e4066Sahrens 	ASSERT(dn->dn_datablksz != 0);
1691fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
1692fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
1693fb09f5aaSMadhav Suresh 	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1694fa9e4066Sahrens 
1695fa9e4066Sahrens 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1696fa9e4066Sahrens 	    dn->dn_object, txg);
1697fa9e4066Sahrens 
169894c2d0ebSMatthew Ahrens 	multilist_sublist_insert_head(mls, dn);
1699fa9e4066Sahrens 
170094c2d0ebSMatthew Ahrens 	multilist_sublist_unlock(mls);
1701fa9e4066Sahrens 
1702fa9e4066Sahrens 	/*
1703fa9e4066Sahrens 	 * The dnode maintains a hold on its containing dbuf as
1704fa9e4066Sahrens 	 * long as there are holds on it.  Each instantiated child
1705744947dcSTom Erickson 	 * dbuf maintains a hold on the dnode.  When the last child
1706fa9e4066Sahrens 	 * drops its hold, the dnode will drop its hold on the
1707fa9e4066Sahrens 	 * containing dbuf. We add a "dirty hold" here so that the
1708fa9e4066Sahrens 	 * dnode will hang around after we finish processing its
1709fa9e4066Sahrens 	 * children.
1710fa9e4066Sahrens 	 */
17111934e92fSmaybee 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1712fa9e4066Sahrens 
1713c717a561Smaybee 	(void) dbuf_dirty(dn->dn_dbuf, tx);
1714fa9e4066Sahrens 
1715fa9e4066Sahrens 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
1716fa9e4066Sahrens }
1717fa9e4066Sahrens 
1718fa9e4066Sahrens void
dnode_free(dnode_t * dn,dmu_tx_t * tx)1719fa9e4066Sahrens dnode_free(dnode_t *dn, dmu_tx_t *tx)
1720fa9e4066Sahrens {
1721fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
1722fa9e4066Sahrens 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1723fa9e4066Sahrens 		mutex_exit(&dn->dn_mtx);
1724fa9e4066Sahrens 		return;
1725fa9e4066Sahrens 	}
1726fa9e4066Sahrens 	dn->dn_free_txg = tx->tx_txg;
1727fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
1728fa9e4066Sahrens 
172994c2d0ebSMatthew Ahrens 	dnode_setdirty(dn, tx);
1730fa9e4066Sahrens }
1731fa9e4066Sahrens 
1732fa9e4066Sahrens /*
1733fa9e4066Sahrens  * Try to change the block size for the indicated dnode.  This can only
1734fa9e4066Sahrens  * succeed if there are no blocks allocated or dirty beyond first block
1735fa9e4066Sahrens  */
1736fa9e4066Sahrens int
dnode_set_blksz(dnode_t * dn,uint64_t size,int ibs,dmu_tx_t * tx)1737fa9e4066Sahrens dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1738fa9e4066Sahrens {
17390f6d88adSAlex Reece 	dmu_buf_impl_t *db;
1740cdb0ab79Smaybee 	int err;
1741fa9e4066Sahrens 
1742b5152584SMatthew Ahrens 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1743fa9e4066Sahrens 	if (size == 0)
1744fa9e4066Sahrens 		size = SPA_MINBLOCKSIZE;
1745fa9e4066Sahrens 	else
1746fa9e4066Sahrens 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1747fa9e4066Sahrens 
1748b143e04bSahrens 	if (ibs == dn->dn_indblkshift)
1749b143e04bSahrens 		ibs = 0;
1750fa9e4066Sahrens 
1751b143e04bSahrens 	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1752fa9e4066Sahrens 		return (0);
1753fa9e4066Sahrens 
1754fa9e4066Sahrens 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1755fa9e4066Sahrens 
1756fa9e4066Sahrens 	/* Check for any allocated blocks beyond the first */
17570713e232SGeorge Wilson 	if (dn->dn_maxblkid != 0)
1758b143e04bSahrens 		goto fail;
1759fa9e4066Sahrens 
1760fa9e4066Sahrens 	mutex_enter(&dn->dn_dbufs_mtx);
17610f6d88adSAlex Reece 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
17620f6d88adSAlex Reece 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
17630a586ceaSMark Shellenbaum 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
17640a586ceaSMark Shellenbaum 		    db->db_blkid != DMU_SPILL_BLKID) {
1765fa9e4066Sahrens 			mutex_exit(&dn->dn_dbufs_mtx);
1766b143e04bSahrens 			goto fail;
1767fa9e4066Sahrens 		}
1768fa9e4066Sahrens 	}
1769fa9e4066Sahrens 	mutex_exit(&dn->dn_dbufs_mtx);
1770fa9e4066Sahrens 
1771b143e04bSahrens 	if (ibs && dn->dn_nlevels != 1)
1772b143e04bSahrens 		goto fail;
1773b143e04bSahrens 
1774cdb0ab79Smaybee 	/* resize the old block */
1775a2cdcdd2SPaul Dagnelie 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
17769704bf7fSPaul Dagnelie 	if (err == 0) {
1777c543ec06Sahrens 		dbuf_new_size(db, size, tx);
17789704bf7fSPaul Dagnelie 	} else if (err != ENOENT) {
1779cdb0ab79Smaybee 		goto fail;
17809704bf7fSPaul Dagnelie 	}
1781fa9e4066Sahrens 
1782fa9e4066Sahrens 	dnode_setdblksz(dn, size);
1783c543ec06Sahrens 	dnode_setdirty(dn, tx);
1784c543ec06Sahrens 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1785b143e04bSahrens 	if (ibs) {
1786b143e04bSahrens 		dn->dn_indblkshift = ibs;
1787b143e04bSahrens 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1788b143e04bSahrens 	}
1789cdb0ab79Smaybee 	/* rele after we have fixed the blocksize in the dnode */
1790c543ec06Sahrens 	if (db)
1791c543ec06Sahrens 		dbuf_rele(db, FTAG);
1792fa9e4066Sahrens 
1793fa9e4066Sahrens 	rw_exit(&dn->dn_struct_rwlock);
1794b143e04bSahrens 	return (0);
1795b143e04bSahrens 
1796b143e04bSahrens fail:
1797b143e04bSahrens 	rw_exit(&dn->dn_struct_rwlock);
1798be6fd75aSMatthew Ahrens 	return (SET_ERROR(ENOTSUP));
1799fa9e4066Sahrens }
1800fa9e4066Sahrens 
1801eb633035STom Caputi static void
dnode_set_nlevels_impl(dnode_t * dn,int new_nlevels,dmu_tx_t * tx)1802eb633035STom Caputi dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
1803eb633035STom Caputi {
1804eb633035STom Caputi 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
1805eb633035STom Caputi 	int old_nlevels = dn->dn_nlevels;
1806eb633035STom Caputi 	dmu_buf_impl_t *db;
1807eb633035STom Caputi 	list_t *list;
1808eb633035STom Caputi 	dbuf_dirty_record_t *new, *dr, *dr_next;
1809eb633035STom Caputi 
1810eb633035STom Caputi 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1811eb633035STom Caputi 
1812eb633035STom Caputi 	dn->dn_nlevels = new_nlevels;
1813eb633035STom Caputi 
1814eb633035STom Caputi 	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1815eb633035STom Caputi 	dn->dn_next_nlevels[txgoff] = new_nlevels;
1816eb633035STom Caputi 
1817eb633035STom Caputi 	/* dirty the left indirects */
1818eb633035STom Caputi 	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1819eb633035STom Caputi 	ASSERT(db != NULL);
1820eb633035STom Caputi 	new = dbuf_dirty(db, tx);
1821eb633035STom Caputi 	dbuf_rele(db, FTAG);
1822eb633035STom Caputi 
1823eb633035STom Caputi 	/* transfer the dirty records to the new indirect */
1824eb633035STom Caputi 	mutex_enter(&dn->dn_mtx);
1825eb633035STom Caputi 	mutex_enter(&new->dt.di.dr_mtx);
1826eb633035STom Caputi 	list = &dn->dn_dirty_records[txgoff];
1827eb633035STom Caputi 	for (dr = list_head(list); dr; dr = dr_next) {
1828eb633035STom Caputi 		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1829eb633035STom Caputi 		if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1830eb633035STom Caputi 		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1831eb633035STom Caputi 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1832eb633035STom Caputi 			ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1833eb633035STom Caputi 			list_remove(&dn->dn_dirty_records[txgoff], dr);
1834eb633035STom Caputi 			list_insert_tail(&new->dt.di.dr_children, dr);
1835eb633035STom Caputi 			dr->dr_parent = new;
1836eb633035STom Caputi 		}
1837eb633035STom Caputi 	}
1838eb633035STom Caputi 	mutex_exit(&new->dt.di.dr_mtx);
1839eb633035STom Caputi 	mutex_exit(&dn->dn_mtx);
1840eb633035STom Caputi }
1841eb633035STom Caputi 
1842eb633035STom Caputi int
dnode_set_nlevels(dnode_t * dn,int nlevels,dmu_tx_t * tx)1843eb633035STom Caputi dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
1844eb633035STom Caputi {
1845eb633035STom Caputi 	int ret = 0;
1846eb633035STom Caputi 
1847eb633035STom Caputi 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1848eb633035STom Caputi 
1849eb633035STom Caputi 	if (dn->dn_nlevels == nlevels) {
1850eb633035STom Caputi 		ret = 0;
1851eb633035STom Caputi 		goto out;
1852eb633035STom Caputi 	} else if (nlevels < dn->dn_nlevels) {
1853eb633035STom Caputi 		ret = SET_ERROR(EINVAL);
1854eb633035STom Caputi 		goto out;
1855eb633035STom Caputi 	}
1856eb633035STom Caputi 
1857eb633035STom Caputi 	dnode_set_nlevels_impl(dn, nlevels, tx);
1858eb633035STom Caputi 
1859eb633035STom Caputi out:
1860eb633035STom Caputi 	rw_exit(&dn->dn_struct_rwlock);
1861eb633035STom Caputi 	return (ret);
1862eb633035STom Caputi }
1863eb633035STom Caputi 
18648346f03fSJonathan W Adams /* read-holding callers must not rely on the lock being continuously held */
1865fa9e4066Sahrens void
dnode_new_blkid(dnode_t * dn,uint64_t blkid,dmu_tx_t * tx,boolean_t have_read,boolean_t force)1866eb633035STom Caputi dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
1867eb633035STom Caputi     boolean_t force)
1868fa9e4066Sahrens {
1869c543ec06Sahrens 	int epbs, new_nlevels;
1870fa9e4066Sahrens 	uint64_t sz;
1871fa9e4066Sahrens 
18720a586ceaSMark Shellenbaum 	ASSERT(blkid != DMU_BONUS_BLKID);
1873fa9e4066Sahrens 
18748346f03fSJonathan W Adams 	ASSERT(have_read ?
18758346f03fSJonathan W Adams 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
18768346f03fSJonathan W Adams 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
18778346f03fSJonathan W Adams 
18788346f03fSJonathan W Adams 	/*
18798346f03fSJonathan W Adams 	 * if we have a read-lock, check to see if we need to do any work
18808346f03fSJonathan W Adams 	 * before upgrading to a write-lock.
18818346f03fSJonathan W Adams 	 */
18828346f03fSJonathan W Adams 	if (have_read) {
18838346f03fSJonathan W Adams 		if (blkid <= dn->dn_maxblkid)
18848346f03fSJonathan W Adams 			return;
18858346f03fSJonathan W Adams 
18868346f03fSJonathan W Adams 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
18878346f03fSJonathan W Adams 			rw_exit(&dn->dn_struct_rwlock);
18888346f03fSJonathan W Adams 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
18898346f03fSJonathan W Adams 		}
1890fa9e4066Sahrens 	}
1891fa9e4066Sahrens 
1892eb633035STom Caputi 	/*
1893eb633035STom Caputi 	 * Raw sends (indicated by the force flag) require that we take the
1894eb633035STom Caputi 	 * given blkid even if the value is lower than the current value.
1895eb633035STom Caputi 	 */
1896eb633035STom Caputi 	if (!force && blkid <= dn->dn_maxblkid)
1897c543ec06Sahrens 		goto out;
1898c543ec06Sahrens 
1899eb633035STom Caputi 	/*
1900eb633035STom Caputi 	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
1901eb633035STom Caputi 	 * to indicate that this field is set. This allows us to set the
1902eb633035STom Caputi 	 * maxblkid to 0 on an existing object in dnode_sync().
1903eb633035STom Caputi 	 */
1904c543ec06Sahrens 	dn->dn_maxblkid = blkid;
1905eb633035STom Caputi 	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
1906eb633035STom Caputi 	    blkid | DMU_NEXT_MAXBLKID_SET;
1907fa9e4066Sahrens 
1908fa9e4066Sahrens 	/*
1909c543ec06Sahrens 	 * Compute the number of levels necessary to support the new maxblkid.
1910eb633035STom Caputi 	 * Raw sends will ensure nlevels is set correctly for us.
1911fa9e4066Sahrens 	 */
1912fa9e4066Sahrens 	new_nlevels = 1;
1913fa9e4066Sahrens 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1914c543ec06Sahrens 	for (sz = dn->dn_nblkptr;
1915c543ec06Sahrens 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1916fa9e4066Sahrens 		new_nlevels++;
1917fa9e4066Sahrens 
1918eb633035STom Caputi 	if (!force) {
1919eb633035STom Caputi 		if (new_nlevels > dn->dn_nlevels)
1920eb633035STom Caputi 			dnode_set_nlevels_impl(dn, new_nlevels, tx);
1921eb633035STom Caputi 	} else {
1922eb633035STom Caputi 		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
1923fa9e4066Sahrens 	}
1924fa9e4066Sahrens 
1925fa9e4066Sahrens out:
19268346f03fSJonathan W Adams 	if (have_read)
19278346f03fSJonathan W Adams 		rw_downgrade(&dn->dn_struct_rwlock);
1928fa9e4066Sahrens }
1929fa9e4066Sahrens 
193046e1baa6SMatthew Ahrens static void
dnode_dirty_l1(dnode_t * dn,uint64_t l1blkid,dmu_tx_t * tx)193146e1baa6SMatthew Ahrens dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
193246e1baa6SMatthew Ahrens {
193346e1baa6SMatthew Ahrens 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
193446e1baa6SMatthew Ahrens 	if (db != NULL) {
193546e1baa6SMatthew Ahrens 		dmu_buf_will_dirty(&db->db, tx);
193646e1baa6SMatthew Ahrens 		dbuf_rele(db, FTAG);
193746e1baa6SMatthew Ahrens 	}
193846e1baa6SMatthew Ahrens }
193946e1baa6SMatthew Ahrens 
1940738e2a3cSPaul Dagnelie /*
1941738e2a3cSPaul Dagnelie  * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
1942738e2a3cSPaul Dagnelie  * and end_blkid.
1943738e2a3cSPaul Dagnelie  */
1944738e2a3cSPaul Dagnelie static void
dnode_dirty_l1range(dnode_t * dn,uint64_t start_blkid,uint64_t end_blkid,dmu_tx_t * tx)1945738e2a3cSPaul Dagnelie dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1946738e2a3cSPaul Dagnelie     dmu_tx_t *tx)
1947738e2a3cSPaul Dagnelie {
1948738e2a3cSPaul Dagnelie 	dmu_buf_impl_t db_search;
1949738e2a3cSPaul Dagnelie 	dmu_buf_impl_t *db;
1950738e2a3cSPaul Dagnelie 	avl_index_t where;
1951738e2a3cSPaul Dagnelie 
1952738e2a3cSPaul Dagnelie 	mutex_enter(&dn->dn_dbufs_mtx);
1953738e2a3cSPaul Dagnelie 
1954738e2a3cSPaul Dagnelie 	db_search.db_level = 1;
1955738e2a3cSPaul Dagnelie 	db_search.db_blkid = start_blkid + 1;
1956738e2a3cSPaul Dagnelie 	db_search.db_state = DB_SEARCH;
1957738e2a3cSPaul Dagnelie 	for (;;) {
1958738e2a3cSPaul Dagnelie 
1959738e2a3cSPaul Dagnelie 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
1960738e2a3cSPaul Dagnelie 		if (db == NULL)
1961738e2a3cSPaul Dagnelie 			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1962738e2a3cSPaul Dagnelie 
1963738e2a3cSPaul Dagnelie 		if (db == NULL || db->db_level != 1 ||
1964738e2a3cSPaul Dagnelie 		    db->db_blkid >= end_blkid) {
1965738e2a3cSPaul Dagnelie 			break;
1966738e2a3cSPaul Dagnelie 		}
1967738e2a3cSPaul Dagnelie 
1968738e2a3cSPaul Dagnelie 		/*
1969738e2a3cSPaul Dagnelie 		 * Setup the next blkid we want to search for.
1970738e2a3cSPaul Dagnelie 		 */
1971738e2a3cSPaul Dagnelie 		db_search.db_blkid = db->db_blkid + 1;
1972738e2a3cSPaul Dagnelie 		ASSERT3U(db->db_blkid, >=, start_blkid);
1973738e2a3cSPaul Dagnelie 
1974738e2a3cSPaul Dagnelie 		/*
1975738e2a3cSPaul Dagnelie 		 * If the dbuf transitions to DB_EVICTING while we're trying
1976738e2a3cSPaul Dagnelie 		 * to dirty it, then we will be unable to discover it in
1977738e2a3cSPaul Dagnelie 		 * the dbuf hash table. This will result in a call to
1978738e2a3cSPaul Dagnelie 		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
1979738e2a3cSPaul Dagnelie 		 * lock. To avoid a deadlock, we drop the lock before
1980738e2a3cSPaul Dagnelie 		 * dirtying the level-1 dbuf.
1981738e2a3cSPaul Dagnelie 		 */
1982738e2a3cSPaul Dagnelie 		mutex_exit(&dn->dn_dbufs_mtx);
1983738e2a3cSPaul Dagnelie 		dnode_dirty_l1(dn, db->db_blkid, tx);
1984738e2a3cSPaul Dagnelie 		mutex_enter(&dn->dn_dbufs_mtx);
1985738e2a3cSPaul Dagnelie 	}
1986738e2a3cSPaul Dagnelie 
1987738e2a3cSPaul Dagnelie #ifdef ZFS_DEBUG
1988738e2a3cSPaul Dagnelie 	/*
1989738e2a3cSPaul Dagnelie 	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
1990738e2a3cSPaul Dagnelie 	 */
1991738e2a3cSPaul Dagnelie 	db_search.db_level = 1;
1992738e2a3cSPaul Dagnelie 	db_search.db_blkid = start_blkid + 1;
1993738e2a3cSPaul Dagnelie 	db_search.db_state = DB_SEARCH;
1994738e2a3cSPaul Dagnelie 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
1995738e2a3cSPaul Dagnelie 	if (db == NULL)
1996738e2a3cSPaul Dagnelie 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1997738e2a3cSPaul Dagnelie 	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
1998738e2a3cSPaul Dagnelie 		if (db->db_level != 1 || db->db_blkid >= end_blkid)
1999738e2a3cSPaul Dagnelie 			break;
2000738e2a3cSPaul Dagnelie 		ASSERT(db->db_dirtycnt > 0);
2001738e2a3cSPaul Dagnelie 	}
2002738e2a3cSPaul Dagnelie #endif
2003738e2a3cSPaul Dagnelie 	mutex_exit(&dn->dn_dbufs_mtx);
2004738e2a3cSPaul Dagnelie }
2005738e2a3cSPaul Dagnelie 
2006fa9e4066Sahrens void
dnode_free_range(dnode_t * dn,uint64_t off,uint64_t len,dmu_tx_t * tx)2007fa9e4066Sahrens dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
2008fa9e4066Sahrens {
2009fa9e4066Sahrens 	dmu_buf_impl_t *db;
2010b143e04bSahrens 	uint64_t blkoff, blkid, nblks;
2011cdb0ab79Smaybee 	int blksz, blkshift, head, tail;
2012fa9e4066Sahrens 	int trunc = FALSE;
2013cdb0ab79Smaybee 	int epbs;
2014fa9e4066Sahrens 
2015fa9e4066Sahrens 	blksz = dn->dn_datablksz;
2016cdb0ab79Smaybee 	blkshift = dn->dn_datablkshift;
2017cdb0ab79Smaybee 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2018fa9e4066Sahrens 
2019713d6c20SMatthew Ahrens 	if (len == DMU_OBJECT_END) {
2020fa9e4066Sahrens 		len = UINT64_MAX - off;
2021fa9e4066Sahrens 		trunc = TRUE;
2022fa9e4066Sahrens 	}
2023fa9e4066Sahrens 
2024fa9e4066Sahrens 	/*
2025fa9e4066Sahrens 	 * First, block align the region to free:
2026fa9e4066Sahrens 	 */
2027b143e04bSahrens 	if (ISP2(blksz)) {
2028b143e04bSahrens 		head = P2NPHASE(off, blksz);
2029b143e04bSahrens 		blkoff = P2PHASE(off, blksz);
2030cdb0ab79Smaybee 		if ((off >> blkshift) > dn->dn_maxblkid)
20319704bf7fSPaul Dagnelie 			return;
2032b143e04bSahrens 	} else {
2033b143e04bSahrens 		ASSERT(dn->dn_maxblkid == 0);
2034b143e04bSahrens 		if (off == 0 && len >= blksz) {
203543466aaeSMax Grossman 			/*
203643466aaeSMax Grossman 			 * Freeing the whole block; fast-track this request.
203743466aaeSMax Grossman 			 */
2038cdb0ab79Smaybee 			blkid = 0;
2039cdb0ab79Smaybee 			nblks = 1;
20409704bf7fSPaul Dagnelie 			if (dn->dn_nlevels > 1) {
20419704bf7fSPaul Dagnelie 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
204299a19144SMatthew Ahrens 				dnode_dirty_l1(dn, 0, tx);
20439704bf7fSPaul Dagnelie 				rw_exit(&dn->dn_struct_rwlock);
20449704bf7fSPaul Dagnelie 			}
2045cdb0ab79Smaybee 			goto done;
20461c8564a7SMark Maybee 		} else if (off >= blksz) {
2047cdb0ab79Smaybee 			/* Freeing past end-of-data */
20489704bf7fSPaul Dagnelie 			return;
2049fa9e4066Sahrens 		} else {
2050b143e04bSahrens 			/* Freeing part of the block. */
2051fa9e4066Sahrens 			head = blksz - off;
2052fa9e4066Sahrens 			ASSERT3U(head, >, 0);
2053fa9e4066Sahrens 		}
2054b143e04bSahrens 		blkoff = off;
2055fa9e4066Sahrens 	}
2056fa9e4066Sahrens 	/* zero out any partial block data at the start of the range */
2057fa9e4066Sahrens 	if (head) {
20589704bf7fSPaul Dagnelie 		int res;
2059b143e04bSahrens 		ASSERT3U(blkoff + head, ==, blksz);
2060fa9e4066Sahrens 		if (len < head)
2061fa9e4066Sahrens 			head = len;
20629704bf7fSPaul Dagnelie 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
20639704bf7fSPaul Dagnelie 		res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
20649704bf7fSPaul Dagnelie 		    TRUE, FALSE, FTAG, &db);
20659704bf7fSPaul Dagnelie 		rw_exit(&dn->dn_struct_rwlock);
20669704bf7fSPaul Dagnelie 		if (res == 0) {
2067fa9e4066Sahrens 			caddr_t data;
20689704bf7fSPaul Dagnelie 			boolean_t dirty;
2069fa9e4066Sahrens 
20709704bf7fSPaul Dagnelie 			db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
20719704bf7fSPaul Dagnelie 			    FTAG);
2072fa9e4066Sahrens 			/* don't dirty if it isn't on disk and isn't dirty */
20739704bf7fSPaul Dagnelie 			dirty = db->db_last_dirty ||
20749704bf7fSPaul Dagnelie 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
20759704bf7fSPaul Dagnelie 			dmu_buf_unlock_parent(db, dblt, FTAG);
20769704bf7fSPaul Dagnelie 			if (dirty) {
207743466aaeSMax Grossman 				dmu_buf_will_dirty(&db->db, tx);
2078fa9e4066Sahrens 				data = db->db.db_data;
2079b143e04bSahrens 				bzero(data + blkoff, head);
2080fa9e4066Sahrens 			}
2081ea8dc4b6Seschrock 			dbuf_rele(db, FTAG);
2082fa9e4066Sahrens 		}
2083fa9e4066Sahrens 		off += head;
2084fa9e4066Sahrens 		len -= head;
2085fa9e4066Sahrens 	}
2086fa9e4066Sahrens 
2087b143e04bSahrens 	/* If the range was less than one block, we're done */
2088cdb0ab79Smaybee 	if (len == 0)
20899704bf7fSPaul Dagnelie 		return;
2090fa9e4066Sahrens 
2091cdb0ab79Smaybee 	/* If the remaining range is past end of file, we're done */
2092cdb0ab79Smaybee 	if ((off >> blkshift) > dn->dn_maxblkid)
20939704bf7fSPaul Dagnelie 		return;
2094b143e04bSahrens 
20951c8564a7SMark Maybee 	ASSERT(ISP2(blksz));
2096cdb0ab79Smaybee 	if (trunc)
2097cdb0ab79Smaybee 		tail = 0;
2098cdb0ab79Smaybee 	else
2099cdb0ab79Smaybee 		tail = P2PHASE(len, blksz);
2100cdb0ab79Smaybee 
2101fb09f5aaSMadhav Suresh 	ASSERT0(P2PHASE(off, blksz));
2102cdb0ab79Smaybee 	/* zero out any partial block data at the end of the range */
2103cdb0ab79Smaybee 	if (tail) {
21049704bf7fSPaul Dagnelie 		int res;
2105cdb0ab79Smaybee 		if (len < tail)
2106cdb0ab79Smaybee 			tail = len;
21079704bf7fSPaul Dagnelie 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
21089704bf7fSPaul Dagnelie 		res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
21099704bf7fSPaul Dagnelie 		    TRUE, FALSE, FTAG, &db);
21109704bf7fSPaul Dagnelie 		rw_exit(&dn->dn_struct_rwlock);
21119704bf7fSPaul Dagnelie 		if (res == 0) {
21129704bf7fSPaul Dagnelie 			boolean_t dirty;
2113cdb0ab79Smaybee 			/* don't dirty if not on disk and not dirty */
21149704bf7fSPaul Dagnelie 			db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
21159704bf7fSPaul Dagnelie 			    FTAG);
21169704bf7fSPaul Dagnelie 			dirty = db->db_last_dirty ||
21179704bf7fSPaul Dagnelie 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
21189704bf7fSPaul Dagnelie 			dmu_buf_unlock_parent(db, type, FTAG);
21199704bf7fSPaul Dagnelie 			if (dirty) {
212043466aaeSMax Grossman 				dmu_buf_will_dirty(&db->db, tx);
2121cdb0ab79Smaybee 				bzero(db->db.db_data, tail);
2122fa9e4066Sahrens 			}
2123ea8dc4b6Seschrock 			dbuf_rele(db, FTAG);
2124fa9e4066Sahrens 		}
2125cdb0ab79Smaybee 		len -= tail;
2126cdb0ab79Smaybee 	}
2127fa9e4066Sahrens 
2128cdb0ab79Smaybee 	/* If the range did not include a full block, we are done */
2129cdb0ab79Smaybee 	if (len == 0)
21309704bf7fSPaul Dagnelie 		return;
2131fa9e4066Sahrens 
2132cdb0ab79Smaybee 	ASSERT(IS_P2ALIGNED(off, blksz));
2133cdb0ab79Smaybee 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2134cdb0ab79Smaybee 	blkid = off >> blkshift;
2135cdb0ab79Smaybee 	nblks = len >> blkshift;
2136cdb0ab79Smaybee 	if (trunc)
2137cdb0ab79Smaybee 		nblks += 1;
2138cdb0ab79Smaybee 
2139cdb0ab79Smaybee 	/*
214046e1baa6SMatthew Ahrens 	 * Dirty all the indirect blocks in this range.  Note that only
214146e1baa6SMatthew Ahrens 	 * the first and last indirect blocks can actually be written
214246e1baa6SMatthew Ahrens 	 * (if they were partially freed) -- they must be dirtied, even if
214346e1baa6SMatthew Ahrens 	 * they do not exist on disk yet.  The interior blocks will
214446e1baa6SMatthew Ahrens 	 * be freed by free_children(), so they will not actually be written.
214546e1baa6SMatthew Ahrens 	 * Even though these interior blocks will not be written, we
214646e1baa6SMatthew Ahrens 	 * dirty them for two reasons:
214746e1baa6SMatthew Ahrens 	 *
214846e1baa6SMatthew Ahrens 	 *  - It ensures that the indirect blocks remain in memory until
214946e1baa6SMatthew Ahrens 	 *    syncing context.  (They have already been prefetched by
215046e1baa6SMatthew Ahrens 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
215146e1baa6SMatthew Ahrens 	 *    them serially here.)
215246e1baa6SMatthew Ahrens 	 *
215346e1baa6SMatthew Ahrens 	 *  - The dirty space accounting will put pressure on the txg sync
215446e1baa6SMatthew Ahrens 	 *    mechanism to begin syncing, and to delay transactions if there
215546e1baa6SMatthew Ahrens 	 *    is a large amount of freeing.  Even though these indirect
215646e1baa6SMatthew Ahrens 	 *    blocks will not be written, we could need to write the same
215746e1baa6SMatthew Ahrens 	 *    amount of space if we copy the freed BPs into deadlists.
2158cdb0ab79Smaybee 	 */
2159cdb0ab79Smaybee 	if (dn->dn_nlevels > 1) {
21609704bf7fSPaul Dagnelie 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
216143466aaeSMax Grossman 		uint64_t first, last;
2162b143e04bSahrens 
2163cdb0ab79Smaybee 		first = blkid >> epbs;
216446e1baa6SMatthew Ahrens 		dnode_dirty_l1(dn, first, tx);
2165b143e04bSahrens 		if (trunc)
2166cdb0ab79Smaybee 			last = dn->dn_maxblkid >> epbs;
2167cdb0ab79Smaybee 		else
2168cdb0ab79Smaybee 			last = (blkid + nblks - 1) >> epbs;
216946e1baa6SMatthew Ahrens 		if (last != first)
217046e1baa6SMatthew Ahrens 			dnode_dirty_l1(dn, last, tx);
217146e1baa6SMatthew Ahrens 
2172738e2a3cSPaul Dagnelie 		dnode_dirty_l1range(dn, first, last, tx);
2173738e2a3cSPaul Dagnelie 
217446e1baa6SMatthew Ahrens 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
217546e1baa6SMatthew Ahrens 		    SPA_BLKPTRSHIFT;
217646e1baa6SMatthew Ahrens 		for (uint64_t i = first + 1; i < last; i++) {
217746e1baa6SMatthew Ahrens 			/*
217846e1baa6SMatthew Ahrens 			 * Set i to the blockid of the next non-hole
217946e1baa6SMatthew Ahrens 			 * level-1 indirect block at or after i.  Note
218046e1baa6SMatthew Ahrens 			 * that dnode_next_offset() operates in terms of
218146e1baa6SMatthew Ahrens 			 * level-0-equivalent bytes.
218246e1baa6SMatthew Ahrens 			 */
218346e1baa6SMatthew Ahrens 			uint64_t ibyte = i << shift;
218446e1baa6SMatthew Ahrens 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
218546e1baa6SMatthew Ahrens 			    &ibyte, 2, 1, 0);
218646e1baa6SMatthew Ahrens 			i = ibyte >> shift;
218746e1baa6SMatthew Ahrens 			if (i >= last)
218846e1baa6SMatthew Ahrens 				break;
218946e1baa6SMatthew Ahrens 
219046e1baa6SMatthew Ahrens 			/*
219146e1baa6SMatthew Ahrens 			 * Normally we should not see an error, either
219246e1baa6SMatthew Ahrens 			 * from dnode_next_offset() or dbuf_hold_level()
219346e1baa6SMatthew Ahrens 			 * (except for ESRCH from dnode_next_offset).
219446e1baa6SMatthew Ahrens 			 * If there is an i/o error, then when we read
219546e1baa6SMatthew Ahrens 			 * this block in syncing context, it will use
219646e1baa6SMatthew Ahrens 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
219746e1baa6SMatthew Ahrens 			 * to the "failmode" property.  dnode_next_offset()
219846e1baa6SMatthew Ahrens 			 * doesn't have a flag to indicate MUSTSUCCEED.
219946e1baa6SMatthew Ahrens 			 */
220046e1baa6SMatthew Ahrens 			if (err != 0)
220146e1baa6SMatthew Ahrens 				break;
220246e1baa6SMatthew Ahrens 
220346e1baa6SMatthew Ahrens 			dnode_dirty_l1(dn, i, tx);
220456d55a53Smaybee 		}
22059704bf7fSPaul Dagnelie 		rw_exit(&dn->dn_struct_rwlock);
2206fa9e4066Sahrens 	}
220743466aaeSMax Grossman 
2208cdb0ab79Smaybee done:
2209cdb0ab79Smaybee 	/*
2210cdb0ab79Smaybee 	 * Add this range to the dnode range list.
2211cdb0ab79Smaybee 	 * We will finish up this free operation in the syncing phase.
2212cdb0ab79Smaybee 	 */
2213fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
2214bf16b11eSMatthew Ahrens 	int txgoff = tx->tx_txg & TXG_MASK;
2215bf16b11eSMatthew Ahrens 	if (dn->dn_free_ranges[txgoff] == NULL) {
22164d7988d6SPaul Dagnelie 		dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
22174d7988d6SPaul Dagnelie 		    RANGE_SEG64, NULL, 0, 0);
2218fa9e4066Sahrens 	}
2219bf16b11eSMatthew Ahrens 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2220bf16b11eSMatthew Ahrens 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2221bf16b11eSMatthew Ahrens 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2222bf16b11eSMatthew Ahrens 	    blkid, nblks, tx->tx_txg);
2223fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
2224fa9e4066Sahrens 
2225cdb0ab79Smaybee 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2226fa9e4066Sahrens 	dnode_setdirty(dn, tx);
2227fa9e4066Sahrens }
2228fa9e4066Sahrens 
22290a586ceaSMark Shellenbaum static boolean_t
dnode_spill_freed(dnode_t * dn)22300a586ceaSMark Shellenbaum dnode_spill_freed(dnode_t *dn)
22310a586ceaSMark Shellenbaum {
22320a586ceaSMark Shellenbaum 	int i;
22330a586ceaSMark Shellenbaum 
22340a586ceaSMark Shellenbaum 	mutex_enter(&dn->dn_mtx);
22350a586ceaSMark Shellenbaum 	for (i = 0; i < TXG_SIZE; i++) {
22360a586ceaSMark Shellenbaum 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
22370a586ceaSMark Shellenbaum 			break;
22380a586ceaSMark Shellenbaum 	}
22390a586ceaSMark Shellenbaum 	mutex_exit(&dn->dn_mtx);
22400a586ceaSMark Shellenbaum 	return (i < TXG_SIZE);
22410a586ceaSMark Shellenbaum }
22420a586ceaSMark Shellenbaum 
2243fa9e4066Sahrens /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2244fa9e4066Sahrens uint64_t
dnode_block_freed(dnode_t * dn,uint64_t blkid)2245fa9e4066Sahrens dnode_block_freed(dnode_t *dn, uint64_t blkid)
2246fa9e4066Sahrens {
2247fa9e4066Sahrens 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
2248fa9e4066Sahrens 	int i;
2249fa9e4066Sahrens 
22500a586ceaSMark Shellenbaum 	if (blkid == DMU_BONUS_BLKID)
2251fa9e4066Sahrens 		return (FALSE);
2252fa9e4066Sahrens 
2253fa9e4066Sahrens 	/*
2254fa9e4066Sahrens 	 * If we're in the process of opening the pool, dp will not be
2255fa9e4066Sahrens 	 * set yet, but there shouldn't be anything dirty.
2256fa9e4066Sahrens 	 */
2257fa9e4066Sahrens 	if (dp == NULL)
2258fa9e4066Sahrens 		return (FALSE);
2259fa9e4066Sahrens 
2260fa9e4066Sahrens 	if (dn->dn_free_txg)
2261fa9e4066Sahrens 		return (TRUE);
2262fa9e4066Sahrens 
22630a586ceaSMark Shellenbaum 	if (blkid == DMU_SPILL_BLKID)
22640a586ceaSMark Shellenbaum 		return (dnode_spill_freed(dn));
22650a586ceaSMark Shellenbaum 
2266fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
2267fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
2268bf16b11eSMatthew Ahrens 		if (dn->dn_free_ranges[i] != NULL &&
2269bf16b11eSMatthew Ahrens 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2270fa9e4066Sahrens 			break;
2271fa9e4066Sahrens 	}
2272fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
2273fa9e4066Sahrens 	return (i < TXG_SIZE);
2274fa9e4066Sahrens }
2275fa9e4066Sahrens 
2276fa9e4066Sahrens /* call from syncing context when we actually write/free space for this dnode */
2277fa9e4066Sahrens void
dnode_diduse_space(dnode_t * dn,int64_t delta)227899653d4eSeschrock dnode_diduse_space(dnode_t *dn, int64_t delta)
2279fa9e4066Sahrens {
228099653d4eSeschrock 	uint64_t space;
228199653d4eSeschrock 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2282fa9e4066Sahrens 	    dn, dn->dn_phys,
228399653d4eSeschrock 	    (u_longlong_t)dn->dn_phys->dn_used,
228499653d4eSeschrock 	    (longlong_t)delta);
2285fa9e4066Sahrens 
2286fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
228799653d4eSeschrock 	space = DN_USED_BYTES(dn->dn_phys);
228899653d4eSeschrock 	if (delta > 0) {
228999653d4eSeschrock 		ASSERT3U(space + delta, >=, space); /* no overflow */
229099653d4eSeschrock 	} else {
229199653d4eSeschrock 		ASSERT3U(space, >=, -delta); /* no underflow */
229299653d4eSeschrock 	}
229399653d4eSeschrock 	space += delta;
2294e7437265Sahrens 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
229599653d4eSeschrock 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
2296fb09f5aaSMadhav Suresh 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
229799653d4eSeschrock 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2298fa9e4066Sahrens 	} else {
229999653d4eSeschrock 		dn->dn_phys->dn_used = space;
230099653d4eSeschrock 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2301fa9e4066Sahrens 	}
2302fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
2303fa9e4066Sahrens }
2304fa9e4066Sahrens 
230576256205SMark Maybee /*
2306f7170741SWill Andrews  * Scans a block at the indicated "level" looking for a hole or data,
2307f7170741SWill Andrews  * depending on 'flags'.
2308f7170741SWill Andrews  *
2309f7170741SWill Andrews  * If level > 0, then we are scanning an indirect block looking at its
2310f7170741SWill Andrews  * pointers.  If level == 0, then we are looking at a block of dnodes.
2311f7170741SWill Andrews  *
2312f7170741SWill Andrews  * If we don't find what we are looking for in the block, we return ESRCH.
2313f7170741SWill Andrews  * Otherwise, return with *offset pointing to the beginning (if searching
2314f7170741SWill Andrews  * forwards) or end (if searching backwards) of the range covered by the
2315f7170741SWill Andrews  * block pointer we matched on (or dnode).
231676256205SMark Maybee  *
231776256205SMark Maybee  * The basic search algorithm used below by dnode_next_offset() is to
231876256205SMark Maybee  * use this function to search up the block tree (widen the search) until
231976256205SMark Maybee  * we find something (i.e., we don't return ESRCH) and then search back
232076256205SMark Maybee  * down the tree (narrow the search) until we reach our original search
232176256205SMark Maybee  * level.
232276256205SMark Maybee  */
2323fa9e4066Sahrens static int
dnode_next_offset_level(dnode_t * dn,int flags,uint64_t * offset,int lvl,uint64_t blkfill,uint64_t txg)2324cdb0ab79Smaybee dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2325a2cdcdd2SPaul Dagnelie     int lvl, uint64_t blkfill, uint64_t txg)
2326fa9e4066Sahrens {
2327fa9e4066Sahrens 	dmu_buf_impl_t *db = NULL;
2328fa9e4066Sahrens 	void *data = NULL;
2329fa9e4066Sahrens 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2330fa9e4066Sahrens 	uint64_t epb = 1ULL << epbs;
2331fa9e4066Sahrens 	uint64_t minfill, maxfill;
2332cdb0ab79Smaybee 	boolean_t hole;
2333cdb0ab79Smaybee 	int i, inc, error, span;
2334fa9e4066Sahrens 
2335fa9e4066Sahrens 	dprintf("probing object %llu offset %llx level %d of %u\n",
2336fa9e4066Sahrens 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
2337fa9e4066Sahrens 
23389704bf7fSPaul Dagnelie 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
23399704bf7fSPaul Dagnelie 
234014843421SMatthew Ahrens 	hole = ((flags & DNODE_FIND_HOLE) != 0);
2341cdb0ab79Smaybee 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
23421c8564a7SMark Maybee 	ASSERT(txg == 0 || !hole);
2343cdb0ab79Smaybee 
2344fa9e4066Sahrens 	if (lvl == dn->dn_phys->dn_nlevels) {
2345fa9e4066Sahrens 		error = 0;
2346fa9e4066Sahrens 		epb = dn->dn_phys->dn_nblkptr;
2347fa9e4066Sahrens 		data = dn->dn_phys->dn_blkptr;
2348fa9e4066Sahrens 	} else {
2349a2cdcdd2SPaul Dagnelie 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2350a2cdcdd2SPaul Dagnelie 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2351fa9e4066Sahrens 		if (error) {
23521c8564a7SMark Maybee 			if (error != ENOENT)
23531c8564a7SMark Maybee 				return (error);
23541c8564a7SMark Maybee 			if (hole)
23551c8564a7SMark Maybee 				return (0);
23561c8564a7SMark Maybee 			/*
23571c8564a7SMark Maybee 			 * This can only happen when we are searching up
23581c8564a7SMark Maybee 			 * the block tree for data.  We don't really need to
23591c8564a7SMark Maybee 			 * adjust the offset, as we will just end up looking
23601c8564a7SMark Maybee 			 * at the pointer to this block in its parent, and its
23611c8564a7SMark Maybee 			 * going to be unallocated, so we will skip over it.
23621c8564a7SMark Maybee 			 */
2363be6fd75aSMatthew Ahrens 			return (SET_ERROR(ESRCH));
2364fa9e4066Sahrens 		}
2365eb633035STom Caputi 		error = dbuf_read(db, NULL,
2366eb633035STom Caputi 		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
236798572ac1Sahrens 		if (error) {
236898572ac1Sahrens 			dbuf_rele(db, FTAG);
236998572ac1Sahrens 			return (error);
237098572ac1Sahrens 		}
2371fa9e4066Sahrens 		data = db->db.db_data;
23729704bf7fSPaul Dagnelie 		rw_enter(&db->db_rwlock, RW_READER);
2373fa9e4066Sahrens 	}
2374fa9e4066Sahrens 
237543466aaeSMax Grossman 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
237643466aaeSMax Grossman 	    db->db_blkptr->blk_birth <= txg ||
237743466aaeSMax Grossman 	    BP_IS_HOLE(db->db_blkptr))) {
23781c8564a7SMark Maybee 		/*
23791c8564a7SMark Maybee 		 * This can only happen when we are searching up the tree
23801c8564a7SMark Maybee 		 * and these conditions mean that we need to keep climbing.
23811c8564a7SMark Maybee 		 */
2382be6fd75aSMatthew Ahrens 		error = SET_ERROR(ESRCH);
23836754306eSahrens 	} else if (lvl == 0) {
2384fa9e4066Sahrens 		dnode_phys_t *dnp = data;
238554811da5SToomas Soome 
2386fa9e4066Sahrens 		ASSERT(dn->dn_type == DMU_OT_DNODE);
238754811da5SToomas Soome 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2388fa9e4066Sahrens 
238954811da5SToomas Soome 		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
239054811da5SToomas Soome 		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
239108f3f137SJonathan W Adams 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2392fa9e4066Sahrens 				break;
2393fa9e4066Sahrens 		}
239454811da5SToomas Soome 
239554811da5SToomas Soome 		if (i == blkfill)
2396be6fd75aSMatthew Ahrens 			error = SET_ERROR(ESRCH);
239754811da5SToomas Soome 
239854811da5SToomas Soome 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
239954811da5SToomas Soome 		    (i << DNODE_SHIFT);
2400fa9e4066Sahrens 	} else {
2401fa9e4066Sahrens 		blkptr_t *bp = data;
240276256205SMark Maybee 		uint64_t start = *offset;
2403fa9e4066Sahrens 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
2404fa9e4066Sahrens 		minfill = 0;
2405fa9e4066Sahrens 		maxfill = blkfill << ((lvl - 1) * epbs);
2406fa9e4066Sahrens 
2407fa9e4066Sahrens 		if (hole)
2408fa9e4066Sahrens 			maxfill--;
2409fa9e4066Sahrens 		else
2410fa9e4066Sahrens 			minfill++;
2411fa9e4066Sahrens 
241276256205SMark Maybee 		*offset = *offset >> span;
241376256205SMark Maybee 		for (i = BF64_GET(*offset, 0, epbs);
2414cdb0ab79Smaybee 		    i >= 0 && i < epb; i += inc) {
24155d7b4d43SMatthew Ahrens 			if (BP_GET_FILL(&bp[i]) >= minfill &&
24165d7b4d43SMatthew Ahrens 			    BP_GET_FILL(&bp[i]) <= maxfill &&
24171c8564a7SMark Maybee 			    (hole || bp[i].blk_birth > txg))
2418fa9e4066Sahrens 				break;
241976256205SMark Maybee 			if (inc > 0 || *offset > 0)
242076256205SMark Maybee 				*offset += inc;
242176256205SMark Maybee 		}
242276256205SMark Maybee 		*offset = *offset << span;
242376256205SMark Maybee 		if (inc < 0) {
242476256205SMark Maybee 			/* traversing backwards; position offset at the end */
242576256205SMark Maybee 			ASSERT3U(*offset, <=, start);
242676256205SMark Maybee 			*offset = MIN(*offset + (1ULL << span) - 1, start);
242776256205SMark Maybee 		} else if (*offset < start) {
242876256205SMark Maybee 			*offset = start;
2429fa9e4066Sahrens 		}
243076256205SMark Maybee 		if (i < 0 || i >= epb)
2431be6fd75aSMatthew Ahrens 			error = SET_ERROR(ESRCH);
2432fa9e4066Sahrens 	}
2433fa9e4066Sahrens 
24349704bf7fSPaul Dagnelie 	if (db != NULL) {
24359704bf7fSPaul Dagnelie 		rw_exit(&db->db_rwlock);
2436ea8dc4b6Seschrock 		dbuf_rele(db, FTAG);
24379704bf7fSPaul Dagnelie 	}
2438fa9e4066Sahrens 
2439fa9e4066Sahrens 	return (error);
2440fa9e4066Sahrens }
2441fa9e4066Sahrens 
2442fa9e4066Sahrens /*
2443fa9e4066Sahrens  * Find the next hole, data, or sparse region at or after *offset.
2444fa9e4066Sahrens  * The value 'blkfill' tells us how many items we expect to find
2445fa9e4066Sahrens  * in an L0 data block; this value is 1 for normal objects,
2446fa9e4066Sahrens  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
2447fa9e4066Sahrens  * DNODES_PER_BLOCK when searching for sparse regions thereof.
24486754306eSahrens  *
2449fa9e4066Sahrens  * Examples:
2450fa9e4066Sahrens  *
2451cdb0ab79Smaybee  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
2452cdb0ab79Smaybee  *	Finds the next/previous hole/data in a file.
2453fa9e4066Sahrens  *	Used in dmu_offset_next().
2454fa9e4066Sahrens  *
2455cdb0ab79Smaybee  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2456fa9e4066Sahrens  *	Finds the next free/allocated dnode an objset's meta-dnode.
24576754306eSahrens  *	Only finds objects that have new contents since txg (ie.
24586754306eSahrens  *	bonus buffer changes and content removal are ignored).
2459fa9e4066Sahrens  *	Used in dmu_object_next().
2460fa9e4066Sahrens  *
2461cdb0ab79Smaybee  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2462fa9e4066Sahrens  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
2463fa9e4066Sahrens  *	Used in dmu_object_alloc().
2464fa9e4066Sahrens  */
2465fa9e4066Sahrens int
dnode_next_offset(dnode_t * dn,int flags,uint64_t * offset,int minlvl,uint64_t blkfill,uint64_t txg)2466cdb0ab79Smaybee dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
24676754306eSahrens     int minlvl, uint64_t blkfill, uint64_t txg)
2468fa9e4066Sahrens {
2469cdb0ab79Smaybee 	uint64_t initial_offset = *offset;
2470fa9e4066Sahrens 	int lvl, maxlvl;
2471fa9e4066Sahrens 	int error = 0;
2472fa9e4066Sahrens 
2473cdb0ab79Smaybee 	if (!(flags & DNODE_FIND_HAVELOCK))
2474cdb0ab79Smaybee 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2475fa9e4066Sahrens 
2476fa9e4066Sahrens 	if (dn->dn_phys->dn_nlevels == 0) {
2477be6fd75aSMatthew Ahrens 		error = SET_ERROR(ESRCH);
2478cdb0ab79Smaybee 		goto out;
2479fa9e4066Sahrens 	}
2480fa9e4066Sahrens 
2481fa9e4066Sahrens 	if (dn->dn_datablkshift == 0) {
2482fa9e4066Sahrens 		if (*offset < dn->dn_datablksz) {
2483cdb0ab79Smaybee 			if (flags & DNODE_FIND_HOLE)
2484fa9e4066Sahrens 				*offset = dn->dn_datablksz;
2485fa9e4066Sahrens 		} else {
2486be6fd75aSMatthew Ahrens 			error = SET_ERROR(ESRCH);
2487fa9e4066Sahrens 		}
2488cdb0ab79Smaybee 		goto out;
2489fa9e4066Sahrens 	}
2490fa9e4066Sahrens 
2491fa9e4066Sahrens 	maxlvl = dn->dn_phys->dn_nlevels;
2492fa9e4066Sahrens 
2493fa9e4066Sahrens 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
24946754306eSahrens 		error = dnode_next_offset_level(dn,
2495cdb0ab79Smaybee 		    flags, offset, lvl, blkfill, txg);
249698572ac1Sahrens 		if (error != ESRCH)
2497fa9e4066Sahrens 			break;
2498fa9e4066Sahrens 	}
2499fa9e4066Sahrens 
2500cdb0ab79Smaybee 	while (error == 0 && --lvl >= minlvl) {
25016754306eSahrens 		error = dnode_next_offset_level(dn,
2502cdb0ab79Smaybee 		    flags, offset, lvl, blkfill, txg);
25036754306eSahrens 	}
2504fa9e4066Sahrens 
25050fbc0cd0SMatthew Ahrens 	/*
25060fbc0cd0SMatthew Ahrens 	 * There's always a "virtual hole" at the end of the object, even
25070fbc0cd0SMatthew Ahrens 	 * if all BP's which physically exist are non-holes.
25080fbc0cd0SMatthew Ahrens 	 */
25090fbc0cd0SMatthew Ahrens 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
25100fbc0cd0SMatthew Ahrens 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
25110fbc0cd0SMatthew Ahrens 		error = 0;
25120fbc0cd0SMatthew Ahrens 	}
25130fbc0cd0SMatthew Ahrens 
2514cdb0ab79Smaybee 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2515cdb0ab79Smaybee 	    initial_offset < *offset : initial_offset > *offset))
2516be6fd75aSMatthew Ahrens 		error = SET_ERROR(ESRCH);
2517cdb0ab79Smaybee out:
2518cdb0ab79Smaybee 	if (!(flags & DNODE_FIND_HAVELOCK))
2519cdb0ab79Smaybee 		rw_exit(&dn->dn_struct_rwlock);
2520fa9e4066Sahrens 
2521fa9e4066Sahrens 	return (error);
2522fa9e4066Sahrens }
2523