xref: /illumos-gate/usr/src/uts/common/fs/zfs/dnode.c (revision c4ab0d3f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  * Copyright 2017 RackTop Systems.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/dbuf.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/dmu_objset.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/spa.h>
39 #include <sys/zio.h>
40 #include <sys/dmu_zfetch.h>
41 #include <sys/range_tree.h>
42 
43 dnode_stats_t dnode_stats = {
44 	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
45 	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
46 	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
47 	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
48 	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
49 	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
50 	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
51 	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
52 	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
53 	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
54 	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
55 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
56 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
57 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
58 	{ "dnode_hold_free_txg",		KSTAT_DATA_UINT64 },
59 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
60 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
61 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
62 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
63 	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
64 	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
65 	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
66 	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
67 	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
68 	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
69 	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
70 	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
71 	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
72 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
73 };
74 
75 static kstat_t *dnode_ksp;
76 static kmem_cache_t *dnode_cache;
77 
78 static dnode_phys_t dnode_phys_zero;
79 
80 int zfs_default_bs = SPA_MINBLOCKSHIFT;
81 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
82 
83 #ifdef	_KERNEL
84 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
85 #endif	/* _KERNEL */
86 
87 static int
88 dbuf_compare(const void *x1, const void *x2)
89 {
90 	const dmu_buf_impl_t *d1 = x1;
91 	const dmu_buf_impl_t *d2 = x2;
92 
93 	int cmp = AVL_CMP(d1->db_level, d2->db_level);
94 	if (likely(cmp))
95 		return (cmp);
96 
97 	cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
98 	if (likely(cmp))
99 		return (cmp);
100 
101 	if (d1->db_state == DB_SEARCH) {
102 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
103 		return (-1);
104 	} else if (d2->db_state == DB_SEARCH) {
105 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
106 		return (1);
107 	}
108 
109 	return (AVL_PCMP(d1, d2));
110 }
111 
112 /* ARGSUSED */
113 static int
114 dnode_cons(void *arg, void *unused, int kmflag)
115 {
116 	dnode_t *dn = arg;
117 	int i;
118 
119 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
120 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
121 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
122 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
123 
124 	/*
125 	 * Every dbuf has a reference, and dropping a tracked reference is
126 	 * O(number of references), so don't track dn_holds.
127 	 */
128 	zfs_refcount_create_untracked(&dn->dn_holds);
129 	zfs_refcount_create(&dn->dn_tx_holds);
130 	list_link_init(&dn->dn_link);
131 
132 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
133 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
134 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
135 	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
136 	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
137 	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
138 	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
139 
140 	for (i = 0; i < TXG_SIZE; i++) {
141 		multilist_link_init(&dn->dn_dirty_link[i]);
142 		dn->dn_free_ranges[i] = NULL;
143 		list_create(&dn->dn_dirty_records[i],
144 		    sizeof (dbuf_dirty_record_t),
145 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
146 	}
147 
148 	dn->dn_allocated_txg = 0;
149 	dn->dn_free_txg = 0;
150 	dn->dn_assigned_txg = 0;
151 	dn->dn_dirty_txg = 0;
152 	dn->dn_dirtyctx = 0;
153 	dn->dn_dirtyctx_firstset = NULL;
154 	dn->dn_bonus = NULL;
155 	dn->dn_have_spill = B_FALSE;
156 	dn->dn_zio = NULL;
157 	dn->dn_oldused = 0;
158 	dn->dn_oldflags = 0;
159 	dn->dn_olduid = 0;
160 	dn->dn_oldgid = 0;
161 	dn->dn_newuid = 0;
162 	dn->dn_newgid = 0;
163 	dn->dn_id_flags = 0;
164 
165 	dn->dn_dbufs_count = 0;
166 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
167 	    offsetof(dmu_buf_impl_t, db_link));
168 
169 	dn->dn_moved = 0;
170 	return (0);
171 }
172 
173 /* ARGSUSED */
174 static void
175 dnode_dest(void *arg, void *unused)
176 {
177 	int i;
178 	dnode_t *dn = arg;
179 
180 	rw_destroy(&dn->dn_struct_rwlock);
181 	mutex_destroy(&dn->dn_mtx);
182 	mutex_destroy(&dn->dn_dbufs_mtx);
183 	cv_destroy(&dn->dn_notxholds);
184 	zfs_refcount_destroy(&dn->dn_holds);
185 	zfs_refcount_destroy(&dn->dn_tx_holds);
186 	ASSERT(!list_link_active(&dn->dn_link));
187 
188 	for (i = 0; i < TXG_SIZE; i++) {
189 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
190 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
191 		list_destroy(&dn->dn_dirty_records[i]);
192 		ASSERT0(dn->dn_next_nblkptr[i]);
193 		ASSERT0(dn->dn_next_nlevels[i]);
194 		ASSERT0(dn->dn_next_indblkshift[i]);
195 		ASSERT0(dn->dn_next_bonustype[i]);
196 		ASSERT0(dn->dn_rm_spillblk[i]);
197 		ASSERT0(dn->dn_next_bonuslen[i]);
198 		ASSERT0(dn->dn_next_blksz[i]);
199 	}
200 
201 	ASSERT0(dn->dn_allocated_txg);
202 	ASSERT0(dn->dn_free_txg);
203 	ASSERT0(dn->dn_assigned_txg);
204 	ASSERT0(dn->dn_dirty_txg);
205 	ASSERT0(dn->dn_dirtyctx);
206 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
207 	ASSERT3P(dn->dn_bonus, ==, NULL);
208 	ASSERT(!dn->dn_have_spill);
209 	ASSERT3P(dn->dn_zio, ==, NULL);
210 	ASSERT0(dn->dn_oldused);
211 	ASSERT0(dn->dn_oldflags);
212 	ASSERT0(dn->dn_olduid);
213 	ASSERT0(dn->dn_oldgid);
214 	ASSERT0(dn->dn_newuid);
215 	ASSERT0(dn->dn_newgid);
216 	ASSERT0(dn->dn_id_flags);
217 
218 	ASSERT0(dn->dn_dbufs_count);
219 	avl_destroy(&dn->dn_dbufs);
220 }
221 
222 void
223 dnode_init(void)
224 {
225 	ASSERT(dnode_cache == NULL);
226 	dnode_cache = kmem_cache_create("dnode_t",
227 	    sizeof (dnode_t),
228 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
229 #ifdef	_KERNEL
230 	kmem_cache_set_move(dnode_cache, dnode_move);
231 
232 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
233 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
234 	    KSTAT_FLAG_VIRTUAL);
235 	if (dnode_ksp != NULL) {
236 		dnode_ksp->ks_data = &dnode_stats;
237 		kstat_install(dnode_ksp);
238 	}
239 #endif	/* _KERNEL */
240 }
241 
242 void
243 dnode_fini(void)
244 {
245 	if (dnode_ksp != NULL) {
246 		kstat_delete(dnode_ksp);
247 		dnode_ksp = NULL;
248 	}
249 
250 	kmem_cache_destroy(dnode_cache);
251 	dnode_cache = NULL;
252 }
253 
254 
255 #ifdef ZFS_DEBUG
256 void
257 dnode_verify(dnode_t *dn)
258 {
259 	int drop_struct_lock = FALSE;
260 
261 	ASSERT(dn->dn_phys);
262 	ASSERT(dn->dn_objset);
263 	ASSERT(dn->dn_handle->dnh_dnode == dn);
264 
265 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
266 
267 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
268 		return;
269 
270 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
271 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
272 		drop_struct_lock = TRUE;
273 	}
274 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
275 		int i;
276 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
277 		ASSERT3U(dn->dn_indblkshift, >=, 0);
278 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
279 		if (dn->dn_datablkshift) {
280 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
281 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
282 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
283 		}
284 		ASSERT3U(dn->dn_nlevels, <=, 30);
285 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
286 		ASSERT3U(dn->dn_nblkptr, >=, 1);
287 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
288 		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
289 		ASSERT3U(dn->dn_datablksz, ==,
290 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
291 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
292 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
293 		    dn->dn_bonuslen, <=, max_bonuslen);
294 		for (i = 0; i < TXG_SIZE; i++) {
295 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
296 		}
297 	}
298 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
299 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
300 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
301 	if (dn->dn_dbuf != NULL) {
302 		ASSERT3P(dn->dn_phys, ==,
303 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
304 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
305 	}
306 	if (drop_struct_lock)
307 		rw_exit(&dn->dn_struct_rwlock);
308 }
309 #endif
310 
311 void
312 dnode_byteswap(dnode_phys_t *dnp)
313 {
314 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
315 	int i;
316 
317 	if (dnp->dn_type == DMU_OT_NONE) {
318 		bzero(dnp, sizeof (dnode_phys_t));
319 		return;
320 	}
321 
322 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
323 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
324 	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
325 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
326 	dnp->dn_used = BSWAP_64(dnp->dn_used);
327 
328 	/*
329 	 * dn_nblkptr is only one byte, so it's OK to read it in either
330 	 * byte order.  We can't read dn_bouslen.
331 	 */
332 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
333 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
334 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
335 		buf64[i] = BSWAP_64(buf64[i]);
336 
337 	/*
338 	 * OK to check dn_bonuslen for zero, because it won't matter if
339 	 * we have the wrong byte order.  This is necessary because the
340 	 * dnode dnode is smaller than a regular dnode.
341 	 */
342 	if (dnp->dn_bonuslen != 0) {
343 		/*
344 		 * Note that the bonus length calculated here may be
345 		 * longer than the actual bonus buffer.  This is because
346 		 * we always put the bonus buffer after the last block
347 		 * pointer (instead of packing it against the end of the
348 		 * dnode buffer).
349 		 */
350 		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
351 		int slots = dnp->dn_extra_slots + 1;
352 		size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
353 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
354 		dmu_object_byteswap_t byteswap =
355 		    DMU_OT_BYTESWAP(dnp->dn_bonustype);
356 		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
357 	}
358 
359 	/* Swap SPILL block if we have one */
360 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
361 		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
362 
363 }
364 
365 void
366 dnode_buf_byteswap(void *vbuf, size_t size)
367 {
368 	int i = 0;
369 
370 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
371 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
372 
373 	while (i < size) {
374 		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
375 		dnode_byteswap(dnp);
376 
377 		i += DNODE_MIN_SIZE;
378 		if (dnp->dn_type != DMU_OT_NONE)
379 			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
380 	}
381 }
382 
383 void
384 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
385 {
386 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
387 
388 	dnode_setdirty(dn, tx);
389 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
390 	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
391 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
392 	dn->dn_bonuslen = newsize;
393 	if (newsize == 0)
394 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
395 	else
396 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
397 	rw_exit(&dn->dn_struct_rwlock);
398 }
399 
400 void
401 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
402 {
403 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
404 	dnode_setdirty(dn, tx);
405 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
406 	dn->dn_bonustype = newtype;
407 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
408 	rw_exit(&dn->dn_struct_rwlock);
409 }
410 
411 void
412 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
413 {
414 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
415 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
416 	dnode_setdirty(dn, tx);
417 	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
418 	dn->dn_have_spill = B_FALSE;
419 }
420 
421 static void
422 dnode_setdblksz(dnode_t *dn, int size)
423 {
424 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
425 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
426 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
427 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
428 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
429 	dn->dn_datablksz = size;
430 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
431 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
432 }
433 
434 static dnode_t *
435 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
436     uint64_t object, dnode_handle_t *dnh)
437 {
438 	dnode_t *dn;
439 
440 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
441 #ifdef _KERNEL
442 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
443 #endif /* _KERNEL */
444 	dn->dn_moved = 0;
445 
446 	/*
447 	 * Defer setting dn_objset until the dnode is ready to be a candidate
448 	 * for the dnode_move() callback.
449 	 */
450 	dn->dn_object = object;
451 	dn->dn_dbuf = db;
452 	dn->dn_handle = dnh;
453 	dn->dn_phys = dnp;
454 
455 	if (dnp->dn_datablkszsec) {
456 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
457 	} else {
458 		dn->dn_datablksz = 0;
459 		dn->dn_datablkszsec = 0;
460 		dn->dn_datablkshift = 0;
461 	}
462 	dn->dn_indblkshift = dnp->dn_indblkshift;
463 	dn->dn_nlevels = dnp->dn_nlevels;
464 	dn->dn_type = dnp->dn_type;
465 	dn->dn_nblkptr = dnp->dn_nblkptr;
466 	dn->dn_checksum = dnp->dn_checksum;
467 	dn->dn_compress = dnp->dn_compress;
468 	dn->dn_bonustype = dnp->dn_bonustype;
469 	dn->dn_bonuslen = dnp->dn_bonuslen;
470 	dn->dn_num_slots = dnp->dn_extra_slots + 1;
471 	dn->dn_maxblkid = dnp->dn_maxblkid;
472 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
473 	dn->dn_id_flags = 0;
474 
475 	dmu_zfetch_init(&dn->dn_zfetch, dn);
476 
477 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
478 	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
479 	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
480 
481 	mutex_enter(&os->os_lock);
482 
483 	/*
484 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
485 	 * signifies that the special dnodes have no references from
486 	 * their children (the entries in os_dnodes).  This allows
487 	 * dnode_destroy() to easily determine if the last child has
488 	 * been removed and then complete eviction of the objset.
489 	 */
490 	if (!DMU_OBJECT_IS_SPECIAL(object))
491 		list_insert_head(&os->os_dnodes, dn);
492 	membar_producer();
493 
494 	/*
495 	 * Everything else must be valid before assigning dn_objset
496 	 * makes the dnode eligible for dnode_move().
497 	 */
498 	dn->dn_objset = os;
499 
500 	dnh->dnh_dnode = dn;
501 	mutex_exit(&os->os_lock);
502 
503 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
504 
505 	return (dn);
506 }
507 
508 /*
509  * Caller must be holding the dnode handle, which is released upon return.
510  */
511 static void
512 dnode_destroy(dnode_t *dn)
513 {
514 	objset_t *os = dn->dn_objset;
515 	boolean_t complete_os_eviction = B_FALSE;
516 
517 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
518 
519 	mutex_enter(&os->os_lock);
520 	POINTER_INVALIDATE(&dn->dn_objset);
521 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
522 		list_remove(&os->os_dnodes, dn);
523 		complete_os_eviction =
524 		    list_is_empty(&os->os_dnodes) &&
525 		    list_link_active(&os->os_evicting_node);
526 	}
527 	mutex_exit(&os->os_lock);
528 
529 	/* the dnode can no longer move, so we can release the handle */
530 	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
531 		zrl_remove(&dn->dn_handle->dnh_zrlock);
532 
533 	dn->dn_allocated_txg = 0;
534 	dn->dn_free_txg = 0;
535 	dn->dn_assigned_txg = 0;
536 	dn->dn_dirty_txg = 0;
537 
538 	dn->dn_dirtyctx = 0;
539 	if (dn->dn_dirtyctx_firstset != NULL) {
540 		kmem_free(dn->dn_dirtyctx_firstset, 1);
541 		dn->dn_dirtyctx_firstset = NULL;
542 	}
543 	if (dn->dn_bonus != NULL) {
544 		mutex_enter(&dn->dn_bonus->db_mtx);
545 		dbuf_destroy(dn->dn_bonus);
546 		dn->dn_bonus = NULL;
547 	}
548 	dn->dn_zio = NULL;
549 
550 	dn->dn_have_spill = B_FALSE;
551 	dn->dn_oldused = 0;
552 	dn->dn_oldflags = 0;
553 	dn->dn_olduid = 0;
554 	dn->dn_oldgid = 0;
555 	dn->dn_newuid = 0;
556 	dn->dn_newgid = 0;
557 	dn->dn_id_flags = 0;
558 
559 	dmu_zfetch_fini(&dn->dn_zfetch);
560 	kmem_cache_free(dnode_cache, dn);
561 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
562 
563 	if (complete_os_eviction)
564 		dmu_objset_evict_done(os);
565 }
566 
567 void
568 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
569     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
570 {
571 	int i;
572 
573 	ASSERT3U(dn_slots, >, 0);
574 	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
575 	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
576 	ASSERT3U(blocksize, <=,
577 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
578 	if (blocksize == 0)
579 		blocksize = 1 << zfs_default_bs;
580 	else
581 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
582 
583 	if (ibs == 0)
584 		ibs = zfs_default_ibs;
585 
586 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
587 
588 	dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
589 	    " blocksize=%d ibs=%d dn_slots=%d\n",
590 	    dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
591 	DNODE_STAT_BUMP(dnode_allocate);
592 
593 	ASSERT(dn->dn_type == DMU_OT_NONE);
594 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
595 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
596 	ASSERT(ot != DMU_OT_NONE);
597 	ASSERT(DMU_OT_IS_VALID(ot));
598 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
599 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
600 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
601 	ASSERT(DMU_OT_IS_VALID(bonustype));
602 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
603 	ASSERT(dn->dn_type == DMU_OT_NONE);
604 	ASSERT0(dn->dn_maxblkid);
605 	ASSERT0(dn->dn_allocated_txg);
606 	ASSERT0(dn->dn_dirty_txg);
607 	ASSERT0(dn->dn_assigned_txg);
608 	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
609 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
610 	ASSERT(avl_is_empty(&dn->dn_dbufs));
611 
612 	for (i = 0; i < TXG_SIZE; i++) {
613 		ASSERT0(dn->dn_next_nblkptr[i]);
614 		ASSERT0(dn->dn_next_nlevels[i]);
615 		ASSERT0(dn->dn_next_indblkshift[i]);
616 		ASSERT0(dn->dn_next_bonuslen[i]);
617 		ASSERT0(dn->dn_next_bonustype[i]);
618 		ASSERT0(dn->dn_rm_spillblk[i]);
619 		ASSERT0(dn->dn_next_blksz[i]);
620 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
621 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
622 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
623 	}
624 
625 	dn->dn_type = ot;
626 	dnode_setdblksz(dn, blocksize);
627 	dn->dn_indblkshift = ibs;
628 	dn->dn_nlevels = 1;
629 	dn->dn_num_slots = dn_slots;
630 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
631 		dn->dn_nblkptr = 1;
632 	else {
633 		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
634 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
635 		    SPA_BLKPTRSHIFT));
636 	}
637 
638 	dn->dn_bonustype = bonustype;
639 	dn->dn_bonuslen = bonuslen;
640 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
641 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
642 	dn->dn_dirtyctx = 0;
643 
644 	dn->dn_free_txg = 0;
645 	if (dn->dn_dirtyctx_firstset) {
646 		kmem_free(dn->dn_dirtyctx_firstset, 1);
647 		dn->dn_dirtyctx_firstset = NULL;
648 	}
649 
650 	dn->dn_allocated_txg = tx->tx_txg;
651 	dn->dn_id_flags = 0;
652 
653 	dnode_setdirty(dn, tx);
654 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
655 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
656 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
657 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
658 }
659 
660 void
661 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
662     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
663 {
664 	int nblkptr;
665 
666 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
667 	ASSERT3U(blocksize, <=,
668 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
669 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
670 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
671 	ASSERT(tx->tx_txg != 0);
672 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
673 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
674 	    (bonustype == DMU_OT_SA && bonuslen == 0));
675 	ASSERT(DMU_OT_IS_VALID(bonustype));
676 	ASSERT3U(bonuslen, <=,
677 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
678 	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
679 
680 	dnode_free_interior_slots(dn);
681 	DNODE_STAT_BUMP(dnode_reallocate);
682 
683 	/* clean up any unreferenced dbufs */
684 	dnode_evict_dbufs(dn);
685 
686 	dn->dn_id_flags = 0;
687 
688 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
689 	dnode_setdirty(dn, tx);
690 	if (dn->dn_datablksz != blocksize) {
691 		/* change blocksize */
692 		ASSERT(dn->dn_maxblkid == 0 &&
693 		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
694 		    dnode_block_freed(dn, 0)));
695 		dnode_setdblksz(dn, blocksize);
696 		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
697 	}
698 	if (dn->dn_bonuslen != bonuslen)
699 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
700 
701 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
702 		nblkptr = 1;
703 	else
704 		nblkptr = MIN(DN_MAX_NBLKPTR,
705 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
706 		    SPA_BLKPTRSHIFT));
707 	if (dn->dn_bonustype != bonustype)
708 		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
709 	if (dn->dn_nblkptr != nblkptr)
710 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
711 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
712 		dbuf_rm_spill(dn, tx);
713 		dnode_rm_spill(dn, tx);
714 	}
715 	rw_exit(&dn->dn_struct_rwlock);
716 
717 	/* change type */
718 	dn->dn_type = ot;
719 
720 	/* change bonus size and type */
721 	mutex_enter(&dn->dn_mtx);
722 	dn->dn_bonustype = bonustype;
723 	dn->dn_bonuslen = bonuslen;
724 	dn->dn_num_slots = dn_slots;
725 	dn->dn_nblkptr = nblkptr;
726 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
727 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
728 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
729 
730 	/* fix up the bonus db_size */
731 	if (dn->dn_bonus) {
732 		dn->dn_bonus->db.db_size =
733 		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
734 		    (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
735 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
736 	}
737 
738 	dn->dn_allocated_txg = tx->tx_txg;
739 	mutex_exit(&dn->dn_mtx);
740 }
741 
742 #ifdef	_KERNEL
743 static void
744 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
745 {
746 	int i;
747 
748 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
749 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
750 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
751 	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
752 
753 	/* Copy fields. */
754 	ndn->dn_objset = odn->dn_objset;
755 	ndn->dn_object = odn->dn_object;
756 	ndn->dn_dbuf = odn->dn_dbuf;
757 	ndn->dn_handle = odn->dn_handle;
758 	ndn->dn_phys = odn->dn_phys;
759 	ndn->dn_type = odn->dn_type;
760 	ndn->dn_bonuslen = odn->dn_bonuslen;
761 	ndn->dn_bonustype = odn->dn_bonustype;
762 	ndn->dn_nblkptr = odn->dn_nblkptr;
763 	ndn->dn_checksum = odn->dn_checksum;
764 	ndn->dn_compress = odn->dn_compress;
765 	ndn->dn_nlevels = odn->dn_nlevels;
766 	ndn->dn_indblkshift = odn->dn_indblkshift;
767 	ndn->dn_datablkshift = odn->dn_datablkshift;
768 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
769 	ndn->dn_datablksz = odn->dn_datablksz;
770 	ndn->dn_maxblkid = odn->dn_maxblkid;
771 	ndn->dn_num_slots = odn->dn_num_slots;
772 	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
773 	    sizeof (odn->dn_next_type));
774 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
775 	    sizeof (odn->dn_next_nblkptr));
776 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
777 	    sizeof (odn->dn_next_nlevels));
778 	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
779 	    sizeof (odn->dn_next_indblkshift));
780 	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
781 	    sizeof (odn->dn_next_bonustype));
782 	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
783 	    sizeof (odn->dn_rm_spillblk));
784 	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
785 	    sizeof (odn->dn_next_bonuslen));
786 	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
787 	    sizeof (odn->dn_next_blksz));
788 	for (i = 0; i < TXG_SIZE; i++) {
789 		list_move_tail(&ndn->dn_dirty_records[i],
790 		    &odn->dn_dirty_records[i]);
791 	}
792 	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
793 	    sizeof (odn->dn_free_ranges));
794 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
795 	ndn->dn_free_txg = odn->dn_free_txg;
796 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
797 	ndn->dn_dirty_txg = odn->dn_dirty_txg;
798 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
799 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
800 	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
801 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
802 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
803 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
804 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
805 	ndn->dn_bonus = odn->dn_bonus;
806 	ndn->dn_have_spill = odn->dn_have_spill;
807 	ndn->dn_zio = odn->dn_zio;
808 	ndn->dn_oldused = odn->dn_oldused;
809 	ndn->dn_oldflags = odn->dn_oldflags;
810 	ndn->dn_olduid = odn->dn_olduid;
811 	ndn->dn_oldgid = odn->dn_oldgid;
812 	ndn->dn_newuid = odn->dn_newuid;
813 	ndn->dn_newgid = odn->dn_newgid;
814 	ndn->dn_id_flags = odn->dn_id_flags;
815 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
816 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
817 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
818 
819 	/*
820 	 * Update back pointers. Updating the handle fixes the back pointer of
821 	 * every descendant dbuf as well as the bonus dbuf.
822 	 */
823 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
824 	ndn->dn_handle->dnh_dnode = ndn;
825 	if (ndn->dn_zfetch.zf_dnode == odn) {
826 		ndn->dn_zfetch.zf_dnode = ndn;
827 	}
828 
829 	/*
830 	 * Invalidate the original dnode by clearing all of its back pointers.
831 	 */
832 	odn->dn_dbuf = NULL;
833 	odn->dn_handle = NULL;
834 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
835 	    offsetof(dmu_buf_impl_t, db_link));
836 	odn->dn_dbufs_count = 0;
837 	odn->dn_bonus = NULL;
838 	odn->dn_zfetch.zf_dnode = NULL;
839 
840 	/*
841 	 * Set the low bit of the objset pointer to ensure that dnode_move()
842 	 * recognizes the dnode as invalid in any subsequent callback.
843 	 */
844 	POINTER_INVALIDATE(&odn->dn_objset);
845 
846 	/*
847 	 * Satisfy the destructor.
848 	 */
849 	for (i = 0; i < TXG_SIZE; i++) {
850 		list_create(&odn->dn_dirty_records[i],
851 		    sizeof (dbuf_dirty_record_t),
852 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
853 		odn->dn_free_ranges[i] = NULL;
854 		odn->dn_next_nlevels[i] = 0;
855 		odn->dn_next_indblkshift[i] = 0;
856 		odn->dn_next_bonustype[i] = 0;
857 		odn->dn_rm_spillblk[i] = 0;
858 		odn->dn_next_bonuslen[i] = 0;
859 		odn->dn_next_blksz[i] = 0;
860 	}
861 	odn->dn_allocated_txg = 0;
862 	odn->dn_free_txg = 0;
863 	odn->dn_assigned_txg = 0;
864 	odn->dn_dirty_txg = 0;
865 	odn->dn_dirtyctx = 0;
866 	odn->dn_dirtyctx_firstset = NULL;
867 	odn->dn_have_spill = B_FALSE;
868 	odn->dn_zio = NULL;
869 	odn->dn_oldused = 0;
870 	odn->dn_oldflags = 0;
871 	odn->dn_olduid = 0;
872 	odn->dn_oldgid = 0;
873 	odn->dn_newuid = 0;
874 	odn->dn_newgid = 0;
875 	odn->dn_id_flags = 0;
876 
877 	/*
878 	 * Mark the dnode.
879 	 */
880 	ndn->dn_moved = 1;
881 	odn->dn_moved = (uint8_t)-1;
882 }
883 
884 /*ARGSUSED*/
885 static kmem_cbrc_t
886 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
887 {
888 	dnode_t *odn = buf, *ndn = newbuf;
889 	objset_t *os;
890 	int64_t refcount;
891 	uint32_t dbufs;
892 
893 	/*
894 	 * The dnode is on the objset's list of known dnodes if the objset
895 	 * pointer is valid. We set the low bit of the objset pointer when
896 	 * freeing the dnode to invalidate it, and the memory patterns written
897 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
898 	 * A newly created dnode sets the objset pointer last of all to indicate
899 	 * that the dnode is known and in a valid state to be moved by this
900 	 * function.
901 	 */
902 	os = odn->dn_objset;
903 	if (!POINTER_IS_VALID(os)) {
904 		DNODE_STAT_BUMP(dnode_move_invalid);
905 		return (KMEM_CBRC_DONT_KNOW);
906 	}
907 
908 	/*
909 	 * Ensure that the objset does not go away during the move.
910 	 */
911 	rw_enter(&os_lock, RW_WRITER);
912 	if (os != odn->dn_objset) {
913 		rw_exit(&os_lock);
914 		DNODE_STAT_BUMP(dnode_move_recheck1);
915 		return (KMEM_CBRC_DONT_KNOW);
916 	}
917 
918 	/*
919 	 * If the dnode is still valid, then so is the objset. We know that no
920 	 * valid objset can be freed while we hold os_lock, so we can safely
921 	 * ensure that the objset remains in use.
922 	 */
923 	mutex_enter(&os->os_lock);
924 
925 	/*
926 	 * Recheck the objset pointer in case the dnode was removed just before
927 	 * acquiring the lock.
928 	 */
929 	if (os != odn->dn_objset) {
930 		mutex_exit(&os->os_lock);
931 		rw_exit(&os_lock);
932 		DNODE_STAT_BUMP(dnode_move_recheck2);
933 		return (KMEM_CBRC_DONT_KNOW);
934 	}
935 
936 	/*
937 	 * At this point we know that as long as we hold os->os_lock, the dnode
938 	 * cannot be freed and fields within the dnode can be safely accessed.
939 	 * The objset listing this dnode cannot go away as long as this dnode is
940 	 * on its list.
941 	 */
942 	rw_exit(&os_lock);
943 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
944 		mutex_exit(&os->os_lock);
945 		DNODE_STAT_BUMP(dnode_move_special);
946 		return (KMEM_CBRC_NO);
947 	}
948 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
949 
950 	/*
951 	 * Lock the dnode handle to prevent the dnode from obtaining any new
952 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
953 	 * from accessing the dnode, so that we can discount their holds. The
954 	 * handle is safe to access because we know that while the dnode cannot
955 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
956 	 * safely move any dnode referenced only by dbufs.
957 	 */
958 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
959 		mutex_exit(&os->os_lock);
960 		DNODE_STAT_BUMP(dnode_move_handle);
961 		return (KMEM_CBRC_LATER);
962 	}
963 
964 	/*
965 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
966 	 * We need to guarantee that there is a hold for every dbuf in order to
967 	 * determine whether the dnode is actively referenced. Falsely matching
968 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
969 	 * that a thread already having an active dnode hold is about to add a
970 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
971 	 * progress.
972 	 */
973 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
974 		zrl_exit(&odn->dn_handle->dnh_zrlock);
975 		mutex_exit(&os->os_lock);
976 		DNODE_STAT_BUMP(dnode_move_rwlock);
977 		return (KMEM_CBRC_LATER);
978 	}
979 
980 	/*
981 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
982 	 * case, the dbuf count is decremented under the handle lock before the
983 	 * dbuf's hold is released. This order ensures that if we count the hold
984 	 * after the dbuf is removed but before its hold is released, we will
985 	 * treat the unmatched hold as active and exit safely. If we count the
986 	 * hold before the dbuf is removed, the hold is discounted, and the
987 	 * removal is blocked until the move completes.
988 	 */
989 	refcount = zfs_refcount_count(&odn->dn_holds);
990 	ASSERT(refcount >= 0);
991 	dbufs = odn->dn_dbufs_count;
992 
993 	/* We can't have more dbufs than dnode holds. */
994 	ASSERT3U(dbufs, <=, refcount);
995 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
996 	    uint32_t, dbufs);
997 
998 	if (refcount > dbufs) {
999 		rw_exit(&odn->dn_struct_rwlock);
1000 		zrl_exit(&odn->dn_handle->dnh_zrlock);
1001 		mutex_exit(&os->os_lock);
1002 		DNODE_STAT_BUMP(dnode_move_active);
1003 		return (KMEM_CBRC_LATER);
1004 	}
1005 
1006 	rw_exit(&odn->dn_struct_rwlock);
1007 
1008 	/*
1009 	 * At this point we know that anyone with a hold on the dnode is not
1010 	 * actively referencing it. The dnode is known and in a valid state to
1011 	 * move. We're holding the locks needed to execute the critical section.
1012 	 */
1013 	dnode_move_impl(odn, ndn);
1014 
1015 	list_link_replace(&odn->dn_link, &ndn->dn_link);
1016 	/* If the dnode was safe to move, the refcount cannot have changed. */
1017 	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1018 	ASSERT(dbufs == ndn->dn_dbufs_count);
1019 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1020 	mutex_exit(&os->os_lock);
1021 
1022 	return (KMEM_CBRC_YES);
1023 }
1024 #endif	/* _KERNEL */
1025 
1026 static void
1027 dnode_slots_hold(dnode_children_t *children, int idx, int slots)
1028 {
1029 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1030 
1031 	for (int i = idx; i < idx + slots; i++) {
1032 		dnode_handle_t *dnh = &children->dnc_children[i];
1033 		zrl_add(&dnh->dnh_zrlock);
1034 	}
1035 }
1036 
1037 static void
1038 dnode_slots_rele(dnode_children_t *children, int idx, int slots)
1039 {
1040 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1041 
1042 	for (int i = idx; i < idx + slots; i++) {
1043 		dnode_handle_t *dnh = &children->dnc_children[i];
1044 
1045 		if (zrl_is_locked(&dnh->dnh_zrlock))
1046 			zrl_exit(&dnh->dnh_zrlock);
1047 		else
1048 			zrl_remove(&dnh->dnh_zrlock);
1049 	}
1050 }
1051 
1052 static int
1053 dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
1054 {
1055 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1056 
1057 	for (int i = idx; i < idx + slots; i++) {
1058 		dnode_handle_t *dnh = &children->dnc_children[i];
1059 
1060 		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
1061 			for (int j = idx; j < i; j++) {
1062 				dnh = &children->dnc_children[j];
1063 				zrl_exit(&dnh->dnh_zrlock);
1064 			}
1065 
1066 			return (0);
1067 		}
1068 	}
1069 
1070 	return (1);
1071 }
1072 
1073 static void
1074 dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
1075 {
1076 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1077 
1078 	for (int i = idx; i < idx + slots; i++) {
1079 		dnode_handle_t *dnh = &children->dnc_children[i];
1080 		dnh->dnh_dnode = ptr;
1081 	}
1082 }
1083 
1084 static boolean_t
1085 dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
1086 {
1087 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1088 
1089 	/*
1090 	 * If all dnode slots are either already free or
1091 	 * evictable return B_TRUE.
1092 	 */
1093 	for (int i = idx; i < idx + slots; i++) {
1094 		dnode_handle_t *dnh = &children->dnc_children[i];
1095 		dnode_t *dn = dnh->dnh_dnode;
1096 
1097 		if (dn == DN_SLOT_FREE) {
1098 			continue;
1099 		} else if (DN_SLOT_IS_PTR(dn)) {
1100 			mutex_enter(&dn->dn_mtx);
1101 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1102 			    zfs_refcount_is_zero(&dn->dn_holds) &&
1103 			    !DNODE_IS_DIRTY(dn));
1104 			mutex_exit(&dn->dn_mtx);
1105 
1106 			if (!can_free)
1107 				return (B_FALSE);
1108 			else
1109 				continue;
1110 		} else {
1111 			return (B_FALSE);
1112 		}
1113 	}
1114 
1115 	return (B_TRUE);
1116 }
1117 
1118 static void
1119 dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
1120 {
1121 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1122 
1123 	for (int i = idx; i < idx + slots; i++) {
1124 		dnode_handle_t *dnh = &children->dnc_children[i];
1125 
1126 		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
1127 
1128 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1129 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
1130 			dnode_destroy(dnh->dnh_dnode);
1131 			dnh->dnh_dnode = DN_SLOT_FREE;
1132 		}
1133 	}
1134 }
1135 
1136 void
1137 dnode_free_interior_slots(dnode_t *dn)
1138 {
1139 	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
1140 	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
1141 	int idx = (dn->dn_object & (epb - 1)) + 1;
1142 	int slots = dn->dn_num_slots - 1;
1143 
1144 	if (slots == 0)
1145 		return;
1146 
1147 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1148 
1149 	while (!dnode_slots_tryenter(children, idx, slots))
1150 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
1151 
1152 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
1153 	dnode_slots_rele(children, idx, slots);
1154 }
1155 
1156 void
1157 dnode_special_close(dnode_handle_t *dnh)
1158 {
1159 	dnode_t *dn = dnh->dnh_dnode;
1160 
1161 	/*
1162 	 * Wait for final references to the dnode to clear.  This can
1163 	 * only happen if the arc is asynchronously evicting state that
1164 	 * has a hold on this dnode while we are trying to evict this
1165 	 * dnode.
1166 	 */
1167 	while (zfs_refcount_count(&dn->dn_holds) > 0)
1168 		delay(1);
1169 	ASSERT(dn->dn_dbuf == NULL ||
1170 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1171 	zrl_add(&dnh->dnh_zrlock);
1172 	dnode_destroy(dn); /* implicit zrl_remove() */
1173 	zrl_destroy(&dnh->dnh_zrlock);
1174 	dnh->dnh_dnode = NULL;
1175 }
1176 
1177 void
1178 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1179     dnode_handle_t *dnh)
1180 {
1181 	dnode_t *dn;
1182 
1183 	zrl_init(&dnh->dnh_zrlock);
1184 	zrl_tryenter(&dnh->dnh_zrlock);
1185 
1186 	dn = dnode_create(os, dnp, NULL, object, dnh);
1187 	DNODE_VERIFY(dn);
1188 
1189 	zrl_exit(&dnh->dnh_zrlock);
1190 }
1191 
1192 static void
1193 dnode_buf_evict_async(void *dbu)
1194 {
1195 	dnode_children_t *dnc = dbu;
1196 
1197 	DNODE_STAT_BUMP(dnode_buf_evict);
1198 
1199 	for (int i = 0; i < dnc->dnc_count; i++) {
1200 		dnode_handle_t *dnh = &dnc->dnc_children[i];
1201 		dnode_t *dn;
1202 
1203 		/*
1204 		 * The dnode handle lock guards against the dnode moving to
1205 		 * another valid address, so there is no need here to guard
1206 		 * against changes to or from NULL.
1207 		 */
1208 		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1209 			zrl_destroy(&dnh->dnh_zrlock);
1210 			dnh->dnh_dnode = DN_SLOT_UNINIT;
1211 			continue;
1212 		}
1213 
1214 		zrl_add(&dnh->dnh_zrlock);
1215 		dn = dnh->dnh_dnode;
1216 		/*
1217 		 * If there are holds on this dnode, then there should
1218 		 * be holds on the dnode's containing dbuf as well; thus
1219 		 * it wouldn't be eligible for eviction and this function
1220 		 * would not have been called.
1221 		 */
1222 		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1223 		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1224 
1225 		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1226 		zrl_destroy(&dnh->dnh_zrlock);
1227 		dnh->dnh_dnode = DN_SLOT_UNINIT;
1228 	}
1229 	kmem_free(dnc, sizeof (dnode_children_t) +
1230 	    dnc->dnc_count * sizeof (dnode_handle_t));
1231 }
1232 
1233 /*
1234  * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
1235  * to ensure the hole at the specified object offset is large enough to
1236  * hold the dnode being created. The slots parameter is also used to ensure
1237  * a dnode does not span multiple dnode blocks. In both of these cases, if
1238  * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
1239  * are only possible when using DNODE_MUST_BE_FREE.
1240  *
1241  * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
1242  * dnode_hold_impl() will check if the requested dnode is already consumed
1243  * as an extra dnode slot by an large dnode, in which case it returns
1244  * ENOENT.
1245  *
1246  * errors:
1247  * EINVAL - invalid object number or flags.
1248  * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
1249  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
1250  *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
1251  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
1252  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
1253  *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
1254  * EIO    - i/o error error when reading the meta dnode dbuf.
1255  * succeeds even for free dnodes.
1256  */
1257 int
1258 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1259     void *tag, dnode_t **dnp)
1260 {
1261 	int epb, idx, err;
1262 	int drop_struct_lock = FALSE;
1263 	int type;
1264 	uint64_t blk;
1265 	dnode_t *mdn, *dn;
1266 	dmu_buf_impl_t *db;
1267 	dnode_children_t *dnc;
1268 	dnode_phys_t *dn_block;
1269 	dnode_handle_t *dnh;
1270 
1271 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
1272 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
1273 
1274 	/*
1275 	 * If you are holding the spa config lock as writer, you shouldn't
1276 	 * be asking the DMU to do *anything* unless it's the root pool
1277 	 * which may require us to read from the root filesystem while
1278 	 * holding some (not all) of the locks as writer.
1279 	 */
1280 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1281 	    (spa_is_root(os->os_spa) &&
1282 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1283 
1284 	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
1285 
1286 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
1287 		dn = (object == DMU_USERUSED_OBJECT) ?
1288 		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
1289 		if (dn == NULL)
1290 			return (SET_ERROR(ENOENT));
1291 		type = dn->dn_type;
1292 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1293 			return (SET_ERROR(ENOENT));
1294 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1295 			return (SET_ERROR(EEXIST));
1296 		DNODE_VERIFY(dn);
1297 		(void) zfs_refcount_add(&dn->dn_holds, tag);
1298 		*dnp = dn;
1299 		return (0);
1300 	}
1301 
1302 	if (object == 0 || object >= DN_MAX_OBJECT)
1303 		return (SET_ERROR(EINVAL));
1304 
1305 	mdn = DMU_META_DNODE(os);
1306 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1307 
1308 	DNODE_VERIFY(mdn);
1309 
1310 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1311 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1312 		drop_struct_lock = TRUE;
1313 	}
1314 
1315 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1316 
1317 	db = dbuf_hold(mdn, blk, FTAG);
1318 	if (drop_struct_lock)
1319 		rw_exit(&mdn->dn_struct_rwlock);
1320 	if (db == NULL) {
1321 		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1322 		return (SET_ERROR(EIO));
1323 	}
1324 	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
1325 	if (err) {
1326 		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1327 		dbuf_rele(db, FTAG);
1328 		return (err);
1329 	}
1330 
1331 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1332 	epb = db->db.db_size >> DNODE_SHIFT;
1333 
1334 	idx = object & (epb - 1);
1335 	dn_block = (dnode_phys_t *)db->db.db_data;
1336 
1337 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1338 	dnc = dmu_buf_get_user(&db->db);
1339 	dnh = NULL;
1340 	if (dnc == NULL) {
1341 		dnode_children_t *winner;
1342 		int skip = 0;
1343 
1344 		dnc = kmem_zalloc(sizeof (dnode_children_t) +
1345 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
1346 		dnc->dnc_count = epb;
1347 		dnh = &dnc->dnc_children[0];
1348 
1349 		/* Initialize dnode slot status from dnode_phys_t */
1350 		for (int i = 0; i < epb; i++) {
1351 			zrl_init(&dnh[i].dnh_zrlock);
1352 
1353 			if (skip) {
1354 				skip--;
1355 				continue;
1356 			}
1357 
1358 			if (dn_block[i].dn_type != DMU_OT_NONE) {
1359 				int interior = dn_block[i].dn_extra_slots;
1360 
1361 				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
1362 				dnode_set_slots(dnc, i + 1, interior,
1363 				    DN_SLOT_INTERIOR);
1364 				skip = interior;
1365 			} else {
1366 				dnh[i].dnh_dnode = DN_SLOT_FREE;
1367 				skip = 0;
1368 			}
1369 		}
1370 
1371 		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
1372 		    dnode_buf_evict_async, NULL);
1373 		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1374 		if (winner != NULL) {
1375 
1376 			for (int i = 0; i < epb; i++)
1377 				zrl_destroy(&dnh[i].dnh_zrlock);
1378 
1379 			kmem_free(dnc, sizeof (dnode_children_t) +
1380 			    epb * sizeof (dnode_handle_t));
1381 			dnc = winner;
1382 		}
1383 	}
1384 
1385 	ASSERT(dnc->dnc_count == epb);
1386 	dn = DN_SLOT_UNINIT;
1387 
1388 	if (flag & DNODE_MUST_BE_ALLOCATED) {
1389 		slots = 1;
1390 
1391 		while (dn == DN_SLOT_UNINIT) {
1392 			dnode_slots_hold(dnc, idx, slots);
1393 			dnh = &dnc->dnc_children[idx];
1394 
1395 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1396 				dn = dnh->dnh_dnode;
1397 				break;
1398 			} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
1399 				DNODE_STAT_BUMP(dnode_hold_alloc_interior);
1400 				dnode_slots_rele(dnc, idx, slots);
1401 				dbuf_rele(db, FTAG);
1402 				return (SET_ERROR(EEXIST));
1403 			} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
1404 				DNODE_STAT_BUMP(dnode_hold_alloc_misses);
1405 				dnode_slots_rele(dnc, idx, slots);
1406 				dbuf_rele(db, FTAG);
1407 				return (SET_ERROR(ENOENT));
1408 			}
1409 
1410 			dnode_slots_rele(dnc, idx, slots);
1411 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
1412 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
1413 				continue;
1414 			}
1415 
1416 			/*
1417 			 * Someone else won the race and called dnode_create()
1418 			 * after we checked DN_SLOT_IS_PTR() above but before
1419 			 * we acquired the lock.
1420 			 */
1421 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1422 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
1423 				dn = dnh->dnh_dnode;
1424 			} else {
1425 				dn = dnode_create(os, dn_block + idx, db,
1426 				    object, dnh);
1427 			}
1428 		}
1429 
1430 		mutex_enter(&dn->dn_mtx);
1431 		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
1432 			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
1433 			mutex_exit(&dn->dn_mtx);
1434 			dnode_slots_rele(dnc, idx, slots);
1435 			dbuf_rele(db, FTAG);
1436 			return (SET_ERROR(ENOENT));
1437 		}
1438 
1439 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
1440 	} else if (flag & DNODE_MUST_BE_FREE) {
1441 
1442 		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
1443 			DNODE_STAT_BUMP(dnode_hold_free_overflow);
1444 			dbuf_rele(db, FTAG);
1445 			return (SET_ERROR(ENOSPC));
1446 		}
1447 
1448 		while (dn == DN_SLOT_UNINIT) {
1449 			dnode_slots_hold(dnc, idx, slots);
1450 
1451 			if (!dnode_check_slots_free(dnc, idx, slots)) {
1452 				DNODE_STAT_BUMP(dnode_hold_free_misses);
1453 				dnode_slots_rele(dnc, idx, slots);
1454 				dbuf_rele(db, FTAG);
1455 				return (SET_ERROR(ENOSPC));
1456 			}
1457 
1458 			dnode_slots_rele(dnc, idx, slots);
1459 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
1460 				DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
1461 				continue;
1462 			}
1463 
1464 			if (!dnode_check_slots_free(dnc, idx, slots)) {
1465 				DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
1466 				dnode_slots_rele(dnc, idx, slots);
1467 				dbuf_rele(db, FTAG);
1468 				return (SET_ERROR(ENOSPC));
1469 			}
1470 
1471 			/*
1472 			 * Allocated but otherwise free dnodes which would
1473 			 * be in the interior of a multi-slot dnodes need
1474 			 * to be freed.  Single slot dnodes can be safely
1475 			 * re-purposed as a performance optimization.
1476 			 */
1477 			if (slots > 1)
1478 				dnode_reclaim_slots(dnc, idx + 1, slots - 1);
1479 
1480 			dnh = &dnc->dnc_children[idx];
1481 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1482 				dn = dnh->dnh_dnode;
1483 			} else {
1484 				dn = dnode_create(os, dn_block + idx, db,
1485 				    object, dnh);
1486 			}
1487 		}
1488 
1489 		mutex_enter(&dn->dn_mtx);
1490 		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
1491 			DNODE_STAT_BUMP(dnode_hold_free_refcount);
1492 			mutex_exit(&dn->dn_mtx);
1493 			dnode_slots_rele(dnc, idx, slots);
1494 			dbuf_rele(db, FTAG);
1495 			return (SET_ERROR(EEXIST));
1496 		}
1497 
1498 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
1499 		DNODE_STAT_BUMP(dnode_hold_free_hits);
1500 	} else {
1501 		dbuf_rele(db, FTAG);
1502 		return (SET_ERROR(EINVAL));
1503 	}
1504 
1505 	if (dn->dn_free_txg) {
1506 		DNODE_STAT_BUMP(dnode_hold_free_txg);
1507 		type = dn->dn_type;
1508 		mutex_exit(&dn->dn_mtx);
1509 		dnode_slots_rele(dnc, idx, slots);
1510 		dbuf_rele(db, FTAG);
1511 		return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
1512 		    ENOENT : EEXIST));
1513 	}
1514 
1515 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1516 		dbuf_add_ref(db, dnh);
1517 
1518 	mutex_exit(&dn->dn_mtx);
1519 
1520 	/* Now we can rely on the hold to prevent the dnode from moving. */
1521 	dnode_slots_rele(dnc, idx, slots);
1522 
1523 	DNODE_VERIFY(dn);
1524 	ASSERT3P(dn->dn_dbuf, ==, db);
1525 	ASSERT3U(dn->dn_object, ==, object);
1526 	dbuf_rele(db, FTAG);
1527 
1528 	*dnp = dn;
1529 	return (0);
1530 }
1531 
1532 /*
1533  * Return held dnode if the object is allocated, NULL if not.
1534  */
1535 int
1536 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1537 {
1538 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
1539 	    dnp));
1540 }
1541 
1542 /*
1543  * Can only add a reference if there is already at least one
1544  * reference on the dnode.  Returns FALSE if unable to add a
1545  * new reference.
1546  */
1547 boolean_t
1548 dnode_add_ref(dnode_t *dn, void *tag)
1549 {
1550 	mutex_enter(&dn->dn_mtx);
1551 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
1552 		mutex_exit(&dn->dn_mtx);
1553 		return (FALSE);
1554 	}
1555 	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
1556 	mutex_exit(&dn->dn_mtx);
1557 	return (TRUE);
1558 }
1559 
1560 void
1561 dnode_rele(dnode_t *dn, void *tag)
1562 {
1563 	mutex_enter(&dn->dn_mtx);
1564 	dnode_rele_and_unlock(dn, tag, B_FALSE);
1565 }
1566 
1567 void
1568 dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
1569 {
1570 	uint64_t refs;
1571 	/* Get while the hold prevents the dnode from moving. */
1572 	dmu_buf_impl_t *db = dn->dn_dbuf;
1573 	dnode_handle_t *dnh = dn->dn_handle;
1574 
1575 	refs = zfs_refcount_remove(&dn->dn_holds, tag);
1576 	mutex_exit(&dn->dn_mtx);
1577 
1578 	/*
1579 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
1580 	 * indirectly by dbuf_rele() while relying on the dnode handle to
1581 	 * prevent the dnode from moving, since releasing the last hold could
1582 	 * result in the dnode's parent dbuf evicting its dnode handles. For
1583 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
1584 	 * other direct or indirect hold on the dnode must first drop the dnode
1585 	 * handle.
1586 	 */
1587 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1588 
1589 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1590 	if (refs == 0 && db != NULL) {
1591 		/*
1592 		 * Another thread could add a hold to the dnode handle in
1593 		 * dnode_hold_impl() while holding the parent dbuf. Since the
1594 		 * hold on the parent dbuf prevents the handle from being
1595 		 * destroyed, the hold on the handle is OK. We can't yet assert
1596 		 * that the handle has zero references, but that will be
1597 		 * asserted anyway when the handle gets destroyed.
1598 		 */
1599 		mutex_enter(&db->db_mtx);
1600 		dbuf_rele_and_unlock(db, dnh, evicting);
1601 	}
1602 }
1603 
1604 void
1605 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1606 {
1607 	objset_t *os = dn->dn_objset;
1608 	uint64_t txg = tx->tx_txg;
1609 
1610 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1611 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
1612 		return;
1613 	}
1614 
1615 	DNODE_VERIFY(dn);
1616 
1617 #ifdef ZFS_DEBUG
1618 	mutex_enter(&dn->dn_mtx);
1619 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1620 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1621 	mutex_exit(&dn->dn_mtx);
1622 #endif
1623 
1624 	/*
1625 	 * Determine old uid/gid when necessary
1626 	 */
1627 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1628 
1629 	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
1630 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1631 
1632 	/*
1633 	 * If we are already marked dirty, we're done.
1634 	 */
1635 	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1636 		multilist_sublist_unlock(mls);
1637 		return;
1638 	}
1639 
1640 	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
1641 	    !avl_is_empty(&dn->dn_dbufs));
1642 	ASSERT(dn->dn_datablksz != 0);
1643 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
1644 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
1645 	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1646 
1647 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1648 	    dn->dn_object, txg);
1649 
1650 	multilist_sublist_insert_head(mls, dn);
1651 
1652 	multilist_sublist_unlock(mls);
1653 
1654 	/*
1655 	 * The dnode maintains a hold on its containing dbuf as
1656 	 * long as there are holds on it.  Each instantiated child
1657 	 * dbuf maintains a hold on the dnode.  When the last child
1658 	 * drops its hold, the dnode will drop its hold on the
1659 	 * containing dbuf. We add a "dirty hold" here so that the
1660 	 * dnode will hang around after we finish processing its
1661 	 * children.
1662 	 */
1663 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1664 
1665 	(void) dbuf_dirty(dn->dn_dbuf, tx);
1666 
1667 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
1668 }
1669 
1670 void
1671 dnode_free(dnode_t *dn, dmu_tx_t *tx)
1672 {
1673 	mutex_enter(&dn->dn_mtx);
1674 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1675 		mutex_exit(&dn->dn_mtx);
1676 		return;
1677 	}
1678 	dn->dn_free_txg = tx->tx_txg;
1679 	mutex_exit(&dn->dn_mtx);
1680 
1681 	dnode_setdirty(dn, tx);
1682 }
1683 
1684 /*
1685  * Try to change the block size for the indicated dnode.  This can only
1686  * succeed if there are no blocks allocated or dirty beyond first block
1687  */
1688 int
1689 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1690 {
1691 	dmu_buf_impl_t *db;
1692 	int err;
1693 
1694 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1695 	if (size == 0)
1696 		size = SPA_MINBLOCKSIZE;
1697 	else
1698 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1699 
1700 	if (ibs == dn->dn_indblkshift)
1701 		ibs = 0;
1702 
1703 	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1704 		return (0);
1705 
1706 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1707 
1708 	/* Check for any allocated blocks beyond the first */
1709 	if (dn->dn_maxblkid != 0)
1710 		goto fail;
1711 
1712 	mutex_enter(&dn->dn_dbufs_mtx);
1713 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
1714 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
1715 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1716 		    db->db_blkid != DMU_SPILL_BLKID) {
1717 			mutex_exit(&dn->dn_dbufs_mtx);
1718 			goto fail;
1719 		}
1720 	}
1721 	mutex_exit(&dn->dn_dbufs_mtx);
1722 
1723 	if (ibs && dn->dn_nlevels != 1)
1724 		goto fail;
1725 
1726 	/* resize the old block */
1727 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
1728 	if (err == 0)
1729 		dbuf_new_size(db, size, tx);
1730 	else if (err != ENOENT)
1731 		goto fail;
1732 
1733 	dnode_setdblksz(dn, size);
1734 	dnode_setdirty(dn, tx);
1735 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1736 	if (ibs) {
1737 		dn->dn_indblkshift = ibs;
1738 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1739 	}
1740 	/* rele after we have fixed the blocksize in the dnode */
1741 	if (db)
1742 		dbuf_rele(db, FTAG);
1743 
1744 	rw_exit(&dn->dn_struct_rwlock);
1745 	return (0);
1746 
1747 fail:
1748 	rw_exit(&dn->dn_struct_rwlock);
1749 	return (SET_ERROR(ENOTSUP));
1750 }
1751 
1752 /* read-holding callers must not rely on the lock being continuously held */
1753 void
1754 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
1755 {
1756 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
1757 	int epbs, new_nlevels;
1758 	uint64_t sz;
1759 
1760 	ASSERT(blkid != DMU_BONUS_BLKID);
1761 
1762 	ASSERT(have_read ?
1763 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
1764 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
1765 
1766 	/*
1767 	 * if we have a read-lock, check to see if we need to do any work
1768 	 * before upgrading to a write-lock.
1769 	 */
1770 	if (have_read) {
1771 		if (blkid <= dn->dn_maxblkid)
1772 			return;
1773 
1774 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1775 			rw_exit(&dn->dn_struct_rwlock);
1776 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1777 		}
1778 	}
1779 
1780 	if (blkid <= dn->dn_maxblkid)
1781 		goto out;
1782 
1783 	dn->dn_maxblkid = blkid;
1784 
1785 	/*
1786 	 * Compute the number of levels necessary to support the new maxblkid.
1787 	 */
1788 	new_nlevels = 1;
1789 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1790 	for (sz = dn->dn_nblkptr;
1791 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1792 		new_nlevels++;
1793 
1794 	if (new_nlevels > dn->dn_nlevels) {
1795 		int old_nlevels = dn->dn_nlevels;
1796 		dmu_buf_impl_t *db;
1797 		list_t *list;
1798 		dbuf_dirty_record_t *new, *dr, *dr_next;
1799 
1800 		dn->dn_nlevels = new_nlevels;
1801 
1802 		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1803 		dn->dn_next_nlevels[txgoff] = new_nlevels;
1804 
1805 		/* dirty the left indirects */
1806 		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1807 		ASSERT(db != NULL);
1808 		new = dbuf_dirty(db, tx);
1809 		dbuf_rele(db, FTAG);
1810 
1811 		/* transfer the dirty records to the new indirect */
1812 		mutex_enter(&dn->dn_mtx);
1813 		mutex_enter(&new->dt.di.dr_mtx);
1814 		list = &dn->dn_dirty_records[txgoff];
1815 		for (dr = list_head(list); dr; dr = dr_next) {
1816 			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1817 			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1818 			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1819 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1820 				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1821 				list_remove(&dn->dn_dirty_records[txgoff], dr);
1822 				list_insert_tail(&new->dt.di.dr_children, dr);
1823 				dr->dr_parent = new;
1824 			}
1825 		}
1826 		mutex_exit(&new->dt.di.dr_mtx);
1827 		mutex_exit(&dn->dn_mtx);
1828 	}
1829 
1830 out:
1831 	if (have_read)
1832 		rw_downgrade(&dn->dn_struct_rwlock);
1833 }
1834 
1835 static void
1836 dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
1837 {
1838 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1839 	if (db != NULL) {
1840 		dmu_buf_will_dirty(&db->db, tx);
1841 		dbuf_rele(db, FTAG);
1842 	}
1843 }
1844 
1845 /*
1846  * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
1847  * and end_blkid.
1848  */
1849 static void
1850 dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1851     dmu_tx_t *tx)
1852 {
1853 	dmu_buf_impl_t db_search;
1854 	dmu_buf_impl_t *db;
1855 	avl_index_t where;
1856 
1857 	mutex_enter(&dn->dn_dbufs_mtx);
1858 
1859 	db_search.db_level = 1;
1860 	db_search.db_blkid = start_blkid + 1;
1861 	db_search.db_state = DB_SEARCH;
1862 	for (;;) {
1863 
1864 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
1865 		if (db == NULL)
1866 			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1867 
1868 		if (db == NULL || db->db_level != 1 ||
1869 		    db->db_blkid >= end_blkid) {
1870 			break;
1871 		}
1872 
1873 		/*
1874 		 * Setup the next blkid we want to search for.
1875 		 */
1876 		db_search.db_blkid = db->db_blkid + 1;
1877 		ASSERT3U(db->db_blkid, >=, start_blkid);
1878 
1879 		/*
1880 		 * If the dbuf transitions to DB_EVICTING while we're trying
1881 		 * to dirty it, then we will be unable to discover it in
1882 		 * the dbuf hash table. This will result in a call to
1883 		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
1884 		 * lock. To avoid a deadlock, we drop the lock before
1885 		 * dirtying the level-1 dbuf.
1886 		 */
1887 		mutex_exit(&dn->dn_dbufs_mtx);
1888 		dnode_dirty_l1(dn, db->db_blkid, tx);
1889 		mutex_enter(&dn->dn_dbufs_mtx);
1890 	}
1891 
1892 #ifdef ZFS_DEBUG
1893 	/*
1894 	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
1895 	 */
1896 	db_search.db_level = 1;
1897 	db_search.db_blkid = start_blkid + 1;
1898 	db_search.db_state = DB_SEARCH;
1899 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
1900 	if (db == NULL)
1901 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1902 	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
1903 		if (db->db_level != 1 || db->db_blkid >= end_blkid)
1904 			break;
1905 		ASSERT(db->db_dirtycnt > 0);
1906 	}
1907 #endif
1908 	mutex_exit(&dn->dn_dbufs_mtx);
1909 }
1910 
1911 void
1912 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1913 {
1914 	dmu_buf_impl_t *db;
1915 	uint64_t blkoff, blkid, nblks;
1916 	int blksz, blkshift, head, tail;
1917 	int trunc = FALSE;
1918 	int epbs;
1919 
1920 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1921 	blksz = dn->dn_datablksz;
1922 	blkshift = dn->dn_datablkshift;
1923 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1924 
1925 	if (len == DMU_OBJECT_END) {
1926 		len = UINT64_MAX - off;
1927 		trunc = TRUE;
1928 	}
1929 
1930 	/*
1931 	 * First, block align the region to free:
1932 	 */
1933 	if (ISP2(blksz)) {
1934 		head = P2NPHASE(off, blksz);
1935 		blkoff = P2PHASE(off, blksz);
1936 		if ((off >> blkshift) > dn->dn_maxblkid)
1937 			goto out;
1938 	} else {
1939 		ASSERT(dn->dn_maxblkid == 0);
1940 		if (off == 0 && len >= blksz) {
1941 			/*
1942 			 * Freeing the whole block; fast-track this request.
1943 			 */
1944 			blkid = 0;
1945 			nblks = 1;
1946 			if (dn->dn_nlevels > 1)
1947 				dnode_dirty_l1(dn, 0, tx);
1948 			goto done;
1949 		} else if (off >= blksz) {
1950 			/* Freeing past end-of-data */
1951 			goto out;
1952 		} else {
1953 			/* Freeing part of the block. */
1954 			head = blksz - off;
1955 			ASSERT3U(head, >, 0);
1956 		}
1957 		blkoff = off;
1958 	}
1959 	/* zero out any partial block data at the start of the range */
1960 	if (head) {
1961 		ASSERT3U(blkoff + head, ==, blksz);
1962 		if (len < head)
1963 			head = len;
1964 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
1965 		    TRUE, FALSE, FTAG, &db) == 0) {
1966 			caddr_t data;
1967 
1968 			/* don't dirty if it isn't on disk and isn't dirty */
1969 			if (db->db_last_dirty ||
1970 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
1971 				rw_exit(&dn->dn_struct_rwlock);
1972 				dmu_buf_will_dirty(&db->db, tx);
1973 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1974 				data = db->db.db_data;
1975 				bzero(data + blkoff, head);
1976 			}
1977 			dbuf_rele(db, FTAG);
1978 		}
1979 		off += head;
1980 		len -= head;
1981 	}
1982 
1983 	/* If the range was less than one block, we're done */
1984 	if (len == 0)
1985 		goto out;
1986 
1987 	/* If the remaining range is past end of file, we're done */
1988 	if ((off >> blkshift) > dn->dn_maxblkid)
1989 		goto out;
1990 
1991 	ASSERT(ISP2(blksz));
1992 	if (trunc)
1993 		tail = 0;
1994 	else
1995 		tail = P2PHASE(len, blksz);
1996 
1997 	ASSERT0(P2PHASE(off, blksz));
1998 	/* zero out any partial block data at the end of the range */
1999 	if (tail) {
2000 		if (len < tail)
2001 			tail = len;
2002 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
2003 		    TRUE, FALSE, FTAG, &db) == 0) {
2004 			/* don't dirty if not on disk and not dirty */
2005 			if (db->db_last_dirty ||
2006 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
2007 				rw_exit(&dn->dn_struct_rwlock);
2008 				dmu_buf_will_dirty(&db->db, tx);
2009 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2010 				bzero(db->db.db_data, tail);
2011 			}
2012 			dbuf_rele(db, FTAG);
2013 		}
2014 		len -= tail;
2015 	}
2016 
2017 	/* If the range did not include a full block, we are done */
2018 	if (len == 0)
2019 		goto out;
2020 
2021 	ASSERT(IS_P2ALIGNED(off, blksz));
2022 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2023 	blkid = off >> blkshift;
2024 	nblks = len >> blkshift;
2025 	if (trunc)
2026 		nblks += 1;
2027 
2028 	/*
2029 	 * Dirty all the indirect blocks in this range.  Note that only
2030 	 * the first and last indirect blocks can actually be written
2031 	 * (if they were partially freed) -- they must be dirtied, even if
2032 	 * they do not exist on disk yet.  The interior blocks will
2033 	 * be freed by free_children(), so they will not actually be written.
2034 	 * Even though these interior blocks will not be written, we
2035 	 * dirty them for two reasons:
2036 	 *
2037 	 *  - It ensures that the indirect blocks remain in memory until
2038 	 *    syncing context.  (They have already been prefetched by
2039 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
2040 	 *    them serially here.)
2041 	 *
2042 	 *  - The dirty space accounting will put pressure on the txg sync
2043 	 *    mechanism to begin syncing, and to delay transactions if there
2044 	 *    is a large amount of freeing.  Even though these indirect
2045 	 *    blocks will not be written, we could need to write the same
2046 	 *    amount of space if we copy the freed BPs into deadlists.
2047 	 */
2048 	if (dn->dn_nlevels > 1) {
2049 		uint64_t first, last;
2050 
2051 		first = blkid >> epbs;
2052 		dnode_dirty_l1(dn, first, tx);
2053 		if (trunc)
2054 			last = dn->dn_maxblkid >> epbs;
2055 		else
2056 			last = (blkid + nblks - 1) >> epbs;
2057 		if (last != first)
2058 			dnode_dirty_l1(dn, last, tx);
2059 
2060 		dnode_dirty_l1range(dn, first, last, tx);
2061 
2062 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
2063 		    SPA_BLKPTRSHIFT;
2064 		for (uint64_t i = first + 1; i < last; i++) {
2065 			/*
2066 			 * Set i to the blockid of the next non-hole
2067 			 * level-1 indirect block at or after i.  Note
2068 			 * that dnode_next_offset() operates in terms of
2069 			 * level-0-equivalent bytes.
2070 			 */
2071 			uint64_t ibyte = i << shift;
2072 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
2073 			    &ibyte, 2, 1, 0);
2074 			i = ibyte >> shift;
2075 			if (i >= last)
2076 				break;
2077 
2078 			/*
2079 			 * Normally we should not see an error, either
2080 			 * from dnode_next_offset() or dbuf_hold_level()
2081 			 * (except for ESRCH from dnode_next_offset).
2082 			 * If there is an i/o error, then when we read
2083 			 * this block in syncing context, it will use
2084 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
2085 			 * to the "failmode" property.  dnode_next_offset()
2086 			 * doesn't have a flag to indicate MUSTSUCCEED.
2087 			 */
2088 			if (err != 0)
2089 				break;
2090 
2091 			dnode_dirty_l1(dn, i, tx);
2092 		}
2093 	}
2094 
2095 done:
2096 	/*
2097 	 * Add this range to the dnode range list.
2098 	 * We will finish up this free operation in the syncing phase.
2099 	 */
2100 	mutex_enter(&dn->dn_mtx);
2101 	int txgoff = tx->tx_txg & TXG_MASK;
2102 	if (dn->dn_free_ranges[txgoff] == NULL) {
2103 		dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
2104 	}
2105 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2106 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2107 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2108 	    blkid, nblks, tx->tx_txg);
2109 	mutex_exit(&dn->dn_mtx);
2110 
2111 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2112 	dnode_setdirty(dn, tx);
2113 out:
2114 
2115 	rw_exit(&dn->dn_struct_rwlock);
2116 }
2117 
2118 static boolean_t
2119 dnode_spill_freed(dnode_t *dn)
2120 {
2121 	int i;
2122 
2123 	mutex_enter(&dn->dn_mtx);
2124 	for (i = 0; i < TXG_SIZE; i++) {
2125 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
2126 			break;
2127 	}
2128 	mutex_exit(&dn->dn_mtx);
2129 	return (i < TXG_SIZE);
2130 }
2131 
2132 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2133 uint64_t
2134 dnode_block_freed(dnode_t *dn, uint64_t blkid)
2135 {
2136 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
2137 	int i;
2138 
2139 	if (blkid == DMU_BONUS_BLKID)
2140 		return (FALSE);
2141 
2142 	/*
2143 	 * If we're in the process of opening the pool, dp will not be
2144 	 * set yet, but there shouldn't be anything dirty.
2145 	 */
2146 	if (dp == NULL)
2147 		return (FALSE);
2148 
2149 	if (dn->dn_free_txg)
2150 		return (TRUE);
2151 
2152 	if (blkid == DMU_SPILL_BLKID)
2153 		return (dnode_spill_freed(dn));
2154 
2155 	mutex_enter(&dn->dn_mtx);
2156 	for (i = 0; i < TXG_SIZE; i++) {
2157 		if (dn->dn_free_ranges[i] != NULL &&
2158 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2159 			break;
2160 	}
2161 	mutex_exit(&dn->dn_mtx);
2162 	return (i < TXG_SIZE);
2163 }
2164 
2165 /* call from syncing context when we actually write/free space for this dnode */
2166 void
2167 dnode_diduse_space(dnode_t *dn, int64_t delta)
2168 {
2169 	uint64_t space;
2170 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2171 	    dn, dn->dn_phys,
2172 	    (u_longlong_t)dn->dn_phys->dn_used,
2173 	    (longlong_t)delta);
2174 
2175 	mutex_enter(&dn->dn_mtx);
2176 	space = DN_USED_BYTES(dn->dn_phys);
2177 	if (delta > 0) {
2178 		ASSERT3U(space + delta, >=, space); /* no overflow */
2179 	} else {
2180 		ASSERT3U(space, >=, -delta); /* no underflow */
2181 	}
2182 	space += delta;
2183 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
2184 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
2185 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
2186 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2187 	} else {
2188 		dn->dn_phys->dn_used = space;
2189 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2190 	}
2191 	mutex_exit(&dn->dn_mtx);
2192 }
2193 
2194 /*
2195  * Scans a block at the indicated "level" looking for a hole or data,
2196  * depending on 'flags'.
2197  *
2198  * If level > 0, then we are scanning an indirect block looking at its
2199  * pointers.  If level == 0, then we are looking at a block of dnodes.
2200  *
2201  * If we don't find what we are looking for in the block, we return ESRCH.
2202  * Otherwise, return with *offset pointing to the beginning (if searching
2203  * forwards) or end (if searching backwards) of the range covered by the
2204  * block pointer we matched on (or dnode).
2205  *
2206  * The basic search algorithm used below by dnode_next_offset() is to
2207  * use this function to search up the block tree (widen the search) until
2208  * we find something (i.e., we don't return ESRCH) and then search back
2209  * down the tree (narrow the search) until we reach our original search
2210  * level.
2211  */
2212 static int
2213 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2214     int lvl, uint64_t blkfill, uint64_t txg)
2215 {
2216 	dmu_buf_impl_t *db = NULL;
2217 	void *data = NULL;
2218 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2219 	uint64_t epb = 1ULL << epbs;
2220 	uint64_t minfill, maxfill;
2221 	boolean_t hole;
2222 	int i, inc, error, span;
2223 
2224 	dprintf("probing object %llu offset %llx level %d of %u\n",
2225 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
2226 
2227 	hole = ((flags & DNODE_FIND_HOLE) != 0);
2228 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
2229 	ASSERT(txg == 0 || !hole);
2230 
2231 	if (lvl == dn->dn_phys->dn_nlevels) {
2232 		error = 0;
2233 		epb = dn->dn_phys->dn_nblkptr;
2234 		data = dn->dn_phys->dn_blkptr;
2235 	} else {
2236 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2237 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2238 		if (error) {
2239 			if (error != ENOENT)
2240 				return (error);
2241 			if (hole)
2242 				return (0);
2243 			/*
2244 			 * This can only happen when we are searching up
2245 			 * the block tree for data.  We don't really need to
2246 			 * adjust the offset, as we will just end up looking
2247 			 * at the pointer to this block in its parent, and its
2248 			 * going to be unallocated, so we will skip over it.
2249 			 */
2250 			return (SET_ERROR(ESRCH));
2251 		}
2252 		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
2253 		if (error) {
2254 			dbuf_rele(db, FTAG);
2255 			return (error);
2256 		}
2257 		data = db->db.db_data;
2258 	}
2259 
2260 
2261 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
2262 	    db->db_blkptr->blk_birth <= txg ||
2263 	    BP_IS_HOLE(db->db_blkptr))) {
2264 		/*
2265 		 * This can only happen when we are searching up the tree
2266 		 * and these conditions mean that we need to keep climbing.
2267 		 */
2268 		error = SET_ERROR(ESRCH);
2269 	} else if (lvl == 0) {
2270 		dnode_phys_t *dnp = data;
2271 
2272 		ASSERT(dn->dn_type == DMU_OT_DNODE);
2273 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2274 
2275 		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2276 		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
2277 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2278 				break;
2279 		}
2280 
2281 		if (i == blkfill)
2282 			error = SET_ERROR(ESRCH);
2283 
2284 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2285 		    (i << DNODE_SHIFT);
2286 	} else {
2287 		blkptr_t *bp = data;
2288 		uint64_t start = *offset;
2289 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
2290 		minfill = 0;
2291 		maxfill = blkfill << ((lvl - 1) * epbs);
2292 
2293 		if (hole)
2294 			maxfill--;
2295 		else
2296 			minfill++;
2297 
2298 		*offset = *offset >> span;
2299 		for (i = BF64_GET(*offset, 0, epbs);
2300 		    i >= 0 && i < epb; i += inc) {
2301 			if (BP_GET_FILL(&bp[i]) >= minfill &&
2302 			    BP_GET_FILL(&bp[i]) <= maxfill &&
2303 			    (hole || bp[i].blk_birth > txg))
2304 				break;
2305 			if (inc > 0 || *offset > 0)
2306 				*offset += inc;
2307 		}
2308 		*offset = *offset << span;
2309 		if (inc < 0) {
2310 			/* traversing backwards; position offset at the end */
2311 			ASSERT3U(*offset, <=, start);
2312 			*offset = MIN(*offset + (1ULL << span) - 1, start);
2313 		} else if (*offset < start) {
2314 			*offset = start;
2315 		}
2316 		if (i < 0 || i >= epb)
2317 			error = SET_ERROR(ESRCH);
2318 	}
2319 
2320 	if (db)
2321 		dbuf_rele(db, FTAG);
2322 
2323 	return (error);
2324 }
2325 
2326 /*
2327  * Find the next hole, data, or sparse region at or after *offset.
2328  * The value 'blkfill' tells us how many items we expect to find
2329  * in an L0 data block; this value is 1 for normal objects,
2330  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
2331  * DNODES_PER_BLOCK when searching for sparse regions thereof.
2332  *
2333  * Examples:
2334  *
2335  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
2336  *	Finds the next/previous hole/data in a file.
2337  *	Used in dmu_offset_next().
2338  *
2339  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2340  *	Finds the next free/allocated dnode an objset's meta-dnode.
2341  *	Only finds objects that have new contents since txg (ie.
2342  *	bonus buffer changes and content removal are ignored).
2343  *	Used in dmu_object_next().
2344  *
2345  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2346  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
2347  *	Used in dmu_object_alloc().
2348  */
2349 int
2350 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
2351     int minlvl, uint64_t blkfill, uint64_t txg)
2352 {
2353 	uint64_t initial_offset = *offset;
2354 	int lvl, maxlvl;
2355 	int error = 0;
2356 
2357 	if (!(flags & DNODE_FIND_HAVELOCK))
2358 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2359 
2360 	if (dn->dn_phys->dn_nlevels == 0) {
2361 		error = SET_ERROR(ESRCH);
2362 		goto out;
2363 	}
2364 
2365 	if (dn->dn_datablkshift == 0) {
2366 		if (*offset < dn->dn_datablksz) {
2367 			if (flags & DNODE_FIND_HOLE)
2368 				*offset = dn->dn_datablksz;
2369 		} else {
2370 			error = SET_ERROR(ESRCH);
2371 		}
2372 		goto out;
2373 	}
2374 
2375 	maxlvl = dn->dn_phys->dn_nlevels;
2376 
2377 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
2378 		error = dnode_next_offset_level(dn,
2379 		    flags, offset, lvl, blkfill, txg);
2380 		if (error != ESRCH)
2381 			break;
2382 	}
2383 
2384 	while (error == 0 && --lvl >= minlvl) {
2385 		error = dnode_next_offset_level(dn,
2386 		    flags, offset, lvl, blkfill, txg);
2387 	}
2388 
2389 	/*
2390 	 * There's always a "virtual hole" at the end of the object, even
2391 	 * if all BP's which physically exist are non-holes.
2392 	 */
2393 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
2394 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
2395 		error = 0;
2396 	}
2397 
2398 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2399 	    initial_offset < *offset : initial_offset > *offset))
2400 		error = SET_ERROR(ESRCH);
2401 out:
2402 	if (!(flags & DNODE_FIND_HAVELOCK))
2403 		rw_exit(&dn->dn_struct_rwlock);
2404 
2405 	return (error);
2406 }
2407