1fa9e4066Sahrens /*
2fa9e4066Sahrens * CDDL HEADER START
3fa9e4066Sahrens *
4fa9e4066Sahrens * The contents of this file are subject to the terms of the
5f65e61c0Sahrens * Common Development and Distribution License (the "License").
6f65e61c0Sahrens * You may not use this file except in compliance with the License.
7fa9e4066Sahrens *
8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens * See the License for the specific language governing permissions
11fa9e4066Sahrens * and limitations under the License.
12fa9e4066Sahrens *
13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens *
19fa9e4066Sahrens * CDDL HEADER END
20fa9e4066Sahrens */
21fa9e4066Sahrens /*
2206e0070dSMark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
234d7988d6SPaul Dagnelie * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
24bc9014e6SJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
26f06dce2cSAndrew Stormont * Copyright 2017 RackTop Systems.
27fa9e4066Sahrens */
28fa9e4066Sahrens
29fa9e4066Sahrens #include <sys/zfs_context.h>
30fa9e4066Sahrens #include <sys/dbuf.h>
31fa9e4066Sahrens #include <sys/dnode.h>
32fa9e4066Sahrens #include <sys/dmu.h>
33fa9e4066Sahrens #include <sys/dmu_impl.h>
34fa9e4066Sahrens #include <sys/dmu_tx.h>
35fa9e4066Sahrens #include <sys/dmu_objset.h>
36fa9e4066Sahrens #include <sys/dsl_dir.h>
37fa9e4066Sahrens #include <sys/dsl_dataset.h>
38fa9e4066Sahrens #include <sys/spa.h>
39fa9e4066Sahrens #include <sys/zio.h>
40fa9e4066Sahrens #include <sys/dmu_zfetch.h>
41bf16b11eSMatthew Ahrens #include <sys/range_tree.h>
42f67950b2SNasf-Fan #include <sys/zfs_project.h>
43fa9e4066Sahrens
4454811da5SToomas Soome dnode_stats_t dnode_stats = {
4554811da5SToomas Soome { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
4654811da5SToomas Soome { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
4754811da5SToomas Soome { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
4854811da5SToomas Soome { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
4954811da5SToomas Soome { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
5054811da5SToomas Soome { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
5154811da5SToomas Soome { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
5254811da5SToomas Soome { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
5354811da5SToomas Soome { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
5454811da5SToomas Soome { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
5554811da5SToomas Soome { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
5654811da5SToomas Soome { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
5754811da5SToomas Soome { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
5854811da5SToomas Soome { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
5954811da5SToomas Soome { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
6054811da5SToomas Soome { "dnode_allocate", KSTAT_DATA_UINT64 },
6154811da5SToomas Soome { "dnode_reallocate", KSTAT_DATA_UINT64 },
6254811da5SToomas Soome { "dnode_buf_evict", KSTAT_DATA_UINT64 },
6354811da5SToomas Soome { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
6454811da5SToomas Soome { "dnode_alloc_race", KSTAT_DATA_UINT64 },
6554811da5SToomas Soome { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
6654811da5SToomas Soome { "dnode_move_invalid", KSTAT_DATA_UINT64 },
6754811da5SToomas Soome { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
6854811da5SToomas Soome { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
6954811da5SToomas Soome { "dnode_move_special", KSTAT_DATA_UINT64 },
7054811da5SToomas Soome { "dnode_move_handle", KSTAT_DATA_UINT64 },
7154811da5SToomas Soome { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
7254811da5SToomas Soome { "dnode_move_active", KSTAT_DATA_UINT64 },
7354811da5SToomas Soome };
7454811da5SToomas Soome
7554811da5SToomas Soome static kstat_t *dnode_ksp;
76fa9e4066Sahrens static kmem_cache_t *dnode_cache;
77fa9e4066Sahrens
78fa9e4066Sahrens static dnode_phys_t dnode_phys_zero;
79fa9e4066Sahrens
80fa9e4066Sahrens int zfs_default_bs = SPA_MINBLOCKSHIFT;
81fa9e4066Sahrens int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
82fa9e4066Sahrens
83f06dce2cSAndrew Stormont #ifdef _KERNEL
84744947dcSTom Erickson static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
85f06dce2cSAndrew Stormont #endif /* _KERNEL */
86744947dcSTom Erickson
870f6d88adSAlex Reece static int
dbuf_compare(const void * x1,const void * x2)880f6d88adSAlex Reece dbuf_compare(const void *x1, const void *x2)
890f6d88adSAlex Reece {
900f6d88adSAlex Reece const dmu_buf_impl_t *d1 = x1;
910f6d88adSAlex Reece const dmu_buf_impl_t *d2 = x2;
920f6d88adSAlex Reece
934d7988d6SPaul Dagnelie int cmp = TREE_CMP(d1->db_level, d2->db_level);
94c4ab0d3fSGvozden Neskovic if (likely(cmp))
95c4ab0d3fSGvozden Neskovic return (cmp);
960f6d88adSAlex Reece
974d7988d6SPaul Dagnelie cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
98c4ab0d3fSGvozden Neskovic if (likely(cmp))
99c4ab0d3fSGvozden Neskovic return (cmp);
1000f6d88adSAlex Reece
101a846f19dSAlex Reece if (d1->db_state == DB_SEARCH) {
102a846f19dSAlex Reece ASSERT3S(d2->db_state, !=, DB_SEARCH);
1030f6d88adSAlex Reece return (-1);
104a846f19dSAlex Reece } else if (d2->db_state == DB_SEARCH) {
105a846f19dSAlex Reece ASSERT3S(d1->db_state, !=, DB_SEARCH);
10686bb58aeSAlex Reece return (1);
10786bb58aeSAlex Reece }
10886bb58aeSAlex Reece
1094d7988d6SPaul Dagnelie return (TREE_PCMP(d1, d2));
1100f6d88adSAlex Reece }
1110f6d88adSAlex Reece
112fa9e4066Sahrens /* ARGSUSED */
113fa9e4066Sahrens static int
dnode_cons(void * arg,void * unused,int kmflag)114fa9e4066Sahrens dnode_cons(void *arg, void *unused, int kmflag)
115fa9e4066Sahrens {
116fa9e4066Sahrens dnode_t *dn = arg;
117744947dcSTom Erickson int i;
118fa9e4066Sahrens
119fa9e4066Sahrens rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
120fa9e4066Sahrens mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
121fa9e4066Sahrens mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
122b5e70f97SRicardo M. Correia cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
123b390f3a9SJohn Poduska cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
124b5e70f97SRicardo M. Correia
1253b2aab18SMatthew Ahrens /*
1263b2aab18SMatthew Ahrens * Every dbuf has a reference, and dropping a tracked reference is
1273b2aab18SMatthew Ahrens * O(number of references), so don't track dn_holds.
1283b2aab18SMatthew Ahrens */
129e914ace2STim Schumacher zfs_refcount_create_untracked(&dn->dn_holds);
130e914ace2STim Schumacher zfs_refcount_create(&dn->dn_tx_holds);
131744947dcSTom Erickson list_link_init(&dn->dn_link);
132744947dcSTom Erickson
133744947dcSTom Erickson bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
134744947dcSTom Erickson bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
135744947dcSTom Erickson bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
136744947dcSTom Erickson bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
137744947dcSTom Erickson bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
138744947dcSTom Erickson bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
139744947dcSTom Erickson bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
140eb633035STom Caputi bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
141fa9e4066Sahrens
142fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) {
143aa02ea01STom Caputi multilist_link_init(&dn->dn_dirty_link[i]);
144bf16b11eSMatthew Ahrens dn->dn_free_ranges[i] = NULL;
145c717a561Smaybee list_create(&dn->dn_dirty_records[i],
146c717a561Smaybee sizeof (dbuf_dirty_record_t),
147c717a561Smaybee offsetof(dbuf_dirty_record_t, dr_dirty_node));
148fa9e4066Sahrens }
149fa9e4066Sahrens
150744947dcSTom Erickson dn->dn_allocated_txg = 0;
151744947dcSTom Erickson dn->dn_free_txg = 0;
152744947dcSTom Erickson dn->dn_assigned_txg = 0;
153aa02ea01STom Caputi dn->dn_dirty_txg = 0;
154744947dcSTom Erickson dn->dn_dirtyctx = 0;
155744947dcSTom Erickson dn->dn_dirtyctx_firstset = NULL;
156744947dcSTom Erickson dn->dn_bonus = NULL;
157744947dcSTom Erickson dn->dn_have_spill = B_FALSE;
158744947dcSTom Erickson dn->dn_zio = NULL;
159744947dcSTom Erickson dn->dn_oldused = 0;
160744947dcSTom Erickson dn->dn_oldflags = 0;
161744947dcSTom Erickson dn->dn_olduid = 0;
162744947dcSTom Erickson dn->dn_oldgid = 0;
163f67950b2SNasf-Fan dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
164744947dcSTom Erickson dn->dn_newuid = 0;
165744947dcSTom Erickson dn->dn_newgid = 0;
166f67950b2SNasf-Fan dn->dn_newprojid = ZFS_DEFAULT_PROJID;
167744947dcSTom Erickson dn->dn_id_flags = 0;
168744947dcSTom Erickson
169744947dcSTom Erickson dn->dn_dbufs_count = 0;
1700f6d88adSAlex Reece avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
171fa9e4066Sahrens offsetof(dmu_buf_impl_t, db_link));
172fa9e4066Sahrens
173744947dcSTom Erickson dn->dn_moved = 0;
174fa9e4066Sahrens return (0);
175fa9e4066Sahrens }
176fa9e4066Sahrens
177fa9e4066Sahrens /* ARGSUSED */
178fa9e4066Sahrens static void
dnode_dest(void * arg,void * unused)179fa9e4066Sahrens dnode_dest(void *arg, void *unused)
180fa9e4066Sahrens {
181fa9e4066Sahrens int i;
182fa9e4066Sahrens dnode_t *dn = arg;
183fa9e4066Sahrens
184fa9e4066Sahrens rw_destroy(&dn->dn_struct_rwlock);
185fa9e4066Sahrens mutex_destroy(&dn->dn_mtx);
186fa9e4066Sahrens mutex_destroy(&dn->dn_dbufs_mtx);
187b5e70f97SRicardo M. Correia cv_destroy(&dn->dn_notxholds);
188b390f3a9SJohn Poduska cv_destroy(&dn->dn_nodnholds);
189e914ace2STim Schumacher zfs_refcount_destroy(&dn->dn_holds);
190e914ace2STim Schumacher zfs_refcount_destroy(&dn->dn_tx_holds);
191744947dcSTom Erickson ASSERT(!list_link_active(&dn->dn_link));
192fa9e4066Sahrens
193fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) {
194aa02ea01STom Caputi ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
195bf16b11eSMatthew Ahrens ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
196c717a561Smaybee list_destroy(&dn->dn_dirty_records[i]);
197fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_nblkptr[i]);
198fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_nlevels[i]);
199fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_indblkshift[i]);
200fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_bonustype[i]);
201fb09f5aaSMadhav Suresh ASSERT0(dn->dn_rm_spillblk[i]);
202fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_bonuslen[i]);
203fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_blksz[i]);
204eb633035STom Caputi ASSERT0(dn->dn_next_maxblkid[i]);
205fa9e4066Sahrens }
206fa9e4066Sahrens
207fb09f5aaSMadhav Suresh ASSERT0(dn->dn_allocated_txg);
208fb09f5aaSMadhav Suresh ASSERT0(dn->dn_free_txg);
209fb09f5aaSMadhav Suresh ASSERT0(dn->dn_assigned_txg);
210aa02ea01STom Caputi ASSERT0(dn->dn_dirty_txg);
211fb09f5aaSMadhav Suresh ASSERT0(dn->dn_dirtyctx);
212744947dcSTom Erickson ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
213744947dcSTom Erickson ASSERT3P(dn->dn_bonus, ==, NULL);
214744947dcSTom Erickson ASSERT(!dn->dn_have_spill);
215744947dcSTom Erickson ASSERT3P(dn->dn_zio, ==, NULL);
216fb09f5aaSMadhav Suresh ASSERT0(dn->dn_oldused);
217fb09f5aaSMadhav Suresh ASSERT0(dn->dn_oldflags);
218fb09f5aaSMadhav Suresh ASSERT0(dn->dn_olduid);
219fb09f5aaSMadhav Suresh ASSERT0(dn->dn_oldgid);
220f67950b2SNasf-Fan ASSERT0(dn->dn_oldprojid);
221fb09f5aaSMadhav Suresh ASSERT0(dn->dn_newuid);
222fb09f5aaSMadhav Suresh ASSERT0(dn->dn_newgid);
223f67950b2SNasf-Fan ASSERT0(dn->dn_newprojid);
224fb09f5aaSMadhav Suresh ASSERT0(dn->dn_id_flags);
225fb09f5aaSMadhav Suresh
226fb09f5aaSMadhav Suresh ASSERT0(dn->dn_dbufs_count);
2270f6d88adSAlex Reece avl_destroy(&dn->dn_dbufs);
228fa9e4066Sahrens }
229fa9e4066Sahrens
230fa9e4066Sahrens void
dnode_init(void)231fa9e4066Sahrens dnode_init(void)
232fa9e4066Sahrens {
233744947dcSTom Erickson ASSERT(dnode_cache == NULL);
234fa9e4066Sahrens dnode_cache = kmem_cache_create("dnode_t",
235fa9e4066Sahrens sizeof (dnode_t),
236fa9e4066Sahrens 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
237f06dce2cSAndrew Stormont #ifdef _KERNEL
238744947dcSTom Erickson kmem_cache_set_move(dnode_cache, dnode_move);
23954811da5SToomas Soome
24054811da5SToomas Soome dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
24154811da5SToomas Soome KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
24254811da5SToomas Soome KSTAT_FLAG_VIRTUAL);
24354811da5SToomas Soome if (dnode_ksp != NULL) {
24454811da5SToomas Soome dnode_ksp->ks_data = &dnode_stats;
24554811da5SToomas Soome kstat_install(dnode_ksp);
24654811da5SToomas Soome }
247f06dce2cSAndrew Stormont #endif /* _KERNEL */
248fa9e4066Sahrens }
249fa9e4066Sahrens
250fa9e4066Sahrens void
dnode_fini(void)251fa9e4066Sahrens dnode_fini(void)
252fa9e4066Sahrens {
25354811da5SToomas Soome if (dnode_ksp != NULL) {
25454811da5SToomas Soome kstat_delete(dnode_ksp);
25554811da5SToomas Soome dnode_ksp = NULL;
25654811da5SToomas Soome }
25754811da5SToomas Soome
258fa9e4066Sahrens kmem_cache_destroy(dnode_cache);
259744947dcSTom Erickson dnode_cache = NULL;
260fa9e4066Sahrens }
261fa9e4066Sahrens
262fa9e4066Sahrens
2639c9dc39aSek #ifdef ZFS_DEBUG
264fa9e4066Sahrens void
dnode_verify(dnode_t * dn)265fa9e4066Sahrens dnode_verify(dnode_t *dn)
266fa9e4066Sahrens {
267fa9e4066Sahrens int drop_struct_lock = FALSE;
268fa9e4066Sahrens
269fa9e4066Sahrens ASSERT(dn->dn_phys);
270fa9e4066Sahrens ASSERT(dn->dn_objset);
271744947dcSTom Erickson ASSERT(dn->dn_handle->dnh_dnode == dn);
272fa9e4066Sahrens
273ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
274fa9e4066Sahrens
275fa9e4066Sahrens if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
276fa9e4066Sahrens return;
277fa9e4066Sahrens
278fa9e4066Sahrens if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
279fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER);
280fa9e4066Sahrens drop_struct_lock = TRUE;
281fa9e4066Sahrens }
282fa9e4066Sahrens if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
283fa9e4066Sahrens int i;
28454811da5SToomas Soome int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
285fa9e4066Sahrens ASSERT3U(dn->dn_indblkshift, >=, 0);
286fa9e4066Sahrens ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
287fa9e4066Sahrens if (dn->dn_datablkshift) {
288fa9e4066Sahrens ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
289fa9e4066Sahrens ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
290fa9e4066Sahrens ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
291fa9e4066Sahrens }
292fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, <=, 30);
293ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(dn->dn_type));
294fa9e4066Sahrens ASSERT3U(dn->dn_nblkptr, >=, 1);
295fa9e4066Sahrens ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
29654811da5SToomas Soome ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
297fa9e4066Sahrens ASSERT3U(dn->dn_datablksz, ==,
298fa9e4066Sahrens dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
299fa9e4066Sahrens ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
300fa9e4066Sahrens ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
30154811da5SToomas Soome dn->dn_bonuslen, <=, max_bonuslen);
302fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) {
303fa9e4066Sahrens ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
304fa9e4066Sahrens }
305fa9e4066Sahrens }
306fa9e4066Sahrens if (dn->dn_phys->dn_type != DMU_OT_NONE)
307fa9e4066Sahrens ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
30814843421SMatthew Ahrens ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
309fa9e4066Sahrens if (dn->dn_dbuf != NULL) {
310fa9e4066Sahrens ASSERT3P(dn->dn_phys, ==,
311fa9e4066Sahrens (dnode_phys_t *)dn->dn_dbuf->db.db_data +
312fa9e4066Sahrens (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
313fa9e4066Sahrens }
314fa9e4066Sahrens if (drop_struct_lock)
315fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock);
316fa9e4066Sahrens }
3179c9dc39aSek #endif
318fa9e4066Sahrens
319fa9e4066Sahrens void
dnode_byteswap(dnode_phys_t * dnp)320fa9e4066Sahrens dnode_byteswap(dnode_phys_t *dnp)
321fa9e4066Sahrens {
322fa9e4066Sahrens uint64_t *buf64 = (void*)&dnp->dn_blkptr;
323fa9e4066Sahrens int i;
324fa9e4066Sahrens
325fa9e4066Sahrens if (dnp->dn_type == DMU_OT_NONE) {
326fa9e4066Sahrens bzero(dnp, sizeof (dnode_phys_t));
327fa9e4066Sahrens return;
328fa9e4066Sahrens }
329fa9e4066Sahrens
330fa9e4066Sahrens dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
331fa9e4066Sahrens dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
33254811da5SToomas Soome dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
333fa9e4066Sahrens dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
33499653d4eSeschrock dnp->dn_used = BSWAP_64(dnp->dn_used);
335fa9e4066Sahrens
336fa9e4066Sahrens /*
337fa9e4066Sahrens * dn_nblkptr is only one byte, so it's OK to read it in either
338fa9e4066Sahrens * byte order. We can't read dn_bouslen.
339fa9e4066Sahrens */
340fa9e4066Sahrens ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
341fa9e4066Sahrens ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
342fa9e4066Sahrens for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
343fa9e4066Sahrens buf64[i] = BSWAP_64(buf64[i]);
344fa9e4066Sahrens
345fa9e4066Sahrens /*
346fa9e4066Sahrens * OK to check dn_bonuslen for zero, because it won't matter if
347fa9e4066Sahrens * we have the wrong byte order. This is necessary because the
348fa9e4066Sahrens * dnode dnode is smaller than a regular dnode.
349fa9e4066Sahrens */
350fa9e4066Sahrens if (dnp->dn_bonuslen != 0) {
351*d4c2c737SGeorge Amanakis dmu_object_byteswap_t byteswap;
352ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
353*d4c2c737SGeorge Amanakis byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
354*d4c2c737SGeorge Amanakis dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
355*d4c2c737SGeorge Amanakis DN_MAX_BONUS_LEN(dnp));
356fa9e4066Sahrens }
3570a586ceaSMark Shellenbaum
3580a586ceaSMark Shellenbaum /* Swap SPILL block if we have one */
3590a586ceaSMark Shellenbaum if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
36054811da5SToomas Soome byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
3610a586ceaSMark Shellenbaum
362fa9e4066Sahrens }
363fa9e4066Sahrens
364fa9e4066Sahrens void
dnode_buf_byteswap(void * vbuf,size_t size)365fa9e4066Sahrens dnode_buf_byteswap(void *vbuf, size_t size)
366fa9e4066Sahrens {
36754811da5SToomas Soome int i = 0;
368fa9e4066Sahrens
369fa9e4066Sahrens ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
370fa9e4066Sahrens ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
371fa9e4066Sahrens
37254811da5SToomas Soome while (i < size) {
37354811da5SToomas Soome dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
37454811da5SToomas Soome dnode_byteswap(dnp);
37554811da5SToomas Soome
37654811da5SToomas Soome i += DNODE_MIN_SIZE;
37754811da5SToomas Soome if (dnp->dn_type != DMU_OT_NONE)
37854811da5SToomas Soome i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
379fa9e4066Sahrens }
380fa9e4066Sahrens }
381fa9e4066Sahrens
3821934e92fSmaybee void
dnode_setbonuslen(dnode_t * dn,int newsize,dmu_tx_t * tx)3831934e92fSmaybee dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
3841934e92fSmaybee {
385e914ace2STim Schumacher ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
3861934e92fSmaybee
3871934e92fSmaybee dnode_setdirty(dn, tx);
3881934e92fSmaybee rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
38954811da5SToomas Soome ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
3901934e92fSmaybee (dn->dn_nblkptr-1) * sizeof (blkptr_t));
3911934e92fSmaybee dn->dn_bonuslen = newsize;
3921934e92fSmaybee if (newsize == 0)
3931934e92fSmaybee dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
3941934e92fSmaybee else
3951934e92fSmaybee dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
3961934e92fSmaybee rw_exit(&dn->dn_struct_rwlock);
3971934e92fSmaybee }
3981934e92fSmaybee
3990a586ceaSMark Shellenbaum void
dnode_setbonus_type(dnode_t * dn,dmu_object_type_t newtype,dmu_tx_t * tx)4000a586ceaSMark Shellenbaum dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
4010a586ceaSMark Shellenbaum {
402e914ace2STim Schumacher ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
4030a586ceaSMark Shellenbaum dnode_setdirty(dn, tx);
4040a586ceaSMark Shellenbaum rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
4050a586ceaSMark Shellenbaum dn->dn_bonustype = newtype;
4060a586ceaSMark Shellenbaum dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
4070a586ceaSMark Shellenbaum rw_exit(&dn->dn_struct_rwlock);
4080a586ceaSMark Shellenbaum }
4090a586ceaSMark Shellenbaum
4100a586ceaSMark Shellenbaum void
dnode_rm_spill(dnode_t * dn,dmu_tx_t * tx)4110a586ceaSMark Shellenbaum dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
4120a586ceaSMark Shellenbaum {
413e914ace2STim Schumacher ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
41406e0070dSMark Shellenbaum ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
4150a586ceaSMark Shellenbaum dnode_setdirty(dn, tx);
4160a586ceaSMark Shellenbaum dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
4170a586ceaSMark Shellenbaum dn->dn_have_spill = B_FALSE;
4180a586ceaSMark Shellenbaum }
4190a586ceaSMark Shellenbaum
420fa9e4066Sahrens static void
dnode_setdblksz(dnode_t * dn,int size)421fa9e4066Sahrens dnode_setdblksz(dnode_t *dn, int size)
422fa9e4066Sahrens {
423fb09f5aaSMadhav Suresh ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
424fa9e4066Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
425fa9e4066Sahrens ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
426fa9e4066Sahrens ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
427fa9e4066Sahrens 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
428fa9e4066Sahrens dn->dn_datablksz = size;
429fa9e4066Sahrens dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
430bf16b11eSMatthew Ahrens dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
431fa9e4066Sahrens }
432fa9e4066Sahrens
433fa9e4066Sahrens static dnode_t *
dnode_create(objset_t * os,dnode_phys_t * dnp,dmu_buf_impl_t * db,uint64_t object,dnode_handle_t * dnh)434503ad85cSMatthew Ahrens dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
435744947dcSTom Erickson uint64_t object, dnode_handle_t *dnh)
436fa9e4066Sahrens {
437bc9014e6SJustin Gibbs dnode_t *dn;
438fa9e4066Sahrens
439bc9014e6SJustin Gibbs dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
440f06dce2cSAndrew Stormont #ifdef _KERNEL
441744947dcSTom Erickson ASSERT(!POINTER_IS_VALID(dn->dn_objset));
442f06dce2cSAndrew Stormont #endif /* _KERNEL */
443744947dcSTom Erickson dn->dn_moved = 0;
444744947dcSTom Erickson
445744947dcSTom Erickson /*
446744947dcSTom Erickson * Defer setting dn_objset until the dnode is ready to be a candidate
447744947dcSTom Erickson * for the dnode_move() callback.
448744947dcSTom Erickson */
449fa9e4066Sahrens dn->dn_object = object;
450fa9e4066Sahrens dn->dn_dbuf = db;
451744947dcSTom Erickson dn->dn_handle = dnh;
452fa9e4066Sahrens dn->dn_phys = dnp;
453fa9e4066Sahrens
454744947dcSTom Erickson if (dnp->dn_datablkszsec) {
455fa9e4066Sahrens dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
456744947dcSTom Erickson } else {
457744947dcSTom Erickson dn->dn_datablksz = 0;
458744947dcSTom Erickson dn->dn_datablkszsec = 0;
459744947dcSTom Erickson dn->dn_datablkshift = 0;
460744947dcSTom Erickson }
461fa9e4066Sahrens dn->dn_indblkshift = dnp->dn_indblkshift;
462fa9e4066Sahrens dn->dn_nlevels = dnp->dn_nlevels;
463fa9e4066Sahrens dn->dn_type = dnp->dn_type;
464fa9e4066Sahrens dn->dn_nblkptr = dnp->dn_nblkptr;
465fa9e4066Sahrens dn->dn_checksum = dnp->dn_checksum;
466fa9e4066Sahrens dn->dn_compress = dnp->dn_compress;
467fa9e4066Sahrens dn->dn_bonustype = dnp->dn_bonustype;
468fa9e4066Sahrens dn->dn_bonuslen = dnp->dn_bonuslen;
46954811da5SToomas Soome dn->dn_num_slots = dnp->dn_extra_slots + 1;
470fa9e4066Sahrens dn->dn_maxblkid = dnp->dn_maxblkid;
4710a586ceaSMark Shellenbaum dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
47206e0070dSMark Shellenbaum dn->dn_id_flags = 0;
473fa9e4066Sahrens
474fa9e4066Sahrens dmu_zfetch_init(&dn->dn_zfetch, dn);
475fa9e4066Sahrens
476ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
47754811da5SToomas Soome ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
47854811da5SToomas Soome ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
479744947dcSTom Erickson
480fa9e4066Sahrens mutex_enter(&os->os_lock);
481bc9014e6SJustin Gibbs
482bc9014e6SJustin Gibbs /*
483bc9014e6SJustin Gibbs * Exclude special dnodes from os_dnodes so an empty os_dnodes
484bc9014e6SJustin Gibbs * signifies that the special dnodes have no references from
485bc9014e6SJustin Gibbs * their children (the entries in os_dnodes). This allows
486bc9014e6SJustin Gibbs * dnode_destroy() to easily determine if the last child has
487bc9014e6SJustin Gibbs * been removed and then complete eviction of the objset.
488bc9014e6SJustin Gibbs */
489bc9014e6SJustin Gibbs if (!DMU_OBJECT_IS_SPECIAL(object))
490bc9014e6SJustin Gibbs list_insert_head(&os->os_dnodes, dn);
491744947dcSTom Erickson membar_producer();
492bc9014e6SJustin Gibbs
493744947dcSTom Erickson /*
494bc9014e6SJustin Gibbs * Everything else must be valid before assigning dn_objset
495bc9014e6SJustin Gibbs * makes the dnode eligible for dnode_move().
496744947dcSTom Erickson */
497744947dcSTom Erickson dn->dn_objset = os;
498bc9014e6SJustin Gibbs
499bc9014e6SJustin Gibbs dnh->dnh_dnode = dn;
500fa9e4066Sahrens mutex_exit(&os->os_lock);
501fa9e4066Sahrens
5025a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
50354811da5SToomas Soome
504fa9e4066Sahrens return (dn);
505fa9e4066Sahrens }
506fa9e4066Sahrens
507744947dcSTom Erickson /*
508744947dcSTom Erickson * Caller must be holding the dnode handle, which is released upon return.
509744947dcSTom Erickson */
510fa9e4066Sahrens static void
dnode_destroy(dnode_t * dn)511fa9e4066Sahrens dnode_destroy(dnode_t *dn)
512fa9e4066Sahrens {
513503ad85cSMatthew Ahrens objset_t *os = dn->dn_objset;
514bc9014e6SJustin Gibbs boolean_t complete_os_eviction = B_FALSE;
515fa9e4066Sahrens
5160a586ceaSMark Shellenbaum ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
517a2eea2e1Sahrens
518fa9e4066Sahrens mutex_enter(&os->os_lock);
519744947dcSTom Erickson POINTER_INVALIDATE(&dn->dn_objset);
520bc9014e6SJustin Gibbs if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
521bc9014e6SJustin Gibbs list_remove(&os->os_dnodes, dn);
522bc9014e6SJustin Gibbs complete_os_eviction =
523bc9014e6SJustin Gibbs list_is_empty(&os->os_dnodes) &&
524bc9014e6SJustin Gibbs list_link_active(&os->os_evicting_node);
525bc9014e6SJustin Gibbs }
526fa9e4066Sahrens mutex_exit(&os->os_lock);
527fa9e4066Sahrens
528744947dcSTom Erickson /* the dnode can no longer move, so we can release the handle */
52954811da5SToomas Soome if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
53054811da5SToomas Soome zrl_remove(&dn->dn_handle->dnh_zrlock);
531744947dcSTom Erickson
532744947dcSTom Erickson dn->dn_allocated_txg = 0;
533744947dcSTom Erickson dn->dn_free_txg = 0;
534744947dcSTom Erickson dn->dn_assigned_txg = 0;
535aa02ea01STom Caputi dn->dn_dirty_txg = 0;
536744947dcSTom Erickson
537744947dcSTom Erickson dn->dn_dirtyctx = 0;
538744947dcSTom Erickson if (dn->dn_dirtyctx_firstset != NULL) {
539fa9e4066Sahrens kmem_free(dn->dn_dirtyctx_firstset, 1);
540fa9e4066Sahrens dn->dn_dirtyctx_firstset = NULL;
541fa9e4066Sahrens }
542744947dcSTom Erickson if (dn->dn_bonus != NULL) {
543ea8dc4b6Seschrock mutex_enter(&dn->dn_bonus->db_mtx);
544dcbf3bd6SGeorge Wilson dbuf_destroy(dn->dn_bonus);
545ea8dc4b6Seschrock dn->dn_bonus = NULL;
546ea8dc4b6Seschrock }
547744947dcSTom Erickson dn->dn_zio = NULL;
548744947dcSTom Erickson
549744947dcSTom Erickson dn->dn_have_spill = B_FALSE;
550744947dcSTom Erickson dn->dn_oldused = 0;
551744947dcSTom Erickson dn->dn_oldflags = 0;
552744947dcSTom Erickson dn->dn_olduid = 0;
553744947dcSTom Erickson dn->dn_oldgid = 0;
554f67950b2SNasf-Fan dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
555744947dcSTom Erickson dn->dn_newuid = 0;
556744947dcSTom Erickson dn->dn_newgid = 0;
557f67950b2SNasf-Fan dn->dn_newprojid = ZFS_DEFAULT_PROJID;
558744947dcSTom Erickson dn->dn_id_flags = 0;
559744947dcSTom Erickson
560cf6106c8SMatthew Ahrens dmu_zfetch_fini(&dn->dn_zfetch);
561fa9e4066Sahrens kmem_cache_free(dnode_cache, dn);
5625a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
563bc9014e6SJustin Gibbs
564bc9014e6SJustin Gibbs if (complete_os_eviction)
565bc9014e6SJustin Gibbs dmu_objset_evict_done(os);
566fa9e4066Sahrens }
567fa9e4066Sahrens
568fa9e4066Sahrens void
dnode_allocate(dnode_t * dn,dmu_object_type_t ot,int blocksize,int ibs,dmu_object_type_t bonustype,int bonuslen,int dn_slots,dmu_tx_t * tx)569fa9e4066Sahrens dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
57054811da5SToomas Soome dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
571fa9e4066Sahrens {
572fa9e4066Sahrens int i;
573fa9e4066Sahrens
57454811da5SToomas Soome ASSERT3U(dn_slots, >, 0);
57554811da5SToomas Soome ASSERT3U(dn_slots << DNODE_SHIFT, <=,
57654811da5SToomas Soome spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
577b5152584SMatthew Ahrens ASSERT3U(blocksize, <=,
578b5152584SMatthew Ahrens spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
579fa9e4066Sahrens if (blocksize == 0)
580fa9e4066Sahrens blocksize = 1 << zfs_default_bs;
5813b83abddSahrens else
5823b83abddSahrens blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
583fa9e4066Sahrens
584fa9e4066Sahrens if (ibs == 0)
585fa9e4066Sahrens ibs = zfs_default_ibs;
586fa9e4066Sahrens
587fa9e4066Sahrens ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
588fa9e4066Sahrens
58954811da5SToomas Soome dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
59054811da5SToomas Soome " blocksize=%d ibs=%d dn_slots=%d\n",
59154811da5SToomas Soome dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
59254811da5SToomas Soome DNODE_STAT_BUMP(dnode_allocate);
593fa9e4066Sahrens
594fa9e4066Sahrens ASSERT(dn->dn_type == DMU_OT_NONE);
595fa9e4066Sahrens ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
596fa9e4066Sahrens ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
597fa9e4066Sahrens ASSERT(ot != DMU_OT_NONE);
598ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(ot));
599fa9e4066Sahrens ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
6000a586ceaSMark Shellenbaum (bonustype == DMU_OT_SA && bonuslen == 0) ||
601fa9e4066Sahrens (bonustype != DMU_OT_NONE && bonuslen != 0));
602ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(bonustype));
60354811da5SToomas Soome ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
604fa9e4066Sahrens ASSERT(dn->dn_type == DMU_OT_NONE);
605fb09f5aaSMadhav Suresh ASSERT0(dn->dn_maxblkid);
606fb09f5aaSMadhav Suresh ASSERT0(dn->dn_allocated_txg);
607aa02ea01STom Caputi ASSERT0(dn->dn_dirty_txg);
608fb09f5aaSMadhav Suresh ASSERT0(dn->dn_assigned_txg);
609e914ace2STim Schumacher ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
610e914ace2STim Schumacher ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
6110f6d88adSAlex Reece ASSERT(avl_is_empty(&dn->dn_dbufs));
612fa9e4066Sahrens
613fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) {
614fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_nblkptr[i]);
615fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_nlevels[i]);
616fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_indblkshift[i]);
617fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_bonuslen[i]);
618fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_bonustype[i]);
619fb09f5aaSMadhav Suresh ASSERT0(dn->dn_rm_spillblk[i]);
620fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_blksz[i]);
621eb633035STom Caputi ASSERT0(dn->dn_next_maxblkid[i]);
622aa02ea01STom Caputi ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
623c717a561Smaybee ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
624bf16b11eSMatthew Ahrens ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
625fa9e4066Sahrens }
626fa9e4066Sahrens
627fa9e4066Sahrens dn->dn_type = ot;
628fa9e4066Sahrens dnode_setdblksz(dn, blocksize);
629fa9e4066Sahrens dn->dn_indblkshift = ibs;
630fa9e4066Sahrens dn->dn_nlevels = 1;
63154811da5SToomas Soome dn->dn_num_slots = dn_slots;
6320a586ceaSMark Shellenbaum if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
6330a586ceaSMark Shellenbaum dn->dn_nblkptr = 1;
63454811da5SToomas Soome else {
63554811da5SToomas Soome dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
63654811da5SToomas Soome 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
63754811da5SToomas Soome SPA_BLKPTRSHIFT));
63854811da5SToomas Soome }
63954811da5SToomas Soome
640fa9e4066Sahrens dn->dn_bonustype = bonustype;
641fa9e4066Sahrens dn->dn_bonuslen = bonuslen;
642fa9e4066Sahrens dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
643fa9e4066Sahrens dn->dn_compress = ZIO_COMPRESS_INHERIT;
644fa9e4066Sahrens dn->dn_dirtyctx = 0;
645fa9e4066Sahrens
646fa9e4066Sahrens dn->dn_free_txg = 0;
647fa9e4066Sahrens if (dn->dn_dirtyctx_firstset) {
648fa9e4066Sahrens kmem_free(dn->dn_dirtyctx_firstset, 1);
649fa9e4066Sahrens dn->dn_dirtyctx_firstset = NULL;
650fa9e4066Sahrens }
651fa9e4066Sahrens
652fa9e4066Sahrens dn->dn_allocated_txg = tx->tx_txg;
6530a586ceaSMark Shellenbaum dn->dn_id_flags = 0;
654f676ed34Sahrens
655fa9e4066Sahrens dnode_setdirty(dn, tx);
656f676ed34Sahrens dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
6571934e92fSmaybee dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
6580a586ceaSMark Shellenbaum dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
659f676ed34Sahrens dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
660fa9e4066Sahrens }
661fa9e4066Sahrens
662fa9e4066Sahrens void
dnode_reallocate(dnode_t * dn,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,int dn_slots,boolean_t keep_spill,dmu_tx_t * tx)663fa9e4066Sahrens dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
664eb633035STom Caputi dmu_object_type_t bonustype, int bonuslen, int dn_slots,
665eb633035STom Caputi boolean_t keep_spill, dmu_tx_t *tx)
666fa9e4066Sahrens {
6672bf405a2SMark Maybee int nblkptr;
668c543ec06Sahrens
669fa9e4066Sahrens ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
670b5152584SMatthew Ahrens ASSERT3U(blocksize, <=,
671b5152584SMatthew Ahrens spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
672fb09f5aaSMadhav Suresh ASSERT0(blocksize % SPA_MINBLOCKSIZE);
673ea8dc4b6Seschrock ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
674fa9e4066Sahrens ASSERT(tx->tx_txg != 0);
675fa9e4066Sahrens ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
67606e0070dSMark Shellenbaum (bonustype != DMU_OT_NONE && bonuslen != 0) ||
67706e0070dSMark Shellenbaum (bonustype == DMU_OT_SA && bonuslen == 0));
678ad135b5dSChristopher Siden ASSERT(DMU_OT_IS_VALID(bonustype));
67954811da5SToomas Soome ASSERT3U(bonuslen, <=,
68054811da5SToomas Soome DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
681946342a2SFabian Grünbichler ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
68254811da5SToomas Soome
68354811da5SToomas Soome dnode_free_interior_slots(dn);
68454811da5SToomas Soome DNODE_STAT_BUMP(dnode_reallocate);
685c543ec06Sahrens
686ea8dc4b6Seschrock /* clean up any unreferenced dbufs */
6871934e92fSmaybee dnode_evict_dbufs(dn);
688da03de99SMark Maybee
68928d97a71SMark Shellenbaum dn->dn_id_flags = 0;
69028d97a71SMark Shellenbaum
691fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
692fa9e4066Sahrens dnode_setdirty(dn, tx);
6932bf405a2SMark Maybee if (dn->dn_datablksz != blocksize) {
6942bf405a2SMark Maybee /* change blocksize */
6952bf405a2SMark Maybee ASSERT(dn->dn_maxblkid == 0 &&
6962bf405a2SMark Maybee (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
6972bf405a2SMark Maybee dnode_block_freed(dn, 0)));
6982bf405a2SMark Maybee dnode_setdblksz(dn, blocksize);
6992bf405a2SMark Maybee dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
7002bf405a2SMark Maybee }
7012bf405a2SMark Maybee if (dn->dn_bonuslen != bonuslen)
7022bf405a2SMark Maybee dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
70306e0070dSMark Shellenbaum
70406e0070dSMark Shellenbaum if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
70506e0070dSMark Shellenbaum nblkptr = 1;
70606e0070dSMark Shellenbaum else
70754811da5SToomas Soome nblkptr = MIN(DN_MAX_NBLKPTR,
70854811da5SToomas Soome 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
70954811da5SToomas Soome SPA_BLKPTRSHIFT));
7100a586ceaSMark Shellenbaum if (dn->dn_bonustype != bonustype)
7110a586ceaSMark Shellenbaum dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
712da03de99SMark Maybee if (dn->dn_nblkptr != nblkptr)
713da03de99SMark Maybee dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
714eb633035STom Caputi if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
71506e0070dSMark Shellenbaum dbuf_rm_spill(dn, tx);
71606e0070dSMark Shellenbaum dnode_rm_spill(dn, tx);
7170a586ceaSMark Shellenbaum }
718fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock);
719fa9e4066Sahrens
720fa9e4066Sahrens /* change type */
721fa9e4066Sahrens dn->dn_type = ot;
722fa9e4066Sahrens
723fa9e4066Sahrens /* change bonus size and type */
724fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
725fa9e4066Sahrens dn->dn_bonustype = bonustype;
726fa9e4066Sahrens dn->dn_bonuslen = bonuslen;
72754811da5SToomas Soome dn->dn_num_slots = dn_slots;
728da03de99SMark Maybee dn->dn_nblkptr = nblkptr;
729fa9e4066Sahrens dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
730fa9e4066Sahrens dn->dn_compress = ZIO_COMPRESS_INHERIT;
731fa9e4066Sahrens ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
732fa9e4066Sahrens
733da03de99SMark Maybee /* fix up the bonus db_size */
734da03de99SMark Maybee if (dn->dn_bonus) {
7351934e92fSmaybee dn->dn_bonus->db.db_size =
73654811da5SToomas Soome DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
73754811da5SToomas Soome (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
7381934e92fSmaybee ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
7391934e92fSmaybee }
740432f72fdSahrens
741fa9e4066Sahrens dn->dn_allocated_txg = tx->tx_txg;
742fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
743fa9e4066Sahrens }
744fa9e4066Sahrens
745f06dce2cSAndrew Stormont #ifdef _KERNEL
746744947dcSTom Erickson static void
dnode_move_impl(dnode_t * odn,dnode_t * ndn)747744947dcSTom Erickson dnode_move_impl(dnode_t *odn, dnode_t *ndn)
748744947dcSTom Erickson {
749744947dcSTom Erickson int i;
750744947dcSTom Erickson
751744947dcSTom Erickson ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
752744947dcSTom Erickson ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
753744947dcSTom Erickson ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
754744947dcSTom Erickson ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
755744947dcSTom Erickson
756744947dcSTom Erickson /* Copy fields. */
757744947dcSTom Erickson ndn->dn_objset = odn->dn_objset;
758744947dcSTom Erickson ndn->dn_object = odn->dn_object;
759744947dcSTom Erickson ndn->dn_dbuf = odn->dn_dbuf;
760744947dcSTom Erickson ndn->dn_handle = odn->dn_handle;
761744947dcSTom Erickson ndn->dn_phys = odn->dn_phys;
762744947dcSTom Erickson ndn->dn_type = odn->dn_type;
763744947dcSTom Erickson ndn->dn_bonuslen = odn->dn_bonuslen;
764744947dcSTom Erickson ndn->dn_bonustype = odn->dn_bonustype;
765744947dcSTom Erickson ndn->dn_nblkptr = odn->dn_nblkptr;
766744947dcSTom Erickson ndn->dn_checksum = odn->dn_checksum;
767744947dcSTom Erickson ndn->dn_compress = odn->dn_compress;
768744947dcSTom Erickson ndn->dn_nlevels = odn->dn_nlevels;
769744947dcSTom Erickson ndn->dn_indblkshift = odn->dn_indblkshift;
770744947dcSTom Erickson ndn->dn_datablkshift = odn->dn_datablkshift;
771744947dcSTom Erickson ndn->dn_datablkszsec = odn->dn_datablkszsec;
772744947dcSTom Erickson ndn->dn_datablksz = odn->dn_datablksz;
773744947dcSTom Erickson ndn->dn_maxblkid = odn->dn_maxblkid;
77454811da5SToomas Soome ndn->dn_num_slots = odn->dn_num_slots;
775c7fbe46dSMatthew Ahrens bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
776c7fbe46dSMatthew Ahrens sizeof (odn->dn_next_type));
777744947dcSTom Erickson bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
778744947dcSTom Erickson sizeof (odn->dn_next_nblkptr));
779744947dcSTom Erickson bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
780744947dcSTom Erickson sizeof (odn->dn_next_nlevels));
781744947dcSTom Erickson bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
782744947dcSTom Erickson sizeof (odn->dn_next_indblkshift));
783744947dcSTom Erickson bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
784744947dcSTom Erickson sizeof (odn->dn_next_bonustype));
785744947dcSTom Erickson bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
786744947dcSTom Erickson sizeof (odn->dn_rm_spillblk));
787744947dcSTom Erickson bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
788744947dcSTom Erickson sizeof (odn->dn_next_bonuslen));
789744947dcSTom Erickson bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
790744947dcSTom Erickson sizeof (odn->dn_next_blksz));
791eb633035STom Caputi bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
792eb633035STom Caputi sizeof (odn->dn_next_maxblkid));
793744947dcSTom Erickson for (i = 0; i < TXG_SIZE; i++) {
794744947dcSTom Erickson list_move_tail(&ndn->dn_dirty_records[i],
795744947dcSTom Erickson &odn->dn_dirty_records[i]);
796744947dcSTom Erickson }
797bf16b11eSMatthew Ahrens bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
798bf16b11eSMatthew Ahrens sizeof (odn->dn_free_ranges));
799744947dcSTom Erickson ndn->dn_allocated_txg = odn->dn_allocated_txg;
800744947dcSTom Erickson ndn->dn_free_txg = odn->dn_free_txg;
801744947dcSTom Erickson ndn->dn_assigned_txg = odn->dn_assigned_txg;
802aa02ea01STom Caputi ndn->dn_dirty_txg = odn->dn_dirty_txg;
803744947dcSTom Erickson ndn->dn_dirtyctx = odn->dn_dirtyctx;
804744947dcSTom Erickson ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
805e914ace2STim Schumacher ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
806e914ace2STim Schumacher zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
8070f6d88adSAlex Reece ASSERT(avl_is_empty(&ndn->dn_dbufs));
8080f6d88adSAlex Reece avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
809744947dcSTom Erickson ndn->dn_dbufs_count = odn->dn_dbufs_count;
810744947dcSTom Erickson ndn->dn_bonus = odn->dn_bonus;
811744947dcSTom Erickson ndn->dn_have_spill = odn->dn_have_spill;
812744947dcSTom Erickson ndn->dn_zio = odn->dn_zio;
813744947dcSTom Erickson ndn->dn_oldused = odn->dn_oldused;
814744947dcSTom Erickson ndn->dn_oldflags = odn->dn_oldflags;
815744947dcSTom Erickson ndn->dn_olduid = odn->dn_olduid;
816744947dcSTom Erickson ndn->dn_oldgid = odn->dn_oldgid;
817f67950b2SNasf-Fan ndn->dn_oldprojid = odn->dn_oldprojid;
818744947dcSTom Erickson ndn->dn_newuid = odn->dn_newuid;
819744947dcSTom Erickson ndn->dn_newgid = odn->dn_newgid;
820f67950b2SNasf-Fan ndn->dn_newprojid = odn->dn_newprojid;
821744947dcSTom Erickson ndn->dn_id_flags = odn->dn_id_flags;
822744947dcSTom Erickson dmu_zfetch_init(&ndn->dn_zfetch, NULL);
823744947dcSTom Erickson list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
824744947dcSTom Erickson ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
825744947dcSTom Erickson
826744947dcSTom Erickson /*
827744947dcSTom Erickson * Update back pointers. Updating the handle fixes the back pointer of
828744947dcSTom Erickson * every descendant dbuf as well as the bonus dbuf.
829744947dcSTom Erickson */
830744947dcSTom Erickson ASSERT(ndn->dn_handle->dnh_dnode == odn);
831744947dcSTom Erickson ndn->dn_handle->dnh_dnode = ndn;
832744947dcSTom Erickson if (ndn->dn_zfetch.zf_dnode == odn) {
833744947dcSTom Erickson ndn->dn_zfetch.zf_dnode = ndn;
834744947dcSTom Erickson }
835744947dcSTom Erickson
836744947dcSTom Erickson /*
837744947dcSTom Erickson * Invalidate the original dnode by clearing all of its back pointers.
838744947dcSTom Erickson */
839744947dcSTom Erickson odn->dn_dbuf = NULL;
840744947dcSTom Erickson odn->dn_handle = NULL;
8410f6d88adSAlex Reece avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
842744947dcSTom Erickson offsetof(dmu_buf_impl_t, db_link));
843744947dcSTom Erickson odn->dn_dbufs_count = 0;
844744947dcSTom Erickson odn->dn_bonus = NULL;
845744947dcSTom Erickson odn->dn_zfetch.zf_dnode = NULL;
846744947dcSTom Erickson
847744947dcSTom Erickson /*
848744947dcSTom Erickson * Set the low bit of the objset pointer to ensure that dnode_move()
849744947dcSTom Erickson * recognizes the dnode as invalid in any subsequent callback.
850744947dcSTom Erickson */
851744947dcSTom Erickson POINTER_INVALIDATE(&odn->dn_objset);
852744947dcSTom Erickson
853744947dcSTom Erickson /*
854744947dcSTom Erickson * Satisfy the destructor.
855744947dcSTom Erickson */
856744947dcSTom Erickson for (i = 0; i < TXG_SIZE; i++) {
857744947dcSTom Erickson list_create(&odn->dn_dirty_records[i],
858744947dcSTom Erickson sizeof (dbuf_dirty_record_t),
859744947dcSTom Erickson offsetof(dbuf_dirty_record_t, dr_dirty_node));
860bf16b11eSMatthew Ahrens odn->dn_free_ranges[i] = NULL;
861744947dcSTom Erickson odn->dn_next_nlevels[i] = 0;
862744947dcSTom Erickson odn->dn_next_indblkshift[i] = 0;
863744947dcSTom Erickson odn->dn_next_bonustype[i] = 0;
864744947dcSTom Erickson odn->dn_rm_spillblk[i] = 0;
865744947dcSTom Erickson odn->dn_next_bonuslen[i] = 0;
866744947dcSTom Erickson odn->dn_next_blksz[i] = 0;
867744947dcSTom Erickson }
868744947dcSTom Erickson odn->dn_allocated_txg = 0;
869744947dcSTom Erickson odn->dn_free_txg = 0;
870744947dcSTom Erickson odn->dn_assigned_txg = 0;
871aa02ea01STom Caputi odn->dn_dirty_txg = 0;
872744947dcSTom Erickson odn->dn_dirtyctx = 0;
873744947dcSTom Erickson odn->dn_dirtyctx_firstset = NULL;
874744947dcSTom Erickson odn->dn_have_spill = B_FALSE;
875744947dcSTom Erickson odn->dn_zio = NULL;
876744947dcSTom Erickson odn->dn_oldused = 0;
877744947dcSTom Erickson odn->dn_oldflags = 0;
878744947dcSTom Erickson odn->dn_olduid = 0;
879744947dcSTom Erickson odn->dn_oldgid = 0;
880f67950b2SNasf-Fan odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
881744947dcSTom Erickson odn->dn_newuid = 0;
882744947dcSTom Erickson odn->dn_newgid = 0;
883f67950b2SNasf-Fan odn->dn_newprojid = ZFS_DEFAULT_PROJID;
884744947dcSTom Erickson odn->dn_id_flags = 0;
885744947dcSTom Erickson
886744947dcSTom Erickson /*
887744947dcSTom Erickson * Mark the dnode.
888744947dcSTom Erickson */
889744947dcSTom Erickson ndn->dn_moved = 1;
890744947dcSTom Erickson odn->dn_moved = (uint8_t)-1;
891744947dcSTom Erickson }
892744947dcSTom Erickson
893744947dcSTom Erickson /*ARGSUSED*/
894744947dcSTom Erickson static kmem_cbrc_t
dnode_move(void * buf,void * newbuf,size_t size,void * arg)895744947dcSTom Erickson dnode_move(void *buf, void *newbuf, size_t size, void *arg)
896744947dcSTom Erickson {
897744947dcSTom Erickson dnode_t *odn = buf, *ndn = newbuf;
898744947dcSTom Erickson objset_t *os;
899744947dcSTom Erickson int64_t refcount;
900744947dcSTom Erickson uint32_t dbufs;
901744947dcSTom Erickson
902744947dcSTom Erickson /*
903744947dcSTom Erickson * The dnode is on the objset's list of known dnodes if the objset
904744947dcSTom Erickson * pointer is valid. We set the low bit of the objset pointer when
905744947dcSTom Erickson * freeing the dnode to invalidate it, and the memory patterns written
906744947dcSTom Erickson * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
907744947dcSTom Erickson * A newly created dnode sets the objset pointer last of all to indicate
908744947dcSTom Erickson * that the dnode is known and in a valid state to be moved by this
909744947dcSTom Erickson * function.
910744947dcSTom Erickson */
911744947dcSTom Erickson os = odn->dn_objset;
912744947dcSTom Erickson if (!POINTER_IS_VALID(os)) {
91354811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_invalid);
914744947dcSTom Erickson return (KMEM_CBRC_DONT_KNOW);
915744947dcSTom Erickson }
916744947dcSTom Erickson
917744947dcSTom Erickson /*
918744947dcSTom Erickson * Ensure that the objset does not go away during the move.
919744947dcSTom Erickson */
920744947dcSTom Erickson rw_enter(&os_lock, RW_WRITER);
921744947dcSTom Erickson if (os != odn->dn_objset) {
922744947dcSTom Erickson rw_exit(&os_lock);
92354811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_recheck1);
924744947dcSTom Erickson return (KMEM_CBRC_DONT_KNOW);
925744947dcSTom Erickson }
926744947dcSTom Erickson
927744947dcSTom Erickson /*
928744947dcSTom Erickson * If the dnode is still valid, then so is the objset. We know that no
929744947dcSTom Erickson * valid objset can be freed while we hold os_lock, so we can safely
930744947dcSTom Erickson * ensure that the objset remains in use.
931744947dcSTom Erickson */
932744947dcSTom Erickson mutex_enter(&os->os_lock);
933744947dcSTom Erickson
934744947dcSTom Erickson /*
935744947dcSTom Erickson * Recheck the objset pointer in case the dnode was removed just before
936744947dcSTom Erickson * acquiring the lock.
937744947dcSTom Erickson */
938744947dcSTom Erickson if (os != odn->dn_objset) {
939744947dcSTom Erickson mutex_exit(&os->os_lock);
940744947dcSTom Erickson rw_exit(&os_lock);
94154811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_recheck2);
942744947dcSTom Erickson return (KMEM_CBRC_DONT_KNOW);
943744947dcSTom Erickson }
944744947dcSTom Erickson
945744947dcSTom Erickson /*
946744947dcSTom Erickson * At this point we know that as long as we hold os->os_lock, the dnode
947744947dcSTom Erickson * cannot be freed and fields within the dnode can be safely accessed.
948744947dcSTom Erickson * The objset listing this dnode cannot go away as long as this dnode is
949744947dcSTom Erickson * on its list.
950744947dcSTom Erickson */
951744947dcSTom Erickson rw_exit(&os_lock);
952744947dcSTom Erickson if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
953744947dcSTom Erickson mutex_exit(&os->os_lock);
95454811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_special);
955744947dcSTom Erickson return (KMEM_CBRC_NO);
956744947dcSTom Erickson }
957744947dcSTom Erickson ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
958744947dcSTom Erickson
959744947dcSTom Erickson /*
960744947dcSTom Erickson * Lock the dnode handle to prevent the dnode from obtaining any new
961744947dcSTom Erickson * holds. This also prevents the descendant dbufs and the bonus dbuf
962744947dcSTom Erickson * from accessing the dnode, so that we can discount their holds. The
963744947dcSTom Erickson * handle is safe to access because we know that while the dnode cannot
964744947dcSTom Erickson * go away, neither can its handle. Once we hold dnh_zrlock, we can
965744947dcSTom Erickson * safely move any dnode referenced only by dbufs.
966744947dcSTom Erickson */
967744947dcSTom Erickson if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
968744947dcSTom Erickson mutex_exit(&os->os_lock);
96954811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_handle);
970744947dcSTom Erickson return (KMEM_CBRC_LATER);
971744947dcSTom Erickson }
972744947dcSTom Erickson
973744947dcSTom Erickson /*
974744947dcSTom Erickson * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
975744947dcSTom Erickson * We need to guarantee that there is a hold for every dbuf in order to
976744947dcSTom Erickson * determine whether the dnode is actively referenced. Falsely matching
977744947dcSTom Erickson * a dbuf to an active hold would lead to an unsafe move. It's possible
978744947dcSTom Erickson * that a thread already having an active dnode hold is about to add a
979744947dcSTom Erickson * dbuf, and we can't compare hold and dbuf counts while the add is in
980744947dcSTom Erickson * progress.
981744947dcSTom Erickson */
982744947dcSTom Erickson if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
983744947dcSTom Erickson zrl_exit(&odn->dn_handle->dnh_zrlock);
984744947dcSTom Erickson mutex_exit(&os->os_lock);
98554811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_rwlock);
986744947dcSTom Erickson return (KMEM_CBRC_LATER);
987744947dcSTom Erickson }
988744947dcSTom Erickson
989744947dcSTom Erickson /*
990744947dcSTom Erickson * A dbuf may be removed (evicted) without an active dnode hold. In that
991744947dcSTom Erickson * case, the dbuf count is decremented under the handle lock before the
992744947dcSTom Erickson * dbuf's hold is released. This order ensures that if we count the hold
993744947dcSTom Erickson * after the dbuf is removed but before its hold is released, we will
994744947dcSTom Erickson * treat the unmatched hold as active and exit safely. If we count the
995744947dcSTom Erickson * hold before the dbuf is removed, the hold is discounted, and the
996744947dcSTom Erickson * removal is blocked until the move completes.
997744947dcSTom Erickson */
998e914ace2STim Schumacher refcount = zfs_refcount_count(&odn->dn_holds);
999744947dcSTom Erickson ASSERT(refcount >= 0);
1000744947dcSTom Erickson dbufs = odn->dn_dbufs_count;
1001744947dcSTom Erickson
1002744947dcSTom Erickson /* We can't have more dbufs than dnode holds. */
1003744947dcSTom Erickson ASSERT3U(dbufs, <=, refcount);
1004744947dcSTom Erickson DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
1005744947dcSTom Erickson uint32_t, dbufs);
1006744947dcSTom Erickson
1007744947dcSTom Erickson if (refcount > dbufs) {
1008744947dcSTom Erickson rw_exit(&odn->dn_struct_rwlock);
1009744947dcSTom Erickson zrl_exit(&odn->dn_handle->dnh_zrlock);
1010744947dcSTom Erickson mutex_exit(&os->os_lock);
101154811da5SToomas Soome DNODE_STAT_BUMP(dnode_move_active);
1012744947dcSTom Erickson return (KMEM_CBRC_LATER);
1013744947dcSTom Erickson }
1014744947dcSTom Erickson
1015744947dcSTom Erickson rw_exit(&odn->dn_struct_rwlock);
1016744947dcSTom Erickson
1017744947dcSTom Erickson /*
1018744947dcSTom Erickson * At this point we know that anyone with a hold on the dnode is not
1019744947dcSTom Erickson * actively referencing it. The dnode is known and in a valid state to
1020744947dcSTom Erickson * move. We're holding the locks needed to execute the critical section.
1021744947dcSTom Erickson */
1022744947dcSTom Erickson dnode_move_impl(odn, ndn);
1023744947dcSTom Erickson
1024744947dcSTom Erickson list_link_replace(&odn->dn_link, &ndn->dn_link);
1025744947dcSTom Erickson /* If the dnode was safe to move, the refcount cannot have changed. */
1026e914ace2STim Schumacher ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1027744947dcSTom Erickson ASSERT(dbufs == ndn->dn_dbufs_count);
1028744947dcSTom Erickson zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1029744947dcSTom Erickson mutex_exit(&os->os_lock);
1030744947dcSTom Erickson
1031744947dcSTom Erickson return (KMEM_CBRC_YES);
1032744947dcSTom Erickson }
1033744947dcSTom Erickson #endif /* _KERNEL */
1034744947dcSTom Erickson
103554811da5SToomas Soome static void
dnode_slots_hold(dnode_children_t * children,int idx,int slots)103654811da5SToomas Soome dnode_slots_hold(dnode_children_t *children, int idx, int slots)
103754811da5SToomas Soome {
103854811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
103954811da5SToomas Soome
104054811da5SToomas Soome for (int i = idx; i < idx + slots; i++) {
104154811da5SToomas Soome dnode_handle_t *dnh = &children->dnc_children[i];
104254811da5SToomas Soome zrl_add(&dnh->dnh_zrlock);
104354811da5SToomas Soome }
104454811da5SToomas Soome }
104554811da5SToomas Soome
104654811da5SToomas Soome static void
dnode_slots_rele(dnode_children_t * children,int idx,int slots)104754811da5SToomas Soome dnode_slots_rele(dnode_children_t *children, int idx, int slots)
104854811da5SToomas Soome {
104954811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
105054811da5SToomas Soome
105154811da5SToomas Soome for (int i = idx; i < idx + slots; i++) {
105254811da5SToomas Soome dnode_handle_t *dnh = &children->dnc_children[i];
105354811da5SToomas Soome
105454811da5SToomas Soome if (zrl_is_locked(&dnh->dnh_zrlock))
105554811da5SToomas Soome zrl_exit(&dnh->dnh_zrlock);
105654811da5SToomas Soome else
105754811da5SToomas Soome zrl_remove(&dnh->dnh_zrlock);
105854811da5SToomas Soome }
105954811da5SToomas Soome }
106054811da5SToomas Soome
106154811da5SToomas Soome static int
dnode_slots_tryenter(dnode_children_t * children,int idx,int slots)106254811da5SToomas Soome dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
106354811da5SToomas Soome {
106454811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
106554811da5SToomas Soome
106654811da5SToomas Soome for (int i = idx; i < idx + slots; i++) {
106754811da5SToomas Soome dnode_handle_t *dnh = &children->dnc_children[i];
106854811da5SToomas Soome
106954811da5SToomas Soome if (!zrl_tryenter(&dnh->dnh_zrlock)) {
107054811da5SToomas Soome for (int j = idx; j < i; j++) {
107154811da5SToomas Soome dnh = &children->dnc_children[j];
107254811da5SToomas Soome zrl_exit(&dnh->dnh_zrlock);
107354811da5SToomas Soome }
107454811da5SToomas Soome
107554811da5SToomas Soome return (0);
107654811da5SToomas Soome }
107754811da5SToomas Soome }
107854811da5SToomas Soome
107954811da5SToomas Soome return (1);
108054811da5SToomas Soome }
108154811da5SToomas Soome
108254811da5SToomas Soome static void
dnode_set_slots(dnode_children_t * children,int idx,int slots,void * ptr)108354811da5SToomas Soome dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
108454811da5SToomas Soome {
108554811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
108654811da5SToomas Soome
108754811da5SToomas Soome for (int i = idx; i < idx + slots; i++) {
108854811da5SToomas Soome dnode_handle_t *dnh = &children->dnc_children[i];
108954811da5SToomas Soome dnh->dnh_dnode = ptr;
109054811da5SToomas Soome }
109154811da5SToomas Soome }
109254811da5SToomas Soome
109354811da5SToomas Soome static boolean_t
dnode_check_slots_free(dnode_children_t * children,int idx,int slots)109454811da5SToomas Soome dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
109554811da5SToomas Soome {
109654811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
109754811da5SToomas Soome
1098aa02ea01STom Caputi /*
1099aa02ea01STom Caputi * If all dnode slots are either already free or
1100aa02ea01STom Caputi * evictable return B_TRUE.
1101aa02ea01STom Caputi */
110254811da5SToomas Soome for (int i = idx; i < idx + slots; i++) {
110354811da5SToomas Soome dnode_handle_t *dnh = &children->dnc_children[i];
110454811da5SToomas Soome dnode_t *dn = dnh->dnh_dnode;
110554811da5SToomas Soome
110654811da5SToomas Soome if (dn == DN_SLOT_FREE) {
110754811da5SToomas Soome continue;
110854811da5SToomas Soome } else if (DN_SLOT_IS_PTR(dn)) {
110954811da5SToomas Soome mutex_enter(&dn->dn_mtx);
1110aa02ea01STom Caputi boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1111aa02ea01STom Caputi zfs_refcount_is_zero(&dn->dn_holds) &&
1112aa02ea01STom Caputi !DNODE_IS_DIRTY(dn));
111354811da5SToomas Soome mutex_exit(&dn->dn_mtx);
111454811da5SToomas Soome
1115aa02ea01STom Caputi if (!can_free)
111654811da5SToomas Soome return (B_FALSE);
1117aa02ea01STom Caputi else
1118aa02ea01STom Caputi continue;
111954811da5SToomas Soome } else {
112054811da5SToomas Soome return (B_FALSE);
112154811da5SToomas Soome }
112254811da5SToomas Soome }
112354811da5SToomas Soome
112454811da5SToomas Soome return (B_TRUE);
112554811da5SToomas Soome }
112654811da5SToomas Soome
112754811da5SToomas Soome static void
dnode_reclaim_slots(dnode_children_t * children,int idx,int slots)112854811da5SToomas Soome dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
112954811da5SToomas Soome {
113054811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
113154811da5SToomas Soome
113254811da5SToomas Soome for (int i = idx; i < idx + slots; i++) {
113354811da5SToomas Soome dnode_handle_t *dnh = &children->dnc_children[i];
113454811da5SToomas Soome
113554811da5SToomas Soome ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
113654811da5SToomas Soome
113754811da5SToomas Soome if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
113854811da5SToomas Soome ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
113954811da5SToomas Soome dnode_destroy(dnh->dnh_dnode);
114054811da5SToomas Soome dnh->dnh_dnode = DN_SLOT_FREE;
114154811da5SToomas Soome }
114254811da5SToomas Soome }
114354811da5SToomas Soome }
114454811da5SToomas Soome
114554811da5SToomas Soome void
dnode_free_interior_slots(dnode_t * dn)114654811da5SToomas Soome dnode_free_interior_slots(dnode_t *dn)
114754811da5SToomas Soome {
114854811da5SToomas Soome dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
114954811da5SToomas Soome int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
115054811da5SToomas Soome int idx = (dn->dn_object & (epb - 1)) + 1;
115154811da5SToomas Soome int slots = dn->dn_num_slots - 1;
115254811da5SToomas Soome
115354811da5SToomas Soome if (slots == 0)
115454811da5SToomas Soome return;
115554811da5SToomas Soome
115654811da5SToomas Soome ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
115754811da5SToomas Soome
115854811da5SToomas Soome while (!dnode_slots_tryenter(children, idx, slots))
115954811da5SToomas Soome DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
116054811da5SToomas Soome
116154811da5SToomas Soome dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
116254811da5SToomas Soome dnode_slots_rele(children, idx, slots);
116354811da5SToomas Soome }
116454811da5SToomas Soome
1165fa9e4066Sahrens void
dnode_special_close(dnode_handle_t * dnh)1166744947dcSTom Erickson dnode_special_close(dnode_handle_t *dnh)
1167fa9e4066Sahrens {
1168744947dcSTom Erickson dnode_t *dn = dnh->dnh_dnode;
1169744947dcSTom Erickson
1170ea8dc4b6Seschrock /*
1171b390f3a9SJohn Poduska * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
1172b390f3a9SJohn Poduska * zfs_refcount_remove()
1173ea8dc4b6Seschrock */
1174b390f3a9SJohn Poduska mutex_enter(&dn->dn_mtx);
1175b390f3a9SJohn Poduska if (zfs_refcount_count(&dn->dn_holds) > 0)
1176b390f3a9SJohn Poduska cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
1177b390f3a9SJohn Poduska mutex_exit(&dn->dn_mtx);
1178b390f3a9SJohn Poduska ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
1179b390f3a9SJohn Poduska
1180bc9014e6SJustin Gibbs ASSERT(dn->dn_dbuf == NULL ||
1181bc9014e6SJustin Gibbs dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1182744947dcSTom Erickson zrl_add(&dnh->dnh_zrlock);
1183744947dcSTom Erickson dnode_destroy(dn); /* implicit zrl_remove() */
1184744947dcSTom Erickson zrl_destroy(&dnh->dnh_zrlock);
1185744947dcSTom Erickson dnh->dnh_dnode = NULL;
1186fa9e4066Sahrens }
1187fa9e4066Sahrens
1188bc9014e6SJustin Gibbs void
dnode_special_open(objset_t * os,dnode_phys_t * dnp,uint64_t object,dnode_handle_t * dnh)1189744947dcSTom Erickson dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1190744947dcSTom Erickson dnode_handle_t *dnh)
1191fa9e4066Sahrens {
1192bc9014e6SJustin Gibbs dnode_t *dn;
1193bc9014e6SJustin Gibbs
1194744947dcSTom Erickson zrl_init(&dnh->dnh_zrlock);
1195d061fa1fSToomas Soome VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
119654811da5SToomas Soome
119754811da5SToomas Soome dn = dnode_create(os, dnp, NULL, object, dnh);
11989c9dc39aSek DNODE_VERIFY(dn);
119954811da5SToomas Soome
120054811da5SToomas Soome zrl_exit(&dnh->dnh_zrlock);
1201fa9e4066Sahrens }
1202fa9e4066Sahrens
1203fa9e4066Sahrens static void
dnode_buf_evict_async(void * dbu)120440510e8eSJosef 'Jeff' Sipek dnode_buf_evict_async(void *dbu)
1205fa9e4066Sahrens {
120654811da5SToomas Soome dnode_children_t *dnc = dbu;
120754811da5SToomas Soome
120854811da5SToomas Soome DNODE_STAT_BUMP(dnode_buf_evict);
1209fa9e4066Sahrens
121054811da5SToomas Soome for (int i = 0; i < dnc->dnc_count; i++) {
121154811da5SToomas Soome dnode_handle_t *dnh = &dnc->dnc_children[i];
1212744947dcSTom Erickson dnode_t *dn;
1213fa9e4066Sahrens
1214744947dcSTom Erickson /*
1215744947dcSTom Erickson * The dnode handle lock guards against the dnode moving to
1216744947dcSTom Erickson * another valid address, so there is no need here to guard
1217744947dcSTom Erickson * against changes to or from NULL.
1218744947dcSTom Erickson */
121954811da5SToomas Soome if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1220744947dcSTom Erickson zrl_destroy(&dnh->dnh_zrlock);
122154811da5SToomas Soome dnh->dnh_dnode = DN_SLOT_UNINIT;
1222fa9e4066Sahrens continue;
1223744947dcSTom Erickson }
1224744947dcSTom Erickson
1225744947dcSTom Erickson zrl_add(&dnh->dnh_zrlock);
1226744947dcSTom Erickson dn = dnh->dnh_dnode;
1227fa9e4066Sahrens /*
1228fa9e4066Sahrens * If there are holds on this dnode, then there should
1229fa9e4066Sahrens * be holds on the dnode's containing dbuf as well; thus
1230744947dcSTom Erickson * it wouldn't be eligible for eviction and this function
1231fa9e4066Sahrens * would not have been called.
1232fa9e4066Sahrens */
1233e914ace2STim Schumacher ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1234e914ace2STim Schumacher ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1235fa9e4066Sahrens
123654811da5SToomas Soome dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1237744947dcSTom Erickson zrl_destroy(&dnh->dnh_zrlock);
123854811da5SToomas Soome dnh->dnh_dnode = DN_SLOT_UNINIT;
1239fa9e4066Sahrens }
124054811da5SToomas Soome kmem_free(dnc, sizeof (dnode_children_t) +
124154811da5SToomas Soome dnc->dnc_count * sizeof (dnode_handle_t));
1242fa9e4066Sahrens }
1243fa9e4066Sahrens
1244fa9e4066Sahrens /*
124554811da5SToomas Soome * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
124654811da5SToomas Soome * to ensure the hole at the specified object offset is large enough to
124754811da5SToomas Soome * hold the dnode being created. The slots parameter is also used to ensure
124854811da5SToomas Soome * a dnode does not span multiple dnode blocks. In both of these cases, if
124954811da5SToomas Soome * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
125054811da5SToomas Soome * are only possible when using DNODE_MUST_BE_FREE.
125154811da5SToomas Soome *
125254811da5SToomas Soome * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
125354811da5SToomas Soome * dnode_hold_impl() will check if the requested dnode is already consumed
125454811da5SToomas Soome * as an extra dnode slot by an large dnode, in which case it returns
125554811da5SToomas Soome * ENOENT.
125654811da5SToomas Soome *
1257d8849d7dSChunwei Chen * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
1258d8849d7dSChunwei Chen * return whether the hold would succeed or not. tag and dnp should set to
1259d8849d7dSChunwei Chen * NULL in this case.
1260d8849d7dSChunwei Chen *
1261ea8dc4b6Seschrock * errors:
126254811da5SToomas Soome * EINVAL - invalid object number or flags.
126354811da5SToomas Soome * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
126454811da5SToomas Soome * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
126554811da5SToomas Soome * - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
126654811da5SToomas Soome * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
126754811da5SToomas Soome * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
126854811da5SToomas Soome * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
126954811da5SToomas Soome * EIO - i/o error error when reading the meta dnode dbuf.
1270ea8dc4b6Seschrock * succeeds even for free dnodes.
1271fa9e4066Sahrens */
1272ea8dc4b6Seschrock int
dnode_hold_impl(objset_t * os,uint64_t object,int flag,int slots,void * tag,dnode_t ** dnp)127354811da5SToomas Soome dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1274ea8dc4b6Seschrock void *tag, dnode_t **dnp)
1275fa9e4066Sahrens {
1276ea8dc4b6Seschrock int epb, idx, err;
1277fa9e4066Sahrens int drop_struct_lock = FALSE;
1278ea8dc4b6Seschrock int type;
1279fa9e4066Sahrens uint64_t blk;
1280fa9e4066Sahrens dnode_t *mdn, *dn;
1281fa9e4066Sahrens dmu_buf_impl_t *db;
128254811da5SToomas Soome dnode_children_t *dnc;
128354811da5SToomas Soome dnode_phys_t *dn_block;
1284744947dcSTom Erickson dnode_handle_t *dnh;
1285fa9e4066Sahrens
128654811da5SToomas Soome ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
128754811da5SToomas Soome ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
1288d8849d7dSChunwei Chen IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
128954811da5SToomas Soome
1290e14bb325SJeff Bonwick /*
1291e14bb325SJeff Bonwick * If you are holding the spa config lock as writer, you shouldn't
1292dcba9f3fSGeorge Wilson * be asking the DMU to do *anything* unless it's the root pool
1293dcba9f3fSGeorge Wilson * which may require us to read from the root filesystem while
1294dcba9f3fSGeorge Wilson * holding some (not all) of the locks as writer.
1295e14bb325SJeff Bonwick */
1296dcba9f3fSGeorge Wilson ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1297dcba9f3fSGeorge Wilson (spa_is_root(os->os_spa) &&
129844ecc532SGeorge Wilson spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1299e14bb325SJeff Bonwick
130086714001SSerapheim Dimitropoulos ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
130186714001SSerapheim Dimitropoulos
1302f67950b2SNasf-Fan if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
1303f67950b2SNasf-Fan object == DMU_PROJECTUSED_OBJECT) {
1304f67950b2SNasf-Fan if (object == DMU_USERUSED_OBJECT)
1305f67950b2SNasf-Fan dn = DMU_USERUSED_DNODE(os);
1306f67950b2SNasf-Fan else if (object == DMU_GROUPUSED_OBJECT)
1307f67950b2SNasf-Fan dn = DMU_GROUPUSED_DNODE(os);
1308f67950b2SNasf-Fan else
1309f67950b2SNasf-Fan dn = DMU_PROJECTUSED_DNODE(os);
131014843421SMatthew Ahrens if (dn == NULL)
1311be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT));
131214843421SMatthew Ahrens type = dn->dn_type;
131314843421SMatthew Ahrens if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1314be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT));
131514843421SMatthew Ahrens if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1316be6fd75aSMatthew Ahrens return (SET_ERROR(EEXIST));
131714843421SMatthew Ahrens DNODE_VERIFY(dn);
1318d8849d7dSChunwei Chen /* Don't actually hold if dry run, just return 0 */
1319d8849d7dSChunwei Chen if (!(flag & DNODE_DRY_RUN)) {
1320d8849d7dSChunwei Chen (void) zfs_refcount_add(&dn->dn_holds, tag);
1321d8849d7dSChunwei Chen *dnp = dn;
1322d8849d7dSChunwei Chen }
132314843421SMatthew Ahrens return (0);
132414843421SMatthew Ahrens }
132514843421SMatthew Ahrens
1326fa9e4066Sahrens if (object == 0 || object >= DN_MAX_OBJECT)
1327be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL));
1328fa9e4066Sahrens
1329744947dcSTom Erickson mdn = DMU_META_DNODE(os);
1330744947dcSTom Erickson ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1331fa9e4066Sahrens
13329c9dc39aSek DNODE_VERIFY(mdn);
1333fa9e4066Sahrens
1334fa9e4066Sahrens if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1335fa9e4066Sahrens rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1336fa9e4066Sahrens drop_struct_lock = TRUE;
1337fa9e4066Sahrens }
1338fa9e4066Sahrens
1339a2cdcdd2SPaul Dagnelie blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1340ea8dc4b6Seschrock db = dbuf_hold(mdn, blk, FTAG);
1341fa9e4066Sahrens if (drop_struct_lock)
1342fa9e4066Sahrens rw_exit(&mdn->dn_struct_rwlock);
134354811da5SToomas Soome if (db == NULL) {
134454811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1345be6fd75aSMatthew Ahrens return (SET_ERROR(EIO));
134654811da5SToomas Soome }
1347eb633035STom Caputi /*
1348eb633035STom Caputi * We do not need to decrypt to read the dnode so it doesn't matter
1349eb633035STom Caputi * if we get the encrypted or decrypted version.
1350eb633035STom Caputi */
1351eb633035STom Caputi err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
1352ea8dc4b6Seschrock if (err) {
135354811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1354ea8dc4b6Seschrock dbuf_rele(db, FTAG);
1355ea8dc4b6Seschrock return (err);
1356ea8dc4b6Seschrock }
1357fa9e4066Sahrens
1358fa9e4066Sahrens ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1359fa9e4066Sahrens epb = db->db.db_size >> DNODE_SHIFT;
1360fa9e4066Sahrens
136154811da5SToomas Soome idx = object & (epb - 1);
136254811da5SToomas Soome dn_block = (dnode_phys_t *)db->db.db_data;
1363fa9e4066Sahrens
1364744947dcSTom Erickson ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
136554811da5SToomas Soome dnc = dmu_buf_get_user(&db->db);
136654811da5SToomas Soome dnh = NULL;
136754811da5SToomas Soome if (dnc == NULL) {
1368744947dcSTom Erickson dnode_children_t *winner;
136954811da5SToomas Soome int skip = 0;
137054811da5SToomas Soome
137154811da5SToomas Soome dnc = kmem_zalloc(sizeof (dnode_children_t) +
13727f18da4cSJustin T. Gibbs epb * sizeof (dnode_handle_t), KM_SLEEP);
137354811da5SToomas Soome dnc->dnc_count = epb;
137454811da5SToomas Soome dnh = &dnc->dnc_children[0];
137554811da5SToomas Soome
137654811da5SToomas Soome /* Initialize dnode slot status from dnode_phys_t */
137754811da5SToomas Soome for (int i = 0; i < epb; i++) {
1378744947dcSTom Erickson zrl_init(&dnh[i].dnh_zrlock);
137954811da5SToomas Soome
138054811da5SToomas Soome if (skip) {
138154811da5SToomas Soome skip--;
138254811da5SToomas Soome continue;
138354811da5SToomas Soome }
138454811da5SToomas Soome
138554811da5SToomas Soome if (dn_block[i].dn_type != DMU_OT_NONE) {
138654811da5SToomas Soome int interior = dn_block[i].dn_extra_slots;
138754811da5SToomas Soome
138854811da5SToomas Soome dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
138954811da5SToomas Soome dnode_set_slots(dnc, i + 1, interior,
139054811da5SToomas Soome DN_SLOT_INTERIOR);
139154811da5SToomas Soome skip = interior;
139254811da5SToomas Soome } else {
139354811da5SToomas Soome dnh[i].dnh_dnode = DN_SLOT_FREE;
139454811da5SToomas Soome skip = 0;
139554811da5SToomas Soome }
1396744947dcSTom Erickson }
139754811da5SToomas Soome
139854811da5SToomas Soome dmu_buf_init_user(&dnc->dnc_dbu, NULL,
139940510e8eSJosef 'Jeff' Sipek dnode_buf_evict_async, NULL);
140054811da5SToomas Soome winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1401bc9014e6SJustin Gibbs if (winner != NULL) {
1402d2b3cbbdSJorgen Lundman
140354811da5SToomas Soome for (int i = 0; i < epb; i++)
1404d2b3cbbdSJorgen Lundman zrl_destroy(&dnh[i].dnh_zrlock);
1405d2b3cbbdSJorgen Lundman
140654811da5SToomas Soome kmem_free(dnc, sizeof (dnode_children_t) +
14077f18da4cSJustin T. Gibbs epb * sizeof (dnode_handle_t));
140854811da5SToomas Soome dnc = winner;
1409fa9e4066Sahrens }
1410fa9e4066Sahrens }
1411fa9e4066Sahrens
141254811da5SToomas Soome ASSERT(dnc->dnc_count == epb);
141354811da5SToomas Soome dn = DN_SLOT_UNINIT;
141454811da5SToomas Soome
141554811da5SToomas Soome if (flag & DNODE_MUST_BE_ALLOCATED) {
141654811da5SToomas Soome slots = 1;
141754811da5SToomas Soome
141854811da5SToomas Soome while (dn == DN_SLOT_UNINIT) {
141954811da5SToomas Soome dnode_slots_hold(dnc, idx, slots);
142054811da5SToomas Soome dnh = &dnc->dnc_children[idx];
142154811da5SToomas Soome
142254811da5SToomas Soome if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
142354811da5SToomas Soome dn = dnh->dnh_dnode;
142454811da5SToomas Soome break;
142554811da5SToomas Soome } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
142654811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_alloc_interior);
142754811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
142854811da5SToomas Soome dbuf_rele(db, FTAG);
142954811da5SToomas Soome return (SET_ERROR(EEXIST));
143054811da5SToomas Soome } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
143154811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_alloc_misses);
143254811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
143354811da5SToomas Soome dbuf_rele(db, FTAG);
143454811da5SToomas Soome return (SET_ERROR(ENOENT));
143554811da5SToomas Soome }
143654811da5SToomas Soome
143754811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
143854811da5SToomas Soome if (!dnode_slots_tryenter(dnc, idx, slots)) {
143954811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
144054811da5SToomas Soome continue;
144154811da5SToomas Soome }
144254811da5SToomas Soome
144354811da5SToomas Soome /*
144454811da5SToomas Soome * Someone else won the race and called dnode_create()
144554811da5SToomas Soome * after we checked DN_SLOT_IS_PTR() above but before
144654811da5SToomas Soome * we acquired the lock.
144754811da5SToomas Soome */
144854811da5SToomas Soome if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
144954811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
145054811da5SToomas Soome dn = dnh->dnh_dnode;
145154811da5SToomas Soome } else {
145254811da5SToomas Soome dn = dnode_create(os, dn_block + idx, db,
145354811da5SToomas Soome object, dnh);
145454811da5SToomas Soome }
145554811da5SToomas Soome }
145654811da5SToomas Soome
145754811da5SToomas Soome mutex_enter(&dn->dn_mtx);
145854811da5SToomas Soome if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
145954811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
146054811da5SToomas Soome mutex_exit(&dn->dn_mtx);
146154811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
146254811da5SToomas Soome dbuf_rele(db, FTAG);
146354811da5SToomas Soome return (SET_ERROR(ENOENT));
146454811da5SToomas Soome }
146554811da5SToomas Soome
1466d8849d7dSChunwei Chen /* Don't actually hold if dry run, just return 0 */
1467d8849d7dSChunwei Chen if (flag & DNODE_DRY_RUN) {
1468d8849d7dSChunwei Chen mutex_exit(&dn->dn_mtx);
1469d8849d7dSChunwei Chen dnode_slots_rele(dnc, idx, slots);
1470d8849d7dSChunwei Chen dbuf_rele(db, FTAG);
1471d8849d7dSChunwei Chen return (0);
1472d8849d7dSChunwei Chen }
1473d8849d7dSChunwei Chen
147454811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_alloc_hits);
147554811da5SToomas Soome } else if (flag & DNODE_MUST_BE_FREE) {
147654811da5SToomas Soome
147754811da5SToomas Soome if (idx + slots - 1 >= DNODES_PER_BLOCK) {
147854811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_free_overflow);
147954811da5SToomas Soome dbuf_rele(db, FTAG);
148054811da5SToomas Soome return (SET_ERROR(ENOSPC));
148154811da5SToomas Soome }
148254811da5SToomas Soome
148354811da5SToomas Soome while (dn == DN_SLOT_UNINIT) {
148454811da5SToomas Soome dnode_slots_hold(dnc, idx, slots);
148554811da5SToomas Soome
148654811da5SToomas Soome if (!dnode_check_slots_free(dnc, idx, slots)) {
148754811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_free_misses);
148854811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
148954811da5SToomas Soome dbuf_rele(db, FTAG);
149054811da5SToomas Soome return (SET_ERROR(ENOSPC));
149154811da5SToomas Soome }
149254811da5SToomas Soome
149354811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
149454811da5SToomas Soome if (!dnode_slots_tryenter(dnc, idx, slots)) {
149554811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
149654811da5SToomas Soome continue;
149754811da5SToomas Soome }
149854811da5SToomas Soome
149954811da5SToomas Soome if (!dnode_check_slots_free(dnc, idx, slots)) {
150054811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
150154811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
150254811da5SToomas Soome dbuf_rele(db, FTAG);
150354811da5SToomas Soome return (SET_ERROR(ENOSPC));
150454811da5SToomas Soome }
150554811da5SToomas Soome
150654811da5SToomas Soome /*
150754811da5SToomas Soome * Allocated but otherwise free dnodes which would
150854811da5SToomas Soome * be in the interior of a multi-slot dnodes need
150954811da5SToomas Soome * to be freed. Single slot dnodes can be safely
151054811da5SToomas Soome * re-purposed as a performance optimization.
151154811da5SToomas Soome */
151254811da5SToomas Soome if (slots > 1)
151354811da5SToomas Soome dnode_reclaim_slots(dnc, idx + 1, slots - 1);
151454811da5SToomas Soome
151554811da5SToomas Soome dnh = &dnc->dnc_children[idx];
151654811da5SToomas Soome if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
151754811da5SToomas Soome dn = dnh->dnh_dnode;
151854811da5SToomas Soome } else {
151954811da5SToomas Soome dn = dnode_create(os, dn_block + idx, db,
152054811da5SToomas Soome object, dnh);
152154811da5SToomas Soome }
152254811da5SToomas Soome }
152354811da5SToomas Soome
152454811da5SToomas Soome mutex_enter(&dn->dn_mtx);
1525e914ace2STim Schumacher if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
152654811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_free_refcount);
152754811da5SToomas Soome mutex_exit(&dn->dn_mtx);
152854811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
152954811da5SToomas Soome dbuf_rele(db, FTAG);
153054811da5SToomas Soome return (SET_ERROR(EEXIST));
153154811da5SToomas Soome }
15320e8c6158Smaybee
1533d8849d7dSChunwei Chen /* Don't actually hold if dry run, just return 0 */
1534d8849d7dSChunwei Chen if (flag & DNODE_DRY_RUN) {
1535d8849d7dSChunwei Chen mutex_exit(&dn->dn_mtx);
1536d8849d7dSChunwei Chen dnode_slots_rele(dnc, idx, slots);
1537d8849d7dSChunwei Chen dbuf_rele(db, FTAG);
1538d8849d7dSChunwei Chen return (0);
1539d8849d7dSChunwei Chen }
1540d8849d7dSChunwei Chen
154154811da5SToomas Soome dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
154254811da5SToomas Soome DNODE_STAT_BUMP(dnode_hold_free_hits);
154354811da5SToomas Soome } else {
154454811da5SToomas Soome dbuf_rele(db, FTAG);
154554811da5SToomas Soome return (SET_ERROR(EINVAL));
1546fa9e4066Sahrens }
1547fa9e4066Sahrens
1548d8849d7dSChunwei Chen ASSERT0(dn->dn_free_txg);
154954811da5SToomas Soome
1550e914ace2STim Schumacher if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1551744947dcSTom Erickson dbuf_add_ref(db, dnh);
155254811da5SToomas Soome
1553bc9014e6SJustin Gibbs mutex_exit(&dn->dn_mtx);
1554bc9014e6SJustin Gibbs
1555744947dcSTom Erickson /* Now we can rely on the hold to prevent the dnode from moving. */
155654811da5SToomas Soome dnode_slots_rele(dnc, idx, slots);
1557fa9e4066Sahrens
15589c9dc39aSek DNODE_VERIFY(dn);
1559fa9e4066Sahrens ASSERT3P(dn->dn_dbuf, ==, db);
1560fa9e4066Sahrens ASSERT3U(dn->dn_object, ==, object);
1561ea8dc4b6Seschrock dbuf_rele(db, FTAG);
1562fa9e4066Sahrens
1563ea8dc4b6Seschrock *dnp = dn;
1564ea8dc4b6Seschrock return (0);
1565fa9e4066Sahrens }
1566fa9e4066Sahrens
1567fa9e4066Sahrens /*
1568fa9e4066Sahrens * Return held dnode if the object is allocated, NULL if not.
1569fa9e4066Sahrens */
1570ea8dc4b6Seschrock int
dnode_hold(objset_t * os,uint64_t object,void * tag,dnode_t ** dnp)1571503ad85cSMatthew Ahrens dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1572fa9e4066Sahrens {
157354811da5SToomas Soome return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
157454811da5SToomas Soome dnp));
1575fa9e4066Sahrens }
1576fa9e4066Sahrens
15771934e92fSmaybee /*
15781934e92fSmaybee * Can only add a reference if there is already at least one
15791934e92fSmaybee * reference on the dnode. Returns FALSE if unable to add a
15801934e92fSmaybee * new reference.
15811934e92fSmaybee */
15821934e92fSmaybee boolean_t
dnode_add_ref(dnode_t * dn,void * tag)1583ea8dc4b6Seschrock dnode_add_ref(dnode_t *dn, void *tag)
1584fa9e4066Sahrens {
15851934e92fSmaybee mutex_enter(&dn->dn_mtx);
1586e914ace2STim Schumacher if (zfs_refcount_is_zero(&dn->dn_holds)) {
15871934e92fSmaybee mutex_exit(&dn->dn_mtx);
15881934e92fSmaybee return (FALSE);
15891934e92fSmaybee }
1590e914ace2STim Schumacher VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
15911934e92fSmaybee mutex_exit(&dn->dn_mtx);
15921934e92fSmaybee return (TRUE);
1593fa9e4066Sahrens }
1594fa9e4066Sahrens
1595fa9e4066Sahrens void
dnode_rele(dnode_t * dn,void * tag)1596ea8dc4b6Seschrock dnode_rele(dnode_t *dn, void *tag)
1597cd485b49SJustin T. Gibbs {
1598cd485b49SJustin T. Gibbs mutex_enter(&dn->dn_mtx);
1599c2919acbSMatthew Ahrens dnode_rele_and_unlock(dn, tag, B_FALSE);
1600cd485b49SJustin T. Gibbs }
1601cd485b49SJustin T. Gibbs
1602cd485b49SJustin T. Gibbs void
dnode_rele_and_unlock(dnode_t * dn,void * tag,boolean_t evicting)1603c2919acbSMatthew Ahrens dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
1604fa9e4066Sahrens {
1605fa9e4066Sahrens uint64_t refs;
1606744947dcSTom Erickson /* Get while the hold prevents the dnode from moving. */
1607744947dcSTom Erickson dmu_buf_impl_t *db = dn->dn_dbuf;
1608744947dcSTom Erickson dnode_handle_t *dnh = dn->dn_handle;
1609fa9e4066Sahrens
1610e914ace2STim Schumacher refs = zfs_refcount_remove(&dn->dn_holds, tag);
1611b390f3a9SJohn Poduska if (refs == 0)
1612b390f3a9SJohn Poduska cv_broadcast(&dn->dn_nodnholds);
16131934e92fSmaybee mutex_exit(&dn->dn_mtx);
1614b390f3a9SJohn Poduska /* dnode could get destroyed at this point, so don't use it anymore */
1615744947dcSTom Erickson
1616744947dcSTom Erickson /*
1617744947dcSTom Erickson * It's unsafe to release the last hold on a dnode by dnode_rele() or
1618744947dcSTom Erickson * indirectly by dbuf_rele() while relying on the dnode handle to
1619744947dcSTom Erickson * prevent the dnode from moving, since releasing the last hold could
1620744947dcSTom Erickson * result in the dnode's parent dbuf evicting its dnode handles. For
1621744947dcSTom Erickson * that reason anyone calling dnode_rele() or dbuf_rele() without some
1622744947dcSTom Erickson * other direct or indirect hold on the dnode must first drop the dnode
1623744947dcSTom Erickson * handle.
1624744947dcSTom Erickson */
1625744947dcSTom Erickson ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1626744947dcSTom Erickson
1627fa9e4066Sahrens /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1628744947dcSTom Erickson if (refs == 0 && db != NULL) {
1629744947dcSTom Erickson /*
1630744947dcSTom Erickson * Another thread could add a hold to the dnode handle in
1631744947dcSTom Erickson * dnode_hold_impl() while holding the parent dbuf. Since the
1632744947dcSTom Erickson * hold on the parent dbuf prevents the handle from being
1633744947dcSTom Erickson * destroyed, the hold on the handle is OK. We can't yet assert
1634744947dcSTom Erickson * that the handle has zero references, but that will be
1635744947dcSTom Erickson * asserted anyway when the handle gets destroyed.
1636744947dcSTom Erickson */
1637c2919acbSMatthew Ahrens mutex_enter(&db->db_mtx);
1638c2919acbSMatthew Ahrens dbuf_rele_and_unlock(db, dnh, evicting);
1639744947dcSTom Erickson }
1640fa9e4066Sahrens }
1641fa9e4066Sahrens
1642d8849d7dSChunwei Chen /*
1643d8849d7dSChunwei Chen * Test whether we can create a dnode at the specified location.
1644d8849d7dSChunwei Chen */
1645d8849d7dSChunwei Chen int
dnode_try_claim(objset_t * os,uint64_t object,int slots)1646d8849d7dSChunwei Chen dnode_try_claim(objset_t *os, uint64_t object, int slots)
1647d8849d7dSChunwei Chen {
1648d8849d7dSChunwei Chen return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
1649d8849d7dSChunwei Chen slots, NULL, NULL));
1650d8849d7dSChunwei Chen }
1651d8849d7dSChunwei Chen
1652fa9e4066Sahrens void
dnode_setdirty(dnode_t * dn,dmu_tx_t * tx)1653fa9e4066Sahrens dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1654fa9e4066Sahrens {
1655503ad85cSMatthew Ahrens objset_t *os = dn->dn_objset;
1656fa9e4066Sahrens uint64_t txg = tx->tx_txg;
1657fa9e4066Sahrens
165814843421SMatthew Ahrens if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
165914843421SMatthew Ahrens dsl_dataset_dirty(os->os_dsl_dataset, tx);
1660fa9e4066Sahrens return;
166114843421SMatthew Ahrens }
1662fa9e4066Sahrens
16639c9dc39aSek DNODE_VERIFY(dn);
1664fa9e4066Sahrens
1665fa9e4066Sahrens #ifdef ZFS_DEBUG
1666fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
1667fa9e4066Sahrens ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1668744947dcSTom Erickson ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1669fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
1670fa9e4066Sahrens #endif
1671fa9e4066Sahrens
16720a586ceaSMark Shellenbaum /*
16730a586ceaSMark Shellenbaum * Determine old uid/gid when necessary
16740a586ceaSMark Shellenbaum */
167506e0070dSMark Shellenbaum dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
16760a586ceaSMark Shellenbaum
167794c2d0ebSMatthew Ahrens multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
167894c2d0ebSMatthew Ahrens multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1679fa9e4066Sahrens
1680fa9e4066Sahrens /*
1681fa9e4066Sahrens * If we are already marked dirty, we're done.
1682fa9e4066Sahrens */
1683aa02ea01STom Caputi if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
168494c2d0ebSMatthew Ahrens multilist_sublist_unlock(mls);
1685fa9e4066Sahrens return;
1686fa9e4066Sahrens }
1687fa9e4066Sahrens
1688e914ace2STim Schumacher ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
16890f6d88adSAlex Reece !avl_is_empty(&dn->dn_dbufs));
1690fa9e4066Sahrens ASSERT(dn->dn_datablksz != 0);
1691fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
1692fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
1693fb09f5aaSMadhav Suresh ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1694fa9e4066Sahrens
1695fa9e4066Sahrens dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1696fa9e4066Sahrens dn->dn_object, txg);
1697fa9e4066Sahrens
169894c2d0ebSMatthew Ahrens multilist_sublist_insert_head(mls, dn);
1699fa9e4066Sahrens
170094c2d0ebSMatthew Ahrens multilist_sublist_unlock(mls);
1701fa9e4066Sahrens
1702fa9e4066Sahrens /*
1703fa9e4066Sahrens * The dnode maintains a hold on its containing dbuf as
1704fa9e4066Sahrens * long as there are holds on it. Each instantiated child
1705744947dcSTom Erickson * dbuf maintains a hold on the dnode. When the last child
1706fa9e4066Sahrens * drops its hold, the dnode will drop its hold on the
1707fa9e4066Sahrens * containing dbuf. We add a "dirty hold" here so that the
1708fa9e4066Sahrens * dnode will hang around after we finish processing its
1709fa9e4066Sahrens * children.
1710fa9e4066Sahrens */
17111934e92fSmaybee VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1712fa9e4066Sahrens
1713c717a561Smaybee (void) dbuf_dirty(dn->dn_dbuf, tx);
1714fa9e4066Sahrens
1715fa9e4066Sahrens dsl_dataset_dirty(os->os_dsl_dataset, tx);
1716fa9e4066Sahrens }
1717fa9e4066Sahrens
1718fa9e4066Sahrens void
dnode_free(dnode_t * dn,dmu_tx_t * tx)1719fa9e4066Sahrens dnode_free(dnode_t *dn, dmu_tx_t *tx)
1720fa9e4066Sahrens {
1721fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
1722fa9e4066Sahrens if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1723fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
1724fa9e4066Sahrens return;
1725fa9e4066Sahrens }
1726fa9e4066Sahrens dn->dn_free_txg = tx->tx_txg;
1727fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
1728fa9e4066Sahrens
172994c2d0ebSMatthew Ahrens dnode_setdirty(dn, tx);
1730fa9e4066Sahrens }
1731fa9e4066Sahrens
1732fa9e4066Sahrens /*
1733fa9e4066Sahrens * Try to change the block size for the indicated dnode. This can only
1734fa9e4066Sahrens * succeed if there are no blocks allocated or dirty beyond first block
1735fa9e4066Sahrens */
1736fa9e4066Sahrens int
dnode_set_blksz(dnode_t * dn,uint64_t size,int ibs,dmu_tx_t * tx)1737fa9e4066Sahrens dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1738fa9e4066Sahrens {
17390f6d88adSAlex Reece dmu_buf_impl_t *db;
1740cdb0ab79Smaybee int err;
1741fa9e4066Sahrens
1742b5152584SMatthew Ahrens ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1743fa9e4066Sahrens if (size == 0)
1744fa9e4066Sahrens size = SPA_MINBLOCKSIZE;
1745fa9e4066Sahrens else
1746fa9e4066Sahrens size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1747fa9e4066Sahrens
1748b143e04bSahrens if (ibs == dn->dn_indblkshift)
1749b143e04bSahrens ibs = 0;
1750fa9e4066Sahrens
1751b143e04bSahrens if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1752fa9e4066Sahrens return (0);
1753fa9e4066Sahrens
1754fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1755fa9e4066Sahrens
1756fa9e4066Sahrens /* Check for any allocated blocks beyond the first */
17570713e232SGeorge Wilson if (dn->dn_maxblkid != 0)
1758b143e04bSahrens goto fail;
1759fa9e4066Sahrens
1760fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx);
17610f6d88adSAlex Reece for (db = avl_first(&dn->dn_dbufs); db != NULL;
17620f6d88adSAlex Reece db = AVL_NEXT(&dn->dn_dbufs, db)) {
17630a586ceaSMark Shellenbaum if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
17640a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) {
1765fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx);
1766b143e04bSahrens goto fail;
1767fa9e4066Sahrens }
1768fa9e4066Sahrens }
1769fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx);
1770fa9e4066Sahrens
1771b143e04bSahrens if (ibs && dn->dn_nlevels != 1)
1772b143e04bSahrens goto fail;
1773b143e04bSahrens
1774cdb0ab79Smaybee /* resize the old block */
1775a2cdcdd2SPaul Dagnelie err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
17769704bf7fSPaul Dagnelie if (err == 0) {
1777c543ec06Sahrens dbuf_new_size(db, size, tx);
17789704bf7fSPaul Dagnelie } else if (err != ENOENT) {
1779cdb0ab79Smaybee goto fail;
17809704bf7fSPaul Dagnelie }
1781fa9e4066Sahrens
1782fa9e4066Sahrens dnode_setdblksz(dn, size);
1783c543ec06Sahrens dnode_setdirty(dn, tx);
1784c543ec06Sahrens dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1785b143e04bSahrens if (ibs) {
1786b143e04bSahrens dn->dn_indblkshift = ibs;
1787b143e04bSahrens dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1788b143e04bSahrens }
1789cdb0ab79Smaybee /* rele after we have fixed the blocksize in the dnode */
1790c543ec06Sahrens if (db)
1791c543ec06Sahrens dbuf_rele(db, FTAG);
1792fa9e4066Sahrens
1793fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock);
1794b143e04bSahrens return (0);
1795b143e04bSahrens
1796b143e04bSahrens fail:
1797b143e04bSahrens rw_exit(&dn->dn_struct_rwlock);
1798be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP));
1799fa9e4066Sahrens }
1800fa9e4066Sahrens
1801eb633035STom Caputi static void
dnode_set_nlevels_impl(dnode_t * dn,int new_nlevels,dmu_tx_t * tx)1802eb633035STom Caputi dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
1803eb633035STom Caputi {
1804eb633035STom Caputi uint64_t txgoff = tx->tx_txg & TXG_MASK;
1805eb633035STom Caputi int old_nlevels = dn->dn_nlevels;
1806eb633035STom Caputi dmu_buf_impl_t *db;
1807eb633035STom Caputi list_t *list;
1808eb633035STom Caputi dbuf_dirty_record_t *new, *dr, *dr_next;
1809eb633035STom Caputi
1810eb633035STom Caputi ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1811eb633035STom Caputi
1812eb633035STom Caputi dn->dn_nlevels = new_nlevels;
1813eb633035STom Caputi
1814eb633035STom Caputi ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1815eb633035STom Caputi dn->dn_next_nlevels[txgoff] = new_nlevels;
1816eb633035STom Caputi
1817eb633035STom Caputi /* dirty the left indirects */
1818eb633035STom Caputi db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1819eb633035STom Caputi ASSERT(db != NULL);
1820eb633035STom Caputi new = dbuf_dirty(db, tx);
1821eb633035STom Caputi dbuf_rele(db, FTAG);
1822eb633035STom Caputi
1823eb633035STom Caputi /* transfer the dirty records to the new indirect */
1824eb633035STom Caputi mutex_enter(&dn->dn_mtx);
1825eb633035STom Caputi mutex_enter(&new->dt.di.dr_mtx);
1826eb633035STom Caputi list = &dn->dn_dirty_records[txgoff];
1827eb633035STom Caputi for (dr = list_head(list); dr; dr = dr_next) {
1828eb633035STom Caputi dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1829eb633035STom Caputi if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1830eb633035STom Caputi dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1831eb633035STom Caputi dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1832eb633035STom Caputi ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1833eb633035STom Caputi list_remove(&dn->dn_dirty_records[txgoff], dr);
1834eb633035STom Caputi list_insert_tail(&new->dt.di.dr_children, dr);
1835eb633035STom Caputi dr->dr_parent = new;
1836eb633035STom Caputi }
1837eb633035STom Caputi }
1838eb633035STom Caputi mutex_exit(&new->dt.di.dr_mtx);
1839eb633035STom Caputi mutex_exit(&dn->dn_mtx);
1840eb633035STom Caputi }
1841eb633035STom Caputi
1842eb633035STom Caputi int
dnode_set_nlevels(dnode_t * dn,int nlevels,dmu_tx_t * tx)1843eb633035STom Caputi dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
1844eb633035STom Caputi {
1845eb633035STom Caputi int ret = 0;
1846eb633035STom Caputi
1847eb633035STom Caputi rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1848eb633035STom Caputi
1849eb633035STom Caputi if (dn->dn_nlevels == nlevels) {
1850eb633035STom Caputi ret = 0;
1851eb633035STom Caputi goto out;
1852eb633035STom Caputi } else if (nlevels < dn->dn_nlevels) {
1853eb633035STom Caputi ret = SET_ERROR(EINVAL);
1854eb633035STom Caputi goto out;
1855eb633035STom Caputi }
1856eb633035STom Caputi
1857eb633035STom Caputi dnode_set_nlevels_impl(dn, nlevels, tx);
1858eb633035STom Caputi
1859eb633035STom Caputi out:
1860eb633035STom Caputi rw_exit(&dn->dn_struct_rwlock);
1861eb633035STom Caputi return (ret);
1862eb633035STom Caputi }
1863eb633035STom Caputi
18648346f03fSJonathan W Adams /* read-holding callers must not rely on the lock being continuously held */
1865fa9e4066Sahrens void
dnode_new_blkid(dnode_t * dn,uint64_t blkid,dmu_tx_t * tx,boolean_t have_read,boolean_t force)1866eb633035STom Caputi dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
1867eb633035STom Caputi boolean_t force)
1868fa9e4066Sahrens {
1869c543ec06Sahrens int epbs, new_nlevels;
1870fa9e4066Sahrens uint64_t sz;
1871fa9e4066Sahrens
18720a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID);
1873fa9e4066Sahrens
18748346f03fSJonathan W Adams ASSERT(have_read ?
18758346f03fSJonathan W Adams RW_READ_HELD(&dn->dn_struct_rwlock) :
18768346f03fSJonathan W Adams RW_WRITE_HELD(&dn->dn_struct_rwlock));
18778346f03fSJonathan W Adams
18788346f03fSJonathan W Adams /*
18798346f03fSJonathan W Adams * if we have a read-lock, check to see if we need to do any work
18808346f03fSJonathan W Adams * before upgrading to a write-lock.
18818346f03fSJonathan W Adams */
18828346f03fSJonathan W Adams if (have_read) {
18838346f03fSJonathan W Adams if (blkid <= dn->dn_maxblkid)
18848346f03fSJonathan W Adams return;
18858346f03fSJonathan W Adams
18868346f03fSJonathan W Adams if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
18878346f03fSJonathan W Adams rw_exit(&dn->dn_struct_rwlock);
18888346f03fSJonathan W Adams rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
18898346f03fSJonathan W Adams }
1890fa9e4066Sahrens }
1891fa9e4066Sahrens
1892eb633035STom Caputi /*
1893eb633035STom Caputi * Raw sends (indicated by the force flag) require that we take the
1894eb633035STom Caputi * given blkid even if the value is lower than the current value.
1895eb633035STom Caputi */
1896eb633035STom Caputi if (!force && blkid <= dn->dn_maxblkid)
1897c543ec06Sahrens goto out;
1898c543ec06Sahrens
1899eb633035STom Caputi /*
1900eb633035STom Caputi * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
1901eb633035STom Caputi * to indicate that this field is set. This allows us to set the
1902eb633035STom Caputi * maxblkid to 0 on an existing object in dnode_sync().
1903eb633035STom Caputi */
1904c543ec06Sahrens dn->dn_maxblkid = blkid;
1905eb633035STom Caputi dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
1906eb633035STom Caputi blkid | DMU_NEXT_MAXBLKID_SET;
1907fa9e4066Sahrens
1908fa9e4066Sahrens /*
1909c543ec06Sahrens * Compute the number of levels necessary to support the new maxblkid.
1910eb633035STom Caputi * Raw sends will ensure nlevels is set correctly for us.
1911fa9e4066Sahrens */
1912fa9e4066Sahrens new_nlevels = 1;
1913fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1914c543ec06Sahrens for (sz = dn->dn_nblkptr;
1915c543ec06Sahrens sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1916fa9e4066Sahrens new_nlevels++;
1917fa9e4066Sahrens
1918eb633035STom Caputi if (!force) {
1919eb633035STom Caputi if (new_nlevels > dn->dn_nlevels)
1920eb633035STom Caputi dnode_set_nlevels_impl(dn, new_nlevels, tx);
1921eb633035STom Caputi } else {
1922eb633035STom Caputi ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
1923fa9e4066Sahrens }
1924fa9e4066Sahrens
1925fa9e4066Sahrens out:
19268346f03fSJonathan W Adams if (have_read)
19278346f03fSJonathan W Adams rw_downgrade(&dn->dn_struct_rwlock);
1928fa9e4066Sahrens }
1929fa9e4066Sahrens
193046e1baa6SMatthew Ahrens static void
dnode_dirty_l1(dnode_t * dn,uint64_t l1blkid,dmu_tx_t * tx)193146e1baa6SMatthew Ahrens dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
193246e1baa6SMatthew Ahrens {
193346e1baa6SMatthew Ahrens dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
193446e1baa6SMatthew Ahrens if (db != NULL) {
193546e1baa6SMatthew Ahrens dmu_buf_will_dirty(&db->db, tx);
193646e1baa6SMatthew Ahrens dbuf_rele(db, FTAG);
193746e1baa6SMatthew Ahrens }
193846e1baa6SMatthew Ahrens }
193946e1baa6SMatthew Ahrens
1940738e2a3cSPaul Dagnelie /*
1941738e2a3cSPaul Dagnelie * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
1942738e2a3cSPaul Dagnelie * and end_blkid.
1943738e2a3cSPaul Dagnelie */
1944738e2a3cSPaul Dagnelie static void
dnode_dirty_l1range(dnode_t * dn,uint64_t start_blkid,uint64_t end_blkid,dmu_tx_t * tx)1945738e2a3cSPaul Dagnelie dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1946738e2a3cSPaul Dagnelie dmu_tx_t *tx)
1947738e2a3cSPaul Dagnelie {
1948738e2a3cSPaul Dagnelie dmu_buf_impl_t db_search;
1949738e2a3cSPaul Dagnelie dmu_buf_impl_t *db;
1950738e2a3cSPaul Dagnelie avl_index_t where;
1951738e2a3cSPaul Dagnelie
1952738e2a3cSPaul Dagnelie mutex_enter(&dn->dn_dbufs_mtx);
1953738e2a3cSPaul Dagnelie
1954738e2a3cSPaul Dagnelie db_search.db_level = 1;
1955738e2a3cSPaul Dagnelie db_search.db_blkid = start_blkid + 1;
1956738e2a3cSPaul Dagnelie db_search.db_state = DB_SEARCH;
1957738e2a3cSPaul Dagnelie for (;;) {
1958738e2a3cSPaul Dagnelie
1959738e2a3cSPaul Dagnelie db = avl_find(&dn->dn_dbufs, &db_search, &where);
1960738e2a3cSPaul Dagnelie if (db == NULL)
1961738e2a3cSPaul Dagnelie db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1962738e2a3cSPaul Dagnelie
1963738e2a3cSPaul Dagnelie if (db == NULL || db->db_level != 1 ||
1964738e2a3cSPaul Dagnelie db->db_blkid >= end_blkid) {
1965738e2a3cSPaul Dagnelie break;
1966738e2a3cSPaul Dagnelie }
1967738e2a3cSPaul Dagnelie
1968738e2a3cSPaul Dagnelie /*
1969738e2a3cSPaul Dagnelie * Setup the next blkid we want to search for.
1970738e2a3cSPaul Dagnelie */
1971738e2a3cSPaul Dagnelie db_search.db_blkid = db->db_blkid + 1;
1972738e2a3cSPaul Dagnelie ASSERT3U(db->db_blkid, >=, start_blkid);
1973738e2a3cSPaul Dagnelie
1974738e2a3cSPaul Dagnelie /*
1975738e2a3cSPaul Dagnelie * If the dbuf transitions to DB_EVICTING while we're trying
1976738e2a3cSPaul Dagnelie * to dirty it, then we will be unable to discover it in
1977738e2a3cSPaul Dagnelie * the dbuf hash table. This will result in a call to
1978738e2a3cSPaul Dagnelie * dbuf_create() which needs to acquire the dn_dbufs_mtx
1979738e2a3cSPaul Dagnelie * lock. To avoid a deadlock, we drop the lock before
1980738e2a3cSPaul Dagnelie * dirtying the level-1 dbuf.
1981738e2a3cSPaul Dagnelie */
1982738e2a3cSPaul Dagnelie mutex_exit(&dn->dn_dbufs_mtx);
1983738e2a3cSPaul Dagnelie dnode_dirty_l1(dn, db->db_blkid, tx);
1984738e2a3cSPaul Dagnelie mutex_enter(&dn->dn_dbufs_mtx);
1985738e2a3cSPaul Dagnelie }
1986738e2a3cSPaul Dagnelie
1987738e2a3cSPaul Dagnelie #ifdef ZFS_DEBUG
1988738e2a3cSPaul Dagnelie /*
1989738e2a3cSPaul Dagnelie * Walk all the in-core level-1 dbufs and verify they have been dirtied.
1990738e2a3cSPaul Dagnelie */
1991738e2a3cSPaul Dagnelie db_search.db_level = 1;
1992738e2a3cSPaul Dagnelie db_search.db_blkid = start_blkid + 1;
1993738e2a3cSPaul Dagnelie db_search.db_state = DB_SEARCH;
1994738e2a3cSPaul Dagnelie db = avl_find(&dn->dn_dbufs, &db_search, &where);
1995738e2a3cSPaul Dagnelie if (db == NULL)
1996738e2a3cSPaul Dagnelie db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1997738e2a3cSPaul Dagnelie for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
1998738e2a3cSPaul Dagnelie if (db->db_level != 1 || db->db_blkid >= end_blkid)
1999738e2a3cSPaul Dagnelie break;
2000738e2a3cSPaul Dagnelie ASSERT(db->db_dirtycnt > 0);
2001738e2a3cSPaul Dagnelie }
2002738e2a3cSPaul Dagnelie #endif
2003738e2a3cSPaul Dagnelie mutex_exit(&dn->dn_dbufs_mtx);
2004738e2a3cSPaul Dagnelie }
2005738e2a3cSPaul Dagnelie
2006fa9e4066Sahrens void
dnode_free_range(dnode_t * dn,uint64_t off,uint64_t len,dmu_tx_t * tx)2007fa9e4066Sahrens dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
2008fa9e4066Sahrens {
2009fa9e4066Sahrens dmu_buf_impl_t *db;
2010b143e04bSahrens uint64_t blkoff, blkid, nblks;
2011cdb0ab79Smaybee int blksz, blkshift, head, tail;
2012fa9e4066Sahrens int trunc = FALSE;
2013cdb0ab79Smaybee int epbs;
2014fa9e4066Sahrens
2015fa9e4066Sahrens blksz = dn->dn_datablksz;
2016cdb0ab79Smaybee blkshift = dn->dn_datablkshift;
2017cdb0ab79Smaybee epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2018fa9e4066Sahrens
2019713d6c20SMatthew Ahrens if (len == DMU_OBJECT_END) {
2020fa9e4066Sahrens len = UINT64_MAX - off;
2021fa9e4066Sahrens trunc = TRUE;
2022fa9e4066Sahrens }
2023fa9e4066Sahrens
2024fa9e4066Sahrens /*
2025fa9e4066Sahrens * First, block align the region to free:
2026fa9e4066Sahrens */
2027b143e04bSahrens if (ISP2(blksz)) {
2028b143e04bSahrens head = P2NPHASE(off, blksz);
2029b143e04bSahrens blkoff = P2PHASE(off, blksz);
2030cdb0ab79Smaybee if ((off >> blkshift) > dn->dn_maxblkid)
20319704bf7fSPaul Dagnelie return;
2032b143e04bSahrens } else {
2033b143e04bSahrens ASSERT(dn->dn_maxblkid == 0);
2034b143e04bSahrens if (off == 0 && len >= blksz) {
203543466aaeSMax Grossman /*
203643466aaeSMax Grossman * Freeing the whole block; fast-track this request.
203743466aaeSMax Grossman */
2038cdb0ab79Smaybee blkid = 0;
2039cdb0ab79Smaybee nblks = 1;
20409704bf7fSPaul Dagnelie if (dn->dn_nlevels > 1) {
20419704bf7fSPaul Dagnelie rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
204299a19144SMatthew Ahrens dnode_dirty_l1(dn, 0, tx);
20439704bf7fSPaul Dagnelie rw_exit(&dn->dn_struct_rwlock);
20449704bf7fSPaul Dagnelie }
2045cdb0ab79Smaybee goto done;
20461c8564a7SMark Maybee } else if (off >= blksz) {
2047cdb0ab79Smaybee /* Freeing past end-of-data */
20489704bf7fSPaul Dagnelie return;
2049fa9e4066Sahrens } else {
2050b143e04bSahrens /* Freeing part of the block. */
2051fa9e4066Sahrens head = blksz - off;
2052fa9e4066Sahrens ASSERT3U(head, >, 0);
2053fa9e4066Sahrens }
2054b143e04bSahrens blkoff = off;
2055fa9e4066Sahrens }
2056fa9e4066Sahrens /* zero out any partial block data at the start of the range */
2057fa9e4066Sahrens if (head) {
20589704bf7fSPaul Dagnelie int res;
2059b143e04bSahrens ASSERT3U(blkoff + head, ==, blksz);
2060fa9e4066Sahrens if (len < head)
2061fa9e4066Sahrens head = len;
20629704bf7fSPaul Dagnelie rw_enter(&dn->dn_struct_rwlock, RW_READER);
20639704bf7fSPaul Dagnelie res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
20649704bf7fSPaul Dagnelie TRUE, FALSE, FTAG, &db);
20659704bf7fSPaul Dagnelie rw_exit(&dn->dn_struct_rwlock);
20669704bf7fSPaul Dagnelie if (res == 0) {
2067fa9e4066Sahrens caddr_t data;
20689704bf7fSPaul Dagnelie boolean_t dirty;
2069fa9e4066Sahrens
20709704bf7fSPaul Dagnelie db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
20719704bf7fSPaul Dagnelie FTAG);
2072fa9e4066Sahrens /* don't dirty if it isn't on disk and isn't dirty */
20739704bf7fSPaul Dagnelie dirty = db->db_last_dirty ||
20749704bf7fSPaul Dagnelie (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
20759704bf7fSPaul Dagnelie dmu_buf_unlock_parent(db, dblt, FTAG);
20769704bf7fSPaul Dagnelie if (dirty) {
207743466aaeSMax Grossman dmu_buf_will_dirty(&db->db, tx);
2078fa9e4066Sahrens data = db->db.db_data;
2079b143e04bSahrens bzero(data + blkoff, head);
2080fa9e4066Sahrens }
2081ea8dc4b6Seschrock dbuf_rele(db, FTAG);
2082fa9e4066Sahrens }
2083fa9e4066Sahrens off += head;
2084fa9e4066Sahrens len -= head;
2085fa9e4066Sahrens }
2086fa9e4066Sahrens
2087b143e04bSahrens /* If the range was less than one block, we're done */
2088cdb0ab79Smaybee if (len == 0)
20899704bf7fSPaul Dagnelie return;
2090fa9e4066Sahrens
2091cdb0ab79Smaybee /* If the remaining range is past end of file, we're done */
2092cdb0ab79Smaybee if ((off >> blkshift) > dn->dn_maxblkid)
20939704bf7fSPaul Dagnelie return;
2094b143e04bSahrens
20951c8564a7SMark Maybee ASSERT(ISP2(blksz));
2096cdb0ab79Smaybee if (trunc)
2097cdb0ab79Smaybee tail = 0;
2098cdb0ab79Smaybee else
2099cdb0ab79Smaybee tail = P2PHASE(len, blksz);
2100cdb0ab79Smaybee
2101fb09f5aaSMadhav Suresh ASSERT0(P2PHASE(off, blksz));
2102cdb0ab79Smaybee /* zero out any partial block data at the end of the range */
2103cdb0ab79Smaybee if (tail) {
21049704bf7fSPaul Dagnelie int res;
2105cdb0ab79Smaybee if (len < tail)
2106cdb0ab79Smaybee tail = len;
21079704bf7fSPaul Dagnelie rw_enter(&dn->dn_struct_rwlock, RW_READER);
21089704bf7fSPaul Dagnelie res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
21099704bf7fSPaul Dagnelie TRUE, FALSE, FTAG, &db);
21109704bf7fSPaul Dagnelie rw_exit(&dn->dn_struct_rwlock);
21119704bf7fSPaul Dagnelie if (res == 0) {
21129704bf7fSPaul Dagnelie boolean_t dirty;
2113cdb0ab79Smaybee /* don't dirty if not on disk and not dirty */
21149704bf7fSPaul Dagnelie db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
21159704bf7fSPaul Dagnelie FTAG);
21169704bf7fSPaul Dagnelie dirty = db->db_last_dirty ||
21179704bf7fSPaul Dagnelie (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
21189704bf7fSPaul Dagnelie dmu_buf_unlock_parent(db, type, FTAG);
21199704bf7fSPaul Dagnelie if (dirty) {
212043466aaeSMax Grossman dmu_buf_will_dirty(&db->db, tx);
2121cdb0ab79Smaybee bzero(db->db.db_data, tail);
2122fa9e4066Sahrens }
2123ea8dc4b6Seschrock dbuf_rele(db, FTAG);
2124fa9e4066Sahrens }
2125cdb0ab79Smaybee len -= tail;
2126cdb0ab79Smaybee }
2127fa9e4066Sahrens
2128cdb0ab79Smaybee /* If the range did not include a full block, we are done */
2129cdb0ab79Smaybee if (len == 0)
21309704bf7fSPaul Dagnelie return;
2131fa9e4066Sahrens
2132cdb0ab79Smaybee ASSERT(IS_P2ALIGNED(off, blksz));
2133cdb0ab79Smaybee ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2134cdb0ab79Smaybee blkid = off >> blkshift;
2135cdb0ab79Smaybee nblks = len >> blkshift;
2136cdb0ab79Smaybee if (trunc)
2137cdb0ab79Smaybee nblks += 1;
2138cdb0ab79Smaybee
2139cdb0ab79Smaybee /*
214046e1baa6SMatthew Ahrens * Dirty all the indirect blocks in this range. Note that only
214146e1baa6SMatthew Ahrens * the first and last indirect blocks can actually be written
214246e1baa6SMatthew Ahrens * (if they were partially freed) -- they must be dirtied, even if
214346e1baa6SMatthew Ahrens * they do not exist on disk yet. The interior blocks will
214446e1baa6SMatthew Ahrens * be freed by free_children(), so they will not actually be written.
214546e1baa6SMatthew Ahrens * Even though these interior blocks will not be written, we
214646e1baa6SMatthew Ahrens * dirty them for two reasons:
214746e1baa6SMatthew Ahrens *
214846e1baa6SMatthew Ahrens * - It ensures that the indirect blocks remain in memory until
214946e1baa6SMatthew Ahrens * syncing context. (They have already been prefetched by
215046e1baa6SMatthew Ahrens * dmu_tx_hold_free(), so we don't have to worry about reading
215146e1baa6SMatthew Ahrens * them serially here.)
215246e1baa6SMatthew Ahrens *
215346e1baa6SMatthew Ahrens * - The dirty space accounting will put pressure on the txg sync
215446e1baa6SMatthew Ahrens * mechanism to begin syncing, and to delay transactions if there
215546e1baa6SMatthew Ahrens * is a large amount of freeing. Even though these indirect
215646e1baa6SMatthew Ahrens * blocks will not be written, we could need to write the same
215746e1baa6SMatthew Ahrens * amount of space if we copy the freed BPs into deadlists.
2158cdb0ab79Smaybee */
2159cdb0ab79Smaybee if (dn->dn_nlevels > 1) {
21609704bf7fSPaul Dagnelie rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
216143466aaeSMax Grossman uint64_t first, last;
2162b143e04bSahrens
2163cdb0ab79Smaybee first = blkid >> epbs;
216446e1baa6SMatthew Ahrens dnode_dirty_l1(dn, first, tx);
2165b143e04bSahrens if (trunc)
2166cdb0ab79Smaybee last = dn->dn_maxblkid >> epbs;
2167cdb0ab79Smaybee else
2168cdb0ab79Smaybee last = (blkid + nblks - 1) >> epbs;
216946e1baa6SMatthew Ahrens if (last != first)
217046e1baa6SMatthew Ahrens dnode_dirty_l1(dn, last, tx);
217146e1baa6SMatthew Ahrens
2172738e2a3cSPaul Dagnelie dnode_dirty_l1range(dn, first, last, tx);
2173738e2a3cSPaul Dagnelie
217446e1baa6SMatthew Ahrens int shift = dn->dn_datablkshift + dn->dn_indblkshift -
217546e1baa6SMatthew Ahrens SPA_BLKPTRSHIFT;
217646e1baa6SMatthew Ahrens for (uint64_t i = first + 1; i < last; i++) {
217746e1baa6SMatthew Ahrens /*
217846e1baa6SMatthew Ahrens * Set i to the blockid of the next non-hole
217946e1baa6SMatthew Ahrens * level-1 indirect block at or after i. Note
218046e1baa6SMatthew Ahrens * that dnode_next_offset() operates in terms of
218146e1baa6SMatthew Ahrens * level-0-equivalent bytes.
218246e1baa6SMatthew Ahrens */
218346e1baa6SMatthew Ahrens uint64_t ibyte = i << shift;
218446e1baa6SMatthew Ahrens int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
218546e1baa6SMatthew Ahrens &ibyte, 2, 1, 0);
218646e1baa6SMatthew Ahrens i = ibyte >> shift;
218746e1baa6SMatthew Ahrens if (i >= last)
218846e1baa6SMatthew Ahrens break;
218946e1baa6SMatthew Ahrens
219046e1baa6SMatthew Ahrens /*
219146e1baa6SMatthew Ahrens * Normally we should not see an error, either
219246e1baa6SMatthew Ahrens * from dnode_next_offset() or dbuf_hold_level()
219346e1baa6SMatthew Ahrens * (except for ESRCH from dnode_next_offset).
219446e1baa6SMatthew Ahrens * If there is an i/o error, then when we read
219546e1baa6SMatthew Ahrens * this block in syncing context, it will use
219646e1baa6SMatthew Ahrens * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
219746e1baa6SMatthew Ahrens * to the "failmode" property. dnode_next_offset()
219846e1baa6SMatthew Ahrens * doesn't have a flag to indicate MUSTSUCCEED.
219946e1baa6SMatthew Ahrens */
220046e1baa6SMatthew Ahrens if (err != 0)
220146e1baa6SMatthew Ahrens break;
220246e1baa6SMatthew Ahrens
220346e1baa6SMatthew Ahrens dnode_dirty_l1(dn, i, tx);
220456d55a53Smaybee }
22059704bf7fSPaul Dagnelie rw_exit(&dn->dn_struct_rwlock);
2206fa9e4066Sahrens }
220743466aaeSMax Grossman
2208cdb0ab79Smaybee done:
2209cdb0ab79Smaybee /*
2210cdb0ab79Smaybee * Add this range to the dnode range list.
2211cdb0ab79Smaybee * We will finish up this free operation in the syncing phase.
2212cdb0ab79Smaybee */
2213fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
2214bf16b11eSMatthew Ahrens int txgoff = tx->tx_txg & TXG_MASK;
2215bf16b11eSMatthew Ahrens if (dn->dn_free_ranges[txgoff] == NULL) {
22164d7988d6SPaul Dagnelie dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
22174d7988d6SPaul Dagnelie RANGE_SEG64, NULL, 0, 0);
2218fa9e4066Sahrens }
2219bf16b11eSMatthew Ahrens range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2220bf16b11eSMatthew Ahrens range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2221bf16b11eSMatthew Ahrens dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2222bf16b11eSMatthew Ahrens blkid, nblks, tx->tx_txg);
2223fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
2224fa9e4066Sahrens
2225cdb0ab79Smaybee dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2226fa9e4066Sahrens dnode_setdirty(dn, tx);
2227fa9e4066Sahrens }
2228fa9e4066Sahrens
22290a586ceaSMark Shellenbaum static boolean_t
dnode_spill_freed(dnode_t * dn)22300a586ceaSMark Shellenbaum dnode_spill_freed(dnode_t *dn)
22310a586ceaSMark Shellenbaum {
22320a586ceaSMark Shellenbaum int i;
22330a586ceaSMark Shellenbaum
22340a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx);
22350a586ceaSMark Shellenbaum for (i = 0; i < TXG_SIZE; i++) {
22360a586ceaSMark Shellenbaum if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
22370a586ceaSMark Shellenbaum break;
22380a586ceaSMark Shellenbaum }
22390a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx);
22400a586ceaSMark Shellenbaum return (i < TXG_SIZE);
22410a586ceaSMark Shellenbaum }
22420a586ceaSMark Shellenbaum
2243fa9e4066Sahrens /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2244fa9e4066Sahrens uint64_t
dnode_block_freed(dnode_t * dn,uint64_t blkid)2245fa9e4066Sahrens dnode_block_freed(dnode_t *dn, uint64_t blkid)
2246fa9e4066Sahrens {
2247fa9e4066Sahrens void *dp = spa_get_dsl(dn->dn_objset->os_spa);
2248fa9e4066Sahrens int i;
2249fa9e4066Sahrens
22500a586ceaSMark Shellenbaum if (blkid == DMU_BONUS_BLKID)
2251fa9e4066Sahrens return (FALSE);
2252fa9e4066Sahrens
2253fa9e4066Sahrens /*
2254fa9e4066Sahrens * If we're in the process of opening the pool, dp will not be
2255fa9e4066Sahrens * set yet, but there shouldn't be anything dirty.
2256fa9e4066Sahrens */
2257fa9e4066Sahrens if (dp == NULL)
2258fa9e4066Sahrens return (FALSE);
2259fa9e4066Sahrens
2260fa9e4066Sahrens if (dn->dn_free_txg)
2261fa9e4066Sahrens return (TRUE);
2262fa9e4066Sahrens
22630a586ceaSMark Shellenbaum if (blkid == DMU_SPILL_BLKID)
22640a586ceaSMark Shellenbaum return (dnode_spill_freed(dn));
22650a586ceaSMark Shellenbaum
2266fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
2267fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) {
2268bf16b11eSMatthew Ahrens if (dn->dn_free_ranges[i] != NULL &&
2269bf16b11eSMatthew Ahrens range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2270fa9e4066Sahrens break;
2271fa9e4066Sahrens }
2272fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
2273fa9e4066Sahrens return (i < TXG_SIZE);
2274fa9e4066Sahrens }
2275fa9e4066Sahrens
2276fa9e4066Sahrens /* call from syncing context when we actually write/free space for this dnode */
2277fa9e4066Sahrens void
dnode_diduse_space(dnode_t * dn,int64_t delta)227899653d4eSeschrock dnode_diduse_space(dnode_t *dn, int64_t delta)
2279fa9e4066Sahrens {
228099653d4eSeschrock uint64_t space;
228199653d4eSeschrock dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2282fa9e4066Sahrens dn, dn->dn_phys,
228399653d4eSeschrock (u_longlong_t)dn->dn_phys->dn_used,
228499653d4eSeschrock (longlong_t)delta);
2285fa9e4066Sahrens
2286fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
228799653d4eSeschrock space = DN_USED_BYTES(dn->dn_phys);
228899653d4eSeschrock if (delta > 0) {
228999653d4eSeschrock ASSERT3U(space + delta, >=, space); /* no overflow */
229099653d4eSeschrock } else {
229199653d4eSeschrock ASSERT3U(space, >=, -delta); /* no underflow */
229299653d4eSeschrock }
229399653d4eSeschrock space += delta;
2294e7437265Sahrens if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
229599653d4eSeschrock ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
2296fb09f5aaSMadhav Suresh ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
229799653d4eSeschrock dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2298fa9e4066Sahrens } else {
229999653d4eSeschrock dn->dn_phys->dn_used = space;
230099653d4eSeschrock dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2301fa9e4066Sahrens }
2302fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
2303fa9e4066Sahrens }
2304fa9e4066Sahrens
230576256205SMark Maybee /*
2306f7170741SWill Andrews * Scans a block at the indicated "level" looking for a hole or data,
2307f7170741SWill Andrews * depending on 'flags'.
2308f7170741SWill Andrews *
2309f7170741SWill Andrews * If level > 0, then we are scanning an indirect block looking at its
2310f7170741SWill Andrews * pointers. If level == 0, then we are looking at a block of dnodes.
2311f7170741SWill Andrews *
2312f7170741SWill Andrews * If we don't find what we are looking for in the block, we return ESRCH.
2313f7170741SWill Andrews * Otherwise, return with *offset pointing to the beginning (if searching
2314f7170741SWill Andrews * forwards) or end (if searching backwards) of the range covered by the
2315f7170741SWill Andrews * block pointer we matched on (or dnode).
231676256205SMark Maybee *
231776256205SMark Maybee * The basic search algorithm used below by dnode_next_offset() is to
231876256205SMark Maybee * use this function to search up the block tree (widen the search) until
231976256205SMark Maybee * we find something (i.e., we don't return ESRCH) and then search back
232076256205SMark Maybee * down the tree (narrow the search) until we reach our original search
232176256205SMark Maybee * level.
232276256205SMark Maybee */
2323fa9e4066Sahrens static int
dnode_next_offset_level(dnode_t * dn,int flags,uint64_t * offset,int lvl,uint64_t blkfill,uint64_t txg)2324cdb0ab79Smaybee dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2325a2cdcdd2SPaul Dagnelie int lvl, uint64_t blkfill, uint64_t txg)
2326fa9e4066Sahrens {
2327fa9e4066Sahrens dmu_buf_impl_t *db = NULL;
2328fa9e4066Sahrens void *data = NULL;
2329fa9e4066Sahrens uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2330fa9e4066Sahrens uint64_t epb = 1ULL << epbs;
2331fa9e4066Sahrens uint64_t minfill, maxfill;
2332cdb0ab79Smaybee boolean_t hole;
2333cdb0ab79Smaybee int i, inc, error, span;
2334fa9e4066Sahrens
2335fa9e4066Sahrens dprintf("probing object %llu offset %llx level %d of %u\n",
2336fa9e4066Sahrens dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
2337fa9e4066Sahrens
23389704bf7fSPaul Dagnelie ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
23399704bf7fSPaul Dagnelie
234014843421SMatthew Ahrens hole = ((flags & DNODE_FIND_HOLE) != 0);
2341cdb0ab79Smaybee inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
23421c8564a7SMark Maybee ASSERT(txg == 0 || !hole);
2343cdb0ab79Smaybee
2344fa9e4066Sahrens if (lvl == dn->dn_phys->dn_nlevels) {
2345fa9e4066Sahrens error = 0;
2346fa9e4066Sahrens epb = dn->dn_phys->dn_nblkptr;
2347fa9e4066Sahrens data = dn->dn_phys->dn_blkptr;
2348fa9e4066Sahrens } else {
2349a2cdcdd2SPaul Dagnelie uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2350a2cdcdd2SPaul Dagnelie error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2351fa9e4066Sahrens if (error) {
23521c8564a7SMark Maybee if (error != ENOENT)
23531c8564a7SMark Maybee return (error);
23541c8564a7SMark Maybee if (hole)
23551c8564a7SMark Maybee return (0);
23561c8564a7SMark Maybee /*
23571c8564a7SMark Maybee * This can only happen when we are searching up
23581c8564a7SMark Maybee * the block tree for data. We don't really need to
23591c8564a7SMark Maybee * adjust the offset, as we will just end up looking
23601c8564a7SMark Maybee * at the pointer to this block in its parent, and its
23611c8564a7SMark Maybee * going to be unallocated, so we will skip over it.
23621c8564a7SMark Maybee */
2363be6fd75aSMatthew Ahrens return (SET_ERROR(ESRCH));
2364fa9e4066Sahrens }
2365eb633035STom Caputi error = dbuf_read(db, NULL,
2366eb633035STom Caputi DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
236798572ac1Sahrens if (error) {
236898572ac1Sahrens dbuf_rele(db, FTAG);
236998572ac1Sahrens return (error);
237098572ac1Sahrens }
2371fa9e4066Sahrens data = db->db.db_data;
23729704bf7fSPaul Dagnelie rw_enter(&db->db_rwlock, RW_READER);
2373fa9e4066Sahrens }
2374fa9e4066Sahrens
237543466aaeSMax Grossman if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
237643466aaeSMax Grossman db->db_blkptr->blk_birth <= txg ||
237743466aaeSMax Grossman BP_IS_HOLE(db->db_blkptr))) {
23781c8564a7SMark Maybee /*
23791c8564a7SMark Maybee * This can only happen when we are searching up the tree
23801c8564a7SMark Maybee * and these conditions mean that we need to keep climbing.
23811c8564a7SMark Maybee */
2382be6fd75aSMatthew Ahrens error = SET_ERROR(ESRCH);
23836754306eSahrens } else if (lvl == 0) {
2384fa9e4066Sahrens dnode_phys_t *dnp = data;
238554811da5SToomas Soome
2386fa9e4066Sahrens ASSERT(dn->dn_type == DMU_OT_DNODE);
238754811da5SToomas Soome ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2388fa9e4066Sahrens
238954811da5SToomas Soome for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
239054811da5SToomas Soome i < blkfill; i += dnp[i].dn_extra_slots + 1) {
239108f3f137SJonathan W Adams if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2392fa9e4066Sahrens break;
2393fa9e4066Sahrens }
239454811da5SToomas Soome
239554811da5SToomas Soome if (i == blkfill)
2396be6fd75aSMatthew Ahrens error = SET_ERROR(ESRCH);
239754811da5SToomas Soome
239854811da5SToomas Soome *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
239954811da5SToomas Soome (i << DNODE_SHIFT);
2400fa9e4066Sahrens } else {
2401fa9e4066Sahrens blkptr_t *bp = data;
240276256205SMark Maybee uint64_t start = *offset;
2403fa9e4066Sahrens span = (lvl - 1) * epbs + dn->dn_datablkshift;
2404fa9e4066Sahrens minfill = 0;
2405fa9e4066Sahrens maxfill = blkfill << ((lvl - 1) * epbs);
2406fa9e4066Sahrens
2407fa9e4066Sahrens if (hole)
2408fa9e4066Sahrens maxfill--;
2409fa9e4066Sahrens else
2410fa9e4066Sahrens minfill++;
2411fa9e4066Sahrens
241276256205SMark Maybee *offset = *offset >> span;
241376256205SMark Maybee for (i = BF64_GET(*offset, 0, epbs);
2414cdb0ab79Smaybee i >= 0 && i < epb; i += inc) {
24155d7b4d43SMatthew Ahrens if (BP_GET_FILL(&bp[i]) >= minfill &&
24165d7b4d43SMatthew Ahrens BP_GET_FILL(&bp[i]) <= maxfill &&
24171c8564a7SMark Maybee (hole || bp[i].blk_birth > txg))
2418fa9e4066Sahrens break;
241976256205SMark Maybee if (inc > 0 || *offset > 0)
242076256205SMark Maybee *offset += inc;
242176256205SMark Maybee }
242276256205SMark Maybee *offset = *offset << span;
242376256205SMark Maybee if (inc < 0) {
242476256205SMark Maybee /* traversing backwards; position offset at the end */
242576256205SMark Maybee ASSERT3U(*offset, <=, start);
242676256205SMark Maybee *offset = MIN(*offset + (1ULL << span) - 1, start);
242776256205SMark Maybee } else if (*offset < start) {
242876256205SMark Maybee *offset = start;
2429fa9e4066Sahrens }
243076256205SMark Maybee if (i < 0 || i >= epb)
2431be6fd75aSMatthew Ahrens error = SET_ERROR(ESRCH);
2432fa9e4066Sahrens }
2433fa9e4066Sahrens
24349704bf7fSPaul Dagnelie if (db != NULL) {
24359704bf7fSPaul Dagnelie rw_exit(&db->db_rwlock);
2436ea8dc4b6Seschrock dbuf_rele(db, FTAG);
24379704bf7fSPaul Dagnelie }
2438fa9e4066Sahrens
2439fa9e4066Sahrens return (error);
2440fa9e4066Sahrens }
2441fa9e4066Sahrens
2442fa9e4066Sahrens /*
2443fa9e4066Sahrens * Find the next hole, data, or sparse region at or after *offset.
2444fa9e4066Sahrens * The value 'blkfill' tells us how many items we expect to find
2445fa9e4066Sahrens * in an L0 data block; this value is 1 for normal objects,
2446fa9e4066Sahrens * DNODES_PER_BLOCK for the meta dnode, and some fraction of
2447fa9e4066Sahrens * DNODES_PER_BLOCK when searching for sparse regions thereof.
24486754306eSahrens *
2449fa9e4066Sahrens * Examples:
2450fa9e4066Sahrens *
2451cdb0ab79Smaybee * dnode_next_offset(dn, flags, offset, 1, 1, 0);
2452cdb0ab79Smaybee * Finds the next/previous hole/data in a file.
2453fa9e4066Sahrens * Used in dmu_offset_next().
2454fa9e4066Sahrens *
2455cdb0ab79Smaybee * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2456fa9e4066Sahrens * Finds the next free/allocated dnode an objset's meta-dnode.
24576754306eSahrens * Only finds objects that have new contents since txg (ie.
24586754306eSahrens * bonus buffer changes and content removal are ignored).
2459fa9e4066Sahrens * Used in dmu_object_next().
2460fa9e4066Sahrens *
2461cdb0ab79Smaybee * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2462fa9e4066Sahrens * Finds the next L2 meta-dnode bp that's at most 1/4 full.
2463fa9e4066Sahrens * Used in dmu_object_alloc().
2464fa9e4066Sahrens */
2465fa9e4066Sahrens int
dnode_next_offset(dnode_t * dn,int flags,uint64_t * offset,int minlvl,uint64_t blkfill,uint64_t txg)2466cdb0ab79Smaybee dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
24676754306eSahrens int minlvl, uint64_t blkfill, uint64_t txg)
2468fa9e4066Sahrens {
2469cdb0ab79Smaybee uint64_t initial_offset = *offset;
2470fa9e4066Sahrens int lvl, maxlvl;
2471fa9e4066Sahrens int error = 0;
2472fa9e4066Sahrens
2473cdb0ab79Smaybee if (!(flags & DNODE_FIND_HAVELOCK))
2474cdb0ab79Smaybee rw_enter(&dn->dn_struct_rwlock, RW_READER);
2475fa9e4066Sahrens
2476fa9e4066Sahrens if (dn->dn_phys->dn_nlevels == 0) {
2477be6fd75aSMatthew Ahrens error = SET_ERROR(ESRCH);
2478cdb0ab79Smaybee goto out;
2479fa9e4066Sahrens }
2480fa9e4066Sahrens
2481fa9e4066Sahrens if (dn->dn_datablkshift == 0) {
2482fa9e4066Sahrens if (*offset < dn->dn_datablksz) {
2483cdb0ab79Smaybee if (flags & DNODE_FIND_HOLE)
2484fa9e4066Sahrens *offset = dn->dn_datablksz;
2485fa9e4066Sahrens } else {
2486be6fd75aSMatthew Ahrens error = SET_ERROR(ESRCH);
2487fa9e4066Sahrens }
2488cdb0ab79Smaybee goto out;
2489fa9e4066Sahrens }
2490fa9e4066Sahrens
2491fa9e4066Sahrens maxlvl = dn->dn_phys->dn_nlevels;
2492fa9e4066Sahrens
2493fa9e4066Sahrens for (lvl = minlvl; lvl <= maxlvl; lvl++) {
24946754306eSahrens error = dnode_next_offset_level(dn,
2495cdb0ab79Smaybee flags, offset, lvl, blkfill, txg);
249698572ac1Sahrens if (error != ESRCH)
2497fa9e4066Sahrens break;
2498fa9e4066Sahrens }
2499fa9e4066Sahrens
2500cdb0ab79Smaybee while (error == 0 && --lvl >= minlvl) {
25016754306eSahrens error = dnode_next_offset_level(dn,
2502cdb0ab79Smaybee flags, offset, lvl, blkfill, txg);
25036754306eSahrens }
2504fa9e4066Sahrens
25050fbc0cd0SMatthew Ahrens /*
25060fbc0cd0SMatthew Ahrens * There's always a "virtual hole" at the end of the object, even
25070fbc0cd0SMatthew Ahrens * if all BP's which physically exist are non-holes.
25080fbc0cd0SMatthew Ahrens */
25090fbc0cd0SMatthew Ahrens if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
25100fbc0cd0SMatthew Ahrens minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
25110fbc0cd0SMatthew Ahrens error = 0;
25120fbc0cd0SMatthew Ahrens }
25130fbc0cd0SMatthew Ahrens
2514cdb0ab79Smaybee if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2515cdb0ab79Smaybee initial_offset < *offset : initial_offset > *offset))
2516be6fd75aSMatthew Ahrens error = SET_ERROR(ESRCH);
2517cdb0ab79Smaybee out:
2518cdb0ab79Smaybee if (!(flags & DNODE_FIND_HAVELOCK))
2519cdb0ab79Smaybee rw_exit(&dn->dn_struct_rwlock);
2520fa9e4066Sahrens
2521fa9e4066Sahrens return (error);
2522fa9e4066Sahrens }
2523