1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24 * Copyright 2014 HybridCluster. All rights reserved.
25 */
26
27#include <sys/dbuf.h>
28#include <sys/dmu.h>
29#include <sys/dmu_objset.h>
30#include <sys/dmu_tx.h>
31#include <sys/dnode.h>
32#include <sys/zap.h>
33#include <sys/zfeature.h>
34#include <sys/dsl_dataset.h>
35
36/*
37 * Each of the concurrent object allocators will grab
38 * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
39 * grab 128 slots, which is 4 blocks worth.  This was experimentally
40 * determined to be the lowest value that eliminates the measurable effect
41 * of lock contention from this code path.
42 */
43int dmu_object_alloc_chunk_shift = 7;
44
45static uint64_t
46dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
47    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
48    int dnodesize, dmu_tx_t *tx)
49{
50	uint64_t object;
51	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
52	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
53	dnode_t *dn = NULL;
54	int dn_slots = dnodesize >> DNODE_SHIFT;
55	boolean_t restarted = B_FALSE;
56	uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
57	    os->os_obj_next_percpu_len];
58	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
59	int error;
60
61	if (dn_slots == 0) {
62		dn_slots = DNODE_MIN_SLOTS;
63	} else {
64		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
65		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
66	}
67
68	/*
69	 * The "chunk" of dnodes that is assigned to a CPU-specific
70	 * allocator needs to be at least one block's worth, to avoid
71	 * lock contention on the dbuf.  It can be at most one L1 block's
72	 * worth, so that the "rescan after polishing off a L1's worth"
73	 * logic below will be sure to kick in.
74	 */
75	if (dnodes_per_chunk < DNODES_PER_BLOCK)
76		dnodes_per_chunk = DNODES_PER_BLOCK;
77	if (dnodes_per_chunk > L1_dnode_count)
78		dnodes_per_chunk = L1_dnode_count;
79
80	object = *cpuobj;
81
82	for (;;) {
83		/*
84		 * If we finished a chunk of dnodes, get a new one from
85		 * the global allocator.
86		 */
87		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
88		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
89		    dn_slots)) {
90			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
91			mutex_enter(&os->os_obj_lock);
92			ASSERT0(P2PHASE(os->os_obj_next_chunk,
93			    dnodes_per_chunk));
94			object = os->os_obj_next_chunk;
95
96			/*
97			 * Each time we polish off a L1 bp worth of dnodes
98			 * (2^12 objects), move to another L1 bp that's
99			 * still reasonably sparse (at most 1/4 full). Look
100			 * from the beginning at most once per txg. If we
101			 * still can't allocate from that L1 block, search
102			 * for an empty L0 block, which will quickly skip
103			 * to the end of the metadnode if the no nearby L0
104			 * blocks are empty. This fallback avoids a
105			 * pathology where full dnode blocks containing
106			 * large dnodes appear sparse because they have a
107			 * low blk_fill, leading to many failed allocation
108			 * attempts. In the long term a better mechanism to
109			 * search for sparse metadnode regions, such as
110			 * spacemaps, could be implemented.
111			 *
112			 * os_scan_dnodes is set during txg sync if enough
113			 * objects have been freed since the previous
114			 * rescan to justify backfilling again.
115			 *
116			 * Note that dmu_traverse depends on the behavior
117			 * that we use multiple blocks of the dnode object
118			 * before going back to reuse objects. Any change
119			 * to this algorithm should preserve that property
120			 * or find another solution to the issues described
121			 * in traverse_visitbp.
122			 */
123			if (P2PHASE(object, L1_dnode_count) == 0) {
124				uint64_t offset;
125				uint64_t blkfill;
126				int minlvl;
127				if (os->os_rescan_dnodes) {
128					offset = 0;
129					os->os_rescan_dnodes = B_FALSE;
130				} else {
131					offset = object << DNODE_SHIFT;
132				}
133				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
134				minlvl = restarted ? 1 : 2;
135				restarted = B_TRUE;
136				error = dnode_next_offset(DMU_META_DNODE(os),
137				    DNODE_FIND_HOLE, &offset, minlvl,
138				    blkfill, 0);
139				if (error == 0) {
140					object = offset >> DNODE_SHIFT;
141				}
142			}
143			/*
144			 * Note: if "restarted", we may find a L0 that
145			 * is not suitably aligned.
146			 */
147			os->os_obj_next_chunk =
148			    P2ALIGN(object, dnodes_per_chunk) +
149			    dnodes_per_chunk;
150			(void) atomic_swap_64(cpuobj, object);
151			mutex_exit(&os->os_obj_lock);
152		}
153
154		/*
155		 * The value of (*cpuobj) before adding dn_slots is the object
156		 * ID assigned to us.  The value afterwards is the object ID
157		 * assigned to whoever wants to do an allocation next.
158		 */
159		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
160
161		/*
162		 * XXX We should check for an i/o error here and return
163		 * up to our caller.  Actually we should pre-read it in
164		 * dmu_tx_assign(), but there is currently no mechanism
165		 * to do so.
166		 */
167		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
168		    dn_slots, FTAG, &dn);
169		if (error == 0) {
170			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
171			/*
172			 * Another thread could have allocated it; check
173			 * again now that we have the struct lock.
174			 */
175			if (dn->dn_type == DMU_OT_NONE) {
176				dnode_allocate(dn, ot, blocksize, 0,
177				    bonustype, bonuslen, dn_slots, tx);
178				rw_exit(&dn->dn_struct_rwlock);
179				dmu_tx_add_new_object(tx, dn);
180				dnode_rele(dn, FTAG);
181				return (object);
182			}
183			rw_exit(&dn->dn_struct_rwlock);
184			dnode_rele(dn, FTAG);
185			DNODE_STAT_BUMP(dnode_alloc_race);
186		}
187
188		/*
189		 * Skip to next known valid starting point on error. This
190		 * is the start of the next block of dnodes.
191		 */
192		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
193			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
194			DNODE_STAT_BUMP(dnode_alloc_next_block);
195		}
196		(void) atomic_swap_64(cpuobj, object);
197	}
198}
199
200uint64_t
201dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
202    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
203{
204	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
205	    bonuslen, 0, tx));
206}
207
208uint64_t
209dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
210    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
211    dmu_tx_t *tx)
212{
213	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
214	    bonustype, bonuslen, 0, tx));
215}
216
217uint64_t
218dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
219    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
220{
221	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
222	    bonuslen, dnodesize, tx));
223}
224
225int
226dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
227    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
228{
229	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
230	    bonuslen, 0, tx));
231}
232
233int
234dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
235    int blocksize, dmu_object_type_t bonustype, int bonuslen,
236    int dnodesize, dmu_tx_t *tx)
237{
238	dnode_t *dn;
239	int dn_slots = dnodesize >> DNODE_SHIFT;
240	int err;
241
242	if (dn_slots == 0)
243		dn_slots = DNODE_MIN_SLOTS;
244	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
245	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
246
247	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
248		return (SET_ERROR(EBADF));
249
250	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
251	    FTAG, &dn);
252	if (err)
253		return (err);
254	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
255	dmu_tx_add_new_object(tx, dn);
256
257	dnode_rele(dn, FTAG);
258
259	return (0);
260}
261
262int
263dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
264    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
265{
266	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
267	    bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
268}
269
270int
271dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
272    int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
273    boolean_t keep_spill, dmu_tx_t *tx)
274{
275	dnode_t *dn;
276	int dn_slots = dnodesize >> DNODE_SHIFT;
277	int err;
278
279	if (dn_slots == 0)
280		dn_slots = DNODE_MIN_SLOTS;
281
282	if (object == DMU_META_DNODE_OBJECT)
283		return (SET_ERROR(EBADF));
284
285	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
286	    FTAG, &dn);
287	if (err)
288		return (err);
289
290	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
291	    keep_spill, tx);
292
293	dnode_rele(dn, FTAG);
294	return (err);
295}
296
297int
298dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
299{
300	dnode_t *dn;
301	int err;
302
303	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
304	    FTAG, &dn);
305	if (err)
306		return (err);
307
308	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
309	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
310		dbuf_rm_spill(dn, tx);
311		dnode_rm_spill(dn, tx);
312	}
313	rw_exit(&dn->dn_struct_rwlock);
314
315	dnode_rele(dn, FTAG);
316	return (err);
317}
318
319int
320dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
321{
322	dnode_t *dn;
323	int err;
324
325	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
326
327	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
328	    FTAG, &dn);
329	if (err)
330		return (err);
331
332	ASSERT(dn->dn_type != DMU_OT_NONE);
333	/*
334	 * If we don't create this free range, we'll leak indirect blocks when
335	 * we get to freeing the dnode in syncing context.
336	 */
337	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
338	dnode_free(dn, tx);
339	dnode_rele(dn, FTAG);
340
341	return (0);
342}
343
344/*
345 * Return (in *objectp) the next object which is allocated (or a hole)
346 * after *object, taking into account only objects that may have been modified
347 * after the specified txg.
348 */
349int
350dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
351{
352	uint64_t offset;
353	uint64_t start_obj;
354	struct dsl_dataset *ds = os->os_dsl_dataset;
355	int error;
356
357	if (*objectp == 0) {
358		start_obj = 1;
359	} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
360		uint64_t i = *objectp + 1;
361		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
362		dmu_object_info_t doi;
363
364		/*
365		 * Scan through the remaining meta dnode block. The contents
366		 * of each slot in the block are known so it can be quickly
367		 * checked. If the block is exhausted without a match then
368		 * hand off to dnode_next_offset() for further scanning.
369		 */
370		while (i <= last_obj) {
371			error = dmu_object_info(os, i, &doi);
372			if (error == ENOENT) {
373				if (hole) {
374					*objectp = i;
375					return (0);
376				} else {
377					i++;
378				}
379			} else if (error == EEXIST) {
380				i++;
381			} else if (error == 0) {
382				if (hole) {
383					i += doi.doi_dnodesize >> DNODE_SHIFT;
384				} else {
385					*objectp = i;
386					return (0);
387				}
388			} else {
389				return (error);
390			}
391		}
392
393		start_obj = i;
394	} else {
395		start_obj = *objectp + 1;
396	}
397
398	offset = start_obj << DNODE_SHIFT;
399
400	error = dnode_next_offset(DMU_META_DNODE(os),
401	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
402
403	*objectp = offset >> DNODE_SHIFT;
404
405	return (error);
406}
407
408/*
409 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
410 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
411 *
412 * Only for use from syncing context, on MOS objects.
413 */
414void
415dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
416    dmu_tx_t *tx)
417{
418	dnode_t *dn;
419
420	ASSERT(dmu_tx_is_syncing(tx));
421
422	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
423	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
424		dnode_rele(dn, FTAG);
425		return;
426	}
427	ASSERT3U(dn->dn_type, ==, old_type);
428	ASSERT0(dn->dn_maxblkid);
429
430	/*
431	 * We must initialize the ZAP data before changing the type,
432	 * so that concurrent calls to *_is_zapified() can determine if
433	 * the object has been completely zapified by checking the type.
434	 */
435	mzap_create_impl(mos, object, 0, 0, tx);
436
437	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
438	    DMU_OTN_ZAP_METADATA;
439	dnode_setdirty(dn, tx);
440	dnode_rele(dn, FTAG);
441
442	spa_feature_incr(dmu_objset_spa(mos),
443	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
444}
445
446void
447dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
448{
449	dnode_t *dn;
450	dmu_object_type_t t;
451
452	ASSERT(dmu_tx_is_syncing(tx));
453
454	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
455	t = dn->dn_type;
456	dnode_rele(dn, FTAG);
457
458	if (t == DMU_OTN_ZAP_METADATA) {
459		spa_feature_decr(dmu_objset_spa(mos),
460		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
461	}
462	VERIFY0(dmu_object_free(mos, object, tx));
463}
464