xref: /illumos-gate/usr/src/uts/common/fs/zfs/dmu_object.c (revision 54811da5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24  * Copyright 2014 HybridCluster. All rights reserved.
25  */
26 
27 #include <sys/dmu.h>
28 #include <sys/dmu_objset.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/dnode.h>
31 #include <sys/zap.h>
32 #include <sys/zfeature.h>
33 #include <sys/dsl_dataset.h>
34 
35 /*
36  * Each of the concurrent object allocators will grab
37  * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
38  * grab 128 slots, which is 4 blocks worth.  This was experimentally
39  * determined to be the lowest value that eliminates the measurable effect
40  * of lock contention from this code path.
41  */
42 int dmu_object_alloc_chunk_shift = 7;
43 
44 static uint64_t
45 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
46     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
47     int dnodesize, dmu_tx_t *tx)
48 {
49 	uint64_t object;
50 	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
51 	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
52 	dnode_t *dn = NULL;
53 	int dn_slots = dnodesize >> DNODE_SHIFT;
54 	boolean_t restarted = B_FALSE;
55 	uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
56 	    os->os_obj_next_percpu_len];
57 	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
58 	int error;
59 
60 	if (dn_slots == 0) {
61 		dn_slots = DNODE_MIN_SLOTS;
62 	} else {
63 		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
64 		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
65 	}
66 
67 	/*
68 	 * The "chunk" of dnodes that is assigned to a CPU-specific
69 	 * allocator needs to be at least one block's worth, to avoid
70 	 * lock contention on the dbuf.  It can be at most one L1 block's
71 	 * worth, so that the "rescan after polishing off a L1's worth"
72 	 * logic below will be sure to kick in.
73 	 */
74 	if (dnodes_per_chunk < DNODES_PER_BLOCK)
75 		dnodes_per_chunk = DNODES_PER_BLOCK;
76 	if (dnodes_per_chunk > L1_dnode_count)
77 		dnodes_per_chunk = L1_dnode_count;
78 
79 	object = *cpuobj;
80 
81 	for (;;) {
82 		/*
83 		 * If we finished a chunk of dnodes, get a new one from
84 		 * the global allocator.
85 		 */
86 		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
87 		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
88 		    dn_slots)) {
89 			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
90 			mutex_enter(&os->os_obj_lock);
91 			ASSERT0(P2PHASE(os->os_obj_next_chunk,
92 			    dnodes_per_chunk));
93 			object = os->os_obj_next_chunk;
94 
95 			/*
96 			 * Each time we polish off a L1 bp worth of dnodes
97 			 * (2^12 objects), move to another L1 bp that's
98 			 * still reasonably sparse (at most 1/4 full). Look
99 			 * from the beginning at most once per txg. If we
100 			 * still can't allocate from that L1 block, search
101 			 * for an empty L0 block, which will quickly skip
102 			 * to the end of the metadnode if the no nearby L0
103 			 * blocks are empty. This fallback avoids a
104 			 * pathology where full dnode blocks containing
105 			 * large dnodes appear sparse because they have a
106 			 * low blk_fill, leading to many failed allocation
107 			 * attempts. In the long term a better mechanism to
108 			 * search for sparse metadnode regions, such as
109 			 * spacemaps, could be implemented.
110 			 *
111 			 * os_scan_dnodes is set during txg sync if enough
112 			 * objects have been freed since the previous
113 			 * rescan to justify backfilling again.
114 			 *
115 			 * Note that dmu_traverse depends on the behavior
116 			 * that we use multiple blocks of the dnode object
117 			 * before going back to reuse objects. Any change
118 			 * to this algorithm should preserve that property
119 			 * or find another solution to the issues described
120 			 * in traverse_visitbp.
121 			 */
122 			if (P2PHASE(object, L1_dnode_count) == 0) {
123 				uint64_t offset;
124 				uint64_t blkfill;
125 				int minlvl;
126 				if (os->os_rescan_dnodes) {
127 					offset = 0;
128 					os->os_rescan_dnodes = B_FALSE;
129 				} else {
130 					offset = object << DNODE_SHIFT;
131 				}
132 				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
133 				minlvl = restarted ? 1 : 2;
134 				restarted = B_TRUE;
135 				error = dnode_next_offset(DMU_META_DNODE(os),
136 				    DNODE_FIND_HOLE, &offset, minlvl,
137 				    blkfill, 0);
138 				if (error == 0) {
139 					object = offset >> DNODE_SHIFT;
140 				}
141 			}
142 			/*
143 			 * Note: if "restarted", we may find a L0 that
144 			 * is not suitably aligned.
145 			 */
146 			os->os_obj_next_chunk =
147 			    P2ALIGN(object, dnodes_per_chunk) +
148 			    dnodes_per_chunk;
149 			(void) atomic_swap_64(cpuobj, object);
150 			mutex_exit(&os->os_obj_lock);
151 		}
152 
153 		/*
154 		 * The value of (*cpuobj) before adding dn_slots is the object
155 		 * ID assigned to us.  The value afterwards is the object ID
156 		 * assigned to whoever wants to do an allocation next.
157 		 */
158 		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
159 
160 		/*
161 		 * XXX We should check for an i/o error here and return
162 		 * up to our caller.  Actually we should pre-read it in
163 		 * dmu_tx_assign(), but there is currently no mechanism
164 		 * to do so.
165 		 */
166 		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
167 		    dn_slots, FTAG, &dn);
168 		if (error == 0) {
169 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
170 			/*
171 			 * Another thread could have allocated it; check
172 			 * again now that we have the struct lock.
173 			 */
174 			if (dn->dn_type == DMU_OT_NONE) {
175 				dnode_allocate(dn, ot, blocksize, 0,
176 				    bonustype, bonuslen, dn_slots, tx);
177 				rw_exit(&dn->dn_struct_rwlock);
178 				dmu_tx_add_new_object(tx, dn);
179 				dnode_rele(dn, FTAG);
180 				return (object);
181 			}
182 			rw_exit(&dn->dn_struct_rwlock);
183 			dnode_rele(dn, FTAG);
184 			DNODE_STAT_BUMP(dnode_alloc_race);
185 		}
186 
187 		/*
188 		 * Skip to next known valid starting point on error. This
189 		 * is the start of the next block of dnodes.
190 		 */
191 		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
192 			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
193 			DNODE_STAT_BUMP(dnode_alloc_next_block);
194 		}
195 		(void) atomic_swap_64(cpuobj, object);
196 	}
197 }
198 
199 uint64_t
200 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
201     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
202 {
203 	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
204 	    bonuslen, 0, tx));
205 }
206 
207 uint64_t
208 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
209     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
210     dmu_tx_t *tx)
211 {
212 	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
213 	    bonustype, bonuslen, 0, tx));
214 }
215 
216 uint64_t
217 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
218     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
219 {
220 	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
221 	    bonuslen, dnodesize, tx));
222 }
223 
224 int
225 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
226     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
227 {
228 	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
229 	    bonuslen, 0, tx));
230 }
231 
232 int
233 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
234     int blocksize, dmu_object_type_t bonustype, int bonuslen,
235     int dnodesize, dmu_tx_t *tx)
236 {
237 	dnode_t *dn;
238 	int dn_slots = dnodesize >> DNODE_SHIFT;
239 	int err;
240 
241 	if (dn_slots == 0)
242 		dn_slots = DNODE_MIN_SLOTS;
243 	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
244 	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
245 
246 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
247 		return (SET_ERROR(EBADF));
248 
249 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
250 	    FTAG, &dn);
251 	if (err)
252 		return (err);
253 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
254 	dmu_tx_add_new_object(tx, dn);
255 
256 	dnode_rele(dn, FTAG);
257 
258 	return (0);
259 }
260 
261 int
262 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
263     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
264 {
265 	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
266 	    bonuslen, 0, tx));
267 }
268 
269 int
270 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
271     int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
272     dmu_tx_t *tx)
273 {
274 	dnode_t *dn;
275 	int dn_slots = dnodesize >> DNODE_SHIFT;
276 	int err;
277 
278 	if (object == DMU_META_DNODE_OBJECT)
279 		return (SET_ERROR(EBADF));
280 
281 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
282 	    FTAG, &dn);
283 	if (err)
284 		return (err);
285 
286 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
287 
288 	dnode_rele(dn, FTAG);
289 	return (err);
290 }
291 
292 int
293 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
294 {
295 	dnode_t *dn;
296 	int err;
297 
298 	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
299 
300 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
301 	    FTAG, &dn);
302 	if (err)
303 		return (err);
304 
305 	ASSERT(dn->dn_type != DMU_OT_NONE);
306 	/*
307 	 * If we don't create this free range, we'll leak indirect blocks when
308 	 * we get to freeing the dnode in syncing context.
309 	 */
310 	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
311 	dnode_free(dn, tx);
312 	dnode_rele(dn, FTAG);
313 
314 	return (0);
315 }
316 
317 /*
318  * Return (in *objectp) the next object which is allocated (or a hole)
319  * after *object, taking into account only objects that may have been modified
320  * after the specified txg.
321  */
322 int
323 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
324 {
325 	uint64_t offset;
326 	uint64_t start_obj;
327 	struct dsl_dataset *ds = os->os_dsl_dataset;
328 	int error;
329 
330 	if (*objectp == 0) {
331 		start_obj = 1;
332 	} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
333 		uint64_t i = *objectp + 1;
334 		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
335 		dmu_object_info_t doi;
336 
337 		/*
338 		 * Scan through the remaining meta dnode block. The contents
339 		 * of each slot in the block are known so it can be quickly
340 		 * checked. If the block is exhausted without a match then
341 		 * hand off to dnode_next_offset() for further scanning.
342 		 */
343 		while (i <= last_obj) {
344 			error = dmu_object_info(os, i, &doi);
345 			if (error == ENOENT) {
346 				if (hole) {
347 					*objectp = i;
348 					return (0);
349 				} else {
350 					i++;
351 				}
352 			} else if (error == EEXIST) {
353 				i++;
354 			} else if (error == 0) {
355 				if (hole) {
356 					i += doi.doi_dnodesize >> DNODE_SHIFT;
357 				} else {
358 					*objectp = i;
359 					return (0);
360 				}
361 			} else {
362 				return (error);
363 			}
364 		}
365 
366 		start_obj = i;
367 	} else {
368 		start_obj = *objectp + 1;
369 	}
370 
371 	offset = start_obj << DNODE_SHIFT;
372 
373 	error = dnode_next_offset(DMU_META_DNODE(os),
374 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
375 
376 	*objectp = offset >> DNODE_SHIFT;
377 
378 	return (error);
379 }
380 
381 /*
382  * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
383  * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
384  *
385  * Only for use from syncing context, on MOS objects.
386  */
387 void
388 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
389     dmu_tx_t *tx)
390 {
391 	dnode_t *dn;
392 
393 	ASSERT(dmu_tx_is_syncing(tx));
394 
395 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
396 	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
397 		dnode_rele(dn, FTAG);
398 		return;
399 	}
400 	ASSERT3U(dn->dn_type, ==, old_type);
401 	ASSERT0(dn->dn_maxblkid);
402 
403 	/*
404 	 * We must initialize the ZAP data before changing the type,
405 	 * so that concurrent calls to *_is_zapified() can determine if
406 	 * the object has been completely zapified by checking the type.
407 	 */
408 	mzap_create_impl(mos, object, 0, 0, tx);
409 
410 	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
411 	    DMU_OTN_ZAP_METADATA;
412 	dnode_setdirty(dn, tx);
413 	dnode_rele(dn, FTAG);
414 
415 	spa_feature_incr(dmu_objset_spa(mos),
416 	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
417 }
418 
419 void
420 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
421 {
422 	dnode_t *dn;
423 	dmu_object_type_t t;
424 
425 	ASSERT(dmu_tx_is_syncing(tx));
426 
427 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
428 	t = dn->dn_type;
429 	dnode_rele(dn, FTAG);
430 
431 	if (t == DMU_OTN_ZAP_METADATA) {
432 		spa_feature_decr(dmu_objset_spa(mos),
433 		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
434 	}
435 	VERIFY0(dmu_object_free(mos, object, tx));
436 }
437