1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 24 * Copyright 2014 HybridCluster. All rights reserved. 25 */ 26 27 #include <sys/dmu.h> 28 #include <sys/dmu_objset.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/dnode.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/dsl_dataset.h> 34 35 /* 36 * Each of the concurrent object allocators will grab 37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to 38 * grab 128 slots, which is 4 blocks worth. This was experimentally 39 * determined to be the lowest value that eliminates the measurable effect 40 * of lock contention from this code path. 41 */ 42 int dmu_object_alloc_chunk_shift = 7; 43 44 static uint64_t 45 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, 46 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 47 int dnodesize, dmu_tx_t *tx) 48 { 49 uint64_t object; 50 uint64_t L1_dnode_count = DNODES_PER_BLOCK << 51 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); 52 dnode_t *dn = NULL; 53 int dn_slots = dnodesize >> DNODE_SHIFT; 54 boolean_t restarted = B_FALSE; 55 uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID % 56 os->os_obj_next_percpu_len]; 57 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 58 int error; 59 60 if (dn_slots == 0) { 61 dn_slots = DNODE_MIN_SLOTS; 62 } else { 63 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 64 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 65 } 66 67 /* 68 * The "chunk" of dnodes that is assigned to a CPU-specific 69 * allocator needs to be at least one block's worth, to avoid 70 * lock contention on the dbuf. It can be at most one L1 block's 71 * worth, so that the "rescan after polishing off a L1's worth" 72 * logic below will be sure to kick in. 73 */ 74 if (dnodes_per_chunk < DNODES_PER_BLOCK) 75 dnodes_per_chunk = DNODES_PER_BLOCK; 76 if (dnodes_per_chunk > L1_dnode_count) 77 dnodes_per_chunk = L1_dnode_count; 78 79 object = *cpuobj; 80 81 for (;;) { 82 /* 83 * If we finished a chunk of dnodes, get a new one from 84 * the global allocator. 85 */ 86 if ((P2PHASE(object, dnodes_per_chunk) == 0) || 87 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < 88 dn_slots)) { 89 DNODE_STAT_BUMP(dnode_alloc_next_chunk); 90 mutex_enter(&os->os_obj_lock); 91 ASSERT0(P2PHASE(os->os_obj_next_chunk, 92 dnodes_per_chunk)); 93 object = os->os_obj_next_chunk; 94 95 /* 96 * Each time we polish off a L1 bp worth of dnodes 97 * (2^12 objects), move to another L1 bp that's 98 * still reasonably sparse (at most 1/4 full). Look 99 * from the beginning at most once per txg. If we 100 * still can't allocate from that L1 block, search 101 * for an empty L0 block, which will quickly skip 102 * to the end of the metadnode if the no nearby L0 103 * blocks are empty. This fallback avoids a 104 * pathology where full dnode blocks containing 105 * large dnodes appear sparse because they have a 106 * low blk_fill, leading to many failed allocation 107 * attempts. In the long term a better mechanism to 108 * search for sparse metadnode regions, such as 109 * spacemaps, could be implemented. 110 * 111 * os_scan_dnodes is set during txg sync if enough 112 * objects have been freed since the previous 113 * rescan to justify backfilling again. 114 * 115 * Note that dmu_traverse depends on the behavior 116 * that we use multiple blocks of the dnode object 117 * before going back to reuse objects. Any change 118 * to this algorithm should preserve that property 119 * or find another solution to the issues described 120 * in traverse_visitbp. 121 */ 122 if (P2PHASE(object, L1_dnode_count) == 0) { 123 uint64_t offset; 124 uint64_t blkfill; 125 int minlvl; 126 if (os->os_rescan_dnodes) { 127 offset = 0; 128 os->os_rescan_dnodes = B_FALSE; 129 } else { 130 offset = object << DNODE_SHIFT; 131 } 132 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; 133 minlvl = restarted ? 1 : 2; 134 restarted = B_TRUE; 135 error = dnode_next_offset(DMU_META_DNODE(os), 136 DNODE_FIND_HOLE, &offset, minlvl, 137 blkfill, 0); 138 if (error == 0) { 139 object = offset >> DNODE_SHIFT; 140 } 141 } 142 /* 143 * Note: if "restarted", we may find a L0 that 144 * is not suitably aligned. 145 */ 146 os->os_obj_next_chunk = 147 P2ALIGN(object, dnodes_per_chunk) + 148 dnodes_per_chunk; 149 (void) atomic_swap_64(cpuobj, object); 150 mutex_exit(&os->os_obj_lock); 151 } 152 153 /* 154 * The value of (*cpuobj) before adding dn_slots is the object 155 * ID assigned to us. The value afterwards is the object ID 156 * assigned to whoever wants to do an allocation next. 157 */ 158 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; 159 160 /* 161 * XXX We should check for an i/o error here and return 162 * up to our caller. Actually we should pre-read it in 163 * dmu_tx_assign(), but there is currently no mechanism 164 * to do so. 165 */ 166 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, 167 dn_slots, FTAG, &dn); 168 if (error == 0) { 169 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 170 /* 171 * Another thread could have allocated it; check 172 * again now that we have the struct lock. 173 */ 174 if (dn->dn_type == DMU_OT_NONE) { 175 dnode_allocate(dn, ot, blocksize, 0, 176 bonustype, bonuslen, dn_slots, tx); 177 rw_exit(&dn->dn_struct_rwlock); 178 dmu_tx_add_new_object(tx, dn); 179 dnode_rele(dn, FTAG); 180 return (object); 181 } 182 rw_exit(&dn->dn_struct_rwlock); 183 dnode_rele(dn, FTAG); 184 DNODE_STAT_BUMP(dnode_alloc_race); 185 } 186 187 /* 188 * Skip to next known valid starting point on error. This 189 * is the start of the next block of dnodes. 190 */ 191 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { 192 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); 193 DNODE_STAT_BUMP(dnode_alloc_next_block); 194 } 195 (void) atomic_swap_64(cpuobj, object); 196 } 197 } 198 199 uint64_t 200 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, 201 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 202 { 203 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 204 bonuslen, 0, tx)); 205 } 206 207 uint64_t 208 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, 209 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 210 dmu_tx_t *tx) 211 { 212 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 213 bonustype, bonuslen, 0, tx)); 214 } 215 216 uint64_t 217 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, 218 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 219 { 220 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 221 bonuslen, dnodesize, tx)); 222 } 223 224 int 225 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, 226 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 227 { 228 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, 229 bonuslen, 0, tx)); 230 } 231 232 int 233 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 234 int blocksize, dmu_object_type_t bonustype, int bonuslen, 235 int dnodesize, dmu_tx_t *tx) 236 { 237 dnode_t *dn; 238 int dn_slots = dnodesize >> DNODE_SHIFT; 239 int err; 240 241 if (dn_slots == 0) 242 dn_slots = DNODE_MIN_SLOTS; 243 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 244 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 245 246 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) 247 return (SET_ERROR(EBADF)); 248 249 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, 250 FTAG, &dn); 251 if (err) 252 return (err); 253 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); 254 dmu_tx_add_new_object(tx, dn); 255 256 dnode_rele(dn, FTAG); 257 258 return (0); 259 } 260 261 int 262 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, 263 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 264 { 265 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, 266 bonuslen, 0, tx)); 267 } 268 269 int 270 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 271 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, 272 dmu_tx_t *tx) 273 { 274 dnode_t *dn; 275 int dn_slots = dnodesize >> DNODE_SHIFT; 276 int err; 277 278 if (object == DMU_META_DNODE_OBJECT) 279 return (SET_ERROR(EBADF)); 280 281 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 282 FTAG, &dn); 283 if (err) 284 return (err); 285 286 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); 287 288 dnode_rele(dn, FTAG); 289 return (err); 290 } 291 292 int 293 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 294 { 295 dnode_t *dn; 296 int err; 297 298 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 299 300 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 301 FTAG, &dn); 302 if (err) 303 return (err); 304 305 ASSERT(dn->dn_type != DMU_OT_NONE); 306 /* 307 * If we don't create this free range, we'll leak indirect blocks when 308 * we get to freeing the dnode in syncing context. 309 */ 310 dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 311 dnode_free(dn, tx); 312 dnode_rele(dn, FTAG); 313 314 return (0); 315 } 316 317 /* 318 * Return (in *objectp) the next object which is allocated (or a hole) 319 * after *object, taking into account only objects that may have been modified 320 * after the specified txg. 321 */ 322 int 323 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) 324 { 325 uint64_t offset; 326 uint64_t start_obj; 327 struct dsl_dataset *ds = os->os_dsl_dataset; 328 int error; 329 330 if (*objectp == 0) { 331 start_obj = 1; 332 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { 333 uint64_t i = *objectp + 1; 334 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); 335 dmu_object_info_t doi; 336 337 /* 338 * Scan through the remaining meta dnode block. The contents 339 * of each slot in the block are known so it can be quickly 340 * checked. If the block is exhausted without a match then 341 * hand off to dnode_next_offset() for further scanning. 342 */ 343 while (i <= last_obj) { 344 error = dmu_object_info(os, i, &doi); 345 if (error == ENOENT) { 346 if (hole) { 347 *objectp = i; 348 return (0); 349 } else { 350 i++; 351 } 352 } else if (error == EEXIST) { 353 i++; 354 } else if (error == 0) { 355 if (hole) { 356 i += doi.doi_dnodesize >> DNODE_SHIFT; 357 } else { 358 *objectp = i; 359 return (0); 360 } 361 } else { 362 return (error); 363 } 364 } 365 366 start_obj = i; 367 } else { 368 start_obj = *objectp + 1; 369 } 370 371 offset = start_obj << DNODE_SHIFT; 372 373 error = dnode_next_offset(DMU_META_DNODE(os), 374 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); 375 376 *objectp = offset >> DNODE_SHIFT; 377 378 return (error); 379 } 380 381 /* 382 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the 383 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. 384 * 385 * Only for use from syncing context, on MOS objects. 386 */ 387 void 388 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, 389 dmu_tx_t *tx) 390 { 391 dnode_t *dn; 392 393 ASSERT(dmu_tx_is_syncing(tx)); 394 395 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 396 if (dn->dn_type == DMU_OTN_ZAP_METADATA) { 397 dnode_rele(dn, FTAG); 398 return; 399 } 400 ASSERT3U(dn->dn_type, ==, old_type); 401 ASSERT0(dn->dn_maxblkid); 402 403 /* 404 * We must initialize the ZAP data before changing the type, 405 * so that concurrent calls to *_is_zapified() can determine if 406 * the object has been completely zapified by checking the type. 407 */ 408 mzap_create_impl(mos, object, 0, 0, tx); 409 410 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = 411 DMU_OTN_ZAP_METADATA; 412 dnode_setdirty(dn, tx); 413 dnode_rele(dn, FTAG); 414 415 spa_feature_incr(dmu_objset_spa(mos), 416 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 417 } 418 419 void 420 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) 421 { 422 dnode_t *dn; 423 dmu_object_type_t t; 424 425 ASSERT(dmu_tx_is_syncing(tx)); 426 427 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 428 t = dn->dn_type; 429 dnode_rele(dn, FTAG); 430 431 if (t == DMU_OTN_ZAP_METADATA) { 432 spa_feature_decr(dmu_objset_spa(mos), 433 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 434 } 435 VERIFY0(dmu_object_free(mos, object, tx)); 436 } 437