1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/bpobj.h> 27 #include <sys/zfs_context.h> 28 #include <sys/refcount.h> 29 #include <sys/dsl_pool.h> 30 #include <sys/zfeature.h> 31 #include <sys/zap.h> 32 33 /* 34 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 35 */ 36 uint64_t 37 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 38 { 39 zfeature_info_t *empty_bpobj_feat = 40 &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ]; 41 spa_t *spa = dmu_objset_spa(os); 42 dsl_pool_t *dp = dmu_objset_pool(os); 43 44 if (spa_feature_is_enabled(spa, empty_bpobj_feat)) { 45 if (!spa_feature_is_active(spa, empty_bpobj_feat)) { 46 ASSERT0(dp->dp_empty_bpobj); 47 dp->dp_empty_bpobj = 48 bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx); 49 VERIFY(zap_add(os, 50 DMU_POOL_DIRECTORY_OBJECT, 51 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 52 &dp->dp_empty_bpobj, tx) == 0); 53 } 54 spa_feature_incr(spa, empty_bpobj_feat, tx); 55 ASSERT(dp->dp_empty_bpobj != 0); 56 return (dp->dp_empty_bpobj); 57 } else { 58 return (bpobj_alloc(os, blocksize, tx)); 59 } 60 } 61 62 void 63 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 64 { 65 zfeature_info_t *empty_bpobj_feat = 66 &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ]; 67 dsl_pool_t *dp = dmu_objset_pool(os); 68 69 spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx); 70 if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) { 71 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 72 DMU_POOL_DIRECTORY_OBJECT, 73 DMU_POOL_EMPTY_BPOBJ, tx)); 74 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 75 dp->dp_empty_bpobj = 0; 76 } 77 } 78 79 uint64_t 80 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 81 { 82 int size; 83 84 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 85 size = BPOBJ_SIZE_V0; 86 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 87 size = BPOBJ_SIZE_V1; 88 else 89 size = sizeof (bpobj_phys_t); 90 91 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 92 DMU_OT_BPOBJ_HDR, size, tx)); 93 } 94 95 void 96 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 97 { 98 int64_t i; 99 bpobj_t bpo; 100 dmu_object_info_t doi; 101 int epb; 102 dmu_buf_t *dbuf = NULL; 103 104 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 105 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 106 107 mutex_enter(&bpo.bpo_lock); 108 109 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 110 goto out; 111 112 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 113 epb = doi.doi_data_block_size / sizeof (uint64_t); 114 115 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 116 uint64_t *objarray; 117 uint64_t offset, blkoff; 118 119 offset = i * sizeof (uint64_t); 120 blkoff = P2PHASE(i, epb); 121 122 if (dbuf == NULL || dbuf->db_offset > offset) { 123 if (dbuf) 124 dmu_buf_rele(dbuf, FTAG); 125 VERIFY3U(0, ==, dmu_buf_hold(os, 126 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 127 } 128 129 ASSERT3U(offset, >=, dbuf->db_offset); 130 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 131 132 objarray = dbuf->db_data; 133 bpobj_free(os, objarray[blkoff], tx); 134 } 135 if (dbuf) { 136 dmu_buf_rele(dbuf, FTAG); 137 dbuf = NULL; 138 } 139 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 140 141 out: 142 mutex_exit(&bpo.bpo_lock); 143 bpobj_close(&bpo); 144 145 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 146 } 147 148 int 149 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 150 { 151 dmu_object_info_t doi; 152 int err; 153 154 err = dmu_object_info(os, object, &doi); 155 if (err) 156 return (err); 157 158 bzero(bpo, sizeof (*bpo)); 159 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 160 161 ASSERT(bpo->bpo_dbuf == NULL); 162 ASSERT(bpo->bpo_phys == NULL); 163 ASSERT(object != 0); 164 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 165 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 166 167 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 168 if (err) 169 return (err); 170 171 bpo->bpo_os = os; 172 bpo->bpo_object = object; 173 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 174 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 175 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 176 bpo->bpo_phys = bpo->bpo_dbuf->db_data; 177 return (0); 178 } 179 180 void 181 bpobj_close(bpobj_t *bpo) 182 { 183 /* Lame workaround for closing a bpobj that was never opened. */ 184 if (bpo->bpo_object == 0) 185 return; 186 187 dmu_buf_rele(bpo->bpo_dbuf, bpo); 188 if (bpo->bpo_cached_dbuf != NULL) 189 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 190 bpo->bpo_dbuf = NULL; 191 bpo->bpo_phys = NULL; 192 bpo->bpo_cached_dbuf = NULL; 193 bpo->bpo_object = 0; 194 195 mutex_destroy(&bpo->bpo_lock); 196 } 197 198 static int 199 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, 200 boolean_t free) 201 { 202 dmu_object_info_t doi; 203 int epb; 204 int64_t i; 205 int err = 0; 206 dmu_buf_t *dbuf = NULL; 207 208 mutex_enter(&bpo->bpo_lock); 209 210 if (free) 211 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 212 213 for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { 214 blkptr_t *bparray; 215 blkptr_t *bp; 216 uint64_t offset, blkoff; 217 218 offset = i * sizeof (blkptr_t); 219 blkoff = P2PHASE(i, bpo->bpo_epb); 220 221 if (dbuf == NULL || dbuf->db_offset > offset) { 222 if (dbuf) 223 dmu_buf_rele(dbuf, FTAG); 224 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, 225 FTAG, &dbuf, 0); 226 if (err) 227 break; 228 } 229 230 ASSERT3U(offset, >=, dbuf->db_offset); 231 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 232 233 bparray = dbuf->db_data; 234 bp = &bparray[blkoff]; 235 err = func(arg, bp, tx); 236 if (err) 237 break; 238 if (free) { 239 bpo->bpo_phys->bpo_bytes -= 240 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 241 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); 242 if (bpo->bpo_havecomp) { 243 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); 244 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); 245 } 246 bpo->bpo_phys->bpo_num_blkptrs--; 247 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 248 } 249 } 250 if (dbuf) { 251 dmu_buf_rele(dbuf, FTAG); 252 dbuf = NULL; 253 } 254 if (free) { 255 i++; 256 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, 257 i * sizeof (blkptr_t), -1ULL, tx)); 258 } 259 if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) 260 goto out; 261 262 ASSERT(bpo->bpo_havecomp); 263 err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); 264 if (err) { 265 mutex_exit(&bpo->bpo_lock); 266 return (err); 267 } 268 epb = doi.doi_data_block_size / sizeof (uint64_t); 269 270 for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 271 uint64_t *objarray; 272 uint64_t offset, blkoff; 273 bpobj_t sublist; 274 uint64_t used_before, comp_before, uncomp_before; 275 uint64_t used_after, comp_after, uncomp_after; 276 277 offset = i * sizeof (uint64_t); 278 blkoff = P2PHASE(i, epb); 279 280 if (dbuf == NULL || dbuf->db_offset > offset) { 281 if (dbuf) 282 dmu_buf_rele(dbuf, FTAG); 283 err = dmu_buf_hold(bpo->bpo_os, 284 bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); 285 if (err) 286 break; 287 } 288 289 ASSERT3U(offset, >=, dbuf->db_offset); 290 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 291 292 objarray = dbuf->db_data; 293 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); 294 if (err) 295 break; 296 if (free) { 297 err = bpobj_space(&sublist, 298 &used_before, &comp_before, &uncomp_before); 299 if (err) 300 break; 301 } 302 err = bpobj_iterate_impl(&sublist, func, arg, tx, free); 303 if (free) { 304 VERIFY3U(0, ==, bpobj_space(&sublist, 305 &used_after, &comp_after, &uncomp_after)); 306 bpo->bpo_phys->bpo_bytes -= used_before - used_after; 307 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); 308 bpo->bpo_phys->bpo_comp -= comp_before - comp_after; 309 bpo->bpo_phys->bpo_uncomp -= 310 uncomp_before - uncomp_after; 311 } 312 313 bpobj_close(&sublist); 314 if (err) 315 break; 316 if (free) { 317 err = dmu_object_free(bpo->bpo_os, 318 objarray[blkoff], tx); 319 if (err) 320 break; 321 bpo->bpo_phys->bpo_num_subobjs--; 322 ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); 323 } 324 } 325 if (dbuf) { 326 dmu_buf_rele(dbuf, FTAG); 327 dbuf = NULL; 328 } 329 if (free) { 330 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, 331 bpo->bpo_phys->bpo_subobjs, 332 (i + 1) * sizeof (uint64_t), -1ULL, tx)); 333 } 334 335 out: 336 /* If there are no entries, there should be no bytes. */ 337 ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || 338 (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || 339 bpo->bpo_phys->bpo_bytes == 0); 340 341 mutex_exit(&bpo->bpo_lock); 342 return (err); 343 } 344 345 /* 346 * Iterate and remove the entries. If func returns nonzero, iteration 347 * will stop and that entry will not be removed. 348 */ 349 int 350 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 351 { 352 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); 353 } 354 355 /* 356 * Iterate the entries. If func returns nonzero, iteration will stop. 357 */ 358 int 359 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 360 { 361 return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); 362 } 363 364 void 365 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 366 { 367 bpobj_t subbpo; 368 uint64_t used, comp, uncomp, subsubobjs; 369 370 ASSERT(bpo->bpo_havesubobj); 371 ASSERT(bpo->bpo_havecomp); 372 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 373 374 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 375 bpobj_decr_empty(bpo->bpo_os, tx); 376 return; 377 } 378 379 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 380 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 381 382 if (used == 0) { 383 /* No point in having an empty subobj. */ 384 bpobj_close(&subbpo); 385 bpobj_free(bpo->bpo_os, subobj, tx); 386 return; 387 } 388 389 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 390 if (bpo->bpo_phys->bpo_subobjs == 0) { 391 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, 392 DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 393 } 394 395 dmu_object_info_t doi; 396 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); 397 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); 398 399 mutex_enter(&bpo->bpo_lock); 400 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 401 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 402 sizeof (subobj), &subobj, tx); 403 bpo->bpo_phys->bpo_num_subobjs++; 404 405 /* 406 * If subobj has only one block of subobjs, then move subobj's 407 * subobjs to bpo's subobj list directly. This reduces 408 * recursion in bpobj_iterate due to nested subobjs. 409 */ 410 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 411 if (subsubobjs != 0) { 412 dmu_object_info_t doi; 413 414 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 415 if (doi.doi_max_offset == doi.doi_data_block_size) { 416 dmu_buf_t *subdb; 417 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 418 419 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, 420 0, FTAG, &subdb, 0)); 421 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 422 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 423 numsubsub * sizeof (subobj), subdb->db_data, tx); 424 dmu_buf_rele(subdb, FTAG); 425 bpo->bpo_phys->bpo_num_subobjs += numsubsub; 426 427 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 428 subbpo.bpo_phys->bpo_subobjs = 0; 429 VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, 430 subsubobjs, tx)); 431 } 432 } 433 bpo->bpo_phys->bpo_bytes += used; 434 bpo->bpo_phys->bpo_comp += comp; 435 bpo->bpo_phys->bpo_uncomp += uncomp; 436 mutex_exit(&bpo->bpo_lock); 437 438 bpobj_close(&subbpo); 439 } 440 441 void 442 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) 443 { 444 blkptr_t stored_bp = *bp; 445 uint64_t offset; 446 int blkoff; 447 blkptr_t *bparray; 448 449 ASSERT(!BP_IS_HOLE(bp)); 450 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 451 452 /* We never need the fill count. */ 453 stored_bp.blk_fill = 0; 454 455 /* The bpobj will compress better if we can leave off the checksum */ 456 if (!BP_GET_DEDUP(bp)) 457 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); 458 459 mutex_enter(&bpo->bpo_lock); 460 461 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 462 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 463 464 if (bpo->bpo_cached_dbuf == NULL || 465 offset < bpo->bpo_cached_dbuf->db_offset || 466 offset >= bpo->bpo_cached_dbuf->db_offset + 467 bpo->bpo_cached_dbuf->db_size) { 468 if (bpo->bpo_cached_dbuf) 469 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 470 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 471 offset, bpo, &bpo->bpo_cached_dbuf, 0)); 472 } 473 474 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 475 bparray = bpo->bpo_cached_dbuf->db_data; 476 bparray[blkoff] = stored_bp; 477 478 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 479 bpo->bpo_phys->bpo_num_blkptrs++; 480 bpo->bpo_phys->bpo_bytes += 481 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 482 if (bpo->bpo_havecomp) { 483 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); 484 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); 485 } 486 mutex_exit(&bpo->bpo_lock); 487 } 488 489 struct space_range_arg { 490 spa_t *spa; 491 uint64_t mintxg; 492 uint64_t maxtxg; 493 uint64_t used; 494 uint64_t comp; 495 uint64_t uncomp; 496 }; 497 498 /* ARGSUSED */ 499 static int 500 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 501 { 502 struct space_range_arg *sra = arg; 503 504 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { 505 if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 506 sra->used += bp_get_dsize_sync(sra->spa, bp); 507 else 508 sra->used += bp_get_dsize(sra->spa, bp); 509 sra->comp += BP_GET_PSIZE(bp); 510 sra->uncomp += BP_GET_UCSIZE(bp); 511 } 512 return (0); 513 } 514 515 int 516 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 517 { 518 mutex_enter(&bpo->bpo_lock); 519 520 *usedp = bpo->bpo_phys->bpo_bytes; 521 if (bpo->bpo_havecomp) { 522 *compp = bpo->bpo_phys->bpo_comp; 523 *uncompp = bpo->bpo_phys->bpo_uncomp; 524 mutex_exit(&bpo->bpo_lock); 525 return (0); 526 } else { 527 mutex_exit(&bpo->bpo_lock); 528 return (bpobj_space_range(bpo, 0, UINT64_MAX, 529 usedp, compp, uncompp)); 530 } 531 } 532 533 /* 534 * Return the amount of space in the bpobj which is: 535 * mintxg < blk_birth <= maxtxg 536 */ 537 int 538 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 539 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 540 { 541 struct space_range_arg sra = { 0 }; 542 int err; 543 544 /* 545 * As an optimization, if they want the whole txg range, just 546 * get bpo_bytes rather than iterating over the bps. 547 */ 548 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 549 return (bpobj_space(bpo, usedp, compp, uncompp)); 550 551 sra.spa = dmu_objset_spa(bpo->bpo_os); 552 sra.mintxg = mintxg; 553 sra.maxtxg = maxtxg; 554 555 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 556 *usedp = sra.used; 557 *compp = sra.comp; 558 *uncompp = sra.uncomp; 559 return (err); 560 } 561