1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/dsl_dataset.h> 29 #include <sys/dmu.h> 30 #include <sys/refcount.h> 31 #include <sys/zap.h> 32 #include <sys/zfs_context.h> 33 #include <sys/dsl_pool.h> 34 35 /* 36 * Deadlist concurrency: 37 * 38 * Deadlists can only be modified from the syncing thread. 39 * 40 * Except for dsl_deadlist_insert(), it can only be modified with the 41 * dp_config_rwlock held with RW_WRITER. 42 * 43 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 44 * be called concurrently, from open context, with the dl_config_rwlock held 45 * with RW_READER. 46 * 47 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 48 * the accessors, protecting: 49 * dl_phys->dl_used,comp,uncomp 50 * and protecting the dl_tree from being loaded. 51 * The locking is provided by dl_lock. Note that locking on the bpobj_t 52 * provides its own locking, and dl_oldfmt is immutable. 53 */ 54 55 static int 56 dsl_deadlist_compare(const void *arg1, const void *arg2) 57 { 58 const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; 59 const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; 60 61 return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); 62 } 63 64 static void 65 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 66 { 67 zap_cursor_t zc; 68 zap_attribute_t za; 69 70 ASSERT(MUTEX_HELD(&dl->dl_lock)); 71 72 ASSERT(!dl->dl_oldfmt); 73 if (dl->dl_havetree) 74 return; 75 76 avl_create(&dl->dl_tree, dsl_deadlist_compare, 77 sizeof (dsl_deadlist_entry_t), 78 offsetof(dsl_deadlist_entry_t, dle_node)); 79 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 80 zap_cursor_retrieve(&zc, &za) == 0; 81 zap_cursor_advance(&zc)) { 82 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 83 dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); 84 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 85 za.za_first_integer)); 86 avl_add(&dl->dl_tree, dle); 87 } 88 zap_cursor_fini(&zc); 89 dl->dl_havetree = B_TRUE; 90 } 91 92 void 93 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 94 { 95 dmu_object_info_t doi; 96 97 ASSERT(!dsl_deadlist_is_open(dl)); 98 99 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 100 dl->dl_os = os; 101 dl->dl_object = object; 102 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 103 dmu_object_info_from_db(dl->dl_dbuf, &doi); 104 if (doi.doi_type == DMU_OT_BPOBJ) { 105 dmu_buf_rele(dl->dl_dbuf, dl); 106 dl->dl_dbuf = NULL; 107 dl->dl_oldfmt = B_TRUE; 108 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 109 return; 110 } 111 112 dl->dl_oldfmt = B_FALSE; 113 dl->dl_phys = dl->dl_dbuf->db_data; 114 dl->dl_havetree = B_FALSE; 115 } 116 117 boolean_t 118 dsl_deadlist_is_open(dsl_deadlist_t *dl) 119 { 120 return (dl->dl_os != NULL); 121 } 122 123 void 124 dsl_deadlist_close(dsl_deadlist_t *dl) 125 { 126 void *cookie = NULL; 127 dsl_deadlist_entry_t *dle; 128 129 ASSERT(dsl_deadlist_is_open(dl)); 130 131 if (dl->dl_oldfmt) { 132 dl->dl_oldfmt = B_FALSE; 133 bpobj_close(&dl->dl_bpobj); 134 dl->dl_os = NULL; 135 dl->dl_object = 0; 136 return; 137 } 138 139 if (dl->dl_havetree) { 140 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 141 != NULL) { 142 bpobj_close(&dle->dle_bpobj); 143 kmem_free(dle, sizeof (*dle)); 144 } 145 avl_destroy(&dl->dl_tree); 146 } 147 dmu_buf_rele(dl->dl_dbuf, dl); 148 mutex_destroy(&dl->dl_lock); 149 dl->dl_dbuf = NULL; 150 dl->dl_phys = NULL; 151 dl->dl_os = NULL; 152 dl->dl_object = 0; 153 } 154 155 uint64_t 156 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 157 { 158 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 159 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 160 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 161 sizeof (dsl_deadlist_phys_t), tx)); 162 } 163 164 void 165 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 166 { 167 dmu_object_info_t doi; 168 zap_cursor_t zc; 169 zap_attribute_t za; 170 171 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 172 if (doi.doi_type == DMU_OT_BPOBJ) { 173 bpobj_free(os, dlobj, tx); 174 return; 175 } 176 177 for (zap_cursor_init(&zc, os, dlobj); 178 zap_cursor_retrieve(&zc, &za) == 0; 179 zap_cursor_advance(&zc)) { 180 uint64_t obj = za.za_first_integer; 181 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 182 bpobj_decr_empty(os, tx); 183 else 184 bpobj_free(os, obj, tx); 185 } 186 zap_cursor_fini(&zc); 187 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 188 } 189 190 static void 191 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 192 const blkptr_t *bp, dmu_tx_t *tx) 193 { 194 ASSERT(MUTEX_HELD(&dl->dl_lock)); 195 if (dle->dle_bpobj.bpo_object == 196 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 197 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 198 bpobj_close(&dle->dle_bpobj); 199 bpobj_decr_empty(dl->dl_os, tx); 200 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 201 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 202 dle->dle_mintxg, obj, tx)); 203 } 204 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 205 } 206 207 static void 208 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 209 uint64_t obj, dmu_tx_t *tx) 210 { 211 ASSERT(MUTEX_HELD(&dl->dl_lock)); 212 if (dle->dle_bpobj.bpo_object != 213 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 214 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 215 } else { 216 bpobj_close(&dle->dle_bpobj); 217 bpobj_decr_empty(dl->dl_os, tx); 218 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 219 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 220 dle->dle_mintxg, obj, tx)); 221 } 222 } 223 224 void 225 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 226 { 227 dsl_deadlist_entry_t dle_tofind; 228 dsl_deadlist_entry_t *dle; 229 avl_index_t where; 230 231 if (dl->dl_oldfmt) { 232 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 233 return; 234 } 235 236 mutex_enter(&dl->dl_lock); 237 dsl_deadlist_load_tree(dl); 238 239 dmu_buf_will_dirty(dl->dl_dbuf, tx); 240 dl->dl_phys->dl_used += 241 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 242 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 243 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 244 245 dle_tofind.dle_mintxg = bp->blk_birth; 246 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 247 if (dle == NULL) 248 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 249 else 250 dle = AVL_PREV(&dl->dl_tree, dle); 251 dle_enqueue(dl, dle, bp, tx); 252 mutex_exit(&dl->dl_lock); 253 } 254 255 /* 256 * Insert new key in deadlist, which must be > all current entries. 257 * mintxg is not inclusive. 258 */ 259 void 260 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 261 { 262 uint64_t obj; 263 dsl_deadlist_entry_t *dle; 264 265 if (dl->dl_oldfmt) 266 return; 267 268 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 269 dle->dle_mintxg = mintxg; 270 271 mutex_enter(&dl->dl_lock); 272 dsl_deadlist_load_tree(dl); 273 274 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 275 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 276 avl_add(&dl->dl_tree, dle); 277 278 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 279 mintxg, obj, tx)); 280 mutex_exit(&dl->dl_lock); 281 } 282 283 /* 284 * Remove this key, merging its entries into the previous key. 285 */ 286 void 287 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 288 { 289 dsl_deadlist_entry_t dle_tofind; 290 dsl_deadlist_entry_t *dle, *dle_prev; 291 292 if (dl->dl_oldfmt) 293 return; 294 295 mutex_enter(&dl->dl_lock); 296 dsl_deadlist_load_tree(dl); 297 298 dle_tofind.dle_mintxg = mintxg; 299 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 300 dle_prev = AVL_PREV(&dl->dl_tree, dle); 301 302 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 303 304 avl_remove(&dl->dl_tree, dle); 305 bpobj_close(&dle->dle_bpobj); 306 kmem_free(dle, sizeof (*dle)); 307 308 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 309 mutex_exit(&dl->dl_lock); 310 } 311 312 /* 313 * Walk ds's snapshots to regenerate generate ZAP & AVL. 314 */ 315 static void 316 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 317 uint64_t mrs_obj, dmu_tx_t *tx) 318 { 319 dsl_deadlist_t dl = { 0 }; 320 dsl_pool_t *dp = dmu_objset_pool(os); 321 322 dsl_deadlist_open(&dl, os, dlobj); 323 if (dl.dl_oldfmt) { 324 dsl_deadlist_close(&dl); 325 return; 326 } 327 328 while (mrs_obj != 0) { 329 dsl_dataset_t *ds; 330 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 331 dsl_deadlist_add_key(&dl, 332 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 333 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 334 dsl_dataset_rele(ds, FTAG); 335 } 336 dsl_deadlist_close(&dl); 337 } 338 339 uint64_t 340 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 341 uint64_t mrs_obj, dmu_tx_t *tx) 342 { 343 dsl_deadlist_entry_t *dle; 344 uint64_t newobj; 345 346 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 347 348 if (dl->dl_oldfmt) { 349 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 350 return (newobj); 351 } 352 353 mutex_enter(&dl->dl_lock); 354 dsl_deadlist_load_tree(dl); 355 356 for (dle = avl_first(&dl->dl_tree); dle; 357 dle = AVL_NEXT(&dl->dl_tree, dle)) { 358 uint64_t obj; 359 360 if (dle->dle_mintxg >= maxtxg) 361 break; 362 363 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 364 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 365 dle->dle_mintxg, obj, tx)); 366 } 367 mutex_exit(&dl->dl_lock); 368 return (newobj); 369 } 370 371 void 372 dsl_deadlist_space(dsl_deadlist_t *dl, 373 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 374 { 375 ASSERT(dsl_deadlist_is_open(dl)); 376 if (dl->dl_oldfmt) { 377 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 378 usedp, compp, uncompp)); 379 return; 380 } 381 382 mutex_enter(&dl->dl_lock); 383 *usedp = dl->dl_phys->dl_used; 384 *compp = dl->dl_phys->dl_comp; 385 *uncompp = dl->dl_phys->dl_uncomp; 386 mutex_exit(&dl->dl_lock); 387 } 388 389 /* 390 * return space used in the range (mintxg, maxtxg]. 391 * Includes maxtxg, does not include mintxg. 392 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 393 * larger than any bp in the deadlist (eg. UINT64_MAX)). 394 */ 395 void 396 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 397 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 398 { 399 dsl_deadlist_entry_t *dle; 400 dsl_deadlist_entry_t dle_tofind; 401 avl_index_t where; 402 403 if (dl->dl_oldfmt) { 404 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 405 mintxg, maxtxg, usedp, compp, uncompp)); 406 return; 407 } 408 409 *usedp = *compp = *uncompp = 0; 410 411 mutex_enter(&dl->dl_lock); 412 dsl_deadlist_load_tree(dl); 413 dle_tofind.dle_mintxg = mintxg; 414 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 415 /* 416 * If we don't find this mintxg, there shouldn't be anything 417 * after it either. 418 */ 419 ASSERT(dle != NULL || 420 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 421 422 for (; dle && dle->dle_mintxg < maxtxg; 423 dle = AVL_NEXT(&dl->dl_tree, dle)) { 424 uint64_t used, comp, uncomp; 425 426 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 427 &used, &comp, &uncomp)); 428 429 *usedp += used; 430 *compp += comp; 431 *uncompp += uncomp; 432 } 433 mutex_exit(&dl->dl_lock); 434 } 435 436 static void 437 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 438 dmu_tx_t *tx) 439 { 440 dsl_deadlist_entry_t dle_tofind; 441 dsl_deadlist_entry_t *dle; 442 avl_index_t where; 443 uint64_t used, comp, uncomp; 444 bpobj_t bpo; 445 446 ASSERT(MUTEX_HELD(&dl->dl_lock)); 447 448 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 449 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 450 bpobj_close(&bpo); 451 452 dsl_deadlist_load_tree(dl); 453 454 dmu_buf_will_dirty(dl->dl_dbuf, tx); 455 dl->dl_phys->dl_used += used; 456 dl->dl_phys->dl_comp += comp; 457 dl->dl_phys->dl_uncomp += uncomp; 458 459 dle_tofind.dle_mintxg = birth; 460 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 461 if (dle == NULL) 462 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 463 dle_enqueue_subobj(dl, dle, obj, tx); 464 } 465 466 static int 467 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 468 { 469 dsl_deadlist_t *dl = arg; 470 dsl_deadlist_insert(dl, bp, tx); 471 return (0); 472 } 473 474 /* 475 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 476 * an empty deadlist. 477 */ 478 void 479 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 480 { 481 zap_cursor_t zc; 482 zap_attribute_t za; 483 dmu_buf_t *bonus; 484 dsl_deadlist_phys_t *dlp; 485 dmu_object_info_t doi; 486 487 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 488 if (doi.doi_type == DMU_OT_BPOBJ) { 489 bpobj_t bpo; 490 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 491 VERIFY3U(0, ==, bpobj_iterate(&bpo, 492 dsl_deadlist_insert_cb, dl, tx)); 493 bpobj_close(&bpo); 494 return; 495 } 496 497 mutex_enter(&dl->dl_lock); 498 for (zap_cursor_init(&zc, dl->dl_os, obj); 499 zap_cursor_retrieve(&zc, &za) == 0; 500 zap_cursor_advance(&zc)) { 501 uint64_t mintxg = zfs_strtonum(za.za_name, NULL); 502 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 503 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 504 } 505 zap_cursor_fini(&zc); 506 507 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 508 dlp = bonus->db_data; 509 dmu_buf_will_dirty(bonus, tx); 510 bzero(dlp, sizeof (*dlp)); 511 dmu_buf_rele(bonus, FTAG); 512 mutex_exit(&dl->dl_lock); 513 } 514 515 /* 516 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 517 */ 518 void 519 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 520 dmu_tx_t *tx) 521 { 522 dsl_deadlist_entry_t dle_tofind; 523 dsl_deadlist_entry_t *dle; 524 avl_index_t where; 525 526 ASSERT(!dl->dl_oldfmt); 527 528 mutex_enter(&dl->dl_lock); 529 dmu_buf_will_dirty(dl->dl_dbuf, tx); 530 dsl_deadlist_load_tree(dl); 531 532 dle_tofind.dle_mintxg = mintxg; 533 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 534 if (dle == NULL) 535 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 536 while (dle) { 537 uint64_t used, comp, uncomp; 538 dsl_deadlist_entry_t *dle_next; 539 540 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 541 542 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 543 &used, &comp, &uncomp)); 544 ASSERT3U(dl->dl_phys->dl_used, >=, used); 545 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 546 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 547 dl->dl_phys->dl_used -= used; 548 dl->dl_phys->dl_comp -= comp; 549 dl->dl_phys->dl_uncomp -= uncomp; 550 551 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 552 dle->dle_mintxg, tx)); 553 554 dle_next = AVL_NEXT(&dl->dl_tree, dle); 555 avl_remove(&dl->dl_tree, dle); 556 bpobj_close(&dle->dle_bpobj); 557 kmem_free(dle, sizeof (*dle)); 558 dle = dle_next; 559 } 560 mutex_exit(&dl->dl_lock); 561 } 562