1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/dmu_objset.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_prop.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dmu_traverse.h> 33 #include <sys/dmu_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/arc.h> 36 #include <sys/zio.h> 37 #include <sys/zap.h> 38 #include <sys/unique.h> 39 #include <sys/zfs_context.h> 40 #include <sys/zfs_ioctl.h> 41 #include <sys/spa.h> 42 #include <sys/zfs_znode.h> 43 #include <sys/zfs_onexit.h> 44 #include <sys/zvol.h> 45 #include <sys/dsl_scan.h> 46 #include <sys/dsl_deadlist.h> 47 48 static char *dsl_reaper = "the grim reaper"; 49 50 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 51 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 52 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 53 54 #define SWITCH64(x, y) \ 55 { \ 56 uint64_t __tmp = (x); \ 57 (x) = (y); \ 58 (y) = __tmp; \ 59 } 60 61 #define DS_REF_MAX (1ULL << 62) 62 63 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 64 65 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 66 67 68 /* 69 * Figure out how much of this delta should be propogated to the dsl_dir 70 * layer. If there's a refreservation, that space has already been 71 * partially accounted for in our ancestors. 72 */ 73 static int64_t 74 parent_delta(dsl_dataset_t *ds, int64_t delta) 75 { 76 uint64_t old_bytes, new_bytes; 77 78 if (ds->ds_reserved == 0) 79 return (delta); 80 81 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 82 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 83 84 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 85 return (new_bytes - old_bytes); 86 } 87 88 void 89 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 90 { 91 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 92 int compressed = BP_GET_PSIZE(bp); 93 int uncompressed = BP_GET_UCSIZE(bp); 94 int64_t delta; 95 96 dprintf_bp(bp, "ds=%p", ds); 97 98 ASSERT(dmu_tx_is_syncing(tx)); 99 /* It could have been compressed away to nothing */ 100 if (BP_IS_HOLE(bp)) 101 return; 102 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 103 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 104 if (ds == NULL) { 105 /* 106 * Account for the meta-objset space in its placeholder 107 * dsl_dir. 108 */ 109 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 110 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 111 used, compressed, uncompressed, tx); 112 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 113 return; 114 } 115 dmu_buf_will_dirty(ds->ds_dbuf, tx); 116 117 mutex_enter(&ds->ds_dir->dd_lock); 118 mutex_enter(&ds->ds_lock); 119 delta = parent_delta(ds, used); 120 ds->ds_phys->ds_used_bytes += used; 121 ds->ds_phys->ds_compressed_bytes += compressed; 122 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 123 ds->ds_phys->ds_unique_bytes += used; 124 mutex_exit(&ds->ds_lock); 125 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 126 compressed, uncompressed, tx); 127 dsl_dir_transfer_space(ds->ds_dir, used - delta, 128 DD_USED_REFRSRV, DD_USED_HEAD, tx); 129 mutex_exit(&ds->ds_dir->dd_lock); 130 } 131 132 int 133 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 134 boolean_t async) 135 { 136 if (BP_IS_HOLE(bp)) 137 return (0); 138 139 ASSERT(dmu_tx_is_syncing(tx)); 140 ASSERT(bp->blk_birth <= tx->tx_txg); 141 142 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 143 int compressed = BP_GET_PSIZE(bp); 144 int uncompressed = BP_GET_UCSIZE(bp); 145 146 ASSERT(used > 0); 147 if (ds == NULL) { 148 /* 149 * Account for the meta-objset space in its placeholder 150 * dataset. 151 */ 152 dsl_free(tx->tx_pool, tx->tx_txg, bp); 153 154 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 155 -used, -compressed, -uncompressed, tx); 156 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 157 return (used); 158 } 159 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 160 161 ASSERT(!dsl_dataset_is_snapshot(ds)); 162 dmu_buf_will_dirty(ds->ds_dbuf, tx); 163 164 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 165 int64_t delta; 166 167 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 168 dsl_free(tx->tx_pool, tx->tx_txg, bp); 169 170 mutex_enter(&ds->ds_dir->dd_lock); 171 mutex_enter(&ds->ds_lock); 172 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 173 !DS_UNIQUE_IS_ACCURATE(ds)); 174 delta = parent_delta(ds, -used); 175 ds->ds_phys->ds_unique_bytes -= used; 176 mutex_exit(&ds->ds_lock); 177 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 178 delta, -compressed, -uncompressed, tx); 179 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 180 DD_USED_REFRSRV, DD_USED_HEAD, tx); 181 mutex_exit(&ds->ds_dir->dd_lock); 182 } else { 183 dprintf_bp(bp, "putting on dead list: %s", ""); 184 if (async) { 185 /* 186 * We are here as part of zio's write done callback, 187 * which means we're a zio interrupt thread. We can't 188 * call dsl_deadlist_insert() now because it may block 189 * waiting for I/O. Instead, put bp on the deferred 190 * queue and let dsl_pool_sync() finish the job. 191 */ 192 bplist_append(&ds->ds_pending_deadlist, bp); 193 } else { 194 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 195 } 196 ASSERT3U(ds->ds_prev->ds_object, ==, 197 ds->ds_phys->ds_prev_snap_obj); 198 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 199 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 200 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 201 ds->ds_object && bp->blk_birth > 202 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 203 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 204 mutex_enter(&ds->ds_prev->ds_lock); 205 ds->ds_prev->ds_phys->ds_unique_bytes += used; 206 mutex_exit(&ds->ds_prev->ds_lock); 207 } 208 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 209 dsl_dir_transfer_space(ds->ds_dir, used, 210 DD_USED_HEAD, DD_USED_SNAP, tx); 211 } 212 } 213 mutex_enter(&ds->ds_lock); 214 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 215 ds->ds_phys->ds_used_bytes -= used; 216 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 217 ds->ds_phys->ds_compressed_bytes -= compressed; 218 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 219 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 220 mutex_exit(&ds->ds_lock); 221 222 return (used); 223 } 224 225 uint64_t 226 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 227 { 228 uint64_t trysnap = 0; 229 230 if (ds == NULL) 231 return (0); 232 /* 233 * The snapshot creation could fail, but that would cause an 234 * incorrect FALSE return, which would only result in an 235 * overestimation of the amount of space that an operation would 236 * consume, which is OK. 237 * 238 * There's also a small window where we could miss a pending 239 * snapshot, because we could set the sync task in the quiescing 240 * phase. So this should only be used as a guess. 241 */ 242 if (ds->ds_trysnap_txg > 243 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 244 trysnap = ds->ds_trysnap_txg; 245 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 246 } 247 248 boolean_t 249 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 250 uint64_t blk_birth) 251 { 252 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 253 return (B_FALSE); 254 255 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 256 257 return (B_TRUE); 258 } 259 260 /* ARGSUSED */ 261 static void 262 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 263 { 264 dsl_dataset_t *ds = dsv; 265 266 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 267 268 unique_remove(ds->ds_fsid_guid); 269 270 if (ds->ds_objset != NULL) 271 dmu_objset_evict(ds->ds_objset); 272 273 if (ds->ds_prev) { 274 dsl_dataset_drop_ref(ds->ds_prev, ds); 275 ds->ds_prev = NULL; 276 } 277 278 bplist_destroy(&ds->ds_pending_deadlist); 279 if (db != NULL) { 280 dsl_deadlist_close(&ds->ds_deadlist); 281 } else { 282 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 283 ASSERT(!ds->ds_deadlist.dl_oldfmt); 284 } 285 if (ds->ds_dir) 286 dsl_dir_close(ds->ds_dir, ds); 287 288 ASSERT(!list_link_active(&ds->ds_synced_link)); 289 290 mutex_destroy(&ds->ds_lock); 291 mutex_destroy(&ds->ds_recvlock); 292 mutex_destroy(&ds->ds_opening_lock); 293 rw_destroy(&ds->ds_rwlock); 294 cv_destroy(&ds->ds_exclusive_cv); 295 296 kmem_free(ds, sizeof (dsl_dataset_t)); 297 } 298 299 static int 300 dsl_dataset_get_snapname(dsl_dataset_t *ds) 301 { 302 dsl_dataset_phys_t *headphys; 303 int err; 304 dmu_buf_t *headdbuf; 305 dsl_pool_t *dp = ds->ds_dir->dd_pool; 306 objset_t *mos = dp->dp_meta_objset; 307 308 if (ds->ds_snapname[0]) 309 return (0); 310 if (ds->ds_phys->ds_next_snap_obj == 0) 311 return (0); 312 313 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 314 FTAG, &headdbuf); 315 if (err) 316 return (err); 317 headphys = headdbuf->db_data; 318 err = zap_value_search(dp->dp_meta_objset, 319 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 320 dmu_buf_rele(headdbuf, FTAG); 321 return (err); 322 } 323 324 static int 325 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 326 { 327 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 328 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 329 matchtype_t mt; 330 int err; 331 332 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 333 mt = MT_FIRST; 334 else 335 mt = MT_EXACT; 336 337 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 338 value, mt, NULL, 0, NULL); 339 if (err == ENOTSUP && mt == MT_FIRST) 340 err = zap_lookup(mos, snapobj, name, 8, 1, value); 341 return (err); 342 } 343 344 static int 345 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 346 { 347 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 348 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 349 matchtype_t mt; 350 int err; 351 352 dsl_dir_snap_cmtime_update(ds->ds_dir); 353 354 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 355 mt = MT_FIRST; 356 else 357 mt = MT_EXACT; 358 359 err = zap_remove_norm(mos, snapobj, name, mt, tx); 360 if (err == ENOTSUP && mt == MT_FIRST) 361 err = zap_remove(mos, snapobj, name, tx); 362 return (err); 363 } 364 365 static int 366 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 367 dsl_dataset_t **dsp) 368 { 369 objset_t *mos = dp->dp_meta_objset; 370 dmu_buf_t *dbuf; 371 dsl_dataset_t *ds; 372 int err; 373 dmu_object_info_t doi; 374 375 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 376 dsl_pool_sync_context(dp)); 377 378 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 379 if (err) 380 return (err); 381 382 /* Make sure dsobj has the correct object type. */ 383 dmu_object_info_from_db(dbuf, &doi); 384 if (doi.doi_type != DMU_OT_DSL_DATASET) 385 return (EINVAL); 386 387 ds = dmu_buf_get_user(dbuf); 388 if (ds == NULL) { 389 dsl_dataset_t *winner; 390 391 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 392 ds->ds_dbuf = dbuf; 393 ds->ds_object = dsobj; 394 ds->ds_phys = dbuf->db_data; 395 396 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 397 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 398 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 399 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); 400 401 rw_init(&ds->ds_rwlock, 0, 0, 0); 402 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 403 404 bplist_create(&ds->ds_pending_deadlist); 405 dsl_deadlist_open(&ds->ds_deadlist, 406 mos, ds->ds_phys->ds_deadlist_obj); 407 408 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), 409 offsetof(dmu_sendarg_t, dsa_link)); 410 411 if (err == 0) { 412 err = dsl_dir_open_obj(dp, 413 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 414 } 415 if (err) { 416 mutex_destroy(&ds->ds_lock); 417 mutex_destroy(&ds->ds_recvlock); 418 mutex_destroy(&ds->ds_opening_lock); 419 rw_destroy(&ds->ds_rwlock); 420 cv_destroy(&ds->ds_exclusive_cv); 421 bplist_destroy(&ds->ds_pending_deadlist); 422 dsl_deadlist_close(&ds->ds_deadlist); 423 kmem_free(ds, sizeof (dsl_dataset_t)); 424 dmu_buf_rele(dbuf, tag); 425 return (err); 426 } 427 428 if (!dsl_dataset_is_snapshot(ds)) { 429 ds->ds_snapname[0] = '\0'; 430 if (ds->ds_phys->ds_prev_snap_obj) { 431 err = dsl_dataset_get_ref(dp, 432 ds->ds_phys->ds_prev_snap_obj, 433 ds, &ds->ds_prev); 434 } 435 } else { 436 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 437 err = dsl_dataset_get_snapname(ds); 438 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 439 err = zap_count( 440 ds->ds_dir->dd_pool->dp_meta_objset, 441 ds->ds_phys->ds_userrefs_obj, 442 &ds->ds_userrefs); 443 } 444 } 445 446 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 447 /* 448 * In sync context, we're called with either no lock 449 * or with the write lock. If we're not syncing, 450 * we're always called with the read lock held. 451 */ 452 boolean_t need_lock = 453 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 454 dsl_pool_sync_context(dp); 455 456 if (need_lock) 457 rw_enter(&dp->dp_config_rwlock, RW_READER); 458 459 err = dsl_prop_get_ds(ds, 460 "refreservation", sizeof (uint64_t), 1, 461 &ds->ds_reserved, NULL); 462 if (err == 0) { 463 err = dsl_prop_get_ds(ds, 464 "refquota", sizeof (uint64_t), 1, 465 &ds->ds_quota, NULL); 466 } 467 468 if (need_lock) 469 rw_exit(&dp->dp_config_rwlock); 470 } else { 471 ds->ds_reserved = ds->ds_quota = 0; 472 } 473 474 if (err == 0) { 475 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 476 dsl_dataset_evict); 477 } 478 if (err || winner) { 479 bplist_destroy(&ds->ds_pending_deadlist); 480 dsl_deadlist_close(&ds->ds_deadlist); 481 if (ds->ds_prev) 482 dsl_dataset_drop_ref(ds->ds_prev, ds); 483 dsl_dir_close(ds->ds_dir, ds); 484 mutex_destroy(&ds->ds_lock); 485 mutex_destroy(&ds->ds_recvlock); 486 mutex_destroy(&ds->ds_opening_lock); 487 rw_destroy(&ds->ds_rwlock); 488 cv_destroy(&ds->ds_exclusive_cv); 489 kmem_free(ds, sizeof (dsl_dataset_t)); 490 if (err) { 491 dmu_buf_rele(dbuf, tag); 492 return (err); 493 } 494 ds = winner; 495 } else { 496 ds->ds_fsid_guid = 497 unique_insert(ds->ds_phys->ds_fsid_guid); 498 } 499 } 500 ASSERT3P(ds->ds_dbuf, ==, dbuf); 501 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 502 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 503 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 504 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 505 mutex_enter(&ds->ds_lock); 506 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 507 mutex_exit(&ds->ds_lock); 508 dmu_buf_rele(ds->ds_dbuf, tag); 509 return (ENOENT); 510 } 511 mutex_exit(&ds->ds_lock); 512 *dsp = ds; 513 return (0); 514 } 515 516 static int 517 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 518 { 519 dsl_pool_t *dp = ds->ds_dir->dd_pool; 520 521 /* 522 * In syncing context we don't want the rwlock lock: there 523 * may be an existing writer waiting for sync phase to 524 * finish. We don't need to worry about such writers, since 525 * sync phase is single-threaded, so the writer can't be 526 * doing anything while we are active. 527 */ 528 if (dsl_pool_sync_context(dp)) { 529 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 530 return (0); 531 } 532 533 /* 534 * Normal users will hold the ds_rwlock as a READER until they 535 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 536 * drop their READER lock after they set the ds_owner field. 537 * 538 * If the dataset is being destroyed, the destroy thread will 539 * obtain a WRITER lock for exclusive access after it's done its 540 * open-context work and then change the ds_owner to 541 * dsl_reaper once destruction is assured. So threads 542 * may block here temporarily, until the "destructability" of 543 * the dataset is determined. 544 */ 545 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 546 mutex_enter(&ds->ds_lock); 547 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 548 rw_exit(&dp->dp_config_rwlock); 549 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 550 if (DSL_DATASET_IS_DESTROYED(ds)) { 551 mutex_exit(&ds->ds_lock); 552 dsl_dataset_drop_ref(ds, tag); 553 rw_enter(&dp->dp_config_rwlock, RW_READER); 554 return (ENOENT); 555 } 556 /* 557 * The dp_config_rwlock lives above the ds_lock. And 558 * we need to check DSL_DATASET_IS_DESTROYED() while 559 * holding the ds_lock, so we have to drop and reacquire 560 * the ds_lock here. 561 */ 562 mutex_exit(&ds->ds_lock); 563 rw_enter(&dp->dp_config_rwlock, RW_READER); 564 mutex_enter(&ds->ds_lock); 565 } 566 mutex_exit(&ds->ds_lock); 567 return (0); 568 } 569 570 int 571 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 572 dsl_dataset_t **dsp) 573 { 574 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 575 576 if (err) 577 return (err); 578 return (dsl_dataset_hold_ref(*dsp, tag)); 579 } 580 581 int 582 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 583 void *tag, dsl_dataset_t **dsp) 584 { 585 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 586 if (err) 587 return (err); 588 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 589 dsl_dataset_rele(*dsp, tag); 590 *dsp = NULL; 591 return (EBUSY); 592 } 593 return (0); 594 } 595 596 int 597 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 598 { 599 dsl_dir_t *dd; 600 dsl_pool_t *dp; 601 const char *snapname; 602 uint64_t obj; 603 int err = 0; 604 605 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 606 if (err) 607 return (err); 608 609 dp = dd->dd_pool; 610 obj = dd->dd_phys->dd_head_dataset_obj; 611 rw_enter(&dp->dp_config_rwlock, RW_READER); 612 if (obj) 613 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 614 else 615 err = ENOENT; 616 if (err) 617 goto out; 618 619 err = dsl_dataset_hold_ref(*dsp, tag); 620 621 /* we may be looking for a snapshot */ 622 if (err == 0 && snapname != NULL) { 623 dsl_dataset_t *ds = NULL; 624 625 if (*snapname++ != '@') { 626 dsl_dataset_rele(*dsp, tag); 627 err = ENOENT; 628 goto out; 629 } 630 631 dprintf("looking for snapshot '%s'\n", snapname); 632 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 633 if (err == 0) 634 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 635 dsl_dataset_rele(*dsp, tag); 636 637 ASSERT3U((err == 0), ==, (ds != NULL)); 638 639 if (ds) { 640 mutex_enter(&ds->ds_lock); 641 if (ds->ds_snapname[0] == 0) 642 (void) strlcpy(ds->ds_snapname, snapname, 643 sizeof (ds->ds_snapname)); 644 mutex_exit(&ds->ds_lock); 645 err = dsl_dataset_hold_ref(ds, tag); 646 *dsp = err ? NULL : ds; 647 } 648 } 649 out: 650 rw_exit(&dp->dp_config_rwlock); 651 dsl_dir_close(dd, FTAG); 652 return (err); 653 } 654 655 int 656 dsl_dataset_own(const char *name, boolean_t inconsistentok, 657 void *tag, dsl_dataset_t **dsp) 658 { 659 int err = dsl_dataset_hold(name, tag, dsp); 660 if (err) 661 return (err); 662 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 663 dsl_dataset_rele(*dsp, tag); 664 return (EBUSY); 665 } 666 return (0); 667 } 668 669 void 670 dsl_dataset_name(dsl_dataset_t *ds, char *name) 671 { 672 if (ds == NULL) { 673 (void) strcpy(name, "mos"); 674 } else { 675 dsl_dir_name(ds->ds_dir, name); 676 VERIFY(0 == dsl_dataset_get_snapname(ds)); 677 if (ds->ds_snapname[0]) { 678 (void) strcat(name, "@"); 679 /* 680 * We use a "recursive" mutex so that we 681 * can call dprintf_ds() with ds_lock held. 682 */ 683 if (!MUTEX_HELD(&ds->ds_lock)) { 684 mutex_enter(&ds->ds_lock); 685 (void) strcat(name, ds->ds_snapname); 686 mutex_exit(&ds->ds_lock); 687 } else { 688 (void) strcat(name, ds->ds_snapname); 689 } 690 } 691 } 692 } 693 694 static int 695 dsl_dataset_namelen(dsl_dataset_t *ds) 696 { 697 int result; 698 699 if (ds == NULL) { 700 result = 3; /* "mos" */ 701 } else { 702 result = dsl_dir_namelen(ds->ds_dir); 703 VERIFY(0 == dsl_dataset_get_snapname(ds)); 704 if (ds->ds_snapname[0]) { 705 ++result; /* adding one for the @-sign */ 706 if (!MUTEX_HELD(&ds->ds_lock)) { 707 mutex_enter(&ds->ds_lock); 708 result += strlen(ds->ds_snapname); 709 mutex_exit(&ds->ds_lock); 710 } else { 711 result += strlen(ds->ds_snapname); 712 } 713 } 714 } 715 716 return (result); 717 } 718 719 void 720 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 721 { 722 dmu_buf_rele(ds->ds_dbuf, tag); 723 } 724 725 void 726 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 727 { 728 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 729 rw_exit(&ds->ds_rwlock); 730 } 731 dsl_dataset_drop_ref(ds, tag); 732 } 733 734 void 735 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 736 { 737 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 738 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 739 740 mutex_enter(&ds->ds_lock); 741 ds->ds_owner = NULL; 742 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 743 rw_exit(&ds->ds_rwlock); 744 cv_broadcast(&ds->ds_exclusive_cv); 745 } 746 mutex_exit(&ds->ds_lock); 747 if (ds->ds_dbuf) 748 dsl_dataset_drop_ref(ds, tag); 749 else 750 dsl_dataset_evict(NULL, ds); 751 } 752 753 boolean_t 754 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 755 { 756 boolean_t gotit = FALSE; 757 758 mutex_enter(&ds->ds_lock); 759 if (ds->ds_owner == NULL && 760 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 761 ds->ds_owner = tag; 762 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 763 rw_exit(&ds->ds_rwlock); 764 gotit = TRUE; 765 } 766 mutex_exit(&ds->ds_lock); 767 return (gotit); 768 } 769 770 void 771 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 772 { 773 ASSERT3P(owner, ==, ds->ds_owner); 774 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 775 rw_enter(&ds->ds_rwlock, RW_WRITER); 776 } 777 778 uint64_t 779 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 780 uint64_t flags, dmu_tx_t *tx) 781 { 782 dsl_pool_t *dp = dd->dd_pool; 783 dmu_buf_t *dbuf; 784 dsl_dataset_phys_t *dsphys; 785 uint64_t dsobj; 786 objset_t *mos = dp->dp_meta_objset; 787 788 if (origin == NULL) 789 origin = dp->dp_origin_snap; 790 791 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 792 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 793 ASSERT(dmu_tx_is_syncing(tx)); 794 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 795 796 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 797 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 798 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 799 dmu_buf_will_dirty(dbuf, tx); 800 dsphys = dbuf->db_data; 801 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 802 dsphys->ds_dir_obj = dd->dd_object; 803 dsphys->ds_flags = flags; 804 dsphys->ds_fsid_guid = unique_create(); 805 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 806 sizeof (dsphys->ds_guid)); 807 dsphys->ds_snapnames_zapobj = 808 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 809 DMU_OT_NONE, 0, tx); 810 dsphys->ds_creation_time = gethrestime_sec(); 811 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 812 813 if (origin == NULL) { 814 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 815 } else { 816 dsl_dataset_t *ohds; 817 818 dsphys->ds_prev_snap_obj = origin->ds_object; 819 dsphys->ds_prev_snap_txg = 820 origin->ds_phys->ds_creation_txg; 821 dsphys->ds_used_bytes = 822 origin->ds_phys->ds_used_bytes; 823 dsphys->ds_compressed_bytes = 824 origin->ds_phys->ds_compressed_bytes; 825 dsphys->ds_uncompressed_bytes = 826 origin->ds_phys->ds_uncompressed_bytes; 827 dsphys->ds_bp = origin->ds_phys->ds_bp; 828 dsphys->ds_flags |= origin->ds_phys->ds_flags; 829 830 dmu_buf_will_dirty(origin->ds_dbuf, tx); 831 origin->ds_phys->ds_num_children++; 832 833 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 834 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 835 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 836 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 837 dsl_dataset_rele(ohds, FTAG); 838 839 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 840 if (origin->ds_phys->ds_next_clones_obj == 0) { 841 origin->ds_phys->ds_next_clones_obj = 842 zap_create(mos, 843 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 844 } 845 VERIFY(0 == zap_add_int(mos, 846 origin->ds_phys->ds_next_clones_obj, 847 dsobj, tx)); 848 } 849 850 dmu_buf_will_dirty(dd->dd_dbuf, tx); 851 dd->dd_phys->dd_origin_obj = origin->ds_object; 852 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 853 if (origin->ds_dir->dd_phys->dd_clones == 0) { 854 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 855 origin->ds_dir->dd_phys->dd_clones = 856 zap_create(mos, 857 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 858 } 859 VERIFY3U(0, ==, zap_add_int(mos, 860 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 861 } 862 } 863 864 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 865 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 866 867 dmu_buf_rele(dbuf, FTAG); 868 869 dmu_buf_will_dirty(dd->dd_dbuf, tx); 870 dd->dd_phys->dd_head_dataset_obj = dsobj; 871 872 return (dsobj); 873 } 874 875 uint64_t 876 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 877 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 878 { 879 dsl_pool_t *dp = pdd->dd_pool; 880 uint64_t dsobj, ddobj; 881 dsl_dir_t *dd; 882 883 ASSERT(lastname[0] != '@'); 884 885 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 886 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 887 888 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 889 890 dsl_deleg_set_create_perms(dd, tx, cr); 891 892 dsl_dir_close(dd, FTAG); 893 894 /* 895 * If we are creating a clone, make sure we zero out any stale 896 * data from the origin snapshots zil header. 897 */ 898 if (origin != NULL) { 899 dsl_dataset_t *ds; 900 objset_t *os; 901 902 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 903 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 904 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 905 dsl_dataset_dirty(ds, tx); 906 dsl_dataset_rele(ds, FTAG); 907 } 908 909 return (dsobj); 910 } 911 912 /* 913 * The snapshots must all be in the same pool. 914 */ 915 int 916 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) 917 { 918 int err; 919 dsl_sync_task_t *dst; 920 spa_t *spa; 921 nvpair_t *pair; 922 dsl_sync_task_group_t *dstg; 923 924 pair = nvlist_next_nvpair(snaps, NULL); 925 if (pair == NULL) 926 return (0); 927 928 err = spa_open(nvpair_name(pair), &spa, FTAG); 929 if (err) 930 return (err); 931 dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 932 933 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 934 pair = nvlist_next_nvpair(snaps, pair)) { 935 dsl_dataset_t *ds; 936 int err; 937 938 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); 939 if (err == 0) { 940 struct dsl_ds_destroyarg *dsda; 941 942 dsl_dataset_make_exclusive(ds, dstg); 943 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), 944 KM_SLEEP); 945 dsda->ds = ds; 946 dsda->defer = defer; 947 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 948 dsl_dataset_destroy_sync, dsda, dstg, 0); 949 } else if (err == ENOENT) { 950 err = 0; 951 } else { 952 (void) strcpy(failed, nvpair_name(pair)); 953 break; 954 } 955 } 956 957 if (err == 0) 958 err = dsl_sync_task_group_wait(dstg); 959 960 for (dst = list_head(&dstg->dstg_tasks); dst; 961 dst = list_next(&dstg->dstg_tasks, dst)) { 962 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 963 dsl_dataset_t *ds = dsda->ds; 964 965 /* 966 * Return the file system name that triggered the error 967 */ 968 if (dst->dst_err) { 969 dsl_dataset_name(ds, failed); 970 } 971 ASSERT3P(dsda->rm_origin, ==, NULL); 972 dsl_dataset_disown(ds, dstg); 973 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 974 } 975 976 dsl_sync_task_group_destroy(dstg); 977 spa_close(spa, FTAG); 978 return (err); 979 980 } 981 982 static boolean_t 983 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 984 { 985 boolean_t might_destroy = B_FALSE; 986 987 mutex_enter(&ds->ds_lock); 988 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 989 DS_IS_DEFER_DESTROY(ds)) 990 might_destroy = B_TRUE; 991 mutex_exit(&ds->ds_lock); 992 993 return (might_destroy); 994 } 995 996 /* 997 * If we're removing a clone, and these three conditions are true: 998 * 1) the clone's origin has no other children 999 * 2) the clone's origin has no user references 1000 * 3) the clone's origin has been marked for deferred destruction 1001 * Then, prepare to remove the origin as part of this sync task group. 1002 */ 1003 static int 1004 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 1005 { 1006 dsl_dataset_t *ds = dsda->ds; 1007 dsl_dataset_t *origin = ds->ds_prev; 1008 1009 if (dsl_dataset_might_destroy_origin(origin)) { 1010 char *name; 1011 int namelen; 1012 int error; 1013 1014 namelen = dsl_dataset_namelen(origin) + 1; 1015 name = kmem_alloc(namelen, KM_SLEEP); 1016 dsl_dataset_name(origin, name); 1017 #ifdef _KERNEL 1018 error = zfs_unmount_snap(name, NULL); 1019 if (error) { 1020 kmem_free(name, namelen); 1021 return (error); 1022 } 1023 #endif 1024 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1025 kmem_free(name, namelen); 1026 if (error) 1027 return (error); 1028 dsda->rm_origin = origin; 1029 dsl_dataset_make_exclusive(origin, tag); 1030 } 1031 1032 return (0); 1033 } 1034 1035 /* 1036 * ds must be opened as OWNER. On return (whether successful or not), 1037 * ds will be closed and caller can no longer dereference it. 1038 */ 1039 int 1040 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1041 { 1042 int err; 1043 dsl_sync_task_group_t *dstg; 1044 objset_t *os; 1045 dsl_dir_t *dd; 1046 uint64_t obj; 1047 struct dsl_ds_destroyarg dsda = { 0 }; 1048 dsl_dataset_t dummy_ds = { 0 }; 1049 1050 dsda.ds = ds; 1051 1052 if (dsl_dataset_is_snapshot(ds)) { 1053 /* Destroying a snapshot is simpler */ 1054 dsl_dataset_make_exclusive(ds, tag); 1055 1056 dsda.defer = defer; 1057 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1058 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1059 &dsda, tag, 0); 1060 ASSERT3P(dsda.rm_origin, ==, NULL); 1061 goto out; 1062 } else if (defer) { 1063 err = EINVAL; 1064 goto out; 1065 } 1066 1067 dd = ds->ds_dir; 1068 dummy_ds.ds_dir = dd; 1069 dummy_ds.ds_object = ds->ds_object; 1070 1071 /* 1072 * Check for errors and mark this ds as inconsistent, in 1073 * case we crash while freeing the objects. 1074 */ 1075 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1076 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1077 if (err) 1078 goto out; 1079 1080 err = dmu_objset_from_ds(ds, &os); 1081 if (err) 1082 goto out; 1083 1084 /* 1085 * remove the objects in open context, so that we won't 1086 * have too much to do in syncing context. 1087 */ 1088 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1089 ds->ds_phys->ds_prev_snap_txg)) { 1090 /* 1091 * Ignore errors, if there is not enough disk space 1092 * we will deal with it in dsl_dataset_destroy_sync(). 1093 */ 1094 (void) dmu_free_object(os, obj); 1095 } 1096 if (err != ESRCH) 1097 goto out; 1098 1099 /* 1100 * Only the ZIL knows how to free log blocks. 1101 */ 1102 zil_destroy(dmu_objset_zil(os), B_FALSE); 1103 1104 /* 1105 * Sync out all in-flight IO. 1106 */ 1107 txg_wait_synced(dd->dd_pool, 0); 1108 1109 /* 1110 * If we managed to free all the objects in open 1111 * context, the user space accounting should be zero. 1112 */ 1113 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1114 dmu_objset_userused_enabled(os)) { 1115 uint64_t count; 1116 1117 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1118 count == 0); 1119 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1120 count == 0); 1121 } 1122 1123 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1124 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1125 rw_exit(&dd->dd_pool->dp_config_rwlock); 1126 1127 if (err) 1128 goto out; 1129 1130 /* 1131 * Blow away the dsl_dir + head dataset. 1132 */ 1133 dsl_dataset_make_exclusive(ds, tag); 1134 /* 1135 * If we're removing a clone, we might also need to remove its 1136 * origin. 1137 */ 1138 do { 1139 dsda.need_prep = B_FALSE; 1140 if (dsl_dir_is_clone(dd)) { 1141 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1142 if (err) { 1143 dsl_dir_close(dd, FTAG); 1144 goto out; 1145 } 1146 } 1147 1148 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1149 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1150 dsl_dataset_destroy_sync, &dsda, tag, 0); 1151 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1152 dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); 1153 err = dsl_sync_task_group_wait(dstg); 1154 dsl_sync_task_group_destroy(dstg); 1155 1156 /* 1157 * We could be racing against 'zfs release' or 'zfs destroy -d' 1158 * on the origin snap, in which case we can get EBUSY if we 1159 * needed to destroy the origin snap but were not ready to 1160 * do so. 1161 */ 1162 if (dsda.need_prep) { 1163 ASSERT(err == EBUSY); 1164 ASSERT(dsl_dir_is_clone(dd)); 1165 ASSERT(dsda.rm_origin == NULL); 1166 } 1167 } while (dsda.need_prep); 1168 1169 if (dsda.rm_origin != NULL) 1170 dsl_dataset_disown(dsda.rm_origin, tag); 1171 1172 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1173 if (err) 1174 dsl_dir_close(dd, FTAG); 1175 out: 1176 dsl_dataset_disown(ds, tag); 1177 return (err); 1178 } 1179 1180 blkptr_t * 1181 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1182 { 1183 return (&ds->ds_phys->ds_bp); 1184 } 1185 1186 void 1187 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1188 { 1189 ASSERT(dmu_tx_is_syncing(tx)); 1190 /* If it's the meta-objset, set dp_meta_rootbp */ 1191 if (ds == NULL) { 1192 tx->tx_pool->dp_meta_rootbp = *bp; 1193 } else { 1194 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1195 ds->ds_phys->ds_bp = *bp; 1196 } 1197 } 1198 1199 spa_t * 1200 dsl_dataset_get_spa(dsl_dataset_t *ds) 1201 { 1202 return (ds->ds_dir->dd_pool->dp_spa); 1203 } 1204 1205 void 1206 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1207 { 1208 dsl_pool_t *dp; 1209 1210 if (ds == NULL) /* this is the meta-objset */ 1211 return; 1212 1213 ASSERT(ds->ds_objset != NULL); 1214 1215 if (ds->ds_phys->ds_next_snap_obj != 0) 1216 panic("dirtying snapshot!"); 1217 1218 dp = ds->ds_dir->dd_pool; 1219 1220 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1221 /* up the hold count until we can be written out */ 1222 dmu_buf_add_ref(ds->ds_dbuf, ds); 1223 } 1224 } 1225 1226 /* 1227 * The unique space in the head dataset can be calculated by subtracting 1228 * the space used in the most recent snapshot, that is still being used 1229 * in this file system, from the space currently in use. To figure out 1230 * the space in the most recent snapshot still in use, we need to take 1231 * the total space used in the snapshot and subtract out the space that 1232 * has been freed up since the snapshot was taken. 1233 */ 1234 static void 1235 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1236 { 1237 uint64_t mrs_used; 1238 uint64_t dlused, dlcomp, dluncomp; 1239 1240 ASSERT(!dsl_dataset_is_snapshot(ds)); 1241 1242 if (ds->ds_phys->ds_prev_snap_obj != 0) 1243 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1244 else 1245 mrs_used = 0; 1246 1247 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1248 1249 ASSERT3U(dlused, <=, mrs_used); 1250 ds->ds_phys->ds_unique_bytes = 1251 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1252 1253 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1254 SPA_VERSION_UNIQUE_ACCURATE) 1255 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1256 } 1257 1258 struct killarg { 1259 dsl_dataset_t *ds; 1260 dmu_tx_t *tx; 1261 }; 1262 1263 /* ARGSUSED */ 1264 static int 1265 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1266 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1267 { 1268 struct killarg *ka = arg; 1269 dmu_tx_t *tx = ka->tx; 1270 1271 if (bp == NULL) 1272 return (0); 1273 1274 if (zb->zb_level == ZB_ZIL_LEVEL) { 1275 ASSERT(zilog != NULL); 1276 /* 1277 * It's a block in the intent log. It has no 1278 * accounting, so just free it. 1279 */ 1280 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1281 } else { 1282 ASSERT(zilog == NULL); 1283 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1284 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1285 } 1286 1287 return (0); 1288 } 1289 1290 /* ARGSUSED */ 1291 static int 1292 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1293 { 1294 dsl_dataset_t *ds = arg1; 1295 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1296 uint64_t count; 1297 int err; 1298 1299 /* 1300 * Can't delete a head dataset if there are snapshots of it. 1301 * (Except if the only snapshots are from the branch we cloned 1302 * from.) 1303 */ 1304 if (ds->ds_prev != NULL && 1305 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1306 return (EBUSY); 1307 1308 /* 1309 * This is really a dsl_dir thing, but check it here so that 1310 * we'll be less likely to leave this dataset inconsistent & 1311 * nearly destroyed. 1312 */ 1313 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1314 if (err) 1315 return (err); 1316 if (count != 0) 1317 return (EEXIST); 1318 1319 return (0); 1320 } 1321 1322 /* ARGSUSED */ 1323 static void 1324 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1325 { 1326 dsl_dataset_t *ds = arg1; 1327 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1328 1329 /* Mark it as inconsistent on-disk, in case we crash */ 1330 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1331 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1332 1333 spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1334 "dataset = %llu", ds->ds_object); 1335 } 1336 1337 static int 1338 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1339 dmu_tx_t *tx) 1340 { 1341 dsl_dataset_t *ds = dsda->ds; 1342 dsl_dataset_t *ds_prev = ds->ds_prev; 1343 1344 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1345 struct dsl_ds_destroyarg ndsda = {0}; 1346 1347 /* 1348 * If we're not prepared to remove the origin, don't remove 1349 * the clone either. 1350 */ 1351 if (dsda->rm_origin == NULL) { 1352 dsda->need_prep = B_TRUE; 1353 return (EBUSY); 1354 } 1355 1356 ndsda.ds = ds_prev; 1357 ndsda.is_origin_rm = B_TRUE; 1358 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1359 } 1360 1361 /* 1362 * If we're not going to remove the origin after all, 1363 * undo the open context setup. 1364 */ 1365 if (dsda->rm_origin != NULL) { 1366 dsl_dataset_disown(dsda->rm_origin, tag); 1367 dsda->rm_origin = NULL; 1368 } 1369 1370 return (0); 1371 } 1372 1373 /* 1374 * If you add new checks here, you may need to add 1375 * additional checks to the "temporary" case in 1376 * snapshot_check() in dmu_objset.c. 1377 */ 1378 /* ARGSUSED */ 1379 int 1380 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1381 { 1382 struct dsl_ds_destroyarg *dsda = arg1; 1383 dsl_dataset_t *ds = dsda->ds; 1384 1385 /* we have an owner hold, so noone else can destroy us */ 1386 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1387 1388 /* 1389 * Only allow deferred destroy on pools that support it. 1390 * NOTE: deferred destroy is only supported on snapshots. 1391 */ 1392 if (dsda->defer) { 1393 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1394 SPA_VERSION_USERREFS) 1395 return (ENOTSUP); 1396 ASSERT(dsl_dataset_is_snapshot(ds)); 1397 return (0); 1398 } 1399 1400 /* 1401 * Can't delete a head dataset if there are snapshots of it. 1402 * (Except if the only snapshots are from the branch we cloned 1403 * from.) 1404 */ 1405 if (ds->ds_prev != NULL && 1406 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1407 return (EBUSY); 1408 1409 /* 1410 * If we made changes this txg, traverse_dsl_dataset won't find 1411 * them. Try again. 1412 */ 1413 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1414 return (EAGAIN); 1415 1416 if (dsl_dataset_is_snapshot(ds)) { 1417 /* 1418 * If this snapshot has an elevated user reference count, 1419 * we can't destroy it yet. 1420 */ 1421 if (ds->ds_userrefs > 0 && !dsda->releasing) 1422 return (EBUSY); 1423 1424 mutex_enter(&ds->ds_lock); 1425 /* 1426 * Can't delete a branch point. However, if we're destroying 1427 * a clone and removing its origin due to it having a user 1428 * hold count of 0 and having been marked for deferred destroy, 1429 * it's OK for the origin to have a single clone. 1430 */ 1431 if (ds->ds_phys->ds_num_children > 1432 (dsda->is_origin_rm ? 2 : 1)) { 1433 mutex_exit(&ds->ds_lock); 1434 return (EEXIST); 1435 } 1436 mutex_exit(&ds->ds_lock); 1437 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1438 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1439 } 1440 1441 /* XXX we should do some i/o error checking... */ 1442 return (0); 1443 } 1444 1445 struct refsarg { 1446 kmutex_t lock; 1447 boolean_t gone; 1448 kcondvar_t cv; 1449 }; 1450 1451 /* ARGSUSED */ 1452 static void 1453 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1454 { 1455 struct refsarg *arg = argv; 1456 1457 mutex_enter(&arg->lock); 1458 arg->gone = TRUE; 1459 cv_signal(&arg->cv); 1460 mutex_exit(&arg->lock); 1461 } 1462 1463 static void 1464 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1465 { 1466 struct refsarg arg; 1467 1468 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1469 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1470 arg.gone = FALSE; 1471 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1472 dsl_dataset_refs_gone); 1473 dmu_buf_rele(ds->ds_dbuf, tag); 1474 mutex_enter(&arg.lock); 1475 while (!arg.gone) 1476 cv_wait(&arg.cv, &arg.lock); 1477 ASSERT(arg.gone); 1478 mutex_exit(&arg.lock); 1479 ds->ds_dbuf = NULL; 1480 ds->ds_phys = NULL; 1481 mutex_destroy(&arg.lock); 1482 cv_destroy(&arg.cv); 1483 } 1484 1485 static void 1486 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1487 { 1488 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1489 uint64_t count; 1490 int err; 1491 1492 ASSERT(ds->ds_phys->ds_num_children >= 2); 1493 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1494 /* 1495 * The err should not be ENOENT, but a bug in a previous version 1496 * of the code could cause upgrade_clones_cb() to not set 1497 * ds_next_snap_obj when it should, leading to a missing entry. 1498 * If we knew that the pool was created after 1499 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1500 * ENOENT. However, at least we can check that we don't have 1501 * too many entries in the next_clones_obj even after failing to 1502 * remove this one. 1503 */ 1504 if (err != ENOENT) { 1505 VERIFY3U(err, ==, 0); 1506 } 1507 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1508 &count)); 1509 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1510 } 1511 1512 static void 1513 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1514 { 1515 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1516 zap_cursor_t zc; 1517 zap_attribute_t za; 1518 1519 /* 1520 * If it is the old version, dd_clones doesn't exist so we can't 1521 * find the clones, but deadlist_remove_key() is a no-op so it 1522 * doesn't matter. 1523 */ 1524 if (ds->ds_dir->dd_phys->dd_clones == 0) 1525 return; 1526 1527 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1528 zap_cursor_retrieve(&zc, &za) == 0; 1529 zap_cursor_advance(&zc)) { 1530 dsl_dataset_t *clone; 1531 1532 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1533 za.za_first_integer, FTAG, &clone)); 1534 if (clone->ds_dir->dd_origin_txg > mintxg) { 1535 dsl_deadlist_remove_key(&clone->ds_deadlist, 1536 mintxg, tx); 1537 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1538 } 1539 dsl_dataset_rele(clone, FTAG); 1540 } 1541 zap_cursor_fini(&zc); 1542 } 1543 1544 struct process_old_arg { 1545 dsl_dataset_t *ds; 1546 dsl_dataset_t *ds_prev; 1547 boolean_t after_branch_point; 1548 zio_t *pio; 1549 uint64_t used, comp, uncomp; 1550 }; 1551 1552 static int 1553 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1554 { 1555 struct process_old_arg *poa = arg; 1556 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1557 1558 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1559 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1560 if (poa->ds_prev && !poa->after_branch_point && 1561 bp->blk_birth > 1562 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1563 poa->ds_prev->ds_phys->ds_unique_bytes += 1564 bp_get_dsize_sync(dp->dp_spa, bp); 1565 } 1566 } else { 1567 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1568 poa->comp += BP_GET_PSIZE(bp); 1569 poa->uncomp += BP_GET_UCSIZE(bp); 1570 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1571 } 1572 return (0); 1573 } 1574 1575 static void 1576 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1577 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1578 { 1579 struct process_old_arg poa = { 0 }; 1580 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1581 objset_t *mos = dp->dp_meta_objset; 1582 1583 ASSERT(ds->ds_deadlist.dl_oldfmt); 1584 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1585 1586 poa.ds = ds; 1587 poa.ds_prev = ds_prev; 1588 poa.after_branch_point = after_branch_point; 1589 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1590 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1591 process_old_cb, &poa, tx)); 1592 VERIFY3U(zio_wait(poa.pio), ==, 0); 1593 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1594 1595 /* change snapused */ 1596 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1597 -poa.used, -poa.comp, -poa.uncomp, tx); 1598 1599 /* swap next's deadlist to our deadlist */ 1600 dsl_deadlist_close(&ds->ds_deadlist); 1601 dsl_deadlist_close(&ds_next->ds_deadlist); 1602 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1603 ds->ds_phys->ds_deadlist_obj); 1604 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1605 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1606 ds_next->ds_phys->ds_deadlist_obj); 1607 } 1608 1609 void 1610 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1611 { 1612 struct dsl_ds_destroyarg *dsda = arg1; 1613 dsl_dataset_t *ds = dsda->ds; 1614 int err; 1615 int after_branch_point = FALSE; 1616 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1617 objset_t *mos = dp->dp_meta_objset; 1618 dsl_dataset_t *ds_prev = NULL; 1619 boolean_t wont_destroy; 1620 uint64_t obj; 1621 1622 wont_destroy = (dsda->defer && 1623 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1624 1625 ASSERT(ds->ds_owner || wont_destroy); 1626 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1627 ASSERT(ds->ds_prev == NULL || 1628 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1629 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1630 1631 if (wont_destroy) { 1632 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1633 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1634 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1635 return; 1636 } 1637 1638 /* signal any waiters that this dataset is going away */ 1639 mutex_enter(&ds->ds_lock); 1640 ds->ds_owner = dsl_reaper; 1641 cv_broadcast(&ds->ds_exclusive_cv); 1642 mutex_exit(&ds->ds_lock); 1643 1644 /* Remove our reservation */ 1645 if (ds->ds_reserved != 0) { 1646 dsl_prop_setarg_t psa; 1647 uint64_t value = 0; 1648 1649 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1650 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1651 &value); 1652 psa.psa_effective_value = 0; /* predict default value */ 1653 1654 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1655 ASSERT3U(ds->ds_reserved, ==, 0); 1656 } 1657 1658 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1659 1660 dsl_scan_ds_destroyed(ds, tx); 1661 1662 obj = ds->ds_object; 1663 1664 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1665 if (ds->ds_prev) { 1666 ds_prev = ds->ds_prev; 1667 } else { 1668 VERIFY(0 == dsl_dataset_hold_obj(dp, 1669 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1670 } 1671 after_branch_point = 1672 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1673 1674 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1675 if (after_branch_point && 1676 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1677 remove_from_next_clones(ds_prev, obj, tx); 1678 if (ds->ds_phys->ds_next_snap_obj != 0) { 1679 VERIFY(0 == zap_add_int(mos, 1680 ds_prev->ds_phys->ds_next_clones_obj, 1681 ds->ds_phys->ds_next_snap_obj, tx)); 1682 } 1683 } 1684 if (after_branch_point && 1685 ds->ds_phys->ds_next_snap_obj == 0) { 1686 /* This clone is toast. */ 1687 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1688 ds_prev->ds_phys->ds_num_children--; 1689 1690 /* 1691 * If the clone's origin has no other clones, no 1692 * user holds, and has been marked for deferred 1693 * deletion, then we should have done the necessary 1694 * destroy setup for it. 1695 */ 1696 if (ds_prev->ds_phys->ds_num_children == 1 && 1697 ds_prev->ds_userrefs == 0 && 1698 DS_IS_DEFER_DESTROY(ds_prev)) { 1699 ASSERT3P(dsda->rm_origin, !=, NULL); 1700 } else { 1701 ASSERT3P(dsda->rm_origin, ==, NULL); 1702 } 1703 } else if (!after_branch_point) { 1704 ds_prev->ds_phys->ds_next_snap_obj = 1705 ds->ds_phys->ds_next_snap_obj; 1706 } 1707 } 1708 1709 if (dsl_dataset_is_snapshot(ds)) { 1710 dsl_dataset_t *ds_next; 1711 uint64_t old_unique; 1712 uint64_t used = 0, comp = 0, uncomp = 0; 1713 1714 VERIFY(0 == dsl_dataset_hold_obj(dp, 1715 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1716 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1717 1718 old_unique = ds_next->ds_phys->ds_unique_bytes; 1719 1720 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1721 ds_next->ds_phys->ds_prev_snap_obj = 1722 ds->ds_phys->ds_prev_snap_obj; 1723 ds_next->ds_phys->ds_prev_snap_txg = 1724 ds->ds_phys->ds_prev_snap_txg; 1725 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1726 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1727 1728 1729 if (ds_next->ds_deadlist.dl_oldfmt) { 1730 process_old_deadlist(ds, ds_prev, ds_next, 1731 after_branch_point, tx); 1732 } else { 1733 /* Adjust prev's unique space. */ 1734 if (ds_prev && !after_branch_point) { 1735 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1736 ds_prev->ds_phys->ds_prev_snap_txg, 1737 ds->ds_phys->ds_prev_snap_txg, 1738 &used, &comp, &uncomp); 1739 ds_prev->ds_phys->ds_unique_bytes += used; 1740 } 1741 1742 /* Adjust snapused. */ 1743 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1744 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1745 &used, &comp, &uncomp); 1746 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1747 -used, -comp, -uncomp, tx); 1748 1749 /* Move blocks to be freed to pool's free list. */ 1750 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1751 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1752 tx); 1753 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1754 DD_USED_HEAD, used, comp, uncomp, tx); 1755 dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); 1756 1757 /* Merge our deadlist into next's and free it. */ 1758 dsl_deadlist_merge(&ds_next->ds_deadlist, 1759 ds->ds_phys->ds_deadlist_obj, tx); 1760 } 1761 dsl_deadlist_close(&ds->ds_deadlist); 1762 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1763 1764 /* Collapse range in clone heads */ 1765 dsl_dataset_remove_clones_key(ds, 1766 ds->ds_phys->ds_creation_txg, tx); 1767 1768 if (dsl_dataset_is_snapshot(ds_next)) { 1769 dsl_dataset_t *ds_nextnext; 1770 1771 /* 1772 * Update next's unique to include blocks which 1773 * were previously shared by only this snapshot 1774 * and it. Those blocks will be born after the 1775 * prev snap and before this snap, and will have 1776 * died after the next snap and before the one 1777 * after that (ie. be on the snap after next's 1778 * deadlist). 1779 */ 1780 VERIFY(0 == dsl_dataset_hold_obj(dp, 1781 ds_next->ds_phys->ds_next_snap_obj, 1782 FTAG, &ds_nextnext)); 1783 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1784 ds->ds_phys->ds_prev_snap_txg, 1785 ds->ds_phys->ds_creation_txg, 1786 &used, &comp, &uncomp); 1787 ds_next->ds_phys->ds_unique_bytes += used; 1788 dsl_dataset_rele(ds_nextnext, FTAG); 1789 ASSERT3P(ds_next->ds_prev, ==, NULL); 1790 1791 /* Collapse range in this head. */ 1792 dsl_dataset_t *hds; 1793 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1794 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1795 FTAG, &hds)); 1796 dsl_deadlist_remove_key(&hds->ds_deadlist, 1797 ds->ds_phys->ds_creation_txg, tx); 1798 dsl_dataset_rele(hds, FTAG); 1799 1800 } else { 1801 ASSERT3P(ds_next->ds_prev, ==, ds); 1802 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1803 ds_next->ds_prev = NULL; 1804 if (ds_prev) { 1805 VERIFY(0 == dsl_dataset_get_ref(dp, 1806 ds->ds_phys->ds_prev_snap_obj, 1807 ds_next, &ds_next->ds_prev)); 1808 } 1809 1810 dsl_dataset_recalc_head_uniq(ds_next); 1811 1812 /* 1813 * Reduce the amount of our unconsmed refreservation 1814 * being charged to our parent by the amount of 1815 * new unique data we have gained. 1816 */ 1817 if (old_unique < ds_next->ds_reserved) { 1818 int64_t mrsdelta; 1819 uint64_t new_unique = 1820 ds_next->ds_phys->ds_unique_bytes; 1821 1822 ASSERT(old_unique <= new_unique); 1823 mrsdelta = MIN(new_unique - old_unique, 1824 ds_next->ds_reserved - old_unique); 1825 dsl_dir_diduse_space(ds->ds_dir, 1826 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1827 } 1828 } 1829 dsl_dataset_rele(ds_next, FTAG); 1830 } else { 1831 /* 1832 * There's no next snapshot, so this is a head dataset. 1833 * Destroy the deadlist. Unless it's a clone, the 1834 * deadlist should be empty. (If it's a clone, it's 1835 * safe to ignore the deadlist contents.) 1836 */ 1837 struct killarg ka; 1838 1839 dsl_deadlist_close(&ds->ds_deadlist); 1840 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1841 ds->ds_phys->ds_deadlist_obj = 0; 1842 1843 /* 1844 * Free everything that we point to (that's born after 1845 * the previous snapshot, if we are a clone) 1846 * 1847 * NB: this should be very quick, because we already 1848 * freed all the objects in open context. 1849 */ 1850 ka.ds = ds; 1851 ka.tx = tx; 1852 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1853 TRAVERSE_POST, kill_blkptr, &ka); 1854 ASSERT3U(err, ==, 0); 1855 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1856 ds->ds_phys->ds_unique_bytes == 0); 1857 1858 if (ds->ds_prev != NULL) { 1859 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1860 VERIFY3U(0, ==, zap_remove_int(mos, 1861 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1862 ds->ds_object, tx)); 1863 } 1864 dsl_dataset_rele(ds->ds_prev, ds); 1865 ds->ds_prev = ds_prev = NULL; 1866 } 1867 } 1868 1869 /* 1870 * This must be done after the dsl_traverse(), because it will 1871 * re-open the objset. 1872 */ 1873 if (ds->ds_objset) { 1874 dmu_objset_evict(ds->ds_objset); 1875 ds->ds_objset = NULL; 1876 } 1877 1878 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1879 /* Erase the link in the dir */ 1880 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1881 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1882 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1883 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1884 ASSERT(err == 0); 1885 } else { 1886 /* remove from snapshot namespace */ 1887 dsl_dataset_t *ds_head; 1888 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1889 VERIFY(0 == dsl_dataset_hold_obj(dp, 1890 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1891 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1892 #ifdef ZFS_DEBUG 1893 { 1894 uint64_t val; 1895 1896 err = dsl_dataset_snap_lookup(ds_head, 1897 ds->ds_snapname, &val); 1898 ASSERT3U(err, ==, 0); 1899 ASSERT3U(val, ==, obj); 1900 } 1901 #endif 1902 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1903 ASSERT(err == 0); 1904 dsl_dataset_rele(ds_head, FTAG); 1905 } 1906 1907 if (ds_prev && ds->ds_prev != ds_prev) 1908 dsl_dataset_rele(ds_prev, FTAG); 1909 1910 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1911 spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, 1912 "dataset = %llu", ds->ds_object); 1913 1914 if (ds->ds_phys->ds_next_clones_obj != 0) { 1915 uint64_t count; 1916 ASSERT(0 == zap_count(mos, 1917 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1918 VERIFY(0 == dmu_object_free(mos, 1919 ds->ds_phys->ds_next_clones_obj, tx)); 1920 } 1921 if (ds->ds_phys->ds_props_obj != 0) 1922 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1923 if (ds->ds_phys->ds_userrefs_obj != 0) 1924 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1925 dsl_dir_close(ds->ds_dir, ds); 1926 ds->ds_dir = NULL; 1927 dsl_dataset_drain_refs(ds, tag); 1928 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1929 1930 if (dsda->rm_origin) { 1931 /* 1932 * Remove the origin of the clone we just destroyed. 1933 */ 1934 struct dsl_ds_destroyarg ndsda = {0}; 1935 1936 ndsda.ds = dsda->rm_origin; 1937 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1938 } 1939 } 1940 1941 static int 1942 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1943 { 1944 uint64_t asize; 1945 1946 if (!dmu_tx_is_syncing(tx)) 1947 return (0); 1948 1949 /* 1950 * If there's an fs-only reservation, any blocks that might become 1951 * owned by the snapshot dataset must be accommodated by space 1952 * outside of the reservation. 1953 */ 1954 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 1955 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 1956 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 1957 return (ENOSPC); 1958 1959 /* 1960 * Propogate any reserved space for this snapshot to other 1961 * snapshot checks in this sync group. 1962 */ 1963 if (asize > 0) 1964 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 1965 1966 return (0); 1967 } 1968 1969 int 1970 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 1971 { 1972 dsl_dataset_t *ds = arg1; 1973 const char *snapname = arg2; 1974 int err; 1975 uint64_t value; 1976 1977 /* 1978 * We don't allow multiple snapshots of the same txg. If there 1979 * is already one, try again. 1980 */ 1981 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 1982 return (EAGAIN); 1983 1984 /* 1985 * Check for conflicting name snapshot name. 1986 */ 1987 err = dsl_dataset_snap_lookup(ds, snapname, &value); 1988 if (err == 0) 1989 return (EEXIST); 1990 if (err != ENOENT) 1991 return (err); 1992 1993 /* 1994 * Check that the dataset's name is not too long. Name consists 1995 * of the dataset's length + 1 for the @-sign + snapshot name's length 1996 */ 1997 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 1998 return (ENAMETOOLONG); 1999 2000 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2001 if (err) 2002 return (err); 2003 2004 ds->ds_trysnap_txg = tx->tx_txg; 2005 return (0); 2006 } 2007 2008 void 2009 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2010 { 2011 dsl_dataset_t *ds = arg1; 2012 const char *snapname = arg2; 2013 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2014 dmu_buf_t *dbuf; 2015 dsl_dataset_phys_t *dsphys; 2016 uint64_t dsobj, crtxg; 2017 objset_t *mos = dp->dp_meta_objset; 2018 int err; 2019 2020 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2021 2022 /* 2023 * The origin's ds_creation_txg has to be < TXG_INITIAL 2024 */ 2025 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2026 crtxg = 1; 2027 else 2028 crtxg = tx->tx_txg; 2029 2030 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2031 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2032 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2033 dmu_buf_will_dirty(dbuf, tx); 2034 dsphys = dbuf->db_data; 2035 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2036 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2037 dsphys->ds_fsid_guid = unique_create(); 2038 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2039 sizeof (dsphys->ds_guid)); 2040 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2041 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2042 dsphys->ds_next_snap_obj = ds->ds_object; 2043 dsphys->ds_num_children = 1; 2044 dsphys->ds_creation_time = gethrestime_sec(); 2045 dsphys->ds_creation_txg = crtxg; 2046 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2047 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 2048 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2049 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2050 dsphys->ds_flags = ds->ds_phys->ds_flags; 2051 dsphys->ds_bp = ds->ds_phys->ds_bp; 2052 dmu_buf_rele(dbuf, FTAG); 2053 2054 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2055 if (ds->ds_prev) { 2056 uint64_t next_clones_obj = 2057 ds->ds_prev->ds_phys->ds_next_clones_obj; 2058 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2059 ds->ds_object || 2060 ds->ds_prev->ds_phys->ds_num_children > 1); 2061 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2062 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2063 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2064 ds->ds_prev->ds_phys->ds_creation_txg); 2065 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2066 } else if (next_clones_obj != 0) { 2067 remove_from_next_clones(ds->ds_prev, 2068 dsphys->ds_next_snap_obj, tx); 2069 VERIFY3U(0, ==, zap_add_int(mos, 2070 next_clones_obj, dsobj, tx)); 2071 } 2072 } 2073 2074 /* 2075 * If we have a reference-reservation on this dataset, we will 2076 * need to increase the amount of refreservation being charged 2077 * since our unique space is going to zero. 2078 */ 2079 if (ds->ds_reserved) { 2080 int64_t delta; 2081 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2082 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2083 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2084 delta, 0, 0, tx); 2085 } 2086 2087 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2088 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2089 ds->ds_dir->dd_myname, snapname, dsobj, 2090 ds->ds_phys->ds_prev_snap_txg); 2091 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2092 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2093 dsl_deadlist_close(&ds->ds_deadlist); 2094 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2095 dsl_deadlist_add_key(&ds->ds_deadlist, 2096 ds->ds_phys->ds_prev_snap_txg, tx); 2097 2098 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2099 ds->ds_phys->ds_prev_snap_obj = dsobj; 2100 ds->ds_phys->ds_prev_snap_txg = crtxg; 2101 ds->ds_phys->ds_unique_bytes = 0; 2102 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2103 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2104 2105 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2106 snapname, 8, 1, &dsobj, tx); 2107 ASSERT(err == 0); 2108 2109 if (ds->ds_prev) 2110 dsl_dataset_drop_ref(ds->ds_prev, ds); 2111 VERIFY(0 == dsl_dataset_get_ref(dp, 2112 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2113 2114 dsl_scan_ds_snapshotted(ds, tx); 2115 2116 dsl_dir_snap_cmtime_update(ds->ds_dir); 2117 2118 spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, 2119 "dataset = %llu", dsobj); 2120 } 2121 2122 void 2123 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2124 { 2125 ASSERT(dmu_tx_is_syncing(tx)); 2126 ASSERT(ds->ds_objset != NULL); 2127 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2128 2129 /* 2130 * in case we had to change ds_fsid_guid when we opened it, 2131 * sync it out now. 2132 */ 2133 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2134 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2135 2136 dsl_dir_dirty(ds->ds_dir, tx); 2137 dmu_objset_sync(ds->ds_objset, zio, tx); 2138 } 2139 2140 static void 2141 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) 2142 { 2143 uint64_t count = 0; 2144 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 2145 zap_cursor_t zc; 2146 zap_attribute_t za; 2147 nvlist_t *propval; 2148 nvlist_t *val; 2149 2150 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2151 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2152 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2153 2154 /* 2155 * There may me missing entries in ds_next_clones_obj 2156 * due to a bug in a previous version of the code. 2157 * Only trust it if it has the right number of entries. 2158 */ 2159 if (ds->ds_phys->ds_next_clones_obj != 0) { 2160 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 2161 &count)); 2162 } 2163 if (count != ds->ds_phys->ds_num_children - 1) { 2164 goto fail; 2165 } 2166 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); 2167 zap_cursor_retrieve(&zc, &za) == 0; 2168 zap_cursor_advance(&zc)) { 2169 dsl_dataset_t *clone; 2170 char buf[ZFS_MAXNAMELEN]; 2171 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 2172 za.za_first_integer, FTAG, &clone) != 0) { 2173 goto fail; 2174 } 2175 dsl_dir_name(clone->ds_dir, buf); 2176 VERIFY(nvlist_add_boolean(val, buf) == 0); 2177 dsl_dataset_rele(clone, FTAG); 2178 } 2179 zap_cursor_fini(&zc); 2180 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); 2181 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), 2182 propval) == 0); 2183 fail: 2184 nvlist_free(val); 2185 nvlist_free(propval); 2186 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2187 } 2188 2189 void 2190 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2191 { 2192 uint64_t refd, avail, uobjs, aobjs, ratio; 2193 2194 dsl_dir_stats(ds->ds_dir, nv); 2195 2196 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2197 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2198 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2199 2200 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2201 ds->ds_phys->ds_creation_time); 2202 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2203 ds->ds_phys->ds_creation_txg); 2204 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2205 ds->ds_quota); 2206 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2207 ds->ds_reserved); 2208 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2209 ds->ds_phys->ds_guid); 2210 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2211 ds->ds_phys->ds_unique_bytes); 2212 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2213 ds->ds_object); 2214 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2215 ds->ds_userrefs); 2216 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2217 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2218 2219 if (ds->ds_phys->ds_prev_snap_obj != 0) { 2220 uint64_t written, comp, uncomp; 2221 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2222 dsl_dataset_t *prev; 2223 2224 rw_enter(&dp->dp_config_rwlock, RW_READER); 2225 int err = dsl_dataset_hold_obj(dp, 2226 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 2227 rw_exit(&dp->dp_config_rwlock); 2228 if (err == 0) { 2229 err = dsl_dataset_space_written(prev, ds, &written, 2230 &comp, &uncomp); 2231 dsl_dataset_rele(prev, FTAG); 2232 if (err == 0) { 2233 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, 2234 written); 2235 } 2236 } 2237 } 2238 2239 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2240 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2241 ds->ds_phys->ds_compressed_bytes); 2242 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2243 2244 if (ds->ds_phys->ds_next_snap_obj) { 2245 /* 2246 * This is a snapshot; override the dd's space used with 2247 * our unique space and compression ratio. 2248 */ 2249 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2250 ds->ds_phys->ds_unique_bytes); 2251 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2252 2253 get_clones_stat(ds, nv); 2254 } 2255 } 2256 2257 void 2258 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2259 { 2260 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2261 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2262 stat->dds_guid = ds->ds_phys->ds_guid; 2263 if (ds->ds_phys->ds_next_snap_obj) { 2264 stat->dds_is_snapshot = B_TRUE; 2265 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2266 } else { 2267 stat->dds_is_snapshot = B_FALSE; 2268 stat->dds_num_clones = 0; 2269 } 2270 2271 /* clone origin is really a dsl_dir thing... */ 2272 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2273 if (dsl_dir_is_clone(ds->ds_dir)) { 2274 dsl_dataset_t *ods; 2275 2276 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2277 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2278 dsl_dataset_name(ods, stat->dds_origin); 2279 dsl_dataset_drop_ref(ods, FTAG); 2280 } else { 2281 stat->dds_origin[0] = '\0'; 2282 } 2283 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2284 } 2285 2286 uint64_t 2287 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2288 { 2289 return (ds->ds_fsid_guid); 2290 } 2291 2292 void 2293 dsl_dataset_space(dsl_dataset_t *ds, 2294 uint64_t *refdbytesp, uint64_t *availbytesp, 2295 uint64_t *usedobjsp, uint64_t *availobjsp) 2296 { 2297 *refdbytesp = ds->ds_phys->ds_used_bytes; 2298 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2299 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2300 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2301 if (ds->ds_quota != 0) { 2302 /* 2303 * Adjust available bytes according to refquota 2304 */ 2305 if (*refdbytesp < ds->ds_quota) 2306 *availbytesp = MIN(*availbytesp, 2307 ds->ds_quota - *refdbytesp); 2308 else 2309 *availbytesp = 0; 2310 } 2311 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2312 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2313 } 2314 2315 boolean_t 2316 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2317 { 2318 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2319 2320 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2321 dsl_pool_sync_context(dp)); 2322 if (ds->ds_prev == NULL) 2323 return (B_FALSE); 2324 if (ds->ds_phys->ds_bp.blk_birth > 2325 ds->ds_prev->ds_phys->ds_creation_txg) { 2326 objset_t *os, *os_prev; 2327 /* 2328 * It may be that only the ZIL differs, because it was 2329 * reset in the head. Don't count that as being 2330 * modified. 2331 */ 2332 if (dmu_objset_from_ds(ds, &os) != 0) 2333 return (B_TRUE); 2334 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2335 return (B_TRUE); 2336 return (bcmp(&os->os_phys->os_meta_dnode, 2337 &os_prev->os_phys->os_meta_dnode, 2338 sizeof (os->os_phys->os_meta_dnode)) != 0); 2339 } 2340 return (B_FALSE); 2341 } 2342 2343 /* ARGSUSED */ 2344 static int 2345 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2346 { 2347 dsl_dataset_t *ds = arg1; 2348 char *newsnapname = arg2; 2349 dsl_dir_t *dd = ds->ds_dir; 2350 dsl_dataset_t *hds; 2351 uint64_t val; 2352 int err; 2353 2354 err = dsl_dataset_hold_obj(dd->dd_pool, 2355 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2356 if (err) 2357 return (err); 2358 2359 /* new name better not be in use */ 2360 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2361 dsl_dataset_rele(hds, FTAG); 2362 2363 if (err == 0) 2364 err = EEXIST; 2365 else if (err == ENOENT) 2366 err = 0; 2367 2368 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2369 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2370 err = ENAMETOOLONG; 2371 2372 return (err); 2373 } 2374 2375 static void 2376 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2377 { 2378 dsl_dataset_t *ds = arg1; 2379 const char *newsnapname = arg2; 2380 dsl_dir_t *dd = ds->ds_dir; 2381 objset_t *mos = dd->dd_pool->dp_meta_objset; 2382 dsl_dataset_t *hds; 2383 int err; 2384 2385 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2386 2387 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2388 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2389 2390 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2391 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2392 ASSERT3U(err, ==, 0); 2393 mutex_enter(&ds->ds_lock); 2394 (void) strcpy(ds->ds_snapname, newsnapname); 2395 mutex_exit(&ds->ds_lock); 2396 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2397 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2398 ASSERT3U(err, ==, 0); 2399 2400 spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2401 "dataset = %llu", ds->ds_object); 2402 dsl_dataset_rele(hds, FTAG); 2403 } 2404 2405 struct renamesnaparg { 2406 dsl_sync_task_group_t *dstg; 2407 char failed[MAXPATHLEN]; 2408 char *oldsnap; 2409 char *newsnap; 2410 }; 2411 2412 static int 2413 dsl_snapshot_rename_one(const char *name, void *arg) 2414 { 2415 struct renamesnaparg *ra = arg; 2416 dsl_dataset_t *ds = NULL; 2417 char *snapname; 2418 int err; 2419 2420 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2421 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2422 2423 /* 2424 * For recursive snapshot renames the parent won't be changing 2425 * so we just pass name for both the to/from argument. 2426 */ 2427 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2428 if (err != 0) { 2429 strfree(snapname); 2430 return (err == ENOENT ? 0 : err); 2431 } 2432 2433 #ifdef _KERNEL 2434 /* 2435 * For all filesystems undergoing rename, we'll need to unmount it. 2436 */ 2437 (void) zfs_unmount_snap(snapname, NULL); 2438 #endif 2439 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2440 strfree(snapname); 2441 if (err != 0) 2442 return (err == ENOENT ? 0 : err); 2443 2444 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2445 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2446 2447 return (0); 2448 } 2449 2450 static int 2451 dsl_recursive_rename(char *oldname, const char *newname) 2452 { 2453 int err; 2454 struct renamesnaparg *ra; 2455 dsl_sync_task_t *dst; 2456 spa_t *spa; 2457 char *cp, *fsname = spa_strdup(oldname); 2458 int len = strlen(oldname) + 1; 2459 2460 /* truncate the snapshot name to get the fsname */ 2461 cp = strchr(fsname, '@'); 2462 *cp = '\0'; 2463 2464 err = spa_open(fsname, &spa, FTAG); 2465 if (err) { 2466 kmem_free(fsname, len); 2467 return (err); 2468 } 2469 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2470 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2471 2472 ra->oldsnap = strchr(oldname, '@') + 1; 2473 ra->newsnap = strchr(newname, '@') + 1; 2474 *ra->failed = '\0'; 2475 2476 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2477 DS_FIND_CHILDREN); 2478 kmem_free(fsname, len); 2479 2480 if (err == 0) { 2481 err = dsl_sync_task_group_wait(ra->dstg); 2482 } 2483 2484 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2485 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2486 dsl_dataset_t *ds = dst->dst_arg1; 2487 if (dst->dst_err) { 2488 dsl_dir_name(ds->ds_dir, ra->failed); 2489 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2490 (void) strlcat(ra->failed, ra->newsnap, 2491 sizeof (ra->failed)); 2492 } 2493 dsl_dataset_rele(ds, ra->dstg); 2494 } 2495 2496 if (err) 2497 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2498 2499 dsl_sync_task_group_destroy(ra->dstg); 2500 kmem_free(ra, sizeof (struct renamesnaparg)); 2501 spa_close(spa, FTAG); 2502 return (err); 2503 } 2504 2505 static int 2506 dsl_valid_rename(const char *oldname, void *arg) 2507 { 2508 int delta = *(int *)arg; 2509 2510 if (strlen(oldname) + delta >= MAXNAMELEN) 2511 return (ENAMETOOLONG); 2512 2513 return (0); 2514 } 2515 2516 #pragma weak dmu_objset_rename = dsl_dataset_rename 2517 int 2518 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2519 { 2520 dsl_dir_t *dd; 2521 dsl_dataset_t *ds; 2522 const char *tail; 2523 int err; 2524 2525 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2526 if (err) 2527 return (err); 2528 2529 if (tail == NULL) { 2530 int delta = strlen(newname) - strlen(oldname); 2531 2532 /* if we're growing, validate child name lengths */ 2533 if (delta > 0) 2534 err = dmu_objset_find(oldname, dsl_valid_rename, 2535 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2536 2537 if (err == 0) 2538 err = dsl_dir_rename(dd, newname); 2539 dsl_dir_close(dd, FTAG); 2540 return (err); 2541 } 2542 2543 if (tail[0] != '@') { 2544 /* the name ended in a nonexistent component */ 2545 dsl_dir_close(dd, FTAG); 2546 return (ENOENT); 2547 } 2548 2549 dsl_dir_close(dd, FTAG); 2550 2551 /* new name must be snapshot in same filesystem */ 2552 tail = strchr(newname, '@'); 2553 if (tail == NULL) 2554 return (EINVAL); 2555 tail++; 2556 if (strncmp(oldname, newname, tail - newname) != 0) 2557 return (EXDEV); 2558 2559 if (recursive) { 2560 err = dsl_recursive_rename(oldname, newname); 2561 } else { 2562 err = dsl_dataset_hold(oldname, FTAG, &ds); 2563 if (err) 2564 return (err); 2565 2566 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2567 dsl_dataset_snapshot_rename_check, 2568 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2569 2570 dsl_dataset_rele(ds, FTAG); 2571 } 2572 2573 return (err); 2574 } 2575 2576 struct promotenode { 2577 list_node_t link; 2578 dsl_dataset_t *ds; 2579 }; 2580 2581 struct promotearg { 2582 list_t shared_snaps, origin_snaps, clone_snaps; 2583 dsl_dataset_t *origin_origin; 2584 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2585 char *err_ds; 2586 }; 2587 2588 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2589 static boolean_t snaplist_unstable(list_t *l); 2590 2591 static int 2592 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2593 { 2594 dsl_dataset_t *hds = arg1; 2595 struct promotearg *pa = arg2; 2596 struct promotenode *snap = list_head(&pa->shared_snaps); 2597 dsl_dataset_t *origin_ds = snap->ds; 2598 int err; 2599 uint64_t unused; 2600 2601 /* Check that it is a real clone */ 2602 if (!dsl_dir_is_clone(hds->ds_dir)) 2603 return (EINVAL); 2604 2605 /* Since this is so expensive, don't do the preliminary check */ 2606 if (!dmu_tx_is_syncing(tx)) 2607 return (0); 2608 2609 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2610 return (EXDEV); 2611 2612 /* compute origin's new unique space */ 2613 snap = list_tail(&pa->clone_snaps); 2614 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2615 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2616 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2617 &pa->unique, &unused, &unused); 2618 2619 /* 2620 * Walk the snapshots that we are moving 2621 * 2622 * Compute space to transfer. Consider the incremental changes 2623 * to used for each snapshot: 2624 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2625 * So each snapshot gave birth to: 2626 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2627 * So a sequence would look like: 2628 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2629 * Which simplifies to: 2630 * uN + kN + kN-1 + ... + k1 + k0 2631 * Note however, if we stop before we reach the ORIGIN we get: 2632 * uN + kN + kN-1 + ... + kM - uM-1 2633 */ 2634 pa->used = origin_ds->ds_phys->ds_used_bytes; 2635 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2636 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2637 for (snap = list_head(&pa->shared_snaps); snap; 2638 snap = list_next(&pa->shared_snaps, snap)) { 2639 uint64_t val, dlused, dlcomp, dluncomp; 2640 dsl_dataset_t *ds = snap->ds; 2641 2642 /* Check that the snapshot name does not conflict */ 2643 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2644 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2645 if (err == 0) { 2646 err = EEXIST; 2647 goto out; 2648 } 2649 if (err != ENOENT) 2650 goto out; 2651 2652 /* The very first snapshot does not have a deadlist */ 2653 if (ds->ds_phys->ds_prev_snap_obj == 0) 2654 continue; 2655 2656 dsl_deadlist_space(&ds->ds_deadlist, 2657 &dlused, &dlcomp, &dluncomp); 2658 pa->used += dlused; 2659 pa->comp += dlcomp; 2660 pa->uncomp += dluncomp; 2661 } 2662 2663 /* 2664 * If we are a clone of a clone then we never reached ORIGIN, 2665 * so we need to subtract out the clone origin's used space. 2666 */ 2667 if (pa->origin_origin) { 2668 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; 2669 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2670 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2671 } 2672 2673 /* Check that there is enough space here */ 2674 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2675 pa->used); 2676 if (err) 2677 return (err); 2678 2679 /* 2680 * Compute the amounts of space that will be used by snapshots 2681 * after the promotion (for both origin and clone). For each, 2682 * it is the amount of space that will be on all of their 2683 * deadlists (that was not born before their new origin). 2684 */ 2685 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2686 uint64_t space; 2687 2688 /* 2689 * Note, typically this will not be a clone of a clone, 2690 * so dd_origin_txg will be < TXG_INITIAL, so 2691 * these snaplist_space() -> dsl_deadlist_space_range() 2692 * calls will be fast because they do not have to 2693 * iterate over all bps. 2694 */ 2695 snap = list_head(&pa->origin_snaps); 2696 err = snaplist_space(&pa->shared_snaps, 2697 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2698 if (err) 2699 return (err); 2700 2701 err = snaplist_space(&pa->clone_snaps, 2702 snap->ds->ds_dir->dd_origin_txg, &space); 2703 if (err) 2704 return (err); 2705 pa->cloneusedsnap += space; 2706 } 2707 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2708 err = snaplist_space(&pa->origin_snaps, 2709 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2710 if (err) 2711 return (err); 2712 } 2713 2714 return (0); 2715 out: 2716 pa->err_ds = snap->ds->ds_snapname; 2717 return (err); 2718 } 2719 2720 static void 2721 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2722 { 2723 dsl_dataset_t *hds = arg1; 2724 struct promotearg *pa = arg2; 2725 struct promotenode *snap = list_head(&pa->shared_snaps); 2726 dsl_dataset_t *origin_ds = snap->ds; 2727 dsl_dataset_t *origin_head; 2728 dsl_dir_t *dd = hds->ds_dir; 2729 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2730 dsl_dir_t *odd = NULL; 2731 uint64_t oldnext_obj; 2732 int64_t delta; 2733 2734 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2735 2736 snap = list_head(&pa->origin_snaps); 2737 origin_head = snap->ds; 2738 2739 /* 2740 * We need to explicitly open odd, since origin_ds's dd will be 2741 * changing. 2742 */ 2743 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2744 NULL, FTAG, &odd)); 2745 2746 /* change origin's next snap */ 2747 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2748 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2749 snap = list_tail(&pa->clone_snaps); 2750 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2751 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2752 2753 /* change the origin's next clone */ 2754 if (origin_ds->ds_phys->ds_next_clones_obj) { 2755 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2756 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2757 origin_ds->ds_phys->ds_next_clones_obj, 2758 oldnext_obj, tx)); 2759 } 2760 2761 /* change origin */ 2762 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2763 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2764 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2765 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2766 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2767 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2768 origin_head->ds_dir->dd_origin_txg = 2769 origin_ds->ds_phys->ds_creation_txg; 2770 2771 /* change dd_clone entries */ 2772 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2773 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2774 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2775 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2776 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2777 hds->ds_object, tx)); 2778 2779 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2780 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2781 origin_head->ds_object, tx)); 2782 if (dd->dd_phys->dd_clones == 0) { 2783 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2784 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2785 } 2786 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2787 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2788 2789 } 2790 2791 /* move snapshots to this dir */ 2792 for (snap = list_head(&pa->shared_snaps); snap; 2793 snap = list_next(&pa->shared_snaps, snap)) { 2794 dsl_dataset_t *ds = snap->ds; 2795 2796 /* unregister props as dsl_dir is changing */ 2797 if (ds->ds_objset) { 2798 dmu_objset_evict(ds->ds_objset); 2799 ds->ds_objset = NULL; 2800 } 2801 /* move snap name entry */ 2802 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2803 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2804 ds->ds_snapname, tx)); 2805 VERIFY(0 == zap_add(dp->dp_meta_objset, 2806 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2807 8, 1, &ds->ds_object, tx)); 2808 2809 /* change containing dsl_dir */ 2810 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2811 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2812 ds->ds_phys->ds_dir_obj = dd->dd_object; 2813 ASSERT3P(ds->ds_dir, ==, odd); 2814 dsl_dir_close(ds->ds_dir, ds); 2815 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2816 NULL, ds, &ds->ds_dir)); 2817 2818 /* move any clone references */ 2819 if (ds->ds_phys->ds_next_clones_obj && 2820 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2821 zap_cursor_t zc; 2822 zap_attribute_t za; 2823 2824 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2825 ds->ds_phys->ds_next_clones_obj); 2826 zap_cursor_retrieve(&zc, &za) == 0; 2827 zap_cursor_advance(&zc)) { 2828 dsl_dataset_t *cnds; 2829 uint64_t o; 2830 2831 if (za.za_first_integer == oldnext_obj) { 2832 /* 2833 * We've already moved the 2834 * origin's reference. 2835 */ 2836 continue; 2837 } 2838 2839 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 2840 za.za_first_integer, FTAG, &cnds)); 2841 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 2842 2843 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 2844 odd->dd_phys->dd_clones, o, tx), ==, 0); 2845 VERIFY3U(zap_add_int(dp->dp_meta_objset, 2846 dd->dd_phys->dd_clones, o, tx), ==, 0); 2847 dsl_dataset_rele(cnds, FTAG); 2848 } 2849 zap_cursor_fini(&zc); 2850 } 2851 2852 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2853 } 2854 2855 /* 2856 * Change space accounting. 2857 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2858 * both be valid, or both be 0 (resulting in delta == 0). This 2859 * is true for each of {clone,origin} independently. 2860 */ 2861 2862 delta = pa->cloneusedsnap - 2863 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2864 ASSERT3S(delta, >=, 0); 2865 ASSERT3U(pa->used, >=, delta); 2866 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2867 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2868 pa->used - delta, pa->comp, pa->uncomp, tx); 2869 2870 delta = pa->originusedsnap - 2871 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2872 ASSERT3S(delta, <=, 0); 2873 ASSERT3U(pa->used, >=, -delta); 2874 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2875 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2876 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2877 2878 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2879 2880 /* log history record */ 2881 spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2882 "dataset = %llu", hds->ds_object); 2883 2884 dsl_dir_close(odd, FTAG); 2885 } 2886 2887 static char *snaplist_tag = "snaplist"; 2888 /* 2889 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2890 * (exclusive) and last_obj (inclusive). The list will be in reverse 2891 * order (last_obj will be the list_head()). If first_obj == 0, do all 2892 * snapshots back to this dataset's origin. 2893 */ 2894 static int 2895 snaplist_make(dsl_pool_t *dp, boolean_t own, 2896 uint64_t first_obj, uint64_t last_obj, list_t *l) 2897 { 2898 uint64_t obj = last_obj; 2899 2900 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2901 2902 list_create(l, sizeof (struct promotenode), 2903 offsetof(struct promotenode, link)); 2904 2905 while (obj != first_obj) { 2906 dsl_dataset_t *ds; 2907 struct promotenode *snap; 2908 int err; 2909 2910 if (own) { 2911 err = dsl_dataset_own_obj(dp, obj, 2912 0, snaplist_tag, &ds); 2913 if (err == 0) 2914 dsl_dataset_make_exclusive(ds, snaplist_tag); 2915 } else { 2916 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2917 } 2918 if (err == ENOENT) { 2919 /* lost race with snapshot destroy */ 2920 struct promotenode *last = list_tail(l); 2921 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2922 obj = last->ds->ds_phys->ds_prev_snap_obj; 2923 continue; 2924 } else if (err) { 2925 return (err); 2926 } 2927 2928 if (first_obj == 0) 2929 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2930 2931 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2932 snap->ds = ds; 2933 list_insert_tail(l, snap); 2934 obj = ds->ds_phys->ds_prev_snap_obj; 2935 } 2936 2937 return (0); 2938 } 2939 2940 static int 2941 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2942 { 2943 struct promotenode *snap; 2944 2945 *spacep = 0; 2946 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2947 uint64_t used, comp, uncomp; 2948 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2949 mintxg, UINT64_MAX, &used, &comp, &uncomp); 2950 *spacep += used; 2951 } 2952 return (0); 2953 } 2954 2955 static void 2956 snaplist_destroy(list_t *l, boolean_t own) 2957 { 2958 struct promotenode *snap; 2959 2960 if (!l || !list_link_active(&l->list_head)) 2961 return; 2962 2963 while ((snap = list_tail(l)) != NULL) { 2964 list_remove(l, snap); 2965 if (own) 2966 dsl_dataset_disown(snap->ds, snaplist_tag); 2967 else 2968 dsl_dataset_rele(snap->ds, snaplist_tag); 2969 kmem_free(snap, sizeof (struct promotenode)); 2970 } 2971 list_destroy(l); 2972 } 2973 2974 /* 2975 * Promote a clone. Nomenclature note: 2976 * "clone" or "cds": the original clone which is being promoted 2977 * "origin" or "ods": the snapshot which is originally clone's origin 2978 * "origin head" or "ohds": the dataset which is the head 2979 * (filesystem/volume) for the origin 2980 * "origin origin": the origin of the origin's filesystem (typically 2981 * NULL, indicating that the clone is not a clone of a clone). 2982 */ 2983 int 2984 dsl_dataset_promote(const char *name, char *conflsnap) 2985 { 2986 dsl_dataset_t *ds; 2987 dsl_dir_t *dd; 2988 dsl_pool_t *dp; 2989 dmu_object_info_t doi; 2990 struct promotearg pa = { 0 }; 2991 struct promotenode *snap; 2992 int err; 2993 2994 err = dsl_dataset_hold(name, FTAG, &ds); 2995 if (err) 2996 return (err); 2997 dd = ds->ds_dir; 2998 dp = dd->dd_pool; 2999 3000 err = dmu_object_info(dp->dp_meta_objset, 3001 ds->ds_phys->ds_snapnames_zapobj, &doi); 3002 if (err) { 3003 dsl_dataset_rele(ds, FTAG); 3004 return (err); 3005 } 3006 3007 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 3008 dsl_dataset_rele(ds, FTAG); 3009 return (EINVAL); 3010 } 3011 3012 /* 3013 * We are going to inherit all the snapshots taken before our 3014 * origin (i.e., our new origin will be our parent's origin). 3015 * Take ownership of them so that we can rename them into our 3016 * namespace. 3017 */ 3018 rw_enter(&dp->dp_config_rwlock, RW_READER); 3019 3020 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 3021 &pa.shared_snaps); 3022 if (err != 0) 3023 goto out; 3024 3025 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 3026 if (err != 0) 3027 goto out; 3028 3029 snap = list_head(&pa.shared_snaps); 3030 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 3031 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 3032 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 3033 if (err != 0) 3034 goto out; 3035 3036 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 3037 err = dsl_dataset_hold_obj(dp, 3038 snap->ds->ds_dir->dd_phys->dd_origin_obj, 3039 FTAG, &pa.origin_origin); 3040 if (err != 0) 3041 goto out; 3042 } 3043 3044 out: 3045 rw_exit(&dp->dp_config_rwlock); 3046 3047 /* 3048 * Add in 128x the snapnames zapobj size, since we will be moving 3049 * a bunch of snapnames to the promoted ds, and dirtying their 3050 * bonus buffers. 3051 */ 3052 if (err == 0) { 3053 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 3054 dsl_dataset_promote_sync, ds, &pa, 3055 2 + 2 * doi.doi_physical_blocks_512); 3056 if (err && pa.err_ds && conflsnap) 3057 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 3058 } 3059 3060 snaplist_destroy(&pa.shared_snaps, B_TRUE); 3061 snaplist_destroy(&pa.clone_snaps, B_FALSE); 3062 snaplist_destroy(&pa.origin_snaps, B_FALSE); 3063 if (pa.origin_origin) 3064 dsl_dataset_rele(pa.origin_origin, FTAG); 3065 dsl_dataset_rele(ds, FTAG); 3066 return (err); 3067 } 3068 3069 struct cloneswaparg { 3070 dsl_dataset_t *cds; /* clone dataset */ 3071 dsl_dataset_t *ohds; /* origin's head dataset */ 3072 boolean_t force; 3073 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3074 }; 3075 3076 /* ARGSUSED */ 3077 static int 3078 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3079 { 3080 struct cloneswaparg *csa = arg1; 3081 3082 /* they should both be heads */ 3083 if (dsl_dataset_is_snapshot(csa->cds) || 3084 dsl_dataset_is_snapshot(csa->ohds)) 3085 return (EINVAL); 3086 3087 /* the branch point should be just before them */ 3088 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3089 return (EINVAL); 3090 3091 /* cds should be the clone (unless they are unrelated) */ 3092 if (csa->cds->ds_prev != NULL && 3093 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3094 csa->ohds->ds_object != 3095 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3096 return (EINVAL); 3097 3098 /* the clone should be a child of the origin */ 3099 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3100 return (EINVAL); 3101 3102 /* ohds shouldn't be modified unless 'force' */ 3103 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3104 return (ETXTBSY); 3105 3106 /* adjust amount of any unconsumed refreservation */ 3107 csa->unused_refres_delta = 3108 (int64_t)MIN(csa->ohds->ds_reserved, 3109 csa->ohds->ds_phys->ds_unique_bytes) - 3110 (int64_t)MIN(csa->ohds->ds_reserved, 3111 csa->cds->ds_phys->ds_unique_bytes); 3112 3113 if (csa->unused_refres_delta > 0 && 3114 csa->unused_refres_delta > 3115 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3116 return (ENOSPC); 3117 3118 if (csa->ohds->ds_quota != 0 && 3119 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3120 return (EDQUOT); 3121 3122 return (0); 3123 } 3124 3125 /* ARGSUSED */ 3126 static void 3127 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3128 { 3129 struct cloneswaparg *csa = arg1; 3130 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3131 3132 ASSERT(csa->cds->ds_reserved == 0); 3133 ASSERT(csa->ohds->ds_quota == 0 || 3134 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3135 3136 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3137 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3138 3139 if (csa->cds->ds_objset != NULL) { 3140 dmu_objset_evict(csa->cds->ds_objset); 3141 csa->cds->ds_objset = NULL; 3142 } 3143 3144 if (csa->ohds->ds_objset != NULL) { 3145 dmu_objset_evict(csa->ohds->ds_objset); 3146 csa->ohds->ds_objset = NULL; 3147 } 3148 3149 /* 3150 * Reset origin's unique bytes, if it exists. 3151 */ 3152 if (csa->cds->ds_prev) { 3153 dsl_dataset_t *origin = csa->cds->ds_prev; 3154 uint64_t comp, uncomp; 3155 3156 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3157 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3158 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3159 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3160 } 3161 3162 /* swap blkptrs */ 3163 { 3164 blkptr_t tmp; 3165 tmp = csa->ohds->ds_phys->ds_bp; 3166 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3167 csa->cds->ds_phys->ds_bp = tmp; 3168 } 3169 3170 /* set dd_*_bytes */ 3171 { 3172 int64_t dused, dcomp, duncomp; 3173 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3174 uint64_t odl_used, odl_comp, odl_uncomp; 3175 3176 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3177 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3178 3179 dsl_deadlist_space(&csa->cds->ds_deadlist, 3180 &cdl_used, &cdl_comp, &cdl_uncomp); 3181 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3182 &odl_used, &odl_comp, &odl_uncomp); 3183 3184 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 3185 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 3186 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3187 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3188 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3189 cdl_uncomp - 3190 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3191 3192 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3193 dused, dcomp, duncomp, tx); 3194 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3195 -dused, -dcomp, -duncomp, tx); 3196 3197 /* 3198 * The difference in the space used by snapshots is the 3199 * difference in snapshot space due to the head's 3200 * deadlist (since that's the only thing that's 3201 * changing that affects the snapused). 3202 */ 3203 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3204 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3205 &cdl_used, &cdl_comp, &cdl_uncomp); 3206 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3207 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3208 &odl_used, &odl_comp, &odl_uncomp); 3209 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3210 DD_USED_HEAD, DD_USED_SNAP, tx); 3211 } 3212 3213 /* swap ds_*_bytes */ 3214 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 3215 csa->cds->ds_phys->ds_used_bytes); 3216 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3217 csa->cds->ds_phys->ds_compressed_bytes); 3218 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3219 csa->cds->ds_phys->ds_uncompressed_bytes); 3220 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3221 csa->cds->ds_phys->ds_unique_bytes); 3222 3223 /* apply any parent delta for change in unconsumed refreservation */ 3224 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3225 csa->unused_refres_delta, 0, 0, tx); 3226 3227 /* 3228 * Swap deadlists. 3229 */ 3230 dsl_deadlist_close(&csa->cds->ds_deadlist); 3231 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3232 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3233 csa->cds->ds_phys->ds_deadlist_obj); 3234 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3235 csa->cds->ds_phys->ds_deadlist_obj); 3236 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3237 csa->ohds->ds_phys->ds_deadlist_obj); 3238 3239 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3240 } 3241 3242 /* 3243 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3244 * recv" into an existing fs to swizzle the file system to the new 3245 * version, and by "zfs rollback". Can also be used to swap two 3246 * independent head datasets if neither has any snapshots. 3247 */ 3248 int 3249 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3250 boolean_t force) 3251 { 3252 struct cloneswaparg csa; 3253 int error; 3254 3255 ASSERT(clone->ds_owner); 3256 ASSERT(origin_head->ds_owner); 3257 retry: 3258 /* 3259 * Need exclusive access for the swap. If we're swapping these 3260 * datasets back after an error, we already hold the locks. 3261 */ 3262 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3263 rw_enter(&clone->ds_rwlock, RW_WRITER); 3264 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3265 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3266 rw_exit(&clone->ds_rwlock); 3267 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3268 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3269 rw_exit(&origin_head->ds_rwlock); 3270 goto retry; 3271 } 3272 } 3273 csa.cds = clone; 3274 csa.ohds = origin_head; 3275 csa.force = force; 3276 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3277 dsl_dataset_clone_swap_check, 3278 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3279 return (error); 3280 } 3281 3282 /* 3283 * Given a pool name and a dataset object number in that pool, 3284 * return the name of that dataset. 3285 */ 3286 int 3287 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3288 { 3289 spa_t *spa; 3290 dsl_pool_t *dp; 3291 dsl_dataset_t *ds; 3292 int error; 3293 3294 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3295 return (error); 3296 dp = spa_get_dsl(spa); 3297 rw_enter(&dp->dp_config_rwlock, RW_READER); 3298 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3299 dsl_dataset_name(ds, buf); 3300 dsl_dataset_rele(ds, FTAG); 3301 } 3302 rw_exit(&dp->dp_config_rwlock); 3303 spa_close(spa, FTAG); 3304 3305 return (error); 3306 } 3307 3308 int 3309 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3310 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3311 { 3312 int error = 0; 3313 3314 ASSERT3S(asize, >, 0); 3315 3316 /* 3317 * *ref_rsrv is the portion of asize that will come from any 3318 * unconsumed refreservation space. 3319 */ 3320 *ref_rsrv = 0; 3321 3322 mutex_enter(&ds->ds_lock); 3323 /* 3324 * Make a space adjustment for reserved bytes. 3325 */ 3326 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3327 ASSERT3U(*used, >=, 3328 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3329 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3330 *ref_rsrv = 3331 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3332 } 3333 3334 if (!check_quota || ds->ds_quota == 0) { 3335 mutex_exit(&ds->ds_lock); 3336 return (0); 3337 } 3338 /* 3339 * If they are requesting more space, and our current estimate 3340 * is over quota, they get to try again unless the actual 3341 * on-disk is over quota and there are no pending changes (which 3342 * may free up space for us). 3343 */ 3344 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 3345 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 3346 error = ERESTART; 3347 else 3348 error = EDQUOT; 3349 } 3350 mutex_exit(&ds->ds_lock); 3351 3352 return (error); 3353 } 3354 3355 /* ARGSUSED */ 3356 static int 3357 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3358 { 3359 dsl_dataset_t *ds = arg1; 3360 dsl_prop_setarg_t *psa = arg2; 3361 int err; 3362 3363 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3364 return (ENOTSUP); 3365 3366 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3367 return (err); 3368 3369 if (psa->psa_effective_value == 0) 3370 return (0); 3371 3372 if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || 3373 psa->psa_effective_value < ds->ds_reserved) 3374 return (ENOSPC); 3375 3376 return (0); 3377 } 3378 3379 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3380 3381 void 3382 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3383 { 3384 dsl_dataset_t *ds = arg1; 3385 dsl_prop_setarg_t *psa = arg2; 3386 uint64_t effective_value = psa->psa_effective_value; 3387 3388 dsl_prop_set_sync(ds, psa, tx); 3389 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3390 3391 if (ds->ds_quota != effective_value) { 3392 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3393 ds->ds_quota = effective_value; 3394 3395 spa_history_log_internal(LOG_DS_REFQUOTA, 3396 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", 3397 (longlong_t)ds->ds_quota, ds->ds_object); 3398 } 3399 } 3400 3401 int 3402 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3403 { 3404 dsl_dataset_t *ds; 3405 dsl_prop_setarg_t psa; 3406 int err; 3407 3408 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3409 3410 err = dsl_dataset_hold(dsname, FTAG, &ds); 3411 if (err) 3412 return (err); 3413 3414 /* 3415 * If someone removes a file, then tries to set the quota, we 3416 * want to make sure the file freeing takes effect. 3417 */ 3418 txg_wait_open(ds->ds_dir->dd_pool, 0); 3419 3420 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3421 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3422 ds, &psa, 0); 3423 3424 dsl_dataset_rele(ds, FTAG); 3425 return (err); 3426 } 3427 3428 static int 3429 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3430 { 3431 dsl_dataset_t *ds = arg1; 3432 dsl_prop_setarg_t *psa = arg2; 3433 uint64_t effective_value; 3434 uint64_t unique; 3435 int err; 3436 3437 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3438 SPA_VERSION_REFRESERVATION) 3439 return (ENOTSUP); 3440 3441 if (dsl_dataset_is_snapshot(ds)) 3442 return (EINVAL); 3443 3444 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3445 return (err); 3446 3447 effective_value = psa->psa_effective_value; 3448 3449 /* 3450 * If we are doing the preliminary check in open context, the 3451 * space estimates may be inaccurate. 3452 */ 3453 if (!dmu_tx_is_syncing(tx)) 3454 return (0); 3455 3456 mutex_enter(&ds->ds_lock); 3457 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3458 dsl_dataset_recalc_head_uniq(ds); 3459 unique = ds->ds_phys->ds_unique_bytes; 3460 mutex_exit(&ds->ds_lock); 3461 3462 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3463 uint64_t delta = MAX(unique, effective_value) - 3464 MAX(unique, ds->ds_reserved); 3465 3466 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3467 return (ENOSPC); 3468 if (ds->ds_quota > 0 && 3469 effective_value > ds->ds_quota) 3470 return (ENOSPC); 3471 } 3472 3473 return (0); 3474 } 3475 3476 static void 3477 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3478 { 3479 dsl_dataset_t *ds = arg1; 3480 dsl_prop_setarg_t *psa = arg2; 3481 uint64_t effective_value = psa->psa_effective_value; 3482 uint64_t unique; 3483 int64_t delta; 3484 3485 dsl_prop_set_sync(ds, psa, tx); 3486 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3487 3488 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3489 3490 mutex_enter(&ds->ds_dir->dd_lock); 3491 mutex_enter(&ds->ds_lock); 3492 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3493 unique = ds->ds_phys->ds_unique_bytes; 3494 delta = MAX(0, (int64_t)(effective_value - unique)) - 3495 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3496 ds->ds_reserved = effective_value; 3497 mutex_exit(&ds->ds_lock); 3498 3499 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3500 mutex_exit(&ds->ds_dir->dd_lock); 3501 3502 spa_history_log_internal(LOG_DS_REFRESERV, 3503 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", 3504 (longlong_t)effective_value, ds->ds_object); 3505 } 3506 3507 int 3508 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3509 uint64_t reservation) 3510 { 3511 dsl_dataset_t *ds; 3512 dsl_prop_setarg_t psa; 3513 int err; 3514 3515 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3516 &reservation); 3517 3518 err = dsl_dataset_hold(dsname, FTAG, &ds); 3519 if (err) 3520 return (err); 3521 3522 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3523 dsl_dataset_set_reservation_check, 3524 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3525 3526 dsl_dataset_rele(ds, FTAG); 3527 return (err); 3528 } 3529 3530 typedef struct zfs_hold_cleanup_arg { 3531 dsl_pool_t *dp; 3532 uint64_t dsobj; 3533 char htag[MAXNAMELEN]; 3534 } zfs_hold_cleanup_arg_t; 3535 3536 static void 3537 dsl_dataset_user_release_onexit(void *arg) 3538 { 3539 zfs_hold_cleanup_arg_t *ca = arg; 3540 3541 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3542 B_TRUE); 3543 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3544 } 3545 3546 void 3547 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3548 minor_t minor) 3549 { 3550 zfs_hold_cleanup_arg_t *ca; 3551 3552 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3553 ca->dp = ds->ds_dir->dd_pool; 3554 ca->dsobj = ds->ds_object; 3555 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3556 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3557 dsl_dataset_user_release_onexit, ca, NULL)); 3558 } 3559 3560 /* 3561 * If you add new checks here, you may need to add 3562 * additional checks to the "temporary" case in 3563 * snapshot_check() in dmu_objset.c. 3564 */ 3565 static int 3566 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3567 { 3568 dsl_dataset_t *ds = arg1; 3569 struct dsl_ds_holdarg *ha = arg2; 3570 char *htag = ha->htag; 3571 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3572 int error = 0; 3573 3574 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3575 return (ENOTSUP); 3576 3577 if (!dsl_dataset_is_snapshot(ds)) 3578 return (EINVAL); 3579 3580 /* tags must be unique */ 3581 mutex_enter(&ds->ds_lock); 3582 if (ds->ds_phys->ds_userrefs_obj) { 3583 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3584 8, 1, tx); 3585 if (error == 0) 3586 error = EEXIST; 3587 else if (error == ENOENT) 3588 error = 0; 3589 } 3590 mutex_exit(&ds->ds_lock); 3591 3592 if (error == 0 && ha->temphold && 3593 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3594 error = E2BIG; 3595 3596 return (error); 3597 } 3598 3599 void 3600 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3601 { 3602 dsl_dataset_t *ds = arg1; 3603 struct dsl_ds_holdarg *ha = arg2; 3604 char *htag = ha->htag; 3605 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3606 objset_t *mos = dp->dp_meta_objset; 3607 uint64_t now = gethrestime_sec(); 3608 uint64_t zapobj; 3609 3610 mutex_enter(&ds->ds_lock); 3611 if (ds->ds_phys->ds_userrefs_obj == 0) { 3612 /* 3613 * This is the first user hold for this dataset. Create 3614 * the userrefs zap object. 3615 */ 3616 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3617 zapobj = ds->ds_phys->ds_userrefs_obj = 3618 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3619 } else { 3620 zapobj = ds->ds_phys->ds_userrefs_obj; 3621 } 3622 ds->ds_userrefs++; 3623 mutex_exit(&ds->ds_lock); 3624 3625 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3626 3627 if (ha->temphold) { 3628 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3629 htag, &now, tx)); 3630 } 3631 3632 spa_history_log_internal(LOG_DS_USER_HOLD, 3633 dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, 3634 (int)ha->temphold, ds->ds_object); 3635 } 3636 3637 static int 3638 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3639 { 3640 struct dsl_ds_holdarg *ha = arg; 3641 dsl_dataset_t *ds; 3642 int error; 3643 char *name; 3644 3645 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3646 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3647 error = dsl_dataset_hold(name, ha->dstg, &ds); 3648 strfree(name); 3649 if (error == 0) { 3650 ha->gotone = B_TRUE; 3651 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3652 dsl_dataset_user_hold_sync, ds, ha, 0); 3653 } else if (error == ENOENT && ha->recursive) { 3654 error = 0; 3655 } else { 3656 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3657 } 3658 return (error); 3659 } 3660 3661 int 3662 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3663 boolean_t temphold) 3664 { 3665 struct dsl_ds_holdarg *ha; 3666 int error; 3667 3668 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3669 ha->htag = htag; 3670 ha->temphold = temphold; 3671 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3672 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3673 ds, ha, 0); 3674 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3675 3676 return (error); 3677 } 3678 3679 int 3680 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3681 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3682 { 3683 struct dsl_ds_holdarg *ha; 3684 dsl_sync_task_t *dst; 3685 spa_t *spa; 3686 int error; 3687 minor_t minor = 0; 3688 3689 if (cleanup_fd != -1) { 3690 /* Currently we only support cleanup-on-exit of tempholds. */ 3691 if (!temphold) 3692 return (EINVAL); 3693 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3694 if (error) 3695 return (error); 3696 } 3697 3698 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3699 3700 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3701 3702 error = spa_open(dsname, &spa, FTAG); 3703 if (error) { 3704 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3705 if (cleanup_fd != -1) 3706 zfs_onexit_fd_rele(cleanup_fd); 3707 return (error); 3708 } 3709 3710 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3711 ha->htag = htag; 3712 ha->snapname = snapname; 3713 ha->recursive = recursive; 3714 ha->temphold = temphold; 3715 3716 if (recursive) { 3717 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3718 ha, DS_FIND_CHILDREN); 3719 } else { 3720 error = dsl_dataset_user_hold_one(dsname, ha); 3721 } 3722 if (error == 0) 3723 error = dsl_sync_task_group_wait(ha->dstg); 3724 3725 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3726 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3727 dsl_dataset_t *ds = dst->dst_arg1; 3728 3729 if (dst->dst_err) { 3730 dsl_dataset_name(ds, ha->failed); 3731 *strchr(ha->failed, '@') = '\0'; 3732 } else if (error == 0 && minor != 0 && temphold) { 3733 /* 3734 * If this hold is to be released upon process exit, 3735 * register that action now. 3736 */ 3737 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3738 } 3739 dsl_dataset_rele(ds, ha->dstg); 3740 } 3741 3742 if (error == 0 && recursive && !ha->gotone) 3743 error = ENOENT; 3744 3745 if (error) 3746 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3747 3748 dsl_sync_task_group_destroy(ha->dstg); 3749 3750 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3751 spa_close(spa, FTAG); 3752 if (cleanup_fd != -1) 3753 zfs_onexit_fd_rele(cleanup_fd); 3754 return (error); 3755 } 3756 3757 struct dsl_ds_releasearg { 3758 dsl_dataset_t *ds; 3759 const char *htag; 3760 boolean_t own; /* do we own or just hold ds? */ 3761 }; 3762 3763 static int 3764 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3765 boolean_t *might_destroy) 3766 { 3767 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3768 uint64_t zapobj; 3769 uint64_t tmp; 3770 int error; 3771 3772 *might_destroy = B_FALSE; 3773 3774 mutex_enter(&ds->ds_lock); 3775 zapobj = ds->ds_phys->ds_userrefs_obj; 3776 if (zapobj == 0) { 3777 /* The tag can't possibly exist */ 3778 mutex_exit(&ds->ds_lock); 3779 return (ESRCH); 3780 } 3781 3782 /* Make sure the tag exists */ 3783 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3784 if (error) { 3785 mutex_exit(&ds->ds_lock); 3786 if (error == ENOENT) 3787 error = ESRCH; 3788 return (error); 3789 } 3790 3791 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3792 DS_IS_DEFER_DESTROY(ds)) 3793 *might_destroy = B_TRUE; 3794 3795 mutex_exit(&ds->ds_lock); 3796 return (0); 3797 } 3798 3799 static int 3800 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3801 { 3802 struct dsl_ds_releasearg *ra = arg1; 3803 dsl_dataset_t *ds = ra->ds; 3804 boolean_t might_destroy; 3805 int error; 3806 3807 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3808 return (ENOTSUP); 3809 3810 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3811 if (error) 3812 return (error); 3813 3814 if (might_destroy) { 3815 struct dsl_ds_destroyarg dsda = {0}; 3816 3817 if (dmu_tx_is_syncing(tx)) { 3818 /* 3819 * If we're not prepared to remove the snapshot, 3820 * we can't allow the release to happen right now. 3821 */ 3822 if (!ra->own) 3823 return (EBUSY); 3824 } 3825 dsda.ds = ds; 3826 dsda.releasing = B_TRUE; 3827 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3828 } 3829 3830 return (0); 3831 } 3832 3833 static void 3834 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 3835 { 3836 struct dsl_ds_releasearg *ra = arg1; 3837 dsl_dataset_t *ds = ra->ds; 3838 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3839 objset_t *mos = dp->dp_meta_objset; 3840 uint64_t zapobj; 3841 uint64_t dsobj = ds->ds_object; 3842 uint64_t refs; 3843 int error; 3844 3845 mutex_enter(&ds->ds_lock); 3846 ds->ds_userrefs--; 3847 refs = ds->ds_userrefs; 3848 mutex_exit(&ds->ds_lock); 3849 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 3850 VERIFY(error == 0 || error == ENOENT); 3851 zapobj = ds->ds_phys->ds_userrefs_obj; 3852 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3853 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3854 DS_IS_DEFER_DESTROY(ds)) { 3855 struct dsl_ds_destroyarg dsda = {0}; 3856 3857 ASSERT(ra->own); 3858 dsda.ds = ds; 3859 dsda.releasing = B_TRUE; 3860 /* We already did the destroy_check */ 3861 dsl_dataset_destroy_sync(&dsda, tag, tx); 3862 } 3863 3864 spa_history_log_internal(LOG_DS_USER_RELEASE, 3865 dp->dp_spa, tx, "<%s> %lld dataset = %llu", 3866 ra->htag, (longlong_t)refs, dsobj); 3867 } 3868 3869 static int 3870 dsl_dataset_user_release_one(const char *dsname, void *arg) 3871 { 3872 struct dsl_ds_holdarg *ha = arg; 3873 struct dsl_ds_releasearg *ra; 3874 dsl_dataset_t *ds; 3875 int error; 3876 void *dtag = ha->dstg; 3877 char *name; 3878 boolean_t own = B_FALSE; 3879 boolean_t might_destroy; 3880 3881 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3882 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3883 error = dsl_dataset_hold(name, dtag, &ds); 3884 strfree(name); 3885 if (error == ENOENT && ha->recursive) 3886 return (0); 3887 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3888 if (error) 3889 return (error); 3890 3891 ha->gotone = B_TRUE; 3892 3893 ASSERT(dsl_dataset_is_snapshot(ds)); 3894 3895 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3896 if (error) { 3897 dsl_dataset_rele(ds, dtag); 3898 return (error); 3899 } 3900 3901 if (might_destroy) { 3902 #ifdef _KERNEL 3903 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3904 error = zfs_unmount_snap(name, NULL); 3905 strfree(name); 3906 if (error) { 3907 dsl_dataset_rele(ds, dtag); 3908 return (error); 3909 } 3910 #endif 3911 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 3912 dsl_dataset_rele(ds, dtag); 3913 return (EBUSY); 3914 } else { 3915 own = B_TRUE; 3916 dsl_dataset_make_exclusive(ds, dtag); 3917 } 3918 } 3919 3920 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3921 ra->ds = ds; 3922 ra->htag = ha->htag; 3923 ra->own = own; 3924 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3925 dsl_dataset_user_release_sync, ra, dtag, 0); 3926 3927 return (0); 3928 } 3929 3930 int 3931 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3932 boolean_t recursive) 3933 { 3934 struct dsl_ds_holdarg *ha; 3935 dsl_sync_task_t *dst; 3936 spa_t *spa; 3937 int error; 3938 3939 top: 3940 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3941 3942 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3943 3944 error = spa_open(dsname, &spa, FTAG); 3945 if (error) { 3946 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3947 return (error); 3948 } 3949 3950 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3951 ha->htag = htag; 3952 ha->snapname = snapname; 3953 ha->recursive = recursive; 3954 if (recursive) { 3955 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 3956 ha, DS_FIND_CHILDREN); 3957 } else { 3958 error = dsl_dataset_user_release_one(dsname, ha); 3959 } 3960 if (error == 0) 3961 error = dsl_sync_task_group_wait(ha->dstg); 3962 3963 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3964 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3965 struct dsl_ds_releasearg *ra = dst->dst_arg1; 3966 dsl_dataset_t *ds = ra->ds; 3967 3968 if (dst->dst_err) 3969 dsl_dataset_name(ds, ha->failed); 3970 3971 if (ra->own) 3972 dsl_dataset_disown(ds, ha->dstg); 3973 else 3974 dsl_dataset_rele(ds, ha->dstg); 3975 3976 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 3977 } 3978 3979 if (error == 0 && recursive && !ha->gotone) 3980 error = ENOENT; 3981 3982 if (error && error != EBUSY) 3983 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3984 3985 dsl_sync_task_group_destroy(ha->dstg); 3986 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3987 spa_close(spa, FTAG); 3988 3989 /* 3990 * We can get EBUSY if we were racing with deferred destroy and 3991 * dsl_dataset_user_release_check() hadn't done the necessary 3992 * open context setup. We can also get EBUSY if we're racing 3993 * with destroy and that thread is the ds_owner. Either way 3994 * the busy condition should be transient, and we should retry 3995 * the release operation. 3996 */ 3997 if (error == EBUSY) 3998 goto top; 3999 4000 return (error); 4001 } 4002 4003 /* 4004 * Called at spa_load time (with retry == B_FALSE) to release a stale 4005 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 4006 */ 4007 int 4008 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 4009 boolean_t retry) 4010 { 4011 dsl_dataset_t *ds; 4012 char *snap; 4013 char *name; 4014 int namelen; 4015 int error; 4016 4017 do { 4018 rw_enter(&dp->dp_config_rwlock, RW_READER); 4019 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 4020 rw_exit(&dp->dp_config_rwlock); 4021 if (error) 4022 return (error); 4023 namelen = dsl_dataset_namelen(ds)+1; 4024 name = kmem_alloc(namelen, KM_SLEEP); 4025 dsl_dataset_name(ds, name); 4026 dsl_dataset_rele(ds, FTAG); 4027 4028 snap = strchr(name, '@'); 4029 *snap = '\0'; 4030 ++snap; 4031 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 4032 kmem_free(name, namelen); 4033 4034 /* 4035 * The object can't have been destroyed because we have a hold, 4036 * but it might have been renamed, resulting in ENOENT. Retry 4037 * if we've been requested to do so. 4038 * 4039 * It would be nice if we could use the dsobj all the way 4040 * through and avoid ENOENT entirely. But we might need to 4041 * unmount the snapshot, and there's currently no way to lookup 4042 * a vfsp using a ZFS object id. 4043 */ 4044 } while ((error == ENOENT) && retry); 4045 4046 return (error); 4047 } 4048 4049 int 4050 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 4051 { 4052 dsl_dataset_t *ds; 4053 int err; 4054 4055 err = dsl_dataset_hold(dsname, FTAG, &ds); 4056 if (err) 4057 return (err); 4058 4059 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 4060 if (ds->ds_phys->ds_userrefs_obj != 0) { 4061 zap_attribute_t *za; 4062 zap_cursor_t zc; 4063 4064 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4065 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4066 ds->ds_phys->ds_userrefs_obj); 4067 zap_cursor_retrieve(&zc, za) == 0; 4068 zap_cursor_advance(&zc)) { 4069 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4070 za->za_first_integer)); 4071 } 4072 zap_cursor_fini(&zc); 4073 kmem_free(za, sizeof (zap_attribute_t)); 4074 } 4075 dsl_dataset_rele(ds, FTAG); 4076 return (0); 4077 } 4078 4079 /* 4080 * Note, this function is used as the callback for dmu_objset_find(). We 4081 * always return 0 so that we will continue to find and process 4082 * inconsistent datasets, even if we encounter an error trying to 4083 * process one of them. 4084 */ 4085 /* ARGSUSED */ 4086 int 4087 dsl_destroy_inconsistent(const char *dsname, void *arg) 4088 { 4089 dsl_dataset_t *ds; 4090 4091 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4092 if (DS_IS_INCONSISTENT(ds)) 4093 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4094 else 4095 dsl_dataset_disown(ds, FTAG); 4096 } 4097 return (0); 4098 } 4099 4100 /* 4101 * Return (in *usedp) the amount of space written in new that is not 4102 * present in oldsnap. New may be a snapshot or the head. Old must be 4103 * a snapshot before new, in new's filesystem (or its origin). If not then 4104 * fail and return EINVAL. 4105 * 4106 * The written space is calculated by considering two components: First, we 4107 * ignore any freed space, and calculate the written as new's used space 4108 * minus old's used space. Next, we add in the amount of space that was freed 4109 * between the two snapshots, thus reducing new's used space relative to old's. 4110 * Specifically, this is the space that was born before old->ds_creation_txg, 4111 * and freed before new (ie. on new's deadlist or a previous deadlist). 4112 * 4113 * space freed [---------------------] 4114 * snapshots ---O-------O--------O-------O------ 4115 * oldsnap new 4116 */ 4117 int 4118 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, 4119 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4120 { 4121 int err = 0; 4122 uint64_t snapobj; 4123 dsl_pool_t *dp = new->ds_dir->dd_pool; 4124 4125 *usedp = 0; 4126 *usedp += new->ds_phys->ds_used_bytes; 4127 *usedp -= oldsnap->ds_phys->ds_used_bytes; 4128 4129 *compp = 0; 4130 *compp += new->ds_phys->ds_compressed_bytes; 4131 *compp -= oldsnap->ds_phys->ds_compressed_bytes; 4132 4133 *uncompp = 0; 4134 *uncompp += new->ds_phys->ds_uncompressed_bytes; 4135 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; 4136 4137 rw_enter(&dp->dp_config_rwlock, RW_READER); 4138 snapobj = new->ds_object; 4139 while (snapobj != oldsnap->ds_object) { 4140 dsl_dataset_t *snap; 4141 uint64_t used, comp, uncomp; 4142 4143 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); 4144 if (err != 0) 4145 break; 4146 4147 if (snap->ds_phys->ds_prev_snap_txg == 4148 oldsnap->ds_phys->ds_creation_txg) { 4149 /* 4150 * The blocks in the deadlist can not be born after 4151 * ds_prev_snap_txg, so get the whole deadlist space, 4152 * which is more efficient (especially for old-format 4153 * deadlists). Unfortunately the deadlist code 4154 * doesn't have enough information to make this 4155 * optimization itself. 4156 */ 4157 dsl_deadlist_space(&snap->ds_deadlist, 4158 &used, &comp, &uncomp); 4159 } else { 4160 dsl_deadlist_space_range(&snap->ds_deadlist, 4161 0, oldsnap->ds_phys->ds_creation_txg, 4162 &used, &comp, &uncomp); 4163 } 4164 *usedp += used; 4165 *compp += comp; 4166 *uncompp += uncomp; 4167 4168 /* 4169 * If we get to the beginning of the chain of snapshots 4170 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap 4171 * was not a snapshot of/before new. 4172 */ 4173 snapobj = snap->ds_phys->ds_prev_snap_obj; 4174 dsl_dataset_rele(snap, FTAG); 4175 if (snapobj == 0) { 4176 err = EINVAL; 4177 break; 4178 } 4179 4180 } 4181 rw_exit(&dp->dp_config_rwlock); 4182 return (err); 4183 } 4184 4185 /* 4186 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, 4187 * lastsnap, and all snapshots in between are deleted. 4188 * 4189 * blocks that would be freed [---------------------------] 4190 * snapshots ---O-------O--------O-------O--------O 4191 * firstsnap lastsnap 4192 * 4193 * This is the set of blocks that were born after the snap before firstsnap, 4194 * (birth > firstsnap->prev_snap_txg) and died before the snap after the 4195 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). 4196 * We calculate this by iterating over the relevant deadlists (from the snap 4197 * after lastsnap, backward to the snap after firstsnap), summing up the 4198 * space on the deadlist that was born after the snap before firstsnap. 4199 */ 4200 int 4201 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, 4202 dsl_dataset_t *lastsnap, 4203 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4204 { 4205 int err = 0; 4206 uint64_t snapobj; 4207 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; 4208 4209 ASSERT(dsl_dataset_is_snapshot(firstsnap)); 4210 ASSERT(dsl_dataset_is_snapshot(lastsnap)); 4211 4212 /* 4213 * Check that the snapshots are in the same dsl_dir, and firstsnap 4214 * is before lastsnap. 4215 */ 4216 if (firstsnap->ds_dir != lastsnap->ds_dir || 4217 firstsnap->ds_phys->ds_creation_txg > 4218 lastsnap->ds_phys->ds_creation_txg) 4219 return (EINVAL); 4220 4221 *usedp = *compp = *uncompp = 0; 4222 4223 rw_enter(&dp->dp_config_rwlock, RW_READER); 4224 snapobj = lastsnap->ds_phys->ds_next_snap_obj; 4225 while (snapobj != firstsnap->ds_object) { 4226 dsl_dataset_t *ds; 4227 uint64_t used, comp, uncomp; 4228 4229 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); 4230 if (err != 0) 4231 break; 4232 4233 dsl_deadlist_space_range(&ds->ds_deadlist, 4234 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, 4235 &used, &comp, &uncomp); 4236 *usedp += used; 4237 *compp += comp; 4238 *uncompp += uncomp; 4239 4240 snapobj = ds->ds_phys->ds_prev_snap_obj; 4241 ASSERT3U(snapobj, !=, 0); 4242 dsl_dataset_rele(ds, FTAG); 4243 } 4244 rw_exit(&dp->dp_config_rwlock); 4245 return (err); 4246 } 4247