1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Steven Hartland. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 #include <sys/dsl_pool.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dsl_synctask.h> 35 #include <sys/dsl_scan.h> 36 #include <sys/dnode.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/dmu_objset.h> 39 #include <sys/arc.h> 40 #include <sys/zap.h> 41 #include <sys/zio.h> 42 #include <sys/zfs_context.h> 43 #include <sys/fs/zfs.h> 44 #include <sys/zfs_znode.h> 45 #include <sys/spa_impl.h> 46 #include <sys/dsl_deadlist.h> 47 #include <sys/bptree.h> 48 #include <sys/zfeature.h> 49 #include <sys/zil_impl.h> 50 #include <sys/dsl_userhold.h> 51 52 /* 53 * ZFS Write Throttle 54 * ------------------ 55 * 56 * ZFS must limit the rate of incoming writes to the rate at which it is able 57 * to sync data modifications to the backend storage. Throttling by too much 58 * creates an artificial limit; throttling by too little can only be sustained 59 * for short periods and would lead to highly lumpy performance. On a per-pool 60 * basis, ZFS tracks the amount of modified (dirty) data. As operations change 61 * data, the amount of dirty data increases; as ZFS syncs out data, the amount 62 * of dirty data decreases. When the amount of dirty data exceeds a 63 * predetermined threshold further modifications are blocked until the amount 64 * of dirty data decreases (as data is synced out). 65 * 66 * The limit on dirty data is tunable, and should be adjusted according to 67 * both the IO capacity and available memory of the system. The larger the 68 * window, the more ZFS is able to aggregate and amortize metadata (and data) 69 * changes. However, memory is a limited resource, and allowing for more dirty 70 * data comes at the cost of keeping other useful data in memory (for example 71 * ZFS data cached by the ARC). 72 * 73 * Implementation 74 * 75 * As buffers are modified dsl_pool_willuse_space() increments both the per- 76 * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of 77 * dirty space used; dsl_pool_dirty_space() decrements those values as data 78 * is synced out from dsl_pool_sync(). While only the poolwide value is 79 * relevant, the per-txg value is useful for debugging. The tunable 80 * zfs_dirty_data_max determines the dirty space limit. Once that value is 81 * exceeded, new writes are halted until space frees up. 82 * 83 * The zfs_dirty_data_sync tunable dictates the threshold at which we 84 * ensure that there is a txg syncing (see the comment in txg.c for a full 85 * description of transaction group stages). 86 * 87 * The IO scheduler uses both the dirty space limit and current amount of 88 * dirty data as inputs. Those values affect the number of concurrent IOs ZFS 89 * issues. See the comment in vdev_queue.c for details of the IO scheduler. 90 * 91 * The delay is also calculated based on the amount of dirty data. See the 92 * comment above dmu_tx_delay() for details. 93 */ 94 95 /* 96 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, 97 * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. 98 */ 99 uint64_t zfs_dirty_data_max; 100 uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; 101 int zfs_dirty_data_max_percent = 10; 102 103 /* 104 * If there is at least this much dirty data, push out a txg. 105 */ 106 uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; 107 108 /* 109 * Once there is this amount of dirty data, the dmu_tx_delay() will kick in 110 * and delay each transaction. 111 * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. 112 */ 113 int zfs_delay_min_dirty_percent = 60; 114 115 /* 116 * This controls how quickly the delay approaches infinity. 117 * Larger values cause it to delay more for a given amount of dirty data. 118 * Therefore larger values will cause there to be less dirty data for a 119 * given throughput. 120 * 121 * For the smoothest delay, this value should be about 1 billion divided 122 * by the maximum number of operations per second. This will smoothly 123 * handle between 10x and 1/10th this number. 124 * 125 * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the 126 * multiply in dmu_tx_delay(). 127 */ 128 uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; 129 130 131 hrtime_t zfs_throttle_delay = MSEC2NSEC(10); 132 hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); 133 134 int 135 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 136 { 137 uint64_t obj; 138 int err; 139 140 err = zap_lookup(dp->dp_meta_objset, 141 dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, 142 name, sizeof (obj), 1, &obj); 143 if (err) 144 return (err); 145 146 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 147 } 148 149 static dsl_pool_t * 150 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 151 { 152 dsl_pool_t *dp; 153 blkptr_t *bp = spa_get_rootblkptr(spa); 154 155 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 156 dp->dp_spa = spa; 157 dp->dp_meta_rootbp = *bp; 158 rrw_init(&dp->dp_config_rwlock, B_TRUE); 159 txg_init(dp, txg); 160 161 txg_list_create(&dp->dp_dirty_datasets, 162 offsetof(dsl_dataset_t, ds_dirty_link)); 163 txg_list_create(&dp->dp_dirty_zilogs, 164 offsetof(zilog_t, zl_dirty_link)); 165 txg_list_create(&dp->dp_dirty_dirs, 166 offsetof(dsl_dir_t, dd_dirty_link)); 167 txg_list_create(&dp->dp_sync_tasks, 168 offsetof(dsl_sync_task_t, dst_node)); 169 170 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 171 cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); 172 173 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 174 1, 4, 0); 175 176 return (dp); 177 } 178 179 int 180 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 181 { 182 int err; 183 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 184 185 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 186 &dp->dp_meta_objset); 187 if (err != 0) 188 dsl_pool_close(dp); 189 else 190 *dpp = dp; 191 192 return (err); 193 } 194 195 int 196 dsl_pool_open(dsl_pool_t *dp) 197 { 198 int err; 199 dsl_dir_t *dd; 200 dsl_dataset_t *ds; 201 uint64_t obj; 202 203 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 204 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 205 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 206 &dp->dp_root_dir_obj); 207 if (err) 208 goto out; 209 210 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 211 NULL, dp, &dp->dp_root_dir); 212 if (err) 213 goto out; 214 215 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 216 if (err) 217 goto out; 218 219 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 220 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 221 if (err) 222 goto out; 223 err = dsl_dataset_hold_obj(dp, 224 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); 225 if (err == 0) { 226 err = dsl_dataset_hold_obj(dp, 227 dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, 228 &dp->dp_origin_snap); 229 dsl_dataset_rele(ds, FTAG); 230 } 231 dsl_dir_rele(dd, dp); 232 if (err) 233 goto out; 234 } 235 236 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 237 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 238 &dp->dp_free_dir); 239 if (err) 240 goto out; 241 242 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 243 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 244 if (err) 245 goto out; 246 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 247 dp->dp_meta_objset, obj)); 248 } 249 250 /* 251 * Note: errors ignored, because the leak dir will not exist if we 252 * have not encountered a leak yet. 253 */ 254 (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, 255 &dp->dp_leak_dir); 256 257 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { 258 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 259 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 260 &dp->dp_bptree_obj); 261 if (err != 0) 262 goto out; 263 } 264 265 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { 266 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 267 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 268 &dp->dp_empty_bpobj); 269 if (err != 0) 270 goto out; 271 } 272 273 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 274 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 275 &dp->dp_tmp_userrefs_obj); 276 if (err == ENOENT) 277 err = 0; 278 if (err) 279 goto out; 280 281 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 282 283 out: 284 rrw_exit(&dp->dp_config_rwlock, FTAG); 285 return (err); 286 } 287 288 void 289 dsl_pool_close(dsl_pool_t *dp) 290 { 291 /* 292 * Drop our references from dsl_pool_open(). 293 * 294 * Since we held the origin_snap from "syncing" context (which 295 * includes pool-opening context), it actually only got a "ref" 296 * and not a hold, so just drop that here. 297 */ 298 if (dp->dp_origin_snap) 299 dsl_dataset_rele(dp->dp_origin_snap, dp); 300 if (dp->dp_mos_dir) 301 dsl_dir_rele(dp->dp_mos_dir, dp); 302 if (dp->dp_free_dir) 303 dsl_dir_rele(dp->dp_free_dir, dp); 304 if (dp->dp_leak_dir) 305 dsl_dir_rele(dp->dp_leak_dir, dp); 306 if (dp->dp_root_dir) 307 dsl_dir_rele(dp->dp_root_dir, dp); 308 309 bpobj_close(&dp->dp_free_bpobj); 310 311 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 312 if (dp->dp_meta_objset) 313 dmu_objset_evict(dp->dp_meta_objset); 314 315 txg_list_destroy(&dp->dp_dirty_datasets); 316 txg_list_destroy(&dp->dp_dirty_zilogs); 317 txg_list_destroy(&dp->dp_sync_tasks); 318 txg_list_destroy(&dp->dp_dirty_dirs); 319 320 /* 321 * We can't set retry to TRUE since we're explicitly specifying 322 * a spa to flush. This is good enough; any missed buffers for 323 * this spa won't cause trouble, and they'll eventually fall 324 * out of the ARC just like any other unused buffer. 325 */ 326 arc_flush(dp->dp_spa, FALSE); 327 328 txg_fini(dp); 329 dsl_scan_fini(dp); 330 dmu_buf_user_evict_wait(); 331 332 rrw_destroy(&dp->dp_config_rwlock); 333 mutex_destroy(&dp->dp_lock); 334 taskq_destroy(dp->dp_vnrele_taskq); 335 if (dp->dp_blkstats) 336 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 337 kmem_free(dp, sizeof (dsl_pool_t)); 338 } 339 340 dsl_pool_t * 341 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 342 { 343 int err; 344 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 345 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 346 objset_t *os; 347 dsl_dataset_t *ds; 348 uint64_t obj; 349 350 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 351 352 /* create and open the MOS (meta-objset) */ 353 dp->dp_meta_objset = dmu_objset_create_impl(spa, 354 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 355 356 /* create the pool directory */ 357 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 358 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 359 ASSERT0(err); 360 361 /* Initialize scan structures */ 362 VERIFY0(dsl_scan_init(dp, txg)); 363 364 /* create and open the root dir */ 365 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 366 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 367 NULL, dp, &dp->dp_root_dir)); 368 369 /* create and open the meta-objset dir */ 370 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 371 VERIFY0(dsl_pool_open_special_dir(dp, 372 MOS_DIR_NAME, &dp->dp_mos_dir)); 373 374 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 375 /* create and open the free dir */ 376 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 377 FREE_DIR_NAME, tx); 378 VERIFY0(dsl_pool_open_special_dir(dp, 379 FREE_DIR_NAME, &dp->dp_free_dir)); 380 381 /* create and open the free_bplist */ 382 obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); 383 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 384 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 385 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 386 dp->dp_meta_objset, obj)); 387 } 388 389 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 390 dsl_pool_create_origin(dp, tx); 391 392 /* create the root dataset */ 393 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 394 395 /* create the root objset */ 396 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 397 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 398 os = dmu_objset_create_impl(dp->dp_spa, ds, 399 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 400 rrw_exit(&ds->ds_bp_rwlock, FTAG); 401 #ifdef _KERNEL 402 zfs_create_fs(os, kcred, zplprops, tx); 403 #endif 404 dsl_dataset_rele(ds, FTAG); 405 406 dmu_tx_commit(tx); 407 408 rrw_exit(&dp->dp_config_rwlock, FTAG); 409 410 return (dp); 411 } 412 413 /* 414 * Account for the meta-objset space in its placeholder dsl_dir. 415 */ 416 void 417 dsl_pool_mos_diduse_space(dsl_pool_t *dp, 418 int64_t used, int64_t comp, int64_t uncomp) 419 { 420 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 421 mutex_enter(&dp->dp_lock); 422 dp->dp_mos_used_delta += used; 423 dp->dp_mos_compressed_delta += comp; 424 dp->dp_mos_uncompressed_delta += uncomp; 425 mutex_exit(&dp->dp_lock); 426 } 427 428 static void 429 dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) 430 { 431 zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 432 dmu_objset_sync(dp->dp_meta_objset, zio, tx); 433 VERIFY0(zio_wait(zio)); 434 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 435 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 436 } 437 438 static void 439 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) 440 { 441 ASSERT(MUTEX_HELD(&dp->dp_lock)); 442 443 if (delta < 0) 444 ASSERT3U(-delta, <=, dp->dp_dirty_total); 445 446 dp->dp_dirty_total += delta; 447 448 /* 449 * Note: we signal even when increasing dp_dirty_total. 450 * This ensures forward progress -- each thread wakes the next waiter. 451 */ 452 if (dp->dp_dirty_total <= zfs_dirty_data_max) 453 cv_signal(&dp->dp_spaceavail_cv); 454 } 455 456 void 457 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 458 { 459 zio_t *zio; 460 dmu_tx_t *tx; 461 dsl_dir_t *dd; 462 dsl_dataset_t *ds; 463 objset_t *mos = dp->dp_meta_objset; 464 list_t synced_datasets; 465 466 list_create(&synced_datasets, sizeof (dsl_dataset_t), 467 offsetof(dsl_dataset_t, ds_synced_link)); 468 469 tx = dmu_tx_create_assigned(dp, txg); 470 471 /* 472 * Write out all dirty blocks of dirty datasets. 473 */ 474 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 475 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 476 /* 477 * We must not sync any non-MOS datasets twice, because 478 * we may have taken a snapshot of them. However, we 479 * may sync newly-created datasets on pass 2. 480 */ 481 ASSERT(!list_link_active(&ds->ds_synced_link)); 482 list_insert_tail(&synced_datasets, ds); 483 dsl_dataset_sync(ds, zio, tx); 484 } 485 VERIFY0(zio_wait(zio)); 486 487 /* 488 * We have written all of the accounted dirty data, so our 489 * dp_space_towrite should now be zero. However, some seldom-used 490 * code paths do not adhere to this (e.g. dbuf_undirty(), also 491 * rounding error in dbuf_write_physdone). 492 * Shore up the accounting of any dirtied space now. 493 */ 494 dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); 495 496 /* 497 * Update the long range free counter after 498 * we're done syncing user data 499 */ 500 mutex_enter(&dp->dp_lock); 501 ASSERT(spa_sync_pass(dp->dp_spa) == 1 || 502 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); 503 dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; 504 mutex_exit(&dp->dp_lock); 505 506 /* 507 * After the data blocks have been written (ensured by the zio_wait() 508 * above), update the user/group space accounting. 509 */ 510 for (ds = list_head(&synced_datasets); ds != NULL; 511 ds = list_next(&synced_datasets, ds)) { 512 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 513 } 514 515 /* 516 * Sync the datasets again to push out the changes due to 517 * userspace updates. This must be done before we process the 518 * sync tasks, so that any snapshots will have the correct 519 * user accounting information (and we won't get confused 520 * about which blocks are part of the snapshot). 521 */ 522 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 523 while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { 524 ASSERT(list_link_active(&ds->ds_synced_link)); 525 dmu_buf_rele(ds->ds_dbuf, ds); 526 dsl_dataset_sync(ds, zio, tx); 527 } 528 VERIFY0(zio_wait(zio)); 529 530 /* 531 * Now that the datasets have been completely synced, we can 532 * clean up our in-memory structures accumulated while syncing: 533 * 534 * - move dead blocks from the pending deadlist to the on-disk deadlist 535 * - release hold from dsl_dataset_dirty() 536 */ 537 while ((ds = list_remove_head(&synced_datasets)) != NULL) { 538 dsl_dataset_sync_done(ds, tx); 539 } 540 while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { 541 dsl_dir_sync(dd, tx); 542 } 543 544 /* 545 * The MOS's space is accounted for in the pool/$MOS 546 * (dp_mos_dir). We can't modify the mos while we're syncing 547 * it, so we remember the deltas and apply them here. 548 */ 549 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 550 dp->dp_mos_uncompressed_delta != 0) { 551 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 552 dp->dp_mos_used_delta, 553 dp->dp_mos_compressed_delta, 554 dp->dp_mos_uncompressed_delta, tx); 555 dp->dp_mos_used_delta = 0; 556 dp->dp_mos_compressed_delta = 0; 557 dp->dp_mos_uncompressed_delta = 0; 558 } 559 560 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 561 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 562 dsl_pool_sync_mos(dp, tx); 563 } 564 565 /* 566 * If we modify a dataset in the same txg that we want to destroy it, 567 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 568 * dsl_dir_destroy_check() will fail if there are unexpected holds. 569 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 570 * and clearing the hold on it) before we process the sync_tasks. 571 * The MOS data dirtied by the sync_tasks will be synced on the next 572 * pass. 573 */ 574 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 575 dsl_sync_task_t *dst; 576 /* 577 * No more sync tasks should have been added while we 578 * were syncing. 579 */ 580 ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); 581 while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) 582 dsl_sync_task_sync(dst, tx); 583 } 584 585 dmu_tx_commit(tx); 586 587 DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); 588 } 589 590 void 591 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 592 { 593 zilog_t *zilog; 594 595 while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) { 596 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); 597 /* 598 * We don't remove the zilog from the dp_dirty_zilogs 599 * list until after we've cleaned it. This ensures that 600 * callers of zilog_is_dirty() receive an accurate 601 * answer when they are racing with the spa sync thread. 602 */ 603 zil_clean(zilog, txg); 604 (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); 605 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 606 dmu_buf_rele(ds->ds_dbuf, zilog); 607 } 608 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 609 } 610 611 /* 612 * TRUE if the current thread is the tx_sync_thread or if we 613 * are being called from SPA context during pool initialization. 614 */ 615 int 616 dsl_pool_sync_context(dsl_pool_t *dp) 617 { 618 return (curthread == dp->dp_tx.tx_sync_thread || 619 spa_is_initializing(dp->dp_spa)); 620 } 621 622 uint64_t 623 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 624 { 625 uint64_t space, resv; 626 627 /* 628 * If we're trying to assess whether it's OK to do a free, 629 * cut the reservation in half to allow forward progress 630 * (e.g. make it possible to rm(1) files from a full pool). 631 */ 632 space = spa_get_dspace(dp->dp_spa); 633 resv = spa_get_slop_space(dp->dp_spa); 634 if (netfree) 635 resv >>= 1; 636 637 return (space - resv); 638 } 639 640 boolean_t 641 dsl_pool_need_dirty_delay(dsl_pool_t *dp) 642 { 643 uint64_t delay_min_bytes = 644 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; 645 boolean_t rv; 646 647 mutex_enter(&dp->dp_lock); 648 if (dp->dp_dirty_total > zfs_dirty_data_sync) 649 txg_kick(dp); 650 rv = (dp->dp_dirty_total > delay_min_bytes); 651 mutex_exit(&dp->dp_lock); 652 return (rv); 653 } 654 655 void 656 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 657 { 658 if (space > 0) { 659 mutex_enter(&dp->dp_lock); 660 dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; 661 dsl_pool_dirty_delta(dp, space); 662 mutex_exit(&dp->dp_lock); 663 } 664 } 665 666 void 667 dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) 668 { 669 ASSERT3S(space, >=, 0); 670 if (space == 0) 671 return; 672 mutex_enter(&dp->dp_lock); 673 if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { 674 /* XXX writing something we didn't dirty? */ 675 space = dp->dp_dirty_pertxg[txg & TXG_MASK]; 676 } 677 ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); 678 dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; 679 ASSERT3U(dp->dp_dirty_total, >=, space); 680 dsl_pool_dirty_delta(dp, -space); 681 mutex_exit(&dp->dp_lock); 682 } 683 684 /* ARGSUSED */ 685 static int 686 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 687 { 688 dmu_tx_t *tx = arg; 689 dsl_dataset_t *ds, *prev = NULL; 690 int err; 691 692 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 693 if (err) 694 return (err); 695 696 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 697 err = dsl_dataset_hold_obj(dp, 698 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 699 if (err) { 700 dsl_dataset_rele(ds, FTAG); 701 return (err); 702 } 703 704 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) 705 break; 706 dsl_dataset_rele(ds, FTAG); 707 ds = prev; 708 prev = NULL; 709 } 710 711 if (prev == NULL) { 712 prev = dp->dp_origin_snap; 713 714 /* 715 * The $ORIGIN can't have any data, or the accounting 716 * will be wrong. 717 */ 718 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 719 ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); 720 rrw_exit(&ds->ds_bp_rwlock, FTAG); 721 722 /* The origin doesn't get attached to itself */ 723 if (ds->ds_object == prev->ds_object) { 724 dsl_dataset_rele(ds, FTAG); 725 return (0); 726 } 727 728 dmu_buf_will_dirty(ds->ds_dbuf, tx); 729 dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; 730 dsl_dataset_phys(ds)->ds_prev_snap_txg = 731 dsl_dataset_phys(prev)->ds_creation_txg; 732 733 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 734 dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; 735 736 dmu_buf_will_dirty(prev->ds_dbuf, tx); 737 dsl_dataset_phys(prev)->ds_num_children++; 738 739 if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { 740 ASSERT(ds->ds_prev == NULL); 741 VERIFY0(dsl_dataset_hold_obj(dp, 742 dsl_dataset_phys(ds)->ds_prev_snap_obj, 743 ds, &ds->ds_prev)); 744 } 745 } 746 747 ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); 748 ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); 749 750 if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { 751 dmu_buf_will_dirty(prev->ds_dbuf, tx); 752 dsl_dataset_phys(prev)->ds_next_clones_obj = 753 zap_create(dp->dp_meta_objset, 754 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 755 } 756 VERIFY0(zap_add_int(dp->dp_meta_objset, 757 dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); 758 759 dsl_dataset_rele(ds, FTAG); 760 if (prev != dp->dp_origin_snap) 761 dsl_dataset_rele(prev, FTAG); 762 return (0); 763 } 764 765 void 766 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 767 { 768 ASSERT(dmu_tx_is_syncing(tx)); 769 ASSERT(dp->dp_origin_snap != NULL); 770 771 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 772 tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 773 } 774 775 /* ARGSUSED */ 776 static int 777 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 778 { 779 dmu_tx_t *tx = arg; 780 objset_t *mos = dp->dp_meta_objset; 781 782 if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { 783 dsl_dataset_t *origin; 784 785 VERIFY0(dsl_dataset_hold_obj(dp, 786 dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); 787 788 if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { 789 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 790 dsl_dir_phys(origin->ds_dir)->dd_clones = 791 zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 792 0, tx); 793 } 794 795 VERIFY0(zap_add_int(dp->dp_meta_objset, 796 dsl_dir_phys(origin->ds_dir)->dd_clones, 797 ds->ds_object, tx)); 798 799 dsl_dataset_rele(origin, FTAG); 800 } 801 return (0); 802 } 803 804 void 805 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 806 { 807 ASSERT(dmu_tx_is_syncing(tx)); 808 uint64_t obj; 809 810 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 811 VERIFY0(dsl_pool_open_special_dir(dp, 812 FREE_DIR_NAME, &dp->dp_free_dir)); 813 814 /* 815 * We can't use bpobj_alloc(), because spa_version() still 816 * returns the old version, and we need a new-version bpobj with 817 * subobj support. So call dmu_object_alloc() directly. 818 */ 819 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 820 SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 821 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 822 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 823 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 824 825 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 826 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); 827 } 828 829 void 830 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 831 { 832 uint64_t dsobj; 833 dsl_dataset_t *ds; 834 835 ASSERT(dmu_tx_is_syncing(tx)); 836 ASSERT(dp->dp_origin_snap == NULL); 837 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 838 839 /* create the origin dir, ds, & snap-ds */ 840 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 841 NULL, 0, kcred, tx); 842 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 843 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 844 VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, 845 dp, &dp->dp_origin_snap)); 846 dsl_dataset_rele(ds, FTAG); 847 } 848 849 taskq_t * 850 dsl_pool_vnrele_taskq(dsl_pool_t *dp) 851 { 852 return (dp->dp_vnrele_taskq); 853 } 854 855 /* 856 * Walk through the pool-wide zap object of temporary snapshot user holds 857 * and release them. 858 */ 859 void 860 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 861 { 862 zap_attribute_t za; 863 zap_cursor_t zc; 864 objset_t *mos = dp->dp_meta_objset; 865 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 866 nvlist_t *holds; 867 868 if (zapobj == 0) 869 return; 870 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 871 872 holds = fnvlist_alloc(); 873 874 for (zap_cursor_init(&zc, mos, zapobj); 875 zap_cursor_retrieve(&zc, &za) == 0; 876 zap_cursor_advance(&zc)) { 877 char *htag; 878 nvlist_t *tags; 879 880 htag = strchr(za.za_name, '-'); 881 *htag = '\0'; 882 ++htag; 883 if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { 884 tags = fnvlist_alloc(); 885 fnvlist_add_boolean(tags, htag); 886 fnvlist_add_nvlist(holds, za.za_name, tags); 887 fnvlist_free(tags); 888 } else { 889 fnvlist_add_boolean(tags, htag); 890 } 891 } 892 dsl_dataset_user_release_tmp(dp, holds); 893 fnvlist_free(holds); 894 zap_cursor_fini(&zc); 895 } 896 897 /* 898 * Create the pool-wide zap object for storing temporary snapshot holds. 899 */ 900 void 901 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 902 { 903 objset_t *mos = dp->dp_meta_objset; 904 905 ASSERT(dp->dp_tmp_userrefs_obj == 0); 906 ASSERT(dmu_tx_is_syncing(tx)); 907 908 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 909 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 910 } 911 912 static int 913 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 914 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 915 { 916 objset_t *mos = dp->dp_meta_objset; 917 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 918 char *name; 919 int error; 920 921 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 922 ASSERT(dmu_tx_is_syncing(tx)); 923 924 /* 925 * If the pool was created prior to SPA_VERSION_USERREFS, the 926 * zap object for temporary holds might not exist yet. 927 */ 928 if (zapobj == 0) { 929 if (holding) { 930 dsl_pool_user_hold_create_obj(dp, tx); 931 zapobj = dp->dp_tmp_userrefs_obj; 932 } else { 933 return (SET_ERROR(ENOENT)); 934 } 935 } 936 937 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 938 if (holding) 939 error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 940 else 941 error = zap_remove(mos, zapobj, name, tx); 942 strfree(name); 943 944 return (error); 945 } 946 947 /* 948 * Add a temporary hold for the given dataset object and tag. 949 */ 950 int 951 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 952 uint64_t now, dmu_tx_t *tx) 953 { 954 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 955 } 956 957 /* 958 * Release a temporary hold for the given dataset object and tag. 959 */ 960 int 961 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 962 dmu_tx_t *tx) 963 { 964 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 965 tx, B_FALSE)); 966 } 967 968 /* 969 * DSL Pool Configuration Lock 970 * 971 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 972 * creation / destruction / rename / property setting). It must be held for 973 * read to hold a dataset or dsl_dir. I.e. you must call 974 * dsl_pool_config_enter() or dsl_pool_hold() before calling 975 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 976 * must be held continuously until all datasets and dsl_dirs are released. 977 * 978 * The only exception to this rule is that if a "long hold" is placed on 979 * a dataset, then the dp_config_rwlock may be dropped while the dataset 980 * is still held. The long hold will prevent the dataset from being 981 * destroyed -- the destroy will fail with EBUSY. A long hold can be 982 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 983 * (by calling dsl_{dataset,objset}_{try}own{_obj}). 984 * 985 * Legitimate long-holders (including owners) should be long-running, cancelable 986 * tasks that should cause "zfs destroy" to fail. This includes DMU 987 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 988 * "zfs send", and "zfs diff". There are several other long-holders whose 989 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 990 * 991 * The usual formula for long-holding would be: 992 * dsl_pool_hold() 993 * dsl_dataset_hold() 994 * ... perform checks ... 995 * dsl_dataset_long_hold() 996 * dsl_pool_rele() 997 * ... perform long-running task ... 998 * dsl_dataset_long_rele() 999 * dsl_dataset_rele() 1000 * 1001 * Note that when the long hold is released, the dataset is still held but 1002 * the pool is not held. The dataset may change arbitrarily during this time 1003 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 1004 * dataset except release it. 1005 * 1006 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 1007 * or modifying operations. 1008 * 1009 * Modifying operations should generally use dsl_sync_task(). The synctask 1010 * infrastructure enforces proper locking strategy with respect to the 1011 * dp_config_rwlock. See the comment above dsl_sync_task() for details. 1012 * 1013 * Read-only operations will manually hold the pool, then the dataset, obtain 1014 * information from the dataset, then release the pool and dataset. 1015 * dmu_objset_{hold,rele}() are convenience routines that also do the pool 1016 * hold/rele. 1017 */ 1018 1019 int 1020 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 1021 { 1022 spa_t *spa; 1023 int error; 1024 1025 error = spa_open(name, &spa, tag); 1026 if (error == 0) { 1027 *dp = spa_get_dsl(spa); 1028 dsl_pool_config_enter(*dp, tag); 1029 } 1030 return (error); 1031 } 1032 1033 void 1034 dsl_pool_rele(dsl_pool_t *dp, void *tag) 1035 { 1036 dsl_pool_config_exit(dp, tag); 1037 spa_close(dp->dp_spa, tag); 1038 } 1039 1040 void 1041 dsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1042 { 1043 /* 1044 * We use a "reentrant" reader-writer lock, but not reentrantly. 1045 * 1046 * The rrwlock can (with the track_all flag) track all reading threads, 1047 * which is very useful for debugging which code path failed to release 1048 * the lock, and for verifying that the *current* thread does hold 1049 * the lock. 1050 * 1051 * (Unlike a rwlock, which knows that N threads hold it for 1052 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1053 * if any thread holds it for read, even if this thread doesn't). 1054 */ 1055 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1056 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1057 } 1058 1059 void 1060 dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) 1061 { 1062 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1063 rrw_enter_read_prio(&dp->dp_config_rwlock, tag); 1064 } 1065 1066 void 1067 dsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1068 { 1069 rrw_exit(&dp->dp_config_rwlock, tag); 1070 } 1071 1072 boolean_t 1073 dsl_pool_config_held(dsl_pool_t *dp) 1074 { 1075 return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1076 } 1077 1078 boolean_t 1079 dsl_pool_config_held_writer(dsl_pool_t *dp) 1080 { 1081 return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); 1082 } 1083