1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa_impl.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/zap.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/metaslab.h> 34 #include <sys/metaslab_impl.h> 35 #include <sys/uberblock_impl.h> 36 #include <sys/txg.h> 37 #include <sys/avl.h> 38 #include <sys/bpobj.h> 39 #include <sys/dsl_pool.h> 40 #include <sys/dsl_synctask.h> 41 #include <sys/dsl_dir.h> 42 #include <sys/arc.h> 43 #include <sys/zfeature.h> 44 #include <sys/vdev_indirect_births.h> 45 #include <sys/vdev_indirect_mapping.h> 46 #include <sys/abd.h> 47 #include <sys/vdev_initialize.h> 48 49 /* 50 * This file contains the necessary logic to remove vdevs from a 51 * storage pool. Currently, the only devices that can be removed 52 * are log, cache, and spare devices; and top level vdevs from a pool 53 * w/o raidz. (Note that members of a mirror can also be removed 54 * by the detach operation.) 55 * 56 * Log vdevs are removed by evacuating them and then turning the vdev 57 * into a hole vdev while holding spa config locks. 58 * 59 * Top level vdevs are removed and converted into an indirect vdev via 60 * a multi-step process: 61 * 62 * - Disable allocations from this device (spa_vdev_remove_top). 63 * 64 * - From a new thread (spa_vdev_remove_thread), copy data from 65 * the removing vdev to a different vdev. The copy happens in open 66 * context (spa_vdev_copy_impl) and issues a sync task 67 * (vdev_mapping_sync) so the sync thread can update the partial 68 * indirect mappings in core and on disk. 69 * 70 * - If a free happens during a removal, it is freed from the 71 * removing vdev, and if it has already been copied, from the new 72 * location as well (free_from_removing_vdev). 73 * 74 * - After the removal is completed, the copy thread converts the vdev 75 * into an indirect vdev (vdev_remove_complete) before instructing 76 * the sync thread to destroy the space maps and finish the removal 77 * (spa_finish_removal). 78 */ 79 80 typedef struct vdev_copy_arg { 81 metaslab_t *vca_msp; 82 uint64_t vca_outstanding_bytes; 83 kcondvar_t vca_cv; 84 kmutex_t vca_lock; 85 } vdev_copy_arg_t; 86 87 /* 88 * The maximum amount of memory we can use for outstanding i/o while 89 * doing a device removal. This determines how much i/o we can have 90 * in flight concurrently. 91 */ 92 int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; 93 94 /* 95 * The largest contiguous segment that we will attempt to allocate when 96 * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If 97 * there is a performance problem with attempting to allocate large blocks, 98 * consider decreasing this. 99 * 100 * Note: we will issue I/Os of up to this size. The mpt driver does not 101 * respond well to I/Os larger than 1MB, so we set this to 1MB. (When 102 * mpt processes an I/O larger than 1MB, it needs to do an allocation of 103 * 2 physically contiguous pages; if this allocation fails, mpt will drop 104 * the I/O and hang the device.) 105 */ 106 int zfs_remove_max_segment = 1024 * 1024; 107 108 /* 109 * This is used by the test suite so that it can ensure that certain 110 * actions happen while in the middle of a removal. 111 */ 112 uint64_t zfs_remove_max_bytes_pause = UINT64_MAX; 113 114 #define VDEV_REMOVAL_ZAP_OBJS "lzap" 115 116 static void spa_vdev_remove_thread(void *arg); 117 118 static void 119 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) 120 { 121 VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, 122 DMU_POOL_DIRECTORY_OBJECT, 123 DMU_POOL_REMOVING, sizeof (uint64_t), 124 sizeof (spa->spa_removing_phys) / sizeof (uint64_t), 125 &spa->spa_removing_phys, tx)); 126 } 127 128 static nvlist_t * 129 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 130 { 131 for (int i = 0; i < count; i++) { 132 uint64_t guid = 133 fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); 134 135 if (guid == target_guid) 136 return (nvpp[i]); 137 } 138 139 return (NULL); 140 } 141 142 static void 143 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 144 nvlist_t *dev_to_remove) 145 { 146 nvlist_t **newdev = NULL; 147 148 if (count > 1) 149 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 150 151 for (int i = 0, j = 0; i < count; i++) { 152 if (dev[i] == dev_to_remove) 153 continue; 154 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 155 } 156 157 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 158 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 159 160 for (int i = 0; i < count - 1; i++) 161 nvlist_free(newdev[i]); 162 163 if (count > 1) 164 kmem_free(newdev, (count - 1) * sizeof (void *)); 165 } 166 167 static spa_vdev_removal_t * 168 spa_vdev_removal_create(vdev_t *vd) 169 { 170 spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); 171 mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); 172 cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); 173 svr->svr_allocd_segs = range_tree_create(NULL, NULL); 174 svr->svr_vdev_id = vd->vdev_id; 175 176 for (int i = 0; i < TXG_SIZE; i++) { 177 svr->svr_frees[i] = range_tree_create(NULL, NULL); 178 list_create(&svr->svr_new_segments[i], 179 sizeof (vdev_indirect_mapping_entry_t), 180 offsetof(vdev_indirect_mapping_entry_t, vime_node)); 181 } 182 183 return (svr); 184 } 185 186 void 187 spa_vdev_removal_destroy(spa_vdev_removal_t *svr) 188 { 189 for (int i = 0; i < TXG_SIZE; i++) { 190 ASSERT0(svr->svr_bytes_done[i]); 191 ASSERT0(svr->svr_max_offset_to_sync[i]); 192 range_tree_destroy(svr->svr_frees[i]); 193 list_destroy(&svr->svr_new_segments[i]); 194 } 195 196 range_tree_destroy(svr->svr_allocd_segs); 197 mutex_destroy(&svr->svr_lock); 198 cv_destroy(&svr->svr_cv); 199 kmem_free(svr, sizeof (*svr)); 200 } 201 202 /* 203 * This is called as a synctask in the txg in which we will mark this vdev 204 * as removing (in the config stored in the MOS). 205 * 206 * It begins the evacuation of a toplevel vdev by: 207 * - initializing the spa_removing_phys which tracks this removal 208 * - computing the amount of space to remove for accounting purposes 209 * - dirtying all dbufs in the spa_config_object 210 * - creating the spa_vdev_removal 211 * - starting the spa_vdev_remove_thread 212 */ 213 static void 214 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) 215 { 216 int vdev_id = (uintptr_t)arg; 217 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 218 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 219 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 220 objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; 221 spa_vdev_removal_t *svr = NULL; 222 uint64_t txg = dmu_tx_get_txg(tx); 223 224 ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); 225 svr = spa_vdev_removal_create(vd); 226 227 ASSERT(vd->vdev_removing); 228 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 229 230 spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); 231 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 232 /* 233 * By activating the OBSOLETE_COUNTS feature, we prevent 234 * the pool from being downgraded and ensure that the 235 * refcounts are precise. 236 */ 237 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 238 uint64_t one = 1; 239 VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, 240 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, 241 &one, tx)); 242 ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); 243 } 244 245 vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); 246 vd->vdev_indirect_mapping = 247 vdev_indirect_mapping_open(mos, vic->vic_mapping_object); 248 vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); 249 vd->vdev_indirect_births = 250 vdev_indirect_births_open(mos, vic->vic_births_object); 251 spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; 252 spa->spa_removing_phys.sr_start_time = gethrestime_sec(); 253 spa->spa_removing_phys.sr_end_time = 0; 254 spa->spa_removing_phys.sr_state = DSS_SCANNING; 255 spa->spa_removing_phys.sr_to_copy = 0; 256 spa->spa_removing_phys.sr_copied = 0; 257 258 /* 259 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because 260 * there may be space in the defer tree, which is free, but still 261 * counted in vs_alloc. 262 */ 263 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { 264 metaslab_t *ms = vd->vdev_ms[i]; 265 if (ms->ms_sm == NULL) 266 continue; 267 268 /* 269 * Sync tasks happen before metaslab_sync(), therefore 270 * smp_alloc and sm_alloc must be the same. 271 */ 272 ASSERT3U(space_map_allocated(ms->ms_sm), ==, 273 ms->ms_sm->sm_phys->smp_alloc); 274 275 spa->spa_removing_phys.sr_to_copy += 276 space_map_allocated(ms->ms_sm); 277 278 /* 279 * Space which we are freeing this txg does not need to 280 * be copied. 281 */ 282 spa->spa_removing_phys.sr_to_copy -= 283 range_tree_space(ms->ms_freeing); 284 285 ASSERT0(range_tree_space(ms->ms_freed)); 286 for (int t = 0; t < TXG_SIZE; t++) 287 ASSERT0(range_tree_space(ms->ms_allocating[t])); 288 } 289 290 /* 291 * Sync tasks are called before metaslab_sync(), so there should 292 * be no already-synced metaslabs in the TXG_CLEAN list. 293 */ 294 ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); 295 296 spa_sync_removing_state(spa, tx); 297 298 /* 299 * All blocks that we need to read the most recent mapping must be 300 * stored on concrete vdevs. Therefore, we must dirty anything that 301 * is read before spa_remove_init(). Specifically, the 302 * spa_config_object. (Note that although we already modified the 303 * spa_config_object in spa_sync_removing_state, that may not have 304 * modified all blocks of the object.) 305 */ 306 dmu_object_info_t doi; 307 VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); 308 for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { 309 dmu_buf_t *dbuf; 310 VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, 311 offset, FTAG, &dbuf, 0)); 312 dmu_buf_will_dirty(dbuf, tx); 313 offset += dbuf->db_size; 314 dmu_buf_rele(dbuf, FTAG); 315 } 316 317 /* 318 * Now that we've allocated the im_object, dirty the vdev to ensure 319 * that the object gets written to the config on disk. 320 */ 321 vdev_config_dirty(vd); 322 323 zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " 324 "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), 325 vic->vic_mapping_object); 326 327 spa_history_log_internal(spa, "vdev remove started", tx, 328 "%s vdev %llu %s", spa_name(spa), vd->vdev_id, 329 (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 330 /* 331 * Setting spa_vdev_removal causes subsequent frees to call 332 * free_from_removing_vdev(). Note that we don't need any locking 333 * because we are the sync thread, and metaslab_free_impl() is only 334 * called from syncing context (potentially from a zio taskq thread, 335 * but in any case only when there are outstanding free i/os, which 336 * there are not). 337 */ 338 ASSERT3P(spa->spa_vdev_removal, ==, NULL); 339 spa->spa_vdev_removal = svr; 340 svr->svr_thread = thread_create(NULL, 0, 341 spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); 342 } 343 344 /* 345 * When we are opening a pool, we must read the mapping for each 346 * indirect vdev in order from most recently removed to least 347 * recently removed. We do this because the blocks for the mapping 348 * of older indirect vdevs may be stored on more recently removed vdevs. 349 * In order to read each indirect mapping object, we must have 350 * initialized all more recently removed vdevs. 351 */ 352 int 353 spa_remove_init(spa_t *spa) 354 { 355 int error; 356 357 error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, 358 DMU_POOL_DIRECTORY_OBJECT, 359 DMU_POOL_REMOVING, sizeof (uint64_t), 360 sizeof (spa->spa_removing_phys) / sizeof (uint64_t), 361 &spa->spa_removing_phys); 362 363 if (error == ENOENT) { 364 spa->spa_removing_phys.sr_state = DSS_NONE; 365 spa->spa_removing_phys.sr_removing_vdev = -1; 366 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 367 spa->spa_indirect_vdevs_loaded = B_TRUE; 368 return (0); 369 } else if (error != 0) { 370 return (error); 371 } 372 373 if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { 374 /* 375 * We are currently removing a vdev. Create and 376 * initialize a spa_vdev_removal_t from the bonus 377 * buffer of the removing vdevs vdev_im_object, and 378 * initialize its partial mapping. 379 */ 380 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 381 vdev_t *vd = vdev_lookup_top(spa, 382 spa->spa_removing_phys.sr_removing_vdev); 383 384 if (vd == NULL) { 385 spa_config_exit(spa, SCL_STATE, FTAG); 386 return (EINVAL); 387 } 388 389 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 390 391 ASSERT(vdev_is_concrete(vd)); 392 spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); 393 ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); 394 ASSERT(vd->vdev_removing); 395 396 vd->vdev_indirect_mapping = vdev_indirect_mapping_open( 397 spa->spa_meta_objset, vic->vic_mapping_object); 398 vd->vdev_indirect_births = vdev_indirect_births_open( 399 spa->spa_meta_objset, vic->vic_births_object); 400 spa_config_exit(spa, SCL_STATE, FTAG); 401 402 spa->spa_vdev_removal = svr; 403 } 404 405 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 406 uint64_t indirect_vdev_id = 407 spa->spa_removing_phys.sr_prev_indirect_vdev; 408 while (indirect_vdev_id != UINT64_MAX) { 409 vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); 410 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 411 412 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 413 vd->vdev_indirect_mapping = vdev_indirect_mapping_open( 414 spa->spa_meta_objset, vic->vic_mapping_object); 415 vd->vdev_indirect_births = vdev_indirect_births_open( 416 spa->spa_meta_objset, vic->vic_births_object); 417 418 indirect_vdev_id = vic->vic_prev_indirect_vdev; 419 } 420 spa_config_exit(spa, SCL_STATE, FTAG); 421 422 /* 423 * Now that we've loaded all the indirect mappings, we can allow 424 * reads from other blocks (e.g. via predictive prefetch). 425 */ 426 spa->spa_indirect_vdevs_loaded = B_TRUE; 427 return (0); 428 } 429 430 void 431 spa_restart_removal(spa_t *spa) 432 { 433 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 434 435 if (svr == NULL) 436 return; 437 438 /* 439 * In general when this function is called there is no 440 * removal thread running. The only scenario where this 441 * is not true is during spa_import() where this function 442 * is called twice [once from spa_import_impl() and 443 * spa_async_resume()]. Thus, in the scenario where we 444 * import a pool that has an ongoing removal we don't 445 * want to spawn a second thread. 446 */ 447 if (svr->svr_thread != NULL) 448 return; 449 450 if (!spa_writeable(spa)) 451 return; 452 453 zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); 454 svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 455 0, &p0, TS_RUN, minclsyspri); 456 } 457 458 /* 459 * Process freeing from a device which is in the middle of being removed. 460 * We must handle this carefully so that we attempt to copy freed data, 461 * and we correctly free already-copied data. 462 */ 463 void 464 free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) 465 { 466 spa_t *spa = vd->vdev_spa; 467 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 468 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 469 uint64_t txg = spa_syncing_txg(spa); 470 uint64_t max_offset_yet = 0; 471 472 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 473 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, 474 vdev_indirect_mapping_object(vim)); 475 ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); 476 477 mutex_enter(&svr->svr_lock); 478 479 /* 480 * Remove the segment from the removing vdev's spacemap. This 481 * ensures that we will not attempt to copy this space (if the 482 * removal thread has not yet visited it), and also ensures 483 * that we know what is actually allocated on the new vdevs 484 * (needed if we cancel the removal). 485 * 486 * Note: we must do the metaslab_free_concrete() with the svr_lock 487 * held, so that the remove_thread can not load this metaslab and then 488 * visit this offset between the time that we metaslab_free_concrete() 489 * and when we check to see if it has been visited. 490 * 491 * Note: The checkpoint flag is set to false as having/taking 492 * a checkpoint and removing a device can't happen at the same 493 * time. 494 */ 495 ASSERT(!spa_has_checkpoint(spa)); 496 metaslab_free_concrete(vd, offset, size, B_FALSE); 497 498 uint64_t synced_size = 0; 499 uint64_t synced_offset = 0; 500 uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); 501 if (offset < max_offset_synced) { 502 /* 503 * The mapping for this offset is already on disk. 504 * Free from the new location. 505 * 506 * Note that we use svr_max_synced_offset because it is 507 * updated atomically with respect to the in-core mapping. 508 * By contrast, vim_max_offset is not. 509 * 510 * This block may be split between a synced entry and an 511 * in-flight or unvisited entry. Only process the synced 512 * portion of it here. 513 */ 514 synced_size = MIN(size, max_offset_synced - offset); 515 synced_offset = offset; 516 517 ASSERT3U(max_offset_yet, <=, max_offset_synced); 518 max_offset_yet = max_offset_synced; 519 520 DTRACE_PROBE3(remove__free__synced, 521 spa_t *, spa, 522 uint64_t, offset, 523 uint64_t, synced_size); 524 525 size -= synced_size; 526 offset += synced_size; 527 } 528 529 /* 530 * Look at all in-flight txgs starting from the currently syncing one 531 * and see if a section of this free is being copied. By starting from 532 * this txg and iterating forward, we might find that this region 533 * was copied in two different txgs and handle it appropriately. 534 */ 535 for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { 536 int txgoff = (txg + i) & TXG_MASK; 537 if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { 538 /* 539 * The mapping for this offset is in flight, and 540 * will be synced in txg+i. 541 */ 542 uint64_t inflight_size = MIN(size, 543 svr->svr_max_offset_to_sync[txgoff] - offset); 544 545 DTRACE_PROBE4(remove__free__inflight, 546 spa_t *, spa, 547 uint64_t, offset, 548 uint64_t, inflight_size, 549 uint64_t, txg + i); 550 551 /* 552 * We copy data in order of increasing offset. 553 * Therefore the max_offset_to_sync[] must increase 554 * (or be zero, indicating that nothing is being 555 * copied in that txg). 556 */ 557 if (svr->svr_max_offset_to_sync[txgoff] != 0) { 558 ASSERT3U(svr->svr_max_offset_to_sync[txgoff], 559 >=, max_offset_yet); 560 max_offset_yet = 561 svr->svr_max_offset_to_sync[txgoff]; 562 } 563 564 /* 565 * We've already committed to copying this segment: 566 * we have allocated space elsewhere in the pool for 567 * it and have an IO outstanding to copy the data. We 568 * cannot free the space before the copy has 569 * completed, or else the copy IO might overwrite any 570 * new data. To free that space, we record the 571 * segment in the appropriate svr_frees tree and free 572 * the mapped space later, in the txg where we have 573 * completed the copy and synced the mapping (see 574 * vdev_mapping_sync). 575 */ 576 range_tree_add(svr->svr_frees[txgoff], 577 offset, inflight_size); 578 size -= inflight_size; 579 offset += inflight_size; 580 581 /* 582 * This space is already accounted for as being 583 * done, because it is being copied in txg+i. 584 * However, if i!=0, then it is being copied in 585 * a future txg. If we crash after this txg 586 * syncs but before txg+i syncs, then the space 587 * will be free. Therefore we must account 588 * for the space being done in *this* txg 589 * (when it is freed) rather than the future txg 590 * (when it will be copied). 591 */ 592 ASSERT3U(svr->svr_bytes_done[txgoff], >=, 593 inflight_size); 594 svr->svr_bytes_done[txgoff] -= inflight_size; 595 svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; 596 } 597 } 598 ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); 599 600 if (size > 0) { 601 /* 602 * The copy thread has not yet visited this offset. Ensure 603 * that it doesn't. 604 */ 605 606 DTRACE_PROBE3(remove__free__unvisited, 607 spa_t *, spa, 608 uint64_t, offset, 609 uint64_t, size); 610 611 if (svr->svr_allocd_segs != NULL) 612 range_tree_clear(svr->svr_allocd_segs, offset, size); 613 614 /* 615 * Since we now do not need to copy this data, for 616 * accounting purposes we have done our job and can count 617 * it as completed. 618 */ 619 svr->svr_bytes_done[txg & TXG_MASK] += size; 620 } 621 mutex_exit(&svr->svr_lock); 622 623 /* 624 * Now that we have dropped svr_lock, process the synced portion 625 * of this free. 626 */ 627 if (synced_size > 0) { 628 vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); 629 630 /* 631 * Note: this can only be called from syncing context, 632 * and the vdev_indirect_mapping is only changed from the 633 * sync thread, so we don't need svr_lock while doing 634 * metaslab_free_impl_cb. 635 */ 636 boolean_t checkpoint = B_FALSE; 637 vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, 638 metaslab_free_impl_cb, &checkpoint); 639 } 640 } 641 642 /* 643 * Stop an active removal and update the spa_removing phys. 644 */ 645 static void 646 spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) 647 { 648 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 649 ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); 650 651 /* Ensure the removal thread has completed before we free the svr. */ 652 spa_vdev_remove_suspend(spa); 653 654 ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); 655 656 if (state == DSS_FINISHED) { 657 spa_removing_phys_t *srp = &spa->spa_removing_phys; 658 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 659 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 660 661 if (srp->sr_prev_indirect_vdev != UINT64_MAX) { 662 vdev_t *pvd = vdev_lookup_top(spa, 663 srp->sr_prev_indirect_vdev); 664 ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); 665 } 666 667 vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; 668 srp->sr_prev_indirect_vdev = vd->vdev_id; 669 } 670 spa->spa_removing_phys.sr_state = state; 671 spa->spa_removing_phys.sr_end_time = gethrestime_sec(); 672 673 spa->spa_vdev_removal = NULL; 674 spa_vdev_removal_destroy(svr); 675 676 spa_sync_removing_state(spa, tx); 677 678 vdev_config_dirty(spa->spa_root_vdev); 679 } 680 681 static void 682 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) 683 { 684 vdev_t *vd = arg; 685 vdev_indirect_mark_obsolete(vd, offset, size); 686 boolean_t checkpoint = B_FALSE; 687 vdev_indirect_ops.vdev_op_remap(vd, offset, size, 688 metaslab_free_impl_cb, &checkpoint); 689 } 690 691 /* 692 * On behalf of the removal thread, syncs an incremental bit more of 693 * the indirect mapping to disk and updates the in-memory mapping. 694 * Called as a sync task in every txg that the removal thread makes progress. 695 */ 696 static void 697 vdev_mapping_sync(void *arg, dmu_tx_t *tx) 698 { 699 spa_vdev_removal_t *svr = arg; 700 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 701 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 702 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 703 uint64_t txg = dmu_tx_get_txg(tx); 704 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 705 706 ASSERT(vic->vic_mapping_object != 0); 707 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 708 709 vdev_indirect_mapping_add_entries(vim, 710 &svr->svr_new_segments[txg & TXG_MASK], tx); 711 vdev_indirect_births_add_entry(vd->vdev_indirect_births, 712 vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); 713 714 /* 715 * Free the copied data for anything that was freed while the 716 * mapping entries were in flight. 717 */ 718 mutex_enter(&svr->svr_lock); 719 range_tree_vacate(svr->svr_frees[txg & TXG_MASK], 720 free_mapped_segment_cb, vd); 721 ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, 722 vdev_indirect_mapping_max_offset(vim)); 723 svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; 724 mutex_exit(&svr->svr_lock); 725 726 spa_sync_removing_state(spa, tx); 727 } 728 729 /* 730 * All reads and writes associated with a call to spa_vdev_copy_segment() 731 * are done. 732 */ 733 static void 734 spa_vdev_copy_nullzio_done(zio_t *zio) 735 { 736 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); 737 } 738 739 /* 740 * The write of the new location is done. 741 */ 742 static void 743 spa_vdev_copy_segment_write_done(zio_t *zio) 744 { 745 vdev_copy_arg_t *vca = zio->io_private; 746 747 abd_free(zio->io_abd); 748 749 mutex_enter(&vca->vca_lock); 750 vca->vca_outstanding_bytes -= zio->io_size; 751 cv_signal(&vca->vca_cv); 752 mutex_exit(&vca->vca_lock); 753 } 754 755 /* 756 * The read of the old location is done. The parent zio is the write to 757 * the new location. Allow it to start. 758 */ 759 static void 760 spa_vdev_copy_segment_read_done(zio_t *zio) 761 { 762 zio_nowait(zio_unique_parent(zio)); 763 } 764 765 /* 766 * If the old and new vdevs are mirrors, we will read both sides of the old 767 * mirror, and write each copy to the corresponding side of the new mirror. 768 * If the old and new vdevs have a different number of children, we will do 769 * this as best as possible. Since we aren't verifying checksums, this 770 * ensures that as long as there's a good copy of the data, we'll have a 771 * good copy after the removal, even if there's silent damage to one side 772 * of the mirror. If we're removing a mirror that has some silent damage, 773 * we'll have exactly the same damage in the new location (assuming that 774 * the new location is also a mirror). 775 * 776 * We accomplish this by creating a tree of zio_t's, with as many writes as 777 * there are "children" of the new vdev (a non-redundant vdev counts as one 778 * child, a 2-way mirror has 2 children, etc). Each write has an associated 779 * read from a child of the old vdev. Typically there will be the same 780 * number of children of the old and new vdevs. However, if there are more 781 * children of the new vdev, some child(ren) of the old vdev will be issued 782 * multiple reads. If there are more children of the old vdev, some copies 783 * will be dropped. 784 * 785 * For example, the tree of zio_t's for a 2-way mirror is: 786 * 787 * null 788 * / \ 789 * write(new vdev, child 0) write(new vdev, child 1) 790 * | | 791 * read(old vdev, child 0) read(old vdev, child 1) 792 * 793 * Child zio's complete before their parents complete. However, zio's 794 * created with zio_vdev_child_io() may be issued before their children 795 * complete. In this case we need to make sure that the children (reads) 796 * complete before the parents (writes) are *issued*. We do this by not 797 * calling zio_nowait() on each write until its corresponding read has 798 * completed. 799 * 800 * The spa_config_lock must be held while zio's created by 801 * zio_vdev_child_io() are in progress, to ensure that the vdev tree does 802 * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" 803 * zio is needed to release the spa_config_lock after all the reads and 804 * writes complete. (Note that we can't grab the config lock for each read, 805 * because it is not reentrant - we could deadlock with a thread waiting 806 * for a write lock.) 807 */ 808 static void 809 spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, 810 vdev_t *source_vd, uint64_t source_offset, 811 vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) 812 { 813 ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); 814 815 mutex_enter(&vca->vca_lock); 816 vca->vca_outstanding_bytes += size; 817 mutex_exit(&vca->vca_lock); 818 819 abd_t *abd = abd_alloc_for_io(size, B_FALSE); 820 821 vdev_t *source_child_vd; 822 if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { 823 /* 824 * Source and dest are both mirrors. Copy from the same 825 * child id as we are copying to (wrapping around if there 826 * are more dest children than source children). 827 */ 828 source_child_vd = 829 source_vd->vdev_child[dest_id % source_vd->vdev_children]; 830 } else { 831 source_child_vd = source_vd; 832 } 833 834 zio_t *write_zio = zio_vdev_child_io(nzio, NULL, 835 dest_child_vd, dest_offset, abd, size, 836 ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 837 ZIO_FLAG_CANFAIL, 838 spa_vdev_copy_segment_write_done, vca); 839 840 zio_nowait(zio_vdev_child_io(write_zio, NULL, 841 source_child_vd, source_offset, abd, size, 842 ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 843 ZIO_FLAG_CANFAIL, 844 spa_vdev_copy_segment_read_done, vca)); 845 } 846 847 /* 848 * Allocate a new location for this segment, and create the zio_t's to 849 * read from the old location and write to the new location. 850 */ 851 static int 852 spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, 853 vdev_copy_arg_t *vca, zio_alloc_list_t *zal) 854 { 855 metaslab_group_t *mg = vd->vdev_mg; 856 spa_t *spa = vd->vdev_spa; 857 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 858 vdev_indirect_mapping_entry_t *entry; 859 dva_t dst = { 0 }; 860 861 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 862 863 /* 864 * We use allocator 0 for this I/O because we don't expect device remap 865 * to be the steady state of the system, so parallelizing is not as 866 * critical as it is for other allocation types. We also want to ensure 867 * that the IOs are allocated together as much as possible, to reduce 868 * mapping sizes. 869 */ 870 int error = metaslab_alloc_dva(spa, mg->mg_class, size, 871 &dst, 0, NULL, txg, 0, zal, 0); 872 if (error != 0) 873 return (error); 874 875 /* 876 * We can't have any padding of the allocated size, otherwise we will 877 * misunderstand what's allocated, and the size of the mapping. 878 * The caller ensures this will be true by passing in a size that is 879 * aligned to the worst (highest) ashift in the pool. 880 */ 881 ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); 882 883 entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); 884 DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); 885 entry->vime_mapping.vimep_dst = dst; 886 887 /* 888 * See comment before spa_vdev_copy_one_child(). 889 */ 890 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 891 zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, 892 spa_vdev_copy_nullzio_done, NULL, 0); 893 vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); 894 if (dest_vd->vdev_ops == &vdev_mirror_ops) { 895 for (int i = 0; i < dest_vd->vdev_children; i++) { 896 vdev_t *child = dest_vd->vdev_child[i]; 897 spa_vdev_copy_one_child(vca, nzio, vd, start, 898 child, DVA_GET_OFFSET(&dst), i, size); 899 } 900 } else { 901 spa_vdev_copy_one_child(vca, nzio, vd, start, 902 dest_vd, DVA_GET_OFFSET(&dst), -1, size); 903 } 904 zio_nowait(nzio); 905 906 list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); 907 ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); 908 vdev_dirty(vd, 0, NULL, txg); 909 910 return (0); 911 } 912 913 /* 914 * Complete the removal of a toplevel vdev. This is called as a 915 * synctask in the same txg that we will sync out the new config (to the 916 * MOS object) which indicates that this vdev is indirect. 917 */ 918 static void 919 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) 920 { 921 spa_vdev_removal_t *svr = arg; 922 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 923 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 924 925 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 926 927 for (int i = 0; i < TXG_SIZE; i++) { 928 ASSERT0(svr->svr_bytes_done[i]); 929 } 930 931 ASSERT3U(spa->spa_removing_phys.sr_copied, ==, 932 spa->spa_removing_phys.sr_to_copy); 933 934 vdev_destroy_spacemaps(vd, tx); 935 936 /* destroy leaf zaps, if any */ 937 ASSERT3P(svr->svr_zaplist, !=, NULL); 938 for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); 939 pair != NULL; 940 pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { 941 vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); 942 } 943 fnvlist_free(svr->svr_zaplist); 944 945 spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); 946 /* vd->vdev_path is not available here */ 947 spa_history_log_internal(spa, "vdev remove completed", tx, 948 "%s vdev %llu", spa_name(spa), vd->vdev_id); 949 } 950 951 static void 952 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) 953 { 954 ASSERT3P(zlist, !=, NULL); 955 ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); 956 957 if (vd->vdev_leaf_zap != 0) { 958 char zkey[32]; 959 (void) snprintf(zkey, sizeof (zkey), "%s-%"PRIu64, 960 VDEV_REMOVAL_ZAP_OBJS, vd->vdev_leaf_zap); 961 fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); 962 } 963 964 for (uint64_t id = 0; id < vd->vdev_children; id++) { 965 vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); 966 } 967 } 968 969 static void 970 vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) 971 { 972 vdev_t *ivd; 973 dmu_tx_t *tx; 974 spa_t *spa = vd->vdev_spa; 975 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 976 977 /* 978 * First, build a list of leaf zaps to be destroyed. 979 * This is passed to the sync context thread, 980 * which does the actual unlinking. 981 */ 982 svr->svr_zaplist = fnvlist_alloc(); 983 vdev_remove_enlist_zaps(vd, svr->svr_zaplist); 984 985 ivd = vdev_add_parent(vd, &vdev_indirect_ops); 986 ivd->vdev_removing = 0; 987 988 vd->vdev_leaf_zap = 0; 989 990 vdev_remove_child(ivd, vd); 991 vdev_compact_children(ivd); 992 993 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 994 995 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 996 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, 997 0, ZFS_SPACE_CHECK_NONE, tx); 998 dmu_tx_commit(tx); 999 1000 /* 1001 * Indicate that this thread has exited. 1002 * After this, we can not use svr. 1003 */ 1004 mutex_enter(&svr->svr_lock); 1005 svr->svr_thread = NULL; 1006 cv_broadcast(&svr->svr_cv); 1007 mutex_exit(&svr->svr_lock); 1008 } 1009 1010 /* 1011 * Complete the removal of a toplevel vdev. This is called in open 1012 * context by the removal thread after we have copied all vdev's data. 1013 */ 1014 static void 1015 vdev_remove_complete(spa_t *spa) 1016 { 1017 uint64_t txg; 1018 1019 /* 1020 * Wait for any deferred frees to be synced before we call 1021 * vdev_metaslab_fini() 1022 */ 1023 txg_wait_synced(spa->spa_dsl_pool, 0); 1024 txg = spa_vdev_enter(spa); 1025 vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); 1026 ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 1027 1028 sysevent_t *ev = spa_event_create(spa, vd, NULL, 1029 ESC_ZFS_VDEV_REMOVE_DEV); 1030 1031 zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", 1032 vd->vdev_id, txg); 1033 1034 /* 1035 * Discard allocation state. 1036 */ 1037 if (vd->vdev_mg != NULL) { 1038 vdev_metaslab_fini(vd); 1039 metaslab_group_destroy(vd->vdev_mg); 1040 vd->vdev_mg = NULL; 1041 } 1042 ASSERT0(vd->vdev_stat.vs_space); 1043 ASSERT0(vd->vdev_stat.vs_dspace); 1044 1045 vdev_remove_replace_with_indirect(vd, txg); 1046 1047 /* 1048 * We now release the locks, allowing spa_sync to run and finish the 1049 * removal via vdev_remove_complete_sync in syncing context. 1050 * 1051 * Note that we hold on to the vdev_t that has been replaced. Since 1052 * it isn't part of the vdev tree any longer, it can't be concurrently 1053 * manipulated, even while we don't have the config lock. 1054 */ 1055 (void) spa_vdev_exit(spa, NULL, txg, 0); 1056 1057 /* 1058 * Top ZAP should have been transferred to the indirect vdev in 1059 * vdev_remove_replace_with_indirect. 1060 */ 1061 ASSERT0(vd->vdev_top_zap); 1062 1063 /* 1064 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. 1065 */ 1066 ASSERT0(vd->vdev_leaf_zap); 1067 1068 txg = spa_vdev_enter(spa); 1069 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1070 /* 1071 * Request to update the config and the config cachefile. 1072 */ 1073 vdev_config_dirty(spa->spa_root_vdev); 1074 (void) spa_vdev_exit(spa, vd, txg, 0); 1075 1076 spa_event_post(ev); 1077 } 1078 1079 /* 1080 * Evacuates a segment of size at most max_alloc from the vdev 1081 * via repeated calls to spa_vdev_copy_segment. If an allocation 1082 * fails, the pool is probably too fragmented to handle such a 1083 * large size, so decrease max_alloc so that the caller will not try 1084 * this size again this txg. 1085 */ 1086 static void 1087 spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, 1088 uint64_t *max_alloc, dmu_tx_t *tx) 1089 { 1090 uint64_t txg = dmu_tx_get_txg(tx); 1091 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1092 1093 mutex_enter(&svr->svr_lock); 1094 1095 range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); 1096 if (rs == NULL) { 1097 mutex_exit(&svr->svr_lock); 1098 return; 1099 } 1100 uint64_t offset = rs->rs_start; 1101 uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); 1102 1103 range_tree_remove(svr->svr_allocd_segs, offset, length); 1104 1105 if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { 1106 dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, 1107 svr, 0, ZFS_SPACE_CHECK_NONE, tx); 1108 } 1109 1110 svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; 1111 1112 /* 1113 * Note: this is the amount of *allocated* space 1114 * that we are taking care of each txg. 1115 */ 1116 svr->svr_bytes_done[txg & TXG_MASK] += length; 1117 1118 mutex_exit(&svr->svr_lock); 1119 1120 zio_alloc_list_t zal; 1121 metaslab_trace_init(&zal); 1122 uint64_t thismax = *max_alloc; 1123 while (length > 0) { 1124 uint64_t mylen = MIN(length, thismax); 1125 1126 int error = spa_vdev_copy_segment(vd, 1127 offset, mylen, txg, vca, &zal); 1128 1129 if (error == ENOSPC) { 1130 /* 1131 * Cut our segment in half, and don't try this 1132 * segment size again this txg. Note that the 1133 * allocation size must be aligned to the highest 1134 * ashift in the pool, so that the allocation will 1135 * not be padded out to a multiple of the ashift, 1136 * which could cause us to think that this mapping 1137 * is larger than we intended. 1138 */ 1139 ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); 1140 ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); 1141 thismax = P2ROUNDUP(mylen / 2, 1142 1 << spa->spa_max_ashift); 1143 ASSERT3U(thismax, <, mylen); 1144 /* 1145 * The minimum-size allocation can not fail. 1146 */ 1147 ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); 1148 *max_alloc = mylen - (1 << spa->spa_max_ashift); 1149 } else { 1150 ASSERT0(error); 1151 length -= mylen; 1152 offset += mylen; 1153 1154 /* 1155 * We've performed an allocation, so reset the 1156 * alloc trace list. 1157 */ 1158 metaslab_trace_fini(&zal); 1159 metaslab_trace_init(&zal); 1160 } 1161 } 1162 metaslab_trace_fini(&zal); 1163 } 1164 1165 /* 1166 * The removal thread operates in open context. It iterates over all 1167 * allocated space in the vdev, by loading each metaslab's spacemap. 1168 * For each contiguous segment of allocated space (capping the segment 1169 * size at SPA_MAXBLOCKSIZE), we: 1170 * - Allocate space for it on another vdev. 1171 * - Create a new mapping from the old location to the new location 1172 * (as a record in svr_new_segments). 1173 * - Initiate a logical read zio to get the data off the removing disk. 1174 * - In the read zio's done callback, initiate a logical write zio to 1175 * write it to the new vdev. 1176 * Note that all of this will take effect when a particular TXG syncs. 1177 * The sync thread ensures that all the phys reads and writes for the syncing 1178 * TXG have completed (see spa_txg_zio) and writes the new mappings to disk 1179 * (see vdev_mapping_sync()). 1180 */ 1181 static void 1182 spa_vdev_remove_thread(void *arg) 1183 { 1184 spa_t *spa = arg; 1185 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1186 vdev_copy_arg_t vca; 1187 uint64_t max_alloc = zfs_remove_max_segment; 1188 uint64_t last_txg = 0; 1189 1190 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1191 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 1192 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1193 uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); 1194 1195 ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); 1196 ASSERT(vdev_is_concrete(vd)); 1197 ASSERT(vd->vdev_removing); 1198 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 1199 ASSERT(vim != NULL); 1200 1201 mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); 1202 cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); 1203 vca.vca_outstanding_bytes = 0; 1204 1205 mutex_enter(&svr->svr_lock); 1206 1207 /* 1208 * Start from vim_max_offset so we pick up where we left off 1209 * if we are restarting the removal after opening the pool. 1210 */ 1211 uint64_t msi; 1212 for (msi = start_offset >> vd->vdev_ms_shift; 1213 msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { 1214 metaslab_t *msp = vd->vdev_ms[msi]; 1215 ASSERT3U(msi, <=, vd->vdev_ms_count); 1216 1217 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1218 1219 mutex_enter(&msp->ms_sync_lock); 1220 mutex_enter(&msp->ms_lock); 1221 1222 /* 1223 * Assert nothing in flight -- ms_*tree is empty. 1224 */ 1225 for (int i = 0; i < TXG_SIZE; i++) { 1226 ASSERT0(range_tree_space(msp->ms_allocating[i])); 1227 } 1228 1229 /* 1230 * If the metaslab has ever been allocated from (ms_sm!=NULL), 1231 * read the allocated segments from the space map object 1232 * into svr_allocd_segs. Since we do this while holding 1233 * svr_lock and ms_sync_lock, concurrent frees (which 1234 * would have modified the space map) will wait for us 1235 * to finish loading the spacemap, and then take the 1236 * appropriate action (see free_from_removing_vdev()). 1237 */ 1238 if (msp->ms_sm != NULL) { 1239 space_map_t *sm = NULL; 1240 1241 /* 1242 * We have to open a new space map here, because 1243 * ms_sm's sm_length and sm_alloc may not reflect 1244 * what's in the object contents, if we are in between 1245 * metaslab_sync() and metaslab_sync_done(). 1246 */ 1247 VERIFY0(space_map_open(&sm, 1248 spa->spa_dsl_pool->dp_meta_objset, 1249 msp->ms_sm->sm_object, msp->ms_sm->sm_start, 1250 msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); 1251 space_map_update(sm); 1252 VERIFY0(space_map_load(sm, svr->svr_allocd_segs, 1253 SM_ALLOC)); 1254 space_map_close(sm); 1255 1256 range_tree_walk(msp->ms_freeing, 1257 range_tree_remove, svr->svr_allocd_segs); 1258 1259 /* 1260 * When we are resuming from a paused removal (i.e. 1261 * when importing a pool with a removal in progress), 1262 * discard any state that we have already processed. 1263 */ 1264 range_tree_clear(svr->svr_allocd_segs, 0, start_offset); 1265 } 1266 mutex_exit(&msp->ms_lock); 1267 mutex_exit(&msp->ms_sync_lock); 1268 1269 vca.vca_msp = msp; 1270 zfs_dbgmsg("copying %llu segments for metaslab %llu", 1271 avl_numnodes(&svr->svr_allocd_segs->rt_root), 1272 msp->ms_id); 1273 1274 while (!svr->svr_thread_exit && 1275 !range_tree_is_empty(svr->svr_allocd_segs)) { 1276 1277 mutex_exit(&svr->svr_lock); 1278 1279 /* 1280 * We need to periodically drop the config lock so that 1281 * writers can get in. Additionally, we can't wait 1282 * for a txg to sync while holding a config lock 1283 * (since a waiting writer could cause a 3-way deadlock 1284 * with the sync thread, which also gets a config 1285 * lock for reader). So we can't hold the config lock 1286 * while calling dmu_tx_assign(). 1287 */ 1288 spa_config_exit(spa, SCL_CONFIG, FTAG); 1289 1290 /* 1291 * This delay will pause the removal around the point 1292 * specified by zfs_remove_max_bytes_pause. We do this 1293 * solely from the test suite or during debugging. 1294 */ 1295 uint64_t bytes_copied = 1296 spa->spa_removing_phys.sr_copied; 1297 for (int i = 0; i < TXG_SIZE; i++) 1298 bytes_copied += svr->svr_bytes_done[i]; 1299 while (zfs_remove_max_bytes_pause <= bytes_copied && 1300 !svr->svr_thread_exit) 1301 delay(hz); 1302 1303 mutex_enter(&vca.vca_lock); 1304 while (vca.vca_outstanding_bytes > 1305 zfs_remove_max_copy_bytes) { 1306 cv_wait(&vca.vca_cv, &vca.vca_lock); 1307 } 1308 mutex_exit(&vca.vca_lock); 1309 1310 dmu_tx_t *tx = 1311 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1312 1313 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1314 uint64_t txg = dmu_tx_get_txg(tx); 1315 1316 /* 1317 * Reacquire the vdev_config lock. The vdev_t 1318 * that we're removing may have changed, e.g. due 1319 * to a vdev_attach or vdev_detach. 1320 */ 1321 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1322 vd = vdev_lookup_top(spa, svr->svr_vdev_id); 1323 1324 if (txg != last_txg) 1325 max_alloc = zfs_remove_max_segment; 1326 last_txg = txg; 1327 1328 spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); 1329 1330 dmu_tx_commit(tx); 1331 mutex_enter(&svr->svr_lock); 1332 } 1333 } 1334 1335 mutex_exit(&svr->svr_lock); 1336 1337 spa_config_exit(spa, SCL_CONFIG, FTAG); 1338 1339 /* 1340 * Wait for all copies to finish before cleaning up the vca. 1341 */ 1342 txg_wait_synced(spa->spa_dsl_pool, 0); 1343 ASSERT0(vca.vca_outstanding_bytes); 1344 1345 mutex_destroy(&vca.vca_lock); 1346 cv_destroy(&vca.vca_cv); 1347 1348 if (svr->svr_thread_exit) { 1349 mutex_enter(&svr->svr_lock); 1350 range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); 1351 svr->svr_thread = NULL; 1352 cv_broadcast(&svr->svr_cv); 1353 mutex_exit(&svr->svr_lock); 1354 } else { 1355 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1356 vdev_remove_complete(spa); 1357 } 1358 } 1359 1360 void 1361 spa_vdev_remove_suspend(spa_t *spa) 1362 { 1363 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1364 1365 if (svr == NULL) 1366 return; 1367 1368 mutex_enter(&svr->svr_lock); 1369 svr->svr_thread_exit = B_TRUE; 1370 while (svr->svr_thread != NULL) 1371 cv_wait(&svr->svr_cv, &svr->svr_lock); 1372 svr->svr_thread_exit = B_FALSE; 1373 mutex_exit(&svr->svr_lock); 1374 } 1375 1376 /* ARGSUSED */ 1377 static int 1378 spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) 1379 { 1380 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1381 1382 if (spa->spa_vdev_removal == NULL) 1383 return (ENOTACTIVE); 1384 return (0); 1385 } 1386 1387 /* 1388 * Cancel a removal by freeing all entries from the partial mapping 1389 * and marking the vdev as no longer being removing. 1390 */ 1391 /* ARGSUSED */ 1392 static void 1393 spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) 1394 { 1395 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1396 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1397 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 1398 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1399 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1400 objset_t *mos = spa->spa_meta_objset; 1401 1402 ASSERT3P(svr->svr_thread, ==, NULL); 1403 1404 spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); 1405 if (vdev_obsolete_counts_are_precise(vd)) { 1406 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 1407 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 1408 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); 1409 } 1410 1411 if (vdev_obsolete_sm_object(vd) != 0) { 1412 ASSERT(vd->vdev_obsolete_sm != NULL); 1413 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 1414 space_map_object(vd->vdev_obsolete_sm)); 1415 1416 space_map_free(vd->vdev_obsolete_sm, tx); 1417 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 1418 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 1419 space_map_close(vd->vdev_obsolete_sm); 1420 vd->vdev_obsolete_sm = NULL; 1421 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 1422 } 1423 for (int i = 0; i < TXG_SIZE; i++) { 1424 ASSERT(list_is_empty(&svr->svr_new_segments[i])); 1425 ASSERT3U(svr->svr_max_offset_to_sync[i], <=, 1426 vdev_indirect_mapping_max_offset(vim)); 1427 } 1428 1429 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 1430 metaslab_t *msp = vd->vdev_ms[msi]; 1431 1432 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) 1433 break; 1434 1435 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1436 1437 mutex_enter(&msp->ms_lock); 1438 1439 /* 1440 * Assert nothing in flight -- ms_*tree is empty. 1441 */ 1442 for (int i = 0; i < TXG_SIZE; i++) 1443 ASSERT0(range_tree_space(msp->ms_allocating[i])); 1444 for (int i = 0; i < TXG_DEFER_SIZE; i++) 1445 ASSERT0(range_tree_space(msp->ms_defer[i])); 1446 ASSERT0(range_tree_space(msp->ms_freed)); 1447 1448 if (msp->ms_sm != NULL) { 1449 /* 1450 * Assert that the in-core spacemap has the same 1451 * length as the on-disk one, so we can use the 1452 * existing in-core spacemap to load it from disk. 1453 */ 1454 ASSERT3U(msp->ms_sm->sm_alloc, ==, 1455 msp->ms_sm->sm_phys->smp_alloc); 1456 ASSERT3U(msp->ms_sm->sm_length, ==, 1457 msp->ms_sm->sm_phys->smp_objsize); 1458 1459 mutex_enter(&svr->svr_lock); 1460 VERIFY0(space_map_load(msp->ms_sm, 1461 svr->svr_allocd_segs, SM_ALLOC)); 1462 range_tree_walk(msp->ms_freeing, 1463 range_tree_remove, svr->svr_allocd_segs); 1464 1465 /* 1466 * Clear everything past what has been synced, 1467 * because we have not allocated mappings for it yet. 1468 */ 1469 uint64_t syncd = vdev_indirect_mapping_max_offset(vim); 1470 uint64_t sm_end = msp->ms_sm->sm_start + 1471 msp->ms_sm->sm_size; 1472 if (sm_end > syncd) 1473 range_tree_clear(svr->svr_allocd_segs, 1474 syncd, sm_end - syncd); 1475 1476 mutex_exit(&svr->svr_lock); 1477 } 1478 mutex_exit(&msp->ms_lock); 1479 1480 mutex_enter(&svr->svr_lock); 1481 range_tree_vacate(svr->svr_allocd_segs, 1482 free_mapped_segment_cb, vd); 1483 mutex_exit(&svr->svr_lock); 1484 } 1485 1486 /* 1487 * Note: this must happen after we invoke free_mapped_segment_cb, 1488 * because it adds to the obsolete_segments. 1489 */ 1490 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 1491 1492 ASSERT3U(vic->vic_mapping_object, ==, 1493 vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); 1494 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 1495 vd->vdev_indirect_mapping = NULL; 1496 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 1497 vic->vic_mapping_object = 0; 1498 1499 ASSERT3U(vic->vic_births_object, ==, 1500 vdev_indirect_births_object(vd->vdev_indirect_births)); 1501 vdev_indirect_births_close(vd->vdev_indirect_births); 1502 vd->vdev_indirect_births = NULL; 1503 vdev_indirect_births_free(mos, vic->vic_births_object, tx); 1504 vic->vic_births_object = 0; 1505 1506 /* 1507 * We may have processed some frees from the removing vdev in this 1508 * txg, thus increasing svr_bytes_done; discard that here to 1509 * satisfy the assertions in spa_vdev_removal_destroy(). 1510 * Note that future txg's can not have any bytes_done, because 1511 * future TXG's are only modified from open context, and we have 1512 * already shut down the copying thread. 1513 */ 1514 svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; 1515 spa_finish_removal(spa, DSS_CANCELED, tx); 1516 1517 vd->vdev_removing = B_FALSE; 1518 vdev_config_dirty(vd); 1519 1520 zfs_dbgmsg("canceled device removal for vdev %llu in %llu", 1521 vd->vdev_id, dmu_tx_get_txg(tx)); 1522 spa_history_log_internal(spa, "vdev remove canceled", tx, 1523 "%s vdev %llu %s", spa_name(spa), 1524 vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 1525 } 1526 1527 int 1528 spa_vdev_remove_cancel(spa_t *spa) 1529 { 1530 spa_vdev_remove_suspend(spa); 1531 1532 if (spa->spa_vdev_removal == NULL) 1533 return (ENOTACTIVE); 1534 1535 uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; 1536 1537 int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, 1538 spa_vdev_remove_cancel_sync, NULL, 0, 1539 ZFS_SPACE_CHECK_EXTRA_RESERVED); 1540 1541 if (error == 0) { 1542 spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); 1543 vdev_t *vd = vdev_lookup_top(spa, vdid); 1544 metaslab_group_activate(vd->vdev_mg); 1545 spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); 1546 } 1547 1548 return (error); 1549 } 1550 1551 /* 1552 * Called every sync pass of every txg if there's a svr. 1553 */ 1554 void 1555 svr_sync(spa_t *spa, dmu_tx_t *tx) 1556 { 1557 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1558 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 1559 1560 /* 1561 * This check is necessary so that we do not dirty the 1562 * DIRECTORY_OBJECT via spa_sync_removing_state() when there 1563 * is nothing to do. Dirtying it every time would prevent us 1564 * from syncing-to-convergence. 1565 */ 1566 if (svr->svr_bytes_done[txgoff] == 0) 1567 return; 1568 1569 /* 1570 * Update progress accounting. 1571 */ 1572 spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; 1573 svr->svr_bytes_done[txgoff] = 0; 1574 1575 spa_sync_removing_state(spa, tx); 1576 } 1577 1578 static void 1579 vdev_remove_make_hole_and_free(vdev_t *vd) 1580 { 1581 uint64_t id = vd->vdev_id; 1582 spa_t *spa = vd->vdev_spa; 1583 vdev_t *rvd = spa->spa_root_vdev; 1584 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 1585 1586 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1587 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1588 1589 vdev_free(vd); 1590 1591 if (last_vdev) { 1592 vdev_compact_children(rvd); 1593 } else { 1594 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 1595 vdev_add_child(rvd, vd); 1596 } 1597 vdev_config_dirty(rvd); 1598 1599 /* 1600 * Reassess the health of our root vdev. 1601 */ 1602 vdev_reopen(rvd); 1603 } 1604 1605 /* 1606 * Remove a log device. The config lock is held for the specified TXG. 1607 */ 1608 static int 1609 spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) 1610 { 1611 metaslab_group_t *mg = vd->vdev_mg; 1612 spa_t *spa = vd->vdev_spa; 1613 int error = 0; 1614 1615 ASSERT(vd->vdev_islog); 1616 ASSERT(vd == vd->vdev_top); 1617 1618 /* 1619 * Stop allocating from this vdev. 1620 */ 1621 metaslab_group_passivate(mg); 1622 1623 /* 1624 * Wait for the youngest allocations and frees to sync, 1625 * and then wait for the deferral of those frees to finish. 1626 */ 1627 spa_vdev_config_exit(spa, NULL, 1628 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 1629 1630 /* 1631 * Evacuate the device. We don't hold the config lock as writer 1632 * since we need to do I/O but we do keep the 1633 * spa_namespace_lock held. Once this completes the device 1634 * should no longer have any blocks allocated on it. 1635 */ 1636 if (vd->vdev_islog) { 1637 if (vd->vdev_stat.vs_alloc != 0) 1638 error = spa_reset_logs(spa); 1639 } 1640 1641 *txg = spa_vdev_config_enter(spa); 1642 1643 if (error != 0) { 1644 metaslab_group_activate(mg); 1645 return (error); 1646 } 1647 ASSERT0(vd->vdev_stat.vs_alloc); 1648 1649 /* 1650 * The evacuation succeeded. Remove any remaining MOS metadata 1651 * associated with this vdev, and wait for these changes to sync. 1652 */ 1653 vd->vdev_removing = B_TRUE; 1654 1655 vdev_dirty_leaves(vd, VDD_DTL, *txg); 1656 vdev_config_dirty(vd); 1657 1658 spa_history_log_internal(spa, "vdev remove", NULL, 1659 "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, 1660 (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 1661 1662 /* Make sure these changes are sync'ed */ 1663 spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); 1664 1665 /* Stop initializing */ 1666 (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); 1667 1668 *txg = spa_vdev_config_enter(spa); 1669 1670 sysevent_t *ev = spa_event_create(spa, vd, NULL, 1671 ESC_ZFS_VDEV_REMOVE_DEV); 1672 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1673 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1674 1675 /* The top ZAP should have been destroyed by vdev_remove_empty. */ 1676 ASSERT0(vd->vdev_top_zap); 1677 /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ 1678 ASSERT0(vd->vdev_leaf_zap); 1679 1680 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1681 1682 if (list_link_active(&vd->vdev_state_dirty_node)) 1683 vdev_state_clean(vd); 1684 if (list_link_active(&vd->vdev_config_dirty_node)) 1685 vdev_config_clean(vd); 1686 1687 /* 1688 * Clean up the vdev namespace. 1689 */ 1690 vdev_remove_make_hole_and_free(vd); 1691 1692 if (ev != NULL) 1693 spa_event_post(ev); 1694 1695 return (0); 1696 } 1697 1698 static int 1699 spa_vdev_remove_top_check(vdev_t *vd) 1700 { 1701 spa_t *spa = vd->vdev_spa; 1702 1703 if (vd != vd->vdev_top) 1704 return (SET_ERROR(ENOTSUP)); 1705 1706 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) 1707 return (SET_ERROR(ENOTSUP)); 1708 1709 /* 1710 * There has to be enough free space to remove the 1711 * device and leave double the "slop" space (i.e. we 1712 * must leave at least 3% of the pool free, in addition to 1713 * the normal slop space). 1714 */ 1715 if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, 1716 NULL, 0, B_TRUE) < 1717 vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { 1718 return (SET_ERROR(ENOSPC)); 1719 } 1720 1721 /* 1722 * There can not be a removal in progress. 1723 */ 1724 if (spa->spa_removing_phys.sr_state == DSS_SCANNING) 1725 return (SET_ERROR(EBUSY)); 1726 1727 /* 1728 * The device must have all its data. 1729 */ 1730 if (!vdev_dtl_empty(vd, DTL_MISSING) || 1731 !vdev_dtl_empty(vd, DTL_OUTAGE)) 1732 return (SET_ERROR(EBUSY)); 1733 1734 /* 1735 * The device must be healthy. 1736 */ 1737 if (!vdev_readable(vd)) 1738 return (SET_ERROR(EIO)); 1739 1740 /* 1741 * All vdevs in normal class must have the same ashift. 1742 */ 1743 if (spa->spa_max_ashift != spa->spa_min_ashift) { 1744 return (SET_ERROR(EINVAL)); 1745 } 1746 1747 /* 1748 * All vdevs in normal class must have the same ashift 1749 * and not be raidz. 1750 */ 1751 vdev_t *rvd = spa->spa_root_vdev; 1752 int num_indirect = 0; 1753 for (uint64_t id = 0; id < rvd->vdev_children; id++) { 1754 vdev_t *cvd = rvd->vdev_child[id]; 1755 if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) 1756 ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); 1757 if (cvd->vdev_ops == &vdev_indirect_ops) 1758 num_indirect++; 1759 if (!vdev_is_concrete(cvd)) 1760 continue; 1761 if (cvd->vdev_ops == &vdev_raidz_ops) 1762 return (SET_ERROR(EINVAL)); 1763 /* 1764 * Need the mirror to be mirror of leaf vdevs only 1765 */ 1766 if (cvd->vdev_ops == &vdev_mirror_ops) { 1767 for (uint64_t cid = 0; 1768 cid < cvd->vdev_children; cid++) { 1769 vdev_t *tmp = cvd->vdev_child[cid]; 1770 if (!tmp->vdev_ops->vdev_op_leaf) 1771 return (SET_ERROR(EINVAL)); 1772 } 1773 } 1774 } 1775 1776 return (0); 1777 } 1778 1779 /* 1780 * Initiate removal of a top-level vdev, reducing the total space in the pool. 1781 * The config lock is held for the specified TXG. Once initiated, 1782 * evacuation of all allocated space (copying it to other vdevs) happens 1783 * in the background (see spa_vdev_remove_thread()), and can be canceled 1784 * (see spa_vdev_remove_cancel()). If successful, the vdev will 1785 * be transformed to an indirect vdev (see spa_vdev_remove_complete()). 1786 */ 1787 static int 1788 spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) 1789 { 1790 spa_t *spa = vd->vdev_spa; 1791 int error; 1792 1793 /* 1794 * Check for errors up-front, so that we don't waste time 1795 * passivating the metaslab group and clearing the ZIL if there 1796 * are errors. 1797 */ 1798 error = spa_vdev_remove_top_check(vd); 1799 if (error != 0) 1800 return (error); 1801 1802 /* 1803 * Stop allocating from this vdev. Note that we must check 1804 * that this is not the only device in the pool before 1805 * passivating, otherwise we will not be able to make 1806 * progress because we can't allocate from any vdevs. 1807 * The above check for sufficient free space serves this 1808 * purpose. 1809 */ 1810 metaslab_group_t *mg = vd->vdev_mg; 1811 metaslab_group_passivate(mg); 1812 1813 /* 1814 * Wait for the youngest allocations and frees to sync, 1815 * and then wait for the deferral of those frees to finish. 1816 */ 1817 spa_vdev_config_exit(spa, NULL, 1818 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 1819 1820 /* 1821 * We must ensure that no "stubby" log blocks are allocated 1822 * on the device to be removed. These blocks could be 1823 * written at any time, including while we are in the middle 1824 * of copying them. 1825 */ 1826 error = spa_reset_logs(spa); 1827 1828 /* 1829 * We stop any initializing that is currently in progress but leave 1830 * the state as "active". This will allow the initializing to resume 1831 * if the removal is canceled sometime later. 1832 */ 1833 vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); 1834 1835 *txg = spa_vdev_config_enter(spa); 1836 1837 /* 1838 * Things might have changed while the config lock was dropped 1839 * (e.g. space usage). Check for errors again. 1840 */ 1841 if (error == 0) 1842 error = spa_vdev_remove_top_check(vd); 1843 1844 if (error != 0) { 1845 metaslab_group_activate(mg); 1846 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 1847 return (error); 1848 } 1849 1850 vd->vdev_removing = B_TRUE; 1851 1852 vdev_dirty_leaves(vd, VDD_DTL, *txg); 1853 vdev_config_dirty(vd); 1854 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); 1855 dsl_sync_task_nowait(spa->spa_dsl_pool, 1856 vdev_remove_initiate_sync, 1857 (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); 1858 dmu_tx_commit(tx); 1859 1860 return (0); 1861 } 1862 1863 /* 1864 * Remove a device from the pool. 1865 * 1866 * Removing a device from the vdev namespace requires several steps 1867 * and can take a significant amount of time. As a result we use 1868 * the spa_vdev_config_[enter/exit] functions which allow us to 1869 * grab and release the spa_config_lock while still holding the namespace 1870 * lock. During each step the configuration is synced out. 1871 */ 1872 int 1873 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1874 { 1875 vdev_t *vd; 1876 nvlist_t **spares, **l2cache, *nv; 1877 uint64_t txg = 0; 1878 uint_t nspares, nl2cache; 1879 int error = 0; 1880 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 1881 sysevent_t *ev = NULL; 1882 1883 ASSERT(spa_writeable(spa)); 1884 1885 if (!locked) 1886 txg = spa_vdev_enter(spa); 1887 1888 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1889 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 1890 error = (spa_has_checkpoint(spa)) ? 1891 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 1892 1893 if (!locked) 1894 return (spa_vdev_exit(spa, NULL, txg, error)); 1895 1896 return (error); 1897 } 1898 1899 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 1900 1901 if (spa->spa_spares.sav_vdevs != NULL && 1902 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1903 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 1904 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 1905 /* 1906 * Only remove the hot spare if it's not currently in use 1907 * in this pool. 1908 */ 1909 if (vd == NULL || unspare) { 1910 char *nvstr = fnvlist_lookup_string(nv, 1911 ZPOOL_CONFIG_PATH); 1912 spa_history_log_internal(spa, "vdev remove", NULL, 1913 "%s vdev (%s) %s", spa_name(spa), 1914 VDEV_TYPE_SPARE, nvstr); 1915 if (vd == NULL) 1916 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 1917 ev = spa_event_create(spa, vd, NULL, 1918 ESC_ZFS_VDEV_REMOVE_AUX); 1919 spa_vdev_remove_aux(spa->spa_spares.sav_config, 1920 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 1921 spa_load_spares(spa); 1922 spa->spa_spares.sav_sync = B_TRUE; 1923 } else { 1924 error = SET_ERROR(EBUSY); 1925 } 1926 } else if (spa->spa_l2cache.sav_vdevs != NULL && 1927 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1928 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 1929 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 1930 char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); 1931 spa_history_log_internal(spa, "vdev remove", NULL, 1932 "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); 1933 /* 1934 * Cache devices can always be removed. 1935 */ 1936 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 1937 ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); 1938 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 1939 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 1940 spa_load_l2cache(spa); 1941 spa->spa_l2cache.sav_sync = B_TRUE; 1942 } else if (vd != NULL && vd->vdev_islog) { 1943 ASSERT(!locked); 1944 error = spa_vdev_remove_log(vd, &txg); 1945 } else if (vd != NULL) { 1946 ASSERT(!locked); 1947 error = spa_vdev_remove_top(vd, &txg); 1948 } else { 1949 /* 1950 * There is no vdev of any kind with the specified guid. 1951 */ 1952 error = SET_ERROR(ENOENT); 1953 } 1954 1955 if (!locked) 1956 error = spa_vdev_exit(spa, NULL, txg, error); 1957 1958 if (ev != NULL) { 1959 if (error != 0) { 1960 spa_event_discard(ev); 1961 } else { 1962 spa_event_post(ev); 1963 } 1964 } 1965 1966 return (error); 1967 } 1968 1969 int 1970 spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) 1971 { 1972 prs->prs_state = spa->spa_removing_phys.sr_state; 1973 1974 if (prs->prs_state == DSS_NONE) 1975 return (SET_ERROR(ENOENT)); 1976 1977 prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; 1978 prs->prs_start_time = spa->spa_removing_phys.sr_start_time; 1979 prs->prs_end_time = spa->spa_removing_phys.sr_end_time; 1980 prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; 1981 prs->prs_copied = spa->spa_removing_phys.sr_copied; 1982 1983 if (spa->spa_vdev_removal != NULL) { 1984 for (int i = 0; i < TXG_SIZE; i++) { 1985 prs->prs_copied += 1986 spa->spa_vdev_removal->svr_bytes_done[i]; 1987 } 1988 } 1989 1990 prs->prs_mapping_memory = 0; 1991 uint64_t indirect_vdev_id = 1992 spa->spa_removing_phys.sr_prev_indirect_vdev; 1993 while (indirect_vdev_id != -1) { 1994 vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; 1995 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1996 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1997 1998 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 1999 prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); 2000 indirect_vdev_id = vic->vic_prev_indirect_vdev; 2001 } 2002 2003 return (0); 2004 } 2005