1094e47e9SGeorge Wilson /* 2094e47e9SGeorge Wilson * CDDL HEADER START 3094e47e9SGeorge Wilson * 4094e47e9SGeorge Wilson * The contents of this file are subject to the terms of the 5094e47e9SGeorge Wilson * Common Development and Distribution License (the "License"). 6094e47e9SGeorge Wilson * You may not use this file except in compliance with the License. 7094e47e9SGeorge Wilson * 8094e47e9SGeorge Wilson * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9094e47e9SGeorge Wilson * or http://www.opensolaris.org/os/licensing. 10094e47e9SGeorge Wilson * See the License for the specific language governing permissions 11094e47e9SGeorge Wilson * and limitations under the License. 12094e47e9SGeorge Wilson * 13094e47e9SGeorge Wilson * When distributing Covered Code, include this CDDL HEADER in each 14094e47e9SGeorge Wilson * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15094e47e9SGeorge Wilson * If applicable, add the following below this CDDL HEADER, with the 16094e47e9SGeorge Wilson * fields enclosed by brackets "[]" replaced with your own identifying 17094e47e9SGeorge Wilson * information: Portions Copyright [yyyy] [name of copyright owner] 18094e47e9SGeorge Wilson * 19094e47e9SGeorge Wilson * CDDL HEADER END 20094e47e9SGeorge Wilson */ 21094e47e9SGeorge Wilson 22094e47e9SGeorge Wilson /* 23094e47e9SGeorge Wilson * Copyright (c) 2016 by Delphix. All rights reserved. 24094e47e9SGeorge Wilson */ 25094e47e9SGeorge Wilson 26094e47e9SGeorge Wilson #include <sys/spa.h> 27094e47e9SGeorge Wilson #include <sys/spa_impl.h> 28094e47e9SGeorge Wilson #include <sys/txg.h> 29094e47e9SGeorge Wilson #include <sys/vdev_impl.h> 30094e47e9SGeorge Wilson #include <sys/refcount.h> 31094e47e9SGeorge Wilson #include <sys/metaslab_impl.h> 32094e47e9SGeorge Wilson #include <sys/dsl_synctask.h> 33094e47e9SGeorge Wilson #include <sys/zap.h> 34094e47e9SGeorge Wilson #include <sys/dmu_tx.h> 35094e47e9SGeorge Wilson 36094e47e9SGeorge Wilson /* 37094e47e9SGeorge Wilson * Maximum number of metaslabs per group that can be initialized 38094e47e9SGeorge Wilson * simultaneously. 39094e47e9SGeorge Wilson */ 40094e47e9SGeorge Wilson int max_initialize_ms = 3; 41094e47e9SGeorge Wilson 42094e47e9SGeorge Wilson /* 43094e47e9SGeorge Wilson * Value that is written to disk during initialization. 44094e47e9SGeorge Wilson */ 45094e47e9SGeorge Wilson uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; 46094e47e9SGeorge Wilson 47094e47e9SGeorge Wilson /* maximum number of I/Os outstanding per leaf vdev */ 48094e47e9SGeorge Wilson int zfs_initialize_limit = 1; 49094e47e9SGeorge Wilson 50094e47e9SGeorge Wilson /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ 51094e47e9SGeorge Wilson uint64_t zfs_initialize_chunk_size = 1024 * 1024; 52094e47e9SGeorge Wilson 53094e47e9SGeorge Wilson static boolean_t 54094e47e9SGeorge Wilson vdev_initialize_should_stop(vdev_t *vd) 55094e47e9SGeorge Wilson { 56094e47e9SGeorge Wilson return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || 57094e47e9SGeorge Wilson vd->vdev_detached || vd->vdev_top->vdev_removing); 58094e47e9SGeorge Wilson } 59094e47e9SGeorge Wilson 60094e47e9SGeorge Wilson static void 61094e47e9SGeorge Wilson vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) 62094e47e9SGeorge Wilson { 63094e47e9SGeorge Wilson /* 64094e47e9SGeorge Wilson * We pass in the guid instead of the vdev_t since the vdev may 65094e47e9SGeorge Wilson * have been freed prior to the sync task being processed. This 66094e47e9SGeorge Wilson * happens when a vdev is detached as we call spa_config_vdev_exit(), 67094e47e9SGeorge Wilson * stop the intializing thread, schedule the sync task, and free 68094e47e9SGeorge Wilson * the vdev. Later when the scheduled sync task is invoked, it would 69094e47e9SGeorge Wilson * find that the vdev has been freed. 70094e47e9SGeorge Wilson */ 71094e47e9SGeorge Wilson uint64_t guid = *(uint64_t *)arg; 72094e47e9SGeorge Wilson uint64_t txg = dmu_tx_get_txg(tx); 73094e47e9SGeorge Wilson kmem_free(arg, sizeof (uint64_t)); 74094e47e9SGeorge Wilson 75094e47e9SGeorge Wilson vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); 76094e47e9SGeorge Wilson if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) 77094e47e9SGeorge Wilson return; 78094e47e9SGeorge Wilson 79094e47e9SGeorge Wilson uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; 80094e47e9SGeorge Wilson vd->vdev_initialize_offset[txg & TXG_MASK] = 0; 81094e47e9SGeorge Wilson 82094e47e9SGeorge Wilson VERIFY(vd->vdev_leaf_zap != 0); 83094e47e9SGeorge Wilson 84094e47e9SGeorge Wilson objset_t *mos = vd->vdev_spa->spa_meta_objset; 85094e47e9SGeorge Wilson 86094e47e9SGeorge Wilson if (last_offset > 0) { 87094e47e9SGeorge Wilson vd->vdev_initialize_last_offset = last_offset; 88094e47e9SGeorge Wilson VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 89094e47e9SGeorge Wilson VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 90094e47e9SGeorge Wilson sizeof (last_offset), 1, &last_offset, tx)); 91094e47e9SGeorge Wilson } 92094e47e9SGeorge Wilson if (vd->vdev_initialize_action_time > 0) { 93094e47e9SGeorge Wilson uint64_t val = (uint64_t)vd->vdev_initialize_action_time; 94094e47e9SGeorge Wilson VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 95094e47e9SGeorge Wilson VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 96094e47e9SGeorge Wilson 1, &val, tx)); 97094e47e9SGeorge Wilson } 98094e47e9SGeorge Wilson 99094e47e9SGeorge Wilson uint64_t initialize_state = vd->vdev_initialize_state; 100094e47e9SGeorge Wilson VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 101094e47e9SGeorge Wilson VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, 102094e47e9SGeorge Wilson &initialize_state, tx)); 103094e47e9SGeorge Wilson } 104094e47e9SGeorge Wilson 105094e47e9SGeorge Wilson static void 106094e47e9SGeorge Wilson vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) 107094e47e9SGeorge Wilson { 108094e47e9SGeorge Wilson ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 109094e47e9SGeorge Wilson spa_t *spa = vd->vdev_spa; 110094e47e9SGeorge Wilson 111094e47e9SGeorge Wilson if (new_state == vd->vdev_initialize_state) 112094e47e9SGeorge Wilson return; 113094e47e9SGeorge Wilson 114094e47e9SGeorge Wilson /* 115094e47e9SGeorge Wilson * Copy the vd's guid, this will be freed by the sync task. 116094e47e9SGeorge Wilson */ 117094e47e9SGeorge Wilson uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 118094e47e9SGeorge Wilson *guid = vd->vdev_guid; 119094e47e9SGeorge Wilson 120094e47e9SGeorge Wilson /* 121094e47e9SGeorge Wilson * If we're suspending, then preserving the original start time. 122094e47e9SGeorge Wilson */ 123094e47e9SGeorge Wilson if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { 124094e47e9SGeorge Wilson vd->vdev_initialize_action_time = gethrestime_sec(); 125094e47e9SGeorge Wilson } 126094e47e9SGeorge Wilson vd->vdev_initialize_state = new_state; 127094e47e9SGeorge Wilson 128094e47e9SGeorge Wilson dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 129094e47e9SGeorge Wilson VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 130094e47e9SGeorge Wilson dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, 131094e47e9SGeorge Wilson guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); 132094e47e9SGeorge Wilson 133094e47e9SGeorge Wilson switch (new_state) { 134094e47e9SGeorge Wilson case VDEV_INITIALIZE_ACTIVE: 135094e47e9SGeorge Wilson spa_history_log_internal(spa, "initialize", tx, 136094e47e9SGeorge Wilson "vdev=%s activated", vd->vdev_path); 137094e47e9SGeorge Wilson break; 138094e47e9SGeorge Wilson case VDEV_INITIALIZE_SUSPENDED: 139094e47e9SGeorge Wilson spa_history_log_internal(spa, "initialize", tx, 140094e47e9SGeorge Wilson "vdev=%s suspended", vd->vdev_path); 141094e47e9SGeorge Wilson break; 142094e47e9SGeorge Wilson case VDEV_INITIALIZE_CANCELED: 143094e47e9SGeorge Wilson spa_history_log_internal(spa, "initialize", tx, 144094e47e9SGeorge Wilson "vdev=%s canceled", vd->vdev_path); 145094e47e9SGeorge Wilson break; 146094e47e9SGeorge Wilson case VDEV_INITIALIZE_COMPLETE: 147094e47e9SGeorge Wilson spa_history_log_internal(spa, "initialize", tx, 148094e47e9SGeorge Wilson "vdev=%s complete", vd->vdev_path); 149094e47e9SGeorge Wilson break; 150094e47e9SGeorge Wilson default: 151094e47e9SGeorge Wilson panic("invalid state %llu", (unsigned long long)new_state); 152094e47e9SGeorge Wilson } 153094e47e9SGeorge Wilson 154094e47e9SGeorge Wilson dmu_tx_commit(tx); 155094e47e9SGeorge Wilson } 156094e47e9SGeorge Wilson 157094e47e9SGeorge Wilson static void 158094e47e9SGeorge Wilson vdev_initialize_cb(zio_t *zio) 159094e47e9SGeorge Wilson { 160094e47e9SGeorge Wilson vdev_t *vd = zio->io_vd; 161094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_io_lock); 162094e47e9SGeorge Wilson if (zio->io_error == ENXIO && !vdev_writeable(vd)) { 163094e47e9SGeorge Wilson /* 164094e47e9SGeorge Wilson * The I/O failed because the vdev was unavailable; roll the 165094e47e9SGeorge Wilson * last offset back. (This works because spa_sync waits on 166094e47e9SGeorge Wilson * spa_txg_zio before it runs sync tasks.) 167094e47e9SGeorge Wilson */ 168094e47e9SGeorge Wilson uint64_t *off = 169094e47e9SGeorge Wilson &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; 170094e47e9SGeorge Wilson *off = MIN(*off, zio->io_offset); 171094e47e9SGeorge Wilson } else { 172094e47e9SGeorge Wilson /* 173094e47e9SGeorge Wilson * Since initializing is best-effort, we ignore I/O errors and 174094e47e9SGeorge Wilson * rely on vdev_probe to determine if the errors are more 175094e47e9SGeorge Wilson * critical. 176094e47e9SGeorge Wilson */ 177094e47e9SGeorge Wilson if (zio->io_error != 0) 178094e47e9SGeorge Wilson vd->vdev_stat.vs_initialize_errors++; 179094e47e9SGeorge Wilson 180094e47e9SGeorge Wilson vd->vdev_initialize_bytes_done += zio->io_orig_size; 181094e47e9SGeorge Wilson } 182094e47e9SGeorge Wilson ASSERT3U(vd->vdev_initialize_inflight, >, 0); 183094e47e9SGeorge Wilson vd->vdev_initialize_inflight--; 184094e47e9SGeorge Wilson cv_broadcast(&vd->vdev_initialize_io_cv); 185094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_io_lock); 186094e47e9SGeorge Wilson 187094e47e9SGeorge Wilson spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 188094e47e9SGeorge Wilson } 189094e47e9SGeorge Wilson 190094e47e9SGeorge Wilson /* Takes care of physical writing and limiting # of concurrent ZIOs. */ 191094e47e9SGeorge Wilson static int 192094e47e9SGeorge Wilson vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) 193094e47e9SGeorge Wilson { 194094e47e9SGeorge Wilson spa_t *spa = vd->vdev_spa; 195094e47e9SGeorge Wilson 196094e47e9SGeorge Wilson /* Limit inflight initializing I/Os */ 197094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_io_lock); 198094e47e9SGeorge Wilson while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { 199094e47e9SGeorge Wilson cv_wait(&vd->vdev_initialize_io_cv, 200094e47e9SGeorge Wilson &vd->vdev_initialize_io_lock); 201094e47e9SGeorge Wilson } 202094e47e9SGeorge Wilson vd->vdev_initialize_inflight++; 203094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_io_lock); 204094e47e9SGeorge Wilson 205094e47e9SGeorge Wilson dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 206094e47e9SGeorge Wilson VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 207094e47e9SGeorge Wilson uint64_t txg = dmu_tx_get_txg(tx); 208094e47e9SGeorge Wilson 209094e47e9SGeorge Wilson spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); 210094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_lock); 211094e47e9SGeorge Wilson 212094e47e9SGeorge Wilson if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { 213094e47e9SGeorge Wilson uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 214094e47e9SGeorge Wilson *guid = vd->vdev_guid; 215094e47e9SGeorge Wilson 216094e47e9SGeorge Wilson /* This is the first write of this txg. */ 217094e47e9SGeorge Wilson dsl_sync_task_nowait(spa_get_dsl(spa), 218094e47e9SGeorge Wilson vdev_initialize_zap_update_sync, guid, 2, 219094e47e9SGeorge Wilson ZFS_SPACE_CHECK_RESERVED, tx); 220094e47e9SGeorge Wilson } 221094e47e9SGeorge Wilson 222094e47e9SGeorge Wilson /* 223094e47e9SGeorge Wilson * We know the vdev struct will still be around since all 224094e47e9SGeorge Wilson * consumers of vdev_free must stop the initialization first. 225094e47e9SGeorge Wilson */ 226094e47e9SGeorge Wilson if (vdev_initialize_should_stop(vd)) { 227094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_io_lock); 228094e47e9SGeorge Wilson ASSERT3U(vd->vdev_initialize_inflight, >, 0); 229094e47e9SGeorge Wilson vd->vdev_initialize_inflight--; 230094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_io_lock); 231094e47e9SGeorge Wilson spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 232094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_lock); 233094e47e9SGeorge Wilson dmu_tx_commit(tx); 234094e47e9SGeorge Wilson return (SET_ERROR(EINTR)); 235094e47e9SGeorge Wilson } 236094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_lock); 237094e47e9SGeorge Wilson 238094e47e9SGeorge Wilson vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; 239094e47e9SGeorge Wilson zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, 240094e47e9SGeorge Wilson size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, 241094e47e9SGeorge Wilson ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); 242094e47e9SGeorge Wilson /* vdev_initialize_cb releases SCL_STATE_ALL */ 243094e47e9SGeorge Wilson 244094e47e9SGeorge Wilson dmu_tx_commit(tx); 245094e47e9SGeorge Wilson 246094e47e9SGeorge Wilson return (0); 247094e47e9SGeorge Wilson } 248094e47e9SGeorge Wilson 249094e47e9SGeorge Wilson /* 250094e47e9SGeorge Wilson * Translate a logical range to the physical range for the specified vdev_t. 251094e47e9SGeorge Wilson * This function is initially called with a leaf vdev and will walk each 252094e47e9SGeorge Wilson * parent vdev until it reaches a top-level vdev. Once the top-level is 253094e47e9SGeorge Wilson * reached the physical range is initialized and the recursive function 254094e47e9SGeorge Wilson * begins to unwind. As it unwinds it calls the parent's vdev specific 255094e47e9SGeorge Wilson * translation function to do the real conversion. 256094e47e9SGeorge Wilson */ 257094e47e9SGeorge Wilson void 258094e47e9SGeorge Wilson vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) 259094e47e9SGeorge Wilson { 260094e47e9SGeorge Wilson /* 261094e47e9SGeorge Wilson * Walk up the vdev tree 262094e47e9SGeorge Wilson */ 263094e47e9SGeorge Wilson if (vd != vd->vdev_top) { 264094e47e9SGeorge Wilson vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); 265094e47e9SGeorge Wilson } else { 266094e47e9SGeorge Wilson /* 267094e47e9SGeorge Wilson * We've reached the top-level vdev, initialize the 268094e47e9SGeorge Wilson * physical range to the logical range and start to 269094e47e9SGeorge Wilson * unwind. 270094e47e9SGeorge Wilson */ 271094e47e9SGeorge Wilson physical_rs->rs_start = logical_rs->rs_start; 272094e47e9SGeorge Wilson physical_rs->rs_end = logical_rs->rs_end; 273094e47e9SGeorge Wilson return; 274094e47e9SGeorge Wilson } 275094e47e9SGeorge Wilson 276094e47e9SGeorge Wilson vdev_t *pvd = vd->vdev_parent; 277094e47e9SGeorge Wilson ASSERT3P(pvd, !=, NULL); 278094e47e9SGeorge Wilson ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); 279094e47e9SGeorge Wilson 280094e47e9SGeorge Wilson /* 281094e47e9SGeorge Wilson * As this recursive function unwinds, translate the logical 282094e47e9SGeorge Wilson * range into its physical components by calling the 283094e47e9SGeorge Wilson * vdev specific translate function. 284094e47e9SGeorge Wilson */ 285094e47e9SGeorge Wilson range_seg_t intermediate = { 0 }; 286094e47e9SGeorge Wilson pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); 287094e47e9SGeorge Wilson 288094e47e9SGeorge Wilson physical_rs->rs_start = intermediate.rs_start; 289094e47e9SGeorge Wilson physical_rs->rs_end = intermediate.rs_end; 290094e47e9SGeorge Wilson } 291094e47e9SGeorge Wilson 292094e47e9SGeorge Wilson /* 293094e47e9SGeorge Wilson * Callback to fill each ABD chunk with zfs_initialize_value. len must be 294094e47e9SGeorge Wilson * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD 295094e47e9SGeorge Wilson * allocation will guarantee these for us. 296094e47e9SGeorge Wilson */ 297094e47e9SGeorge Wilson /* ARGSUSED */ 298094e47e9SGeorge Wilson static int 299094e47e9SGeorge Wilson vdev_initialize_block_fill(void *buf, size_t len, void *unused) 300094e47e9SGeorge Wilson { 301094e47e9SGeorge Wilson ASSERT0(len % sizeof (uint64_t)); 302094e47e9SGeorge Wilson for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { 303094e47e9SGeorge Wilson *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; 304094e47e9SGeorge Wilson } 305094e47e9SGeorge Wilson return (0); 306094e47e9SGeorge Wilson } 307094e47e9SGeorge Wilson 308094e47e9SGeorge Wilson static abd_t * 309094e47e9SGeorge Wilson vdev_initialize_block_alloc() 310094e47e9SGeorge Wilson { 311094e47e9SGeorge Wilson /* Allocate ABD for filler data */ 312094e47e9SGeorge Wilson abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); 313094e47e9SGeorge Wilson 314094e47e9SGeorge Wilson ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); 315094e47e9SGeorge Wilson (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, 316094e47e9SGeorge Wilson vdev_initialize_block_fill, NULL); 317094e47e9SGeorge Wilson 318094e47e9SGeorge Wilson return (data); 319094e47e9SGeorge Wilson } 320094e47e9SGeorge Wilson 321094e47e9SGeorge Wilson static void 322094e47e9SGeorge Wilson vdev_initialize_block_free(abd_t *data) 323094e47e9SGeorge Wilson { 324094e47e9SGeorge Wilson abd_free(data); 325094e47e9SGeorge Wilson } 326094e47e9SGeorge Wilson 327094e47e9SGeorge Wilson static int 328094e47e9SGeorge Wilson vdev_initialize_ranges(vdev_t *vd, abd_t *data) 329094e47e9SGeorge Wilson { 330094e47e9SGeorge Wilson avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; 331094e47e9SGeorge Wilson 332094e47e9SGeorge Wilson for (range_seg_t *rs = avl_first(rt); rs != NULL; 333094e47e9SGeorge Wilson rs = AVL_NEXT(rt, rs)) { 334094e47e9SGeorge Wilson uint64_t size = rs->rs_end - rs->rs_start; 335094e47e9SGeorge Wilson 336094e47e9SGeorge Wilson /* Split range into legally-sized physical chunks */ 337094e47e9SGeorge Wilson uint64_t writes_required = 338094e47e9SGeorge Wilson ((size - 1) / zfs_initialize_chunk_size) + 1; 339094e47e9SGeorge Wilson 340094e47e9SGeorge Wilson for (uint64_t w = 0; w < writes_required; w++) { 341094e47e9SGeorge Wilson int error; 342094e47e9SGeorge Wilson 343094e47e9SGeorge Wilson error = vdev_initialize_write(vd, 344094e47e9SGeorge Wilson VDEV_LABEL_START_SIZE + rs->rs_start + 345094e47e9SGeorge Wilson (w * zfs_initialize_chunk_size), 346094e47e9SGeorge Wilson MIN(size - (w * zfs_initialize_chunk_size), 347094e47e9SGeorge Wilson zfs_initialize_chunk_size), data); 348094e47e9SGeorge Wilson if (error != 0) 349094e47e9SGeorge Wilson return (error); 350094e47e9SGeorge Wilson } 351094e47e9SGeorge Wilson } 352094e47e9SGeorge Wilson return (0); 353094e47e9SGeorge Wilson } 354094e47e9SGeorge Wilson 355094e47e9SGeorge Wilson static void 356094e47e9SGeorge Wilson vdev_initialize_mg_wait(metaslab_group_t *mg) 357094e47e9SGeorge Wilson { 358094e47e9SGeorge Wilson ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); 359094e47e9SGeorge Wilson while (mg->mg_initialize_updating) { 360094e47e9SGeorge Wilson cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); 361094e47e9SGeorge Wilson } 362094e47e9SGeorge Wilson } 363094e47e9SGeorge Wilson 364094e47e9SGeorge Wilson static void 365094e47e9SGeorge Wilson vdev_initialize_mg_mark(metaslab_group_t *mg) 366094e47e9SGeorge Wilson { 367094e47e9SGeorge Wilson ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); 368094e47e9SGeorge Wilson ASSERT(mg->mg_initialize_updating); 369094e47e9SGeorge Wilson 370094e47e9SGeorge Wilson while (mg->mg_ms_initializing >= max_initialize_ms) { 371094e47e9SGeorge Wilson cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); 372094e47e9SGeorge Wilson } 373094e47e9SGeorge Wilson mg->mg_ms_initializing++; 374094e47e9SGeorge Wilson ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); 375094e47e9SGeorge Wilson } 376094e47e9SGeorge Wilson 377094e47e9SGeorge Wilson /* 378094e47e9SGeorge Wilson * Mark the metaslab as being initialized to prevent any allocations 379094e47e9SGeorge Wilson * on this metaslab. We must also track how many metaslabs are currently 380094e47e9SGeorge Wilson * being initialized within a metaslab group and limit them to prevent 381094e47e9SGeorge Wilson * allocation failures from occurring because all metaslabs are being 382094e47e9SGeorge Wilson * initialized. 383094e47e9SGeorge Wilson */ 384094e47e9SGeorge Wilson static void 385094e47e9SGeorge Wilson vdev_initialize_ms_mark(metaslab_t *msp) 386094e47e9SGeorge Wilson { 387094e47e9SGeorge Wilson ASSERT(!MUTEX_HELD(&msp->ms_lock)); 388094e47e9SGeorge Wilson metaslab_group_t *mg = msp->ms_group; 389094e47e9SGeorge Wilson 390094e47e9SGeorge Wilson mutex_enter(&mg->mg_ms_initialize_lock); 391094e47e9SGeorge Wilson 392094e47e9SGeorge Wilson /* 393094e47e9SGeorge Wilson * To keep an accurate count of how many threads are initializing 394094e47e9SGeorge Wilson * a specific metaslab group, we only allow one thread to mark 395094e47e9SGeorge Wilson * the metaslab group at a time. This ensures that the value of 396094e47e9SGeorge Wilson * ms_initializing will be accurate when we decide to mark a metaslab 397094e47e9SGeorge Wilson * group as being initialized. To do this we force all other threads 398094e47e9SGeorge Wilson * to wait till the metaslab's mg_initialize_updating flag is no 399094e47e9SGeorge Wilson * longer set. 400094e47e9SGeorge Wilson */ 401094e47e9SGeorge Wilson vdev_initialize_mg_wait(mg); 402094e47e9SGeorge Wilson mg->mg_initialize_updating = B_TRUE; 403094e47e9SGeorge Wilson if (msp->ms_initializing == 0) { 404094e47e9SGeorge Wilson vdev_initialize_mg_mark(mg); 405094e47e9SGeorge Wilson } 406094e47e9SGeorge Wilson mutex_enter(&msp->ms_lock); 407094e47e9SGeorge Wilson msp->ms_initializing++; 408094e47e9SGeorge Wilson mutex_exit(&msp->ms_lock); 409094e47e9SGeorge Wilson 410094e47e9SGeorge Wilson mg->mg_initialize_updating = B_FALSE; 411094e47e9SGeorge Wilson cv_broadcast(&mg->mg_ms_initialize_cv); 412094e47e9SGeorge Wilson mutex_exit(&mg->mg_ms_initialize_lock); 413094e47e9SGeorge Wilson } 414094e47e9SGeorge Wilson 415094e47e9SGeorge Wilson static void 416094e47e9SGeorge Wilson vdev_initialize_ms_unmark(metaslab_t *msp) 417094e47e9SGeorge Wilson { 418094e47e9SGeorge Wilson ASSERT(!MUTEX_HELD(&msp->ms_lock)); 419094e47e9SGeorge Wilson metaslab_group_t *mg = msp->ms_group; 420094e47e9SGeorge Wilson mutex_enter(&mg->mg_ms_initialize_lock); 421094e47e9SGeorge Wilson mutex_enter(&msp->ms_lock); 422094e47e9SGeorge Wilson if (--msp->ms_initializing == 0) { 423094e47e9SGeorge Wilson mg->mg_ms_initializing--; 424094e47e9SGeorge Wilson cv_broadcast(&mg->mg_ms_initialize_cv); 425094e47e9SGeorge Wilson } 426094e47e9SGeorge Wilson mutex_exit(&msp->ms_lock); 427094e47e9SGeorge Wilson mutex_exit(&mg->mg_ms_initialize_lock); 428094e47e9SGeorge Wilson } 429094e47e9SGeorge Wilson 430094e47e9SGeorge Wilson static void 431094e47e9SGeorge Wilson vdev_initialize_calculate_progress(vdev_t *vd) 432094e47e9SGeorge Wilson { 433094e47e9SGeorge Wilson ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 434094e47e9SGeorge Wilson spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 435094e47e9SGeorge Wilson ASSERT(vd->vdev_leaf_zap != 0); 436094e47e9SGeorge Wilson 437094e47e9SGeorge Wilson vd->vdev_initialize_bytes_est = 0; 438094e47e9SGeorge Wilson vd->vdev_initialize_bytes_done = 0; 439094e47e9SGeorge Wilson 440094e47e9SGeorge Wilson for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { 441094e47e9SGeorge Wilson metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 442094e47e9SGeorge Wilson mutex_enter(&msp->ms_lock); 443094e47e9SGeorge Wilson 444094e47e9SGeorge Wilson uint64_t ms_free = msp->ms_size - 445*555d674dSSerapheim Dimitropoulos metaslab_allocated_space(msp); 446094e47e9SGeorge Wilson 447094e47e9SGeorge Wilson if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) 448094e47e9SGeorge Wilson ms_free /= vd->vdev_top->vdev_children; 449094e47e9SGeorge Wilson 450094e47e9SGeorge Wilson /* 451094e47e9SGeorge Wilson * Convert the metaslab range to a physical range 452094e47e9SGeorge Wilson * on our vdev. We use this to determine if we are 453094e47e9SGeorge Wilson * in the middle of this metaslab range. 454094e47e9SGeorge Wilson */ 455094e47e9SGeorge Wilson range_seg_t logical_rs, physical_rs; 456094e47e9SGeorge Wilson logical_rs.rs_start = msp->ms_start; 457094e47e9SGeorge Wilson logical_rs.rs_end = msp->ms_start + msp->ms_size; 458094e47e9SGeorge Wilson vdev_xlate(vd, &logical_rs, &physical_rs); 459094e47e9SGeorge Wilson 460094e47e9SGeorge Wilson if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { 461094e47e9SGeorge Wilson vd->vdev_initialize_bytes_est += ms_free; 462094e47e9SGeorge Wilson mutex_exit(&msp->ms_lock); 463094e47e9SGeorge Wilson continue; 464094e47e9SGeorge Wilson } else if (vd->vdev_initialize_last_offset > 465094e47e9SGeorge Wilson physical_rs.rs_end) { 466094e47e9SGeorge Wilson vd->vdev_initialize_bytes_done += ms_free; 467094e47e9SGeorge Wilson vd->vdev_initialize_bytes_est += ms_free; 468094e47e9SGeorge Wilson mutex_exit(&msp->ms_lock); 469094e47e9SGeorge Wilson continue; 470094e47e9SGeorge Wilson } 471094e47e9SGeorge Wilson 472094e47e9SGeorge Wilson /* 473094e47e9SGeorge Wilson * If we get here, we're in the middle of initializing this 474094e47e9SGeorge Wilson * metaslab. Load it and walk the free tree for more accurate 475094e47e9SGeorge Wilson * progress estimation. 476094e47e9SGeorge Wilson */ 477a0b03b16SSerapheim Dimitropoulos VERIFY0(metaslab_load(msp)); 478094e47e9SGeorge Wilson 479a0b03b16SSerapheim Dimitropoulos for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); 480a0b03b16SSerapheim Dimitropoulos rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { 481094e47e9SGeorge Wilson logical_rs.rs_start = rs->rs_start; 482094e47e9SGeorge Wilson logical_rs.rs_end = rs->rs_end; 483094e47e9SGeorge Wilson vdev_xlate(vd, &logical_rs, &physical_rs); 484094e47e9SGeorge Wilson 485094e47e9SGeorge Wilson uint64_t size = physical_rs.rs_end - 486094e47e9SGeorge Wilson physical_rs.rs_start; 487094e47e9SGeorge Wilson vd->vdev_initialize_bytes_est += size; 488094e47e9SGeorge Wilson if (vd->vdev_initialize_last_offset > 489094e47e9SGeorge Wilson physical_rs.rs_end) { 490094e47e9SGeorge Wilson vd->vdev_initialize_bytes_done += size; 491094e47e9SGeorge Wilson } else if (vd->vdev_initialize_last_offset > 492094e47e9SGeorge Wilson physical_rs.rs_start && 493094e47e9SGeorge Wilson vd->vdev_initialize_last_offset < 494094e47e9SGeorge Wilson physical_rs.rs_end) { 495094e47e9SGeorge Wilson vd->vdev_initialize_bytes_done += 496094e47e9SGeorge Wilson vd->vdev_initialize_last_offset - 497094e47e9SGeorge Wilson physical_rs.rs_start; 498094e47e9SGeorge Wilson } 499094e47e9SGeorge Wilson } 500094e47e9SGeorge Wilson mutex_exit(&msp->ms_lock); 501094e47e9SGeorge Wilson } 502094e47e9SGeorge Wilson } 503094e47e9SGeorge Wilson 504094e47e9SGeorge Wilson static void 505094e47e9SGeorge Wilson vdev_initialize_load(vdev_t *vd) 506094e47e9SGeorge Wilson { 507094e47e9SGeorge Wilson ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 508094e47e9SGeorge Wilson spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 509094e47e9SGeorge Wilson ASSERT(vd->vdev_leaf_zap != 0); 510094e47e9SGeorge Wilson 511094e47e9SGeorge Wilson if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || 512094e47e9SGeorge Wilson vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { 513094e47e9SGeorge Wilson int err = zap_lookup(vd->vdev_spa->spa_meta_objset, 514094e47e9SGeorge Wilson vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 515094e47e9SGeorge Wilson sizeof (vd->vdev_initialize_last_offset), 1, 516094e47e9SGeorge Wilson &vd->vdev_initialize_last_offset); 517094e47e9SGeorge Wilson ASSERT(err == 0 || err == ENOENT); 518094e47e9SGeorge Wilson } 519094e47e9SGeorge Wilson 520094e47e9SGeorge Wilson vdev_initialize_calculate_progress(vd); 521094e47e9SGeorge Wilson } 522094e47e9SGeorge Wilson 523094e47e9SGeorge Wilson 524094e47e9SGeorge Wilson /* 525094e47e9SGeorge Wilson * Convert the logical range into a physcial range and add it to our 526094e47e9SGeorge Wilson * avl tree. 527094e47e9SGeorge Wilson */ 528094e47e9SGeorge Wilson void 529094e47e9SGeorge Wilson vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) 530094e47e9SGeorge Wilson { 531094e47e9SGeorge Wilson vdev_t *vd = arg; 532094e47e9SGeorge Wilson range_seg_t logical_rs, physical_rs; 533094e47e9SGeorge Wilson logical_rs.rs_start = start; 534094e47e9SGeorge Wilson logical_rs.rs_end = start + size; 535094e47e9SGeorge Wilson 536094e47e9SGeorge Wilson ASSERT(vd->vdev_ops->vdev_op_leaf); 537094e47e9SGeorge Wilson vdev_xlate(vd, &logical_rs, &physical_rs); 538094e47e9SGeorge Wilson 539094e47e9SGeorge Wilson IMPLY(vd->vdev_top == vd, 540094e47e9SGeorge Wilson logical_rs.rs_start == physical_rs.rs_start); 541094e47e9SGeorge Wilson IMPLY(vd->vdev_top == vd, 542094e47e9SGeorge Wilson logical_rs.rs_end == physical_rs.rs_end); 543094e47e9SGeorge Wilson 544094e47e9SGeorge Wilson /* Only add segments that we have not visited yet */ 545094e47e9SGeorge Wilson if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) 546094e47e9SGeorge Wilson return; 547094e47e9SGeorge Wilson 548094e47e9SGeorge Wilson /* Pick up where we left off mid-range. */ 549094e47e9SGeorge Wilson if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { 550094e47e9SGeorge Wilson zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " 551094e47e9SGeorge Wilson "(%llu, %llu)", vd->vdev_path, 552094e47e9SGeorge Wilson (u_longlong_t)physical_rs.rs_start, 553094e47e9SGeorge Wilson (u_longlong_t)physical_rs.rs_end, 554094e47e9SGeorge Wilson (u_longlong_t)vd->vdev_initialize_last_offset, 555094e47e9SGeorge Wilson (u_longlong_t)physical_rs.rs_end); 556094e47e9SGeorge Wilson ASSERT3U(physical_rs.rs_end, >, 557094e47e9SGeorge Wilson vd->vdev_initialize_last_offset); 558094e47e9SGeorge Wilson physical_rs.rs_start = vd->vdev_initialize_last_offset; 559094e47e9SGeorge Wilson } 560094e47e9SGeorge Wilson ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); 561094e47e9SGeorge Wilson 562094e47e9SGeorge Wilson /* 563094e47e9SGeorge Wilson * With raidz, it's possible that the logical range does not live on 564094e47e9SGeorge Wilson * this leaf vdev. We only add the physical range to this vdev's if it 565094e47e9SGeorge Wilson * has a length greater than 0. 566094e47e9SGeorge Wilson */ 567094e47e9SGeorge Wilson if (physical_rs.rs_end > physical_rs.rs_start) { 568094e47e9SGeorge Wilson range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, 569094e47e9SGeorge Wilson physical_rs.rs_end - physical_rs.rs_start); 570094e47e9SGeorge Wilson } else { 571094e47e9SGeorge Wilson ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); 572094e47e9SGeorge Wilson } 573094e47e9SGeorge Wilson } 574094e47e9SGeorge Wilson 575094e47e9SGeorge Wilson static void 576094e47e9SGeorge Wilson vdev_initialize_thread(void *arg) 577094e47e9SGeorge Wilson { 578094e47e9SGeorge Wilson vdev_t *vd = arg; 579094e47e9SGeorge Wilson spa_t *spa = vd->vdev_spa; 580094e47e9SGeorge Wilson int error = 0; 581094e47e9SGeorge Wilson uint64_t ms_count = 0; 582094e47e9SGeorge Wilson 583094e47e9SGeorge Wilson ASSERT(vdev_is_concrete(vd)); 584094e47e9SGeorge Wilson spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 585094e47e9SGeorge Wilson 586094e47e9SGeorge Wilson vd->vdev_initialize_last_offset = 0; 587094e47e9SGeorge Wilson vdev_initialize_load(vd); 588094e47e9SGeorge Wilson 589094e47e9SGeorge Wilson abd_t *deadbeef = vdev_initialize_block_alloc(); 590094e47e9SGeorge Wilson 591094e47e9SGeorge Wilson vd->vdev_initialize_tree = range_tree_create(NULL, NULL); 592094e47e9SGeorge Wilson 593094e47e9SGeorge Wilson for (uint64_t i = 0; !vd->vdev_detached && 594094e47e9SGeorge Wilson i < vd->vdev_top->vdev_ms_count; i++) { 595094e47e9SGeorge Wilson metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 596094e47e9SGeorge Wilson 597094e47e9SGeorge Wilson /* 598094e47e9SGeorge Wilson * If we've expanded the top-level vdev or it's our 599094e47e9SGeorge Wilson * first pass, calculate our progress. 600094e47e9SGeorge Wilson */ 601094e47e9SGeorge Wilson if (vd->vdev_top->vdev_ms_count != ms_count) { 602094e47e9SGeorge Wilson vdev_initialize_calculate_progress(vd); 603094e47e9SGeorge Wilson ms_count = vd->vdev_top->vdev_ms_count; 604094e47e9SGeorge Wilson } 605094e47e9SGeorge Wilson 606094e47e9SGeorge Wilson vdev_initialize_ms_mark(msp); 607094e47e9SGeorge Wilson mutex_enter(&msp->ms_lock); 608a0b03b16SSerapheim Dimitropoulos VERIFY0(metaslab_load(msp)); 609094e47e9SGeorge Wilson 610094e47e9SGeorge Wilson range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, 611094e47e9SGeorge Wilson vd); 612094e47e9SGeorge Wilson mutex_exit(&msp->ms_lock); 613094e47e9SGeorge Wilson 614094e47e9SGeorge Wilson spa_config_exit(spa, SCL_CONFIG, FTAG); 615094e47e9SGeorge Wilson error = vdev_initialize_ranges(vd, deadbeef); 616094e47e9SGeorge Wilson vdev_initialize_ms_unmark(msp); 617094e47e9SGeorge Wilson spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 618094e47e9SGeorge Wilson 619094e47e9SGeorge Wilson range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); 620094e47e9SGeorge Wilson if (error != 0) 621094e47e9SGeorge Wilson break; 622094e47e9SGeorge Wilson } 623094e47e9SGeorge Wilson 624094e47e9SGeorge Wilson spa_config_exit(spa, SCL_CONFIG, FTAG); 625094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_io_lock); 626094e47e9SGeorge Wilson while (vd->vdev_initialize_inflight > 0) { 627094e47e9SGeorge Wilson cv_wait(&vd->vdev_initialize_io_cv, 628094e47e9SGeorge Wilson &vd->vdev_initialize_io_lock); 629094e47e9SGeorge Wilson } 630094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_io_lock); 631094e47e9SGeorge Wilson 632094e47e9SGeorge Wilson range_tree_destroy(vd->vdev_initialize_tree); 633094e47e9SGeorge Wilson vdev_initialize_block_free(deadbeef); 634094e47e9SGeorge Wilson vd->vdev_initialize_tree = NULL; 635094e47e9SGeorge Wilson 636094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_lock); 637094e47e9SGeorge Wilson if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { 638094e47e9SGeorge Wilson vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); 639094e47e9SGeorge Wilson } 640094e47e9SGeorge Wilson ASSERT(vd->vdev_initialize_thread != NULL || 641094e47e9SGeorge Wilson vd->vdev_initialize_inflight == 0); 642094e47e9SGeorge Wilson 643094e47e9SGeorge Wilson /* 644094e47e9SGeorge Wilson * Drop the vdev_initialize_lock while we sync out the 645094e47e9SGeorge Wilson * txg since it's possible that a device might be trying to 646094e47e9SGeorge Wilson * come online and must check to see if it needs to restart an 647094e47e9SGeorge Wilson * initialization. That thread will be holding the spa_config_lock 648094e47e9SGeorge Wilson * which would prevent the txg_wait_synced from completing. 649094e47e9SGeorge Wilson */ 650094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_lock); 651094e47e9SGeorge Wilson txg_wait_synced(spa_get_dsl(spa), 0); 652094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_lock); 653094e47e9SGeorge Wilson 654094e47e9SGeorge Wilson vd->vdev_initialize_thread = NULL; 655094e47e9SGeorge Wilson cv_broadcast(&vd->vdev_initialize_cv); 656094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_lock); 657094e47e9SGeorge Wilson } 658094e47e9SGeorge Wilson 659094e47e9SGeorge Wilson /* 660094e47e9SGeorge Wilson * Initiates a device. Caller must hold vdev_initialize_lock. 661094e47e9SGeorge Wilson * Device must be a leaf and not already be initializing. 662094e47e9SGeorge Wilson */ 663094e47e9SGeorge Wilson void 664094e47e9SGeorge Wilson vdev_initialize(vdev_t *vd) 665094e47e9SGeorge Wilson { 666094e47e9SGeorge Wilson ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 667094e47e9SGeorge Wilson ASSERT(vd->vdev_ops->vdev_op_leaf); 668094e47e9SGeorge Wilson ASSERT(vdev_is_concrete(vd)); 669094e47e9SGeorge Wilson ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 670094e47e9SGeorge Wilson ASSERT(!vd->vdev_detached); 671094e47e9SGeorge Wilson ASSERT(!vd->vdev_initialize_exit_wanted); 672094e47e9SGeorge Wilson ASSERT(!vd->vdev_top->vdev_removing); 673094e47e9SGeorge Wilson 674094e47e9SGeorge Wilson vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); 675094e47e9SGeorge Wilson vd->vdev_initialize_thread = thread_create(NULL, 0, 676094e47e9SGeorge Wilson vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 677094e47e9SGeorge Wilson } 678094e47e9SGeorge Wilson 679094e47e9SGeorge Wilson /* 680094e47e9SGeorge Wilson * Stop initializng a device, with the resultant initialing state being 681094e47e9SGeorge Wilson * tgt_state. Blocks until the initializing thread has exited. 682094e47e9SGeorge Wilson * Caller must hold vdev_initialize_lock and must not be writing to the spa 683094e47e9SGeorge Wilson * config, as the initializing thread may try to enter the config as a reader 684094e47e9SGeorge Wilson * before exiting. 685094e47e9SGeorge Wilson */ 686094e47e9SGeorge Wilson void 687094e47e9SGeorge Wilson vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) 688094e47e9SGeorge Wilson { 689094e47e9SGeorge Wilson spa_t *spa = vd->vdev_spa; 690094e47e9SGeorge Wilson ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); 691094e47e9SGeorge Wilson 692094e47e9SGeorge Wilson ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 693094e47e9SGeorge Wilson ASSERT(vd->vdev_ops->vdev_op_leaf); 694094e47e9SGeorge Wilson ASSERT(vdev_is_concrete(vd)); 695094e47e9SGeorge Wilson 696094e47e9SGeorge Wilson /* 697094e47e9SGeorge Wilson * Allow cancel requests to proceed even if the initialize thread 698094e47e9SGeorge Wilson * has stopped. 699094e47e9SGeorge Wilson */ 700094e47e9SGeorge Wilson if (vd->vdev_initialize_thread == NULL && 701094e47e9SGeorge Wilson tgt_state != VDEV_INITIALIZE_CANCELED) { 702094e47e9SGeorge Wilson return; 703094e47e9SGeorge Wilson } 704094e47e9SGeorge Wilson 705094e47e9SGeorge Wilson vdev_initialize_change_state(vd, tgt_state); 706094e47e9SGeorge Wilson vd->vdev_initialize_exit_wanted = B_TRUE; 707094e47e9SGeorge Wilson while (vd->vdev_initialize_thread != NULL) 708094e47e9SGeorge Wilson cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); 709094e47e9SGeorge Wilson 710094e47e9SGeorge Wilson ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 711094e47e9SGeorge Wilson vd->vdev_initialize_exit_wanted = B_FALSE; 712094e47e9SGeorge Wilson } 713094e47e9SGeorge Wilson 714094e47e9SGeorge Wilson static void 715094e47e9SGeorge Wilson vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) 716094e47e9SGeorge Wilson { 717094e47e9SGeorge Wilson if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { 718094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_lock); 719094e47e9SGeorge Wilson vdev_initialize_stop(vd, tgt_state); 720094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_lock); 721094e47e9SGeorge Wilson return; 722094e47e9SGeorge Wilson } 723094e47e9SGeorge Wilson 724094e47e9SGeorge Wilson for (uint64_t i = 0; i < vd->vdev_children; i++) { 725094e47e9SGeorge Wilson vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); 726094e47e9SGeorge Wilson } 727094e47e9SGeorge Wilson } 728094e47e9SGeorge Wilson 729094e47e9SGeorge Wilson /* 730094e47e9SGeorge Wilson * Convenience function to stop initializing of a vdev tree and set all 731094e47e9SGeorge Wilson * initialize thread pointers to NULL. 732094e47e9SGeorge Wilson */ 733094e47e9SGeorge Wilson void 734094e47e9SGeorge Wilson vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) 735094e47e9SGeorge Wilson { 736094e47e9SGeorge Wilson vdev_initialize_stop_all_impl(vd, tgt_state); 737094e47e9SGeorge Wilson 738094e47e9SGeorge Wilson if (vd->vdev_spa->spa_sync_on) { 739094e47e9SGeorge Wilson /* Make sure that our state has been synced to disk */ 740094e47e9SGeorge Wilson txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 741094e47e9SGeorge Wilson } 742094e47e9SGeorge Wilson } 743094e47e9SGeorge Wilson 744094e47e9SGeorge Wilson void 745094e47e9SGeorge Wilson vdev_initialize_restart(vdev_t *vd) 746094e47e9SGeorge Wilson { 747094e47e9SGeorge Wilson ASSERT(MUTEX_HELD(&spa_namespace_lock)); 748094e47e9SGeorge Wilson ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 749094e47e9SGeorge Wilson 750094e47e9SGeorge Wilson if (vd->vdev_leaf_zap != 0) { 751094e47e9SGeorge Wilson mutex_enter(&vd->vdev_initialize_lock); 752094e47e9SGeorge Wilson uint64_t initialize_state = VDEV_INITIALIZE_NONE; 753094e47e9SGeorge Wilson int err = zap_lookup(vd->vdev_spa->spa_meta_objset, 754094e47e9SGeorge Wilson vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, 755094e47e9SGeorge Wilson sizeof (initialize_state), 1, &initialize_state); 756094e47e9SGeorge Wilson ASSERT(err == 0 || err == ENOENT); 757094e47e9SGeorge Wilson vd->vdev_initialize_state = initialize_state; 758094e47e9SGeorge Wilson 759094e47e9SGeorge Wilson uint64_t timestamp = 0; 760094e47e9SGeorge Wilson err = zap_lookup(vd->vdev_spa->spa_meta_objset, 761094e47e9SGeorge Wilson vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, 762094e47e9SGeorge Wilson sizeof (timestamp), 1, ×tamp); 763094e47e9SGeorge Wilson ASSERT(err == 0 || err == ENOENT); 764094e47e9SGeorge Wilson vd->vdev_initialize_action_time = (time_t)timestamp; 765094e47e9SGeorge Wilson 766094e47e9SGeorge Wilson if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || 767094e47e9SGeorge Wilson vd->vdev_offline) { 768094e47e9SGeorge Wilson /* load progress for reporting, but don't resume */ 769094e47e9SGeorge Wilson vdev_initialize_load(vd); 770094e47e9SGeorge Wilson } else if (vd->vdev_initialize_state == 771094e47e9SGeorge Wilson VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { 772094e47e9SGeorge Wilson vdev_initialize(vd); 773094e47e9SGeorge Wilson } 774094e47e9SGeorge Wilson 775094e47e9SGeorge Wilson mutex_exit(&vd->vdev_initialize_lock); 776094e47e9SGeorge Wilson } 777094e47e9SGeorge Wilson 778094e47e9SGeorge Wilson for (uint64_t i = 0; i < vd->vdev_children; i++) { 779094e47e9SGeorge Wilson vdev_initialize_restart(vd->vdev_child[i]); 780094e47e9SGeorge Wilson } 781094e47e9SGeorge Wilson } 782