xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_initialize.c (revision 555d674d5d4b8191dc83723188349d28278b2431)
1094e47e9SGeorge Wilson /*
2094e47e9SGeorge Wilson  * CDDL HEADER START
3094e47e9SGeorge Wilson  *
4094e47e9SGeorge Wilson  * The contents of this file are subject to the terms of the
5094e47e9SGeorge Wilson  * Common Development and Distribution License (the "License").
6094e47e9SGeorge Wilson  * You may not use this file except in compliance with the License.
7094e47e9SGeorge Wilson  *
8094e47e9SGeorge Wilson  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9094e47e9SGeorge Wilson  * or http://www.opensolaris.org/os/licensing.
10094e47e9SGeorge Wilson  * See the License for the specific language governing permissions
11094e47e9SGeorge Wilson  * and limitations under the License.
12094e47e9SGeorge Wilson  *
13094e47e9SGeorge Wilson  * When distributing Covered Code, include this CDDL HEADER in each
14094e47e9SGeorge Wilson  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15094e47e9SGeorge Wilson  * If applicable, add the following below this CDDL HEADER, with the
16094e47e9SGeorge Wilson  * fields enclosed by brackets "[]" replaced with your own identifying
17094e47e9SGeorge Wilson  * information: Portions Copyright [yyyy] [name of copyright owner]
18094e47e9SGeorge Wilson  *
19094e47e9SGeorge Wilson  * CDDL HEADER END
20094e47e9SGeorge Wilson  */
21094e47e9SGeorge Wilson 
22094e47e9SGeorge Wilson /*
23094e47e9SGeorge Wilson  * Copyright (c) 2016 by Delphix. All rights reserved.
24094e47e9SGeorge Wilson  */
25094e47e9SGeorge Wilson 
26094e47e9SGeorge Wilson #include <sys/spa.h>
27094e47e9SGeorge Wilson #include <sys/spa_impl.h>
28094e47e9SGeorge Wilson #include <sys/txg.h>
29094e47e9SGeorge Wilson #include <sys/vdev_impl.h>
30094e47e9SGeorge Wilson #include <sys/refcount.h>
31094e47e9SGeorge Wilson #include <sys/metaslab_impl.h>
32094e47e9SGeorge Wilson #include <sys/dsl_synctask.h>
33094e47e9SGeorge Wilson #include <sys/zap.h>
34094e47e9SGeorge Wilson #include <sys/dmu_tx.h>
35094e47e9SGeorge Wilson 
36094e47e9SGeorge Wilson /*
37094e47e9SGeorge Wilson  * Maximum number of metaslabs per group that can be initialized
38094e47e9SGeorge Wilson  * simultaneously.
39094e47e9SGeorge Wilson  */
40094e47e9SGeorge Wilson int max_initialize_ms = 3;
41094e47e9SGeorge Wilson 
42094e47e9SGeorge Wilson /*
43094e47e9SGeorge Wilson  * Value that is written to disk during initialization.
44094e47e9SGeorge Wilson  */
45094e47e9SGeorge Wilson uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
46094e47e9SGeorge Wilson 
47094e47e9SGeorge Wilson /* maximum number of I/Os outstanding per leaf vdev */
48094e47e9SGeorge Wilson int zfs_initialize_limit = 1;
49094e47e9SGeorge Wilson 
50094e47e9SGeorge Wilson /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
51094e47e9SGeorge Wilson uint64_t zfs_initialize_chunk_size = 1024 * 1024;
52094e47e9SGeorge Wilson 
53094e47e9SGeorge Wilson static boolean_t
54094e47e9SGeorge Wilson vdev_initialize_should_stop(vdev_t *vd)
55094e47e9SGeorge Wilson {
56094e47e9SGeorge Wilson 	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
57094e47e9SGeorge Wilson 	    vd->vdev_detached || vd->vdev_top->vdev_removing);
58094e47e9SGeorge Wilson }
59094e47e9SGeorge Wilson 
60094e47e9SGeorge Wilson static void
61094e47e9SGeorge Wilson vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
62094e47e9SGeorge Wilson {
63094e47e9SGeorge Wilson 	/*
64094e47e9SGeorge Wilson 	 * We pass in the guid instead of the vdev_t since the vdev may
65094e47e9SGeorge Wilson 	 * have been freed prior to the sync task being processed. This
66094e47e9SGeorge Wilson 	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
67094e47e9SGeorge Wilson 	 * stop the intializing thread, schedule the sync task, and free
68094e47e9SGeorge Wilson 	 * the vdev. Later when the scheduled sync task is invoked, it would
69094e47e9SGeorge Wilson 	 * find that the vdev has been freed.
70094e47e9SGeorge Wilson 	 */
71094e47e9SGeorge Wilson 	uint64_t guid = *(uint64_t *)arg;
72094e47e9SGeorge Wilson 	uint64_t txg = dmu_tx_get_txg(tx);
73094e47e9SGeorge Wilson 	kmem_free(arg, sizeof (uint64_t));
74094e47e9SGeorge Wilson 
75094e47e9SGeorge Wilson 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
76094e47e9SGeorge Wilson 	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
77094e47e9SGeorge Wilson 		return;
78094e47e9SGeorge Wilson 
79094e47e9SGeorge Wilson 	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
80094e47e9SGeorge Wilson 	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
81094e47e9SGeorge Wilson 
82094e47e9SGeorge Wilson 	VERIFY(vd->vdev_leaf_zap != 0);
83094e47e9SGeorge Wilson 
84094e47e9SGeorge Wilson 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
85094e47e9SGeorge Wilson 
86094e47e9SGeorge Wilson 	if (last_offset > 0) {
87094e47e9SGeorge Wilson 		vd->vdev_initialize_last_offset = last_offset;
88094e47e9SGeorge Wilson 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
89094e47e9SGeorge Wilson 		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
90094e47e9SGeorge Wilson 		    sizeof (last_offset), 1, &last_offset, tx));
91094e47e9SGeorge Wilson 	}
92094e47e9SGeorge Wilson 	if (vd->vdev_initialize_action_time > 0) {
93094e47e9SGeorge Wilson 		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
94094e47e9SGeorge Wilson 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
95094e47e9SGeorge Wilson 		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
96094e47e9SGeorge Wilson 		    1, &val, tx));
97094e47e9SGeorge Wilson 	}
98094e47e9SGeorge Wilson 
99094e47e9SGeorge Wilson 	uint64_t initialize_state = vd->vdev_initialize_state;
100094e47e9SGeorge Wilson 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
101094e47e9SGeorge Wilson 	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
102094e47e9SGeorge Wilson 	    &initialize_state, tx));
103094e47e9SGeorge Wilson }
104094e47e9SGeorge Wilson 
105094e47e9SGeorge Wilson static void
106094e47e9SGeorge Wilson vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
107094e47e9SGeorge Wilson {
108094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
109094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
110094e47e9SGeorge Wilson 
111094e47e9SGeorge Wilson 	if (new_state == vd->vdev_initialize_state)
112094e47e9SGeorge Wilson 		return;
113094e47e9SGeorge Wilson 
114094e47e9SGeorge Wilson 	/*
115094e47e9SGeorge Wilson 	 * Copy the vd's guid, this will be freed by the sync task.
116094e47e9SGeorge Wilson 	 */
117094e47e9SGeorge Wilson 	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
118094e47e9SGeorge Wilson 	*guid = vd->vdev_guid;
119094e47e9SGeorge Wilson 
120094e47e9SGeorge Wilson 	/*
121094e47e9SGeorge Wilson 	 * If we're suspending, then preserving the original start time.
122094e47e9SGeorge Wilson 	 */
123094e47e9SGeorge Wilson 	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
124094e47e9SGeorge Wilson 		vd->vdev_initialize_action_time = gethrestime_sec();
125094e47e9SGeorge Wilson 	}
126094e47e9SGeorge Wilson 	vd->vdev_initialize_state = new_state;
127094e47e9SGeorge Wilson 
128094e47e9SGeorge Wilson 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
129094e47e9SGeorge Wilson 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
130094e47e9SGeorge Wilson 	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
131094e47e9SGeorge Wilson 	    guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
132094e47e9SGeorge Wilson 
133094e47e9SGeorge Wilson 	switch (new_state) {
134094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_ACTIVE:
135094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
136094e47e9SGeorge Wilson 		    "vdev=%s activated", vd->vdev_path);
137094e47e9SGeorge Wilson 		break;
138094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_SUSPENDED:
139094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
140094e47e9SGeorge Wilson 		    "vdev=%s suspended", vd->vdev_path);
141094e47e9SGeorge Wilson 		break;
142094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_CANCELED:
143094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
144094e47e9SGeorge Wilson 		    "vdev=%s canceled", vd->vdev_path);
145094e47e9SGeorge Wilson 		break;
146094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_COMPLETE:
147094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
148094e47e9SGeorge Wilson 		    "vdev=%s complete", vd->vdev_path);
149094e47e9SGeorge Wilson 		break;
150094e47e9SGeorge Wilson 	default:
151094e47e9SGeorge Wilson 		panic("invalid state %llu", (unsigned long long)new_state);
152094e47e9SGeorge Wilson 	}
153094e47e9SGeorge Wilson 
154094e47e9SGeorge Wilson 	dmu_tx_commit(tx);
155094e47e9SGeorge Wilson }
156094e47e9SGeorge Wilson 
157094e47e9SGeorge Wilson static void
158094e47e9SGeorge Wilson vdev_initialize_cb(zio_t *zio)
159094e47e9SGeorge Wilson {
160094e47e9SGeorge Wilson 	vdev_t *vd = zio->io_vd;
161094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_io_lock);
162094e47e9SGeorge Wilson 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
163094e47e9SGeorge Wilson 		/*
164094e47e9SGeorge Wilson 		 * The I/O failed because the vdev was unavailable; roll the
165094e47e9SGeorge Wilson 		 * last offset back. (This works because spa_sync waits on
166094e47e9SGeorge Wilson 		 * spa_txg_zio before it runs sync tasks.)
167094e47e9SGeorge Wilson 		 */
168094e47e9SGeorge Wilson 		uint64_t *off =
169094e47e9SGeorge Wilson 		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
170094e47e9SGeorge Wilson 		*off = MIN(*off, zio->io_offset);
171094e47e9SGeorge Wilson 	} else {
172094e47e9SGeorge Wilson 		/*
173094e47e9SGeorge Wilson 		 * Since initializing is best-effort, we ignore I/O errors and
174094e47e9SGeorge Wilson 		 * rely on vdev_probe to determine if the errors are more
175094e47e9SGeorge Wilson 		 * critical.
176094e47e9SGeorge Wilson 		 */
177094e47e9SGeorge Wilson 		if (zio->io_error != 0)
178094e47e9SGeorge Wilson 			vd->vdev_stat.vs_initialize_errors++;
179094e47e9SGeorge Wilson 
180094e47e9SGeorge Wilson 		vd->vdev_initialize_bytes_done += zio->io_orig_size;
181094e47e9SGeorge Wilson 	}
182094e47e9SGeorge Wilson 	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
183094e47e9SGeorge Wilson 	vd->vdev_initialize_inflight--;
184094e47e9SGeorge Wilson 	cv_broadcast(&vd->vdev_initialize_io_cv);
185094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_io_lock);
186094e47e9SGeorge Wilson 
187094e47e9SGeorge Wilson 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
188094e47e9SGeorge Wilson }
189094e47e9SGeorge Wilson 
190094e47e9SGeorge Wilson /* Takes care of physical writing and limiting # of concurrent ZIOs. */
191094e47e9SGeorge Wilson static int
192094e47e9SGeorge Wilson vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
193094e47e9SGeorge Wilson {
194094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
195094e47e9SGeorge Wilson 
196094e47e9SGeorge Wilson 	/* Limit inflight initializing I/Os */
197094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_io_lock);
198094e47e9SGeorge Wilson 	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
199094e47e9SGeorge Wilson 		cv_wait(&vd->vdev_initialize_io_cv,
200094e47e9SGeorge Wilson 		    &vd->vdev_initialize_io_lock);
201094e47e9SGeorge Wilson 	}
202094e47e9SGeorge Wilson 	vd->vdev_initialize_inflight++;
203094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_io_lock);
204094e47e9SGeorge Wilson 
205094e47e9SGeorge Wilson 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
206094e47e9SGeorge Wilson 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
207094e47e9SGeorge Wilson 	uint64_t txg = dmu_tx_get_txg(tx);
208094e47e9SGeorge Wilson 
209094e47e9SGeorge Wilson 	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
210094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_lock);
211094e47e9SGeorge Wilson 
212094e47e9SGeorge Wilson 	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
213094e47e9SGeorge Wilson 		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
214094e47e9SGeorge Wilson 		*guid = vd->vdev_guid;
215094e47e9SGeorge Wilson 
216094e47e9SGeorge Wilson 		/* This is the first write of this txg. */
217094e47e9SGeorge Wilson 		dsl_sync_task_nowait(spa_get_dsl(spa),
218094e47e9SGeorge Wilson 		    vdev_initialize_zap_update_sync, guid, 2,
219094e47e9SGeorge Wilson 		    ZFS_SPACE_CHECK_RESERVED, tx);
220094e47e9SGeorge Wilson 	}
221094e47e9SGeorge Wilson 
222094e47e9SGeorge Wilson 	/*
223094e47e9SGeorge Wilson 	 * We know the vdev struct will still be around since all
224094e47e9SGeorge Wilson 	 * consumers of vdev_free must stop the initialization first.
225094e47e9SGeorge Wilson 	 */
226094e47e9SGeorge Wilson 	if (vdev_initialize_should_stop(vd)) {
227094e47e9SGeorge Wilson 		mutex_enter(&vd->vdev_initialize_io_lock);
228094e47e9SGeorge Wilson 		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
229094e47e9SGeorge Wilson 		vd->vdev_initialize_inflight--;
230094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_io_lock);
231094e47e9SGeorge Wilson 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
232094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_lock);
233094e47e9SGeorge Wilson 		dmu_tx_commit(tx);
234094e47e9SGeorge Wilson 		return (SET_ERROR(EINTR));
235094e47e9SGeorge Wilson 	}
236094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_lock);
237094e47e9SGeorge Wilson 
238094e47e9SGeorge Wilson 	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
239094e47e9SGeorge Wilson 	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
240094e47e9SGeorge Wilson 	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
241094e47e9SGeorge Wilson 	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
242094e47e9SGeorge Wilson 	/* vdev_initialize_cb releases SCL_STATE_ALL */
243094e47e9SGeorge Wilson 
244094e47e9SGeorge Wilson 	dmu_tx_commit(tx);
245094e47e9SGeorge Wilson 
246094e47e9SGeorge Wilson 	return (0);
247094e47e9SGeorge Wilson }
248094e47e9SGeorge Wilson 
249094e47e9SGeorge Wilson /*
250094e47e9SGeorge Wilson  * Translate a logical range to the physical range for the specified vdev_t.
251094e47e9SGeorge Wilson  * This function is initially called with a leaf vdev and will walk each
252094e47e9SGeorge Wilson  * parent vdev until it reaches a top-level vdev. Once the top-level is
253094e47e9SGeorge Wilson  * reached the physical range is initialized and the recursive function
254094e47e9SGeorge Wilson  * begins to unwind. As it unwinds it calls the parent's vdev specific
255094e47e9SGeorge Wilson  * translation function to do the real conversion.
256094e47e9SGeorge Wilson  */
257094e47e9SGeorge Wilson void
258094e47e9SGeorge Wilson vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
259094e47e9SGeorge Wilson {
260094e47e9SGeorge Wilson 	/*
261094e47e9SGeorge Wilson 	 * Walk up the vdev tree
262094e47e9SGeorge Wilson 	 */
263094e47e9SGeorge Wilson 	if (vd != vd->vdev_top) {
264094e47e9SGeorge Wilson 		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
265094e47e9SGeorge Wilson 	} else {
266094e47e9SGeorge Wilson 		/*
267094e47e9SGeorge Wilson 		 * We've reached the top-level vdev, initialize the
268094e47e9SGeorge Wilson 		 * physical range to the logical range and start to
269094e47e9SGeorge Wilson 		 * unwind.
270094e47e9SGeorge Wilson 		 */
271094e47e9SGeorge Wilson 		physical_rs->rs_start = logical_rs->rs_start;
272094e47e9SGeorge Wilson 		physical_rs->rs_end = logical_rs->rs_end;
273094e47e9SGeorge Wilson 		return;
274094e47e9SGeorge Wilson 	}
275094e47e9SGeorge Wilson 
276094e47e9SGeorge Wilson 	vdev_t *pvd = vd->vdev_parent;
277094e47e9SGeorge Wilson 	ASSERT3P(pvd, !=, NULL);
278094e47e9SGeorge Wilson 	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
279094e47e9SGeorge Wilson 
280094e47e9SGeorge Wilson 	/*
281094e47e9SGeorge Wilson 	 * As this recursive function unwinds, translate the logical
282094e47e9SGeorge Wilson 	 * range into its physical components by calling the
283094e47e9SGeorge Wilson 	 * vdev specific translate function.
284094e47e9SGeorge Wilson 	 */
285094e47e9SGeorge Wilson 	range_seg_t intermediate = { 0 };
286094e47e9SGeorge Wilson 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
287094e47e9SGeorge Wilson 
288094e47e9SGeorge Wilson 	physical_rs->rs_start = intermediate.rs_start;
289094e47e9SGeorge Wilson 	physical_rs->rs_end = intermediate.rs_end;
290094e47e9SGeorge Wilson }
291094e47e9SGeorge Wilson 
292094e47e9SGeorge Wilson /*
293094e47e9SGeorge Wilson  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
294094e47e9SGeorge Wilson  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
295094e47e9SGeorge Wilson  * allocation will guarantee these for us.
296094e47e9SGeorge Wilson  */
297094e47e9SGeorge Wilson /* ARGSUSED */
298094e47e9SGeorge Wilson static int
299094e47e9SGeorge Wilson vdev_initialize_block_fill(void *buf, size_t len, void *unused)
300094e47e9SGeorge Wilson {
301094e47e9SGeorge Wilson 	ASSERT0(len % sizeof (uint64_t));
302094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
303094e47e9SGeorge Wilson 		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
304094e47e9SGeorge Wilson 	}
305094e47e9SGeorge Wilson 	return (0);
306094e47e9SGeorge Wilson }
307094e47e9SGeorge Wilson 
308094e47e9SGeorge Wilson static abd_t *
309094e47e9SGeorge Wilson vdev_initialize_block_alloc()
310094e47e9SGeorge Wilson {
311094e47e9SGeorge Wilson 	/* Allocate ABD for filler data */
312094e47e9SGeorge Wilson 	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
313094e47e9SGeorge Wilson 
314094e47e9SGeorge Wilson 	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
315094e47e9SGeorge Wilson 	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
316094e47e9SGeorge Wilson 	    vdev_initialize_block_fill, NULL);
317094e47e9SGeorge Wilson 
318094e47e9SGeorge Wilson 	return (data);
319094e47e9SGeorge Wilson }
320094e47e9SGeorge Wilson 
321094e47e9SGeorge Wilson static void
322094e47e9SGeorge Wilson vdev_initialize_block_free(abd_t *data)
323094e47e9SGeorge Wilson {
324094e47e9SGeorge Wilson 	abd_free(data);
325094e47e9SGeorge Wilson }
326094e47e9SGeorge Wilson 
327094e47e9SGeorge Wilson static int
328094e47e9SGeorge Wilson vdev_initialize_ranges(vdev_t *vd, abd_t *data)
329094e47e9SGeorge Wilson {
330094e47e9SGeorge Wilson 	avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
331094e47e9SGeorge Wilson 
332094e47e9SGeorge Wilson 	for (range_seg_t *rs = avl_first(rt); rs != NULL;
333094e47e9SGeorge Wilson 	    rs = AVL_NEXT(rt, rs)) {
334094e47e9SGeorge Wilson 		uint64_t size = rs->rs_end - rs->rs_start;
335094e47e9SGeorge Wilson 
336094e47e9SGeorge Wilson 		/* Split range into legally-sized physical chunks */
337094e47e9SGeorge Wilson 		uint64_t writes_required =
338094e47e9SGeorge Wilson 		    ((size - 1) / zfs_initialize_chunk_size) + 1;
339094e47e9SGeorge Wilson 
340094e47e9SGeorge Wilson 		for (uint64_t w = 0; w < writes_required; w++) {
341094e47e9SGeorge Wilson 			int error;
342094e47e9SGeorge Wilson 
343094e47e9SGeorge Wilson 			error = vdev_initialize_write(vd,
344094e47e9SGeorge Wilson 			    VDEV_LABEL_START_SIZE + rs->rs_start +
345094e47e9SGeorge Wilson 			    (w * zfs_initialize_chunk_size),
346094e47e9SGeorge Wilson 			    MIN(size - (w * zfs_initialize_chunk_size),
347094e47e9SGeorge Wilson 			    zfs_initialize_chunk_size), data);
348094e47e9SGeorge Wilson 			if (error != 0)
349094e47e9SGeorge Wilson 				return (error);
350094e47e9SGeorge Wilson 		}
351094e47e9SGeorge Wilson 	}
352094e47e9SGeorge Wilson 	return (0);
353094e47e9SGeorge Wilson }
354094e47e9SGeorge Wilson 
355094e47e9SGeorge Wilson static void
356094e47e9SGeorge Wilson vdev_initialize_mg_wait(metaslab_group_t *mg)
357094e47e9SGeorge Wilson {
358094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
359094e47e9SGeorge Wilson 	while (mg->mg_initialize_updating) {
360094e47e9SGeorge Wilson 		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
361094e47e9SGeorge Wilson 	}
362094e47e9SGeorge Wilson }
363094e47e9SGeorge Wilson 
364094e47e9SGeorge Wilson static void
365094e47e9SGeorge Wilson vdev_initialize_mg_mark(metaslab_group_t *mg)
366094e47e9SGeorge Wilson {
367094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
368094e47e9SGeorge Wilson 	ASSERT(mg->mg_initialize_updating);
369094e47e9SGeorge Wilson 
370094e47e9SGeorge Wilson 	while (mg->mg_ms_initializing >= max_initialize_ms) {
371094e47e9SGeorge Wilson 		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
372094e47e9SGeorge Wilson 	}
373094e47e9SGeorge Wilson 	mg->mg_ms_initializing++;
374094e47e9SGeorge Wilson 	ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
375094e47e9SGeorge Wilson }
376094e47e9SGeorge Wilson 
377094e47e9SGeorge Wilson /*
378094e47e9SGeorge Wilson  * Mark the metaslab as being initialized to prevent any allocations
379094e47e9SGeorge Wilson  * on this metaslab. We must also track how many metaslabs are currently
380094e47e9SGeorge Wilson  * being initialized within a metaslab group and limit them to prevent
381094e47e9SGeorge Wilson  * allocation failures from occurring because all metaslabs are being
382094e47e9SGeorge Wilson  * initialized.
383094e47e9SGeorge Wilson  */
384094e47e9SGeorge Wilson static void
385094e47e9SGeorge Wilson vdev_initialize_ms_mark(metaslab_t *msp)
386094e47e9SGeorge Wilson {
387094e47e9SGeorge Wilson 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
388094e47e9SGeorge Wilson 	metaslab_group_t *mg = msp->ms_group;
389094e47e9SGeorge Wilson 
390094e47e9SGeorge Wilson 	mutex_enter(&mg->mg_ms_initialize_lock);
391094e47e9SGeorge Wilson 
392094e47e9SGeorge Wilson 	/*
393094e47e9SGeorge Wilson 	 * To keep an accurate count of how many threads are initializing
394094e47e9SGeorge Wilson 	 * a specific metaslab group, we only allow one thread to mark
395094e47e9SGeorge Wilson 	 * the metaslab group at a time. This ensures that the value of
396094e47e9SGeorge Wilson 	 * ms_initializing will be accurate when we decide to mark a metaslab
397094e47e9SGeorge Wilson 	 * group as being initialized. To do this we force all other threads
398094e47e9SGeorge Wilson 	 * to wait till the metaslab's mg_initialize_updating flag is no
399094e47e9SGeorge Wilson 	 * longer set.
400094e47e9SGeorge Wilson 	 */
401094e47e9SGeorge Wilson 	vdev_initialize_mg_wait(mg);
402094e47e9SGeorge Wilson 	mg->mg_initialize_updating = B_TRUE;
403094e47e9SGeorge Wilson 	if (msp->ms_initializing == 0) {
404094e47e9SGeorge Wilson 		vdev_initialize_mg_mark(mg);
405094e47e9SGeorge Wilson 	}
406094e47e9SGeorge Wilson 	mutex_enter(&msp->ms_lock);
407094e47e9SGeorge Wilson 	msp->ms_initializing++;
408094e47e9SGeorge Wilson 	mutex_exit(&msp->ms_lock);
409094e47e9SGeorge Wilson 
410094e47e9SGeorge Wilson 	mg->mg_initialize_updating = B_FALSE;
411094e47e9SGeorge Wilson 	cv_broadcast(&mg->mg_ms_initialize_cv);
412094e47e9SGeorge Wilson 	mutex_exit(&mg->mg_ms_initialize_lock);
413094e47e9SGeorge Wilson }
414094e47e9SGeorge Wilson 
415094e47e9SGeorge Wilson static void
416094e47e9SGeorge Wilson vdev_initialize_ms_unmark(metaslab_t *msp)
417094e47e9SGeorge Wilson {
418094e47e9SGeorge Wilson 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
419094e47e9SGeorge Wilson 	metaslab_group_t *mg = msp->ms_group;
420094e47e9SGeorge Wilson 	mutex_enter(&mg->mg_ms_initialize_lock);
421094e47e9SGeorge Wilson 	mutex_enter(&msp->ms_lock);
422094e47e9SGeorge Wilson 	if (--msp->ms_initializing == 0) {
423094e47e9SGeorge Wilson 		mg->mg_ms_initializing--;
424094e47e9SGeorge Wilson 		cv_broadcast(&mg->mg_ms_initialize_cv);
425094e47e9SGeorge Wilson 	}
426094e47e9SGeorge Wilson 	mutex_exit(&msp->ms_lock);
427094e47e9SGeorge Wilson 	mutex_exit(&mg->mg_ms_initialize_lock);
428094e47e9SGeorge Wilson }
429094e47e9SGeorge Wilson 
430094e47e9SGeorge Wilson static void
431094e47e9SGeorge Wilson vdev_initialize_calculate_progress(vdev_t *vd)
432094e47e9SGeorge Wilson {
433094e47e9SGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
434094e47e9SGeorge Wilson 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
435094e47e9SGeorge Wilson 	ASSERT(vd->vdev_leaf_zap != 0);
436094e47e9SGeorge Wilson 
437094e47e9SGeorge Wilson 	vd->vdev_initialize_bytes_est = 0;
438094e47e9SGeorge Wilson 	vd->vdev_initialize_bytes_done = 0;
439094e47e9SGeorge Wilson 
440094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
441094e47e9SGeorge Wilson 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
442094e47e9SGeorge Wilson 		mutex_enter(&msp->ms_lock);
443094e47e9SGeorge Wilson 
444094e47e9SGeorge Wilson 		uint64_t ms_free = msp->ms_size -
445*555d674dSSerapheim Dimitropoulos 		    metaslab_allocated_space(msp);
446094e47e9SGeorge Wilson 
447094e47e9SGeorge Wilson 		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
448094e47e9SGeorge Wilson 			ms_free /= vd->vdev_top->vdev_children;
449094e47e9SGeorge Wilson 
450094e47e9SGeorge Wilson 		/*
451094e47e9SGeorge Wilson 		 * Convert the metaslab range to a physical range
452094e47e9SGeorge Wilson 		 * on our vdev. We use this to determine if we are
453094e47e9SGeorge Wilson 		 * in the middle of this metaslab range.
454094e47e9SGeorge Wilson 		 */
455094e47e9SGeorge Wilson 		range_seg_t logical_rs, physical_rs;
456094e47e9SGeorge Wilson 		logical_rs.rs_start = msp->ms_start;
457094e47e9SGeorge Wilson 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
458094e47e9SGeorge Wilson 		vdev_xlate(vd, &logical_rs, &physical_rs);
459094e47e9SGeorge Wilson 
460094e47e9SGeorge Wilson 		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
461094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_est += ms_free;
462094e47e9SGeorge Wilson 			mutex_exit(&msp->ms_lock);
463094e47e9SGeorge Wilson 			continue;
464094e47e9SGeorge Wilson 		} else if (vd->vdev_initialize_last_offset >
465094e47e9SGeorge Wilson 		    physical_rs.rs_end) {
466094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_done += ms_free;
467094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_est += ms_free;
468094e47e9SGeorge Wilson 			mutex_exit(&msp->ms_lock);
469094e47e9SGeorge Wilson 			continue;
470094e47e9SGeorge Wilson 		}
471094e47e9SGeorge Wilson 
472094e47e9SGeorge Wilson 		/*
473094e47e9SGeorge Wilson 		 * If we get here, we're in the middle of initializing this
474094e47e9SGeorge Wilson 		 * metaslab. Load it and walk the free tree for more accurate
475094e47e9SGeorge Wilson 		 * progress estimation.
476094e47e9SGeorge Wilson 		 */
477a0b03b16SSerapheim Dimitropoulos 		VERIFY0(metaslab_load(msp));
478094e47e9SGeorge Wilson 
479a0b03b16SSerapheim Dimitropoulos 		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
480a0b03b16SSerapheim Dimitropoulos 		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
481094e47e9SGeorge Wilson 			logical_rs.rs_start = rs->rs_start;
482094e47e9SGeorge Wilson 			logical_rs.rs_end = rs->rs_end;
483094e47e9SGeorge Wilson 			vdev_xlate(vd, &logical_rs, &physical_rs);
484094e47e9SGeorge Wilson 
485094e47e9SGeorge Wilson 			uint64_t size = physical_rs.rs_end -
486094e47e9SGeorge Wilson 			    physical_rs.rs_start;
487094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_est += size;
488094e47e9SGeorge Wilson 			if (vd->vdev_initialize_last_offset >
489094e47e9SGeorge Wilson 			    physical_rs.rs_end) {
490094e47e9SGeorge Wilson 				vd->vdev_initialize_bytes_done += size;
491094e47e9SGeorge Wilson 			} else if (vd->vdev_initialize_last_offset >
492094e47e9SGeorge Wilson 			    physical_rs.rs_start &&
493094e47e9SGeorge Wilson 			    vd->vdev_initialize_last_offset <
494094e47e9SGeorge Wilson 			    physical_rs.rs_end) {
495094e47e9SGeorge Wilson 				vd->vdev_initialize_bytes_done +=
496094e47e9SGeorge Wilson 				    vd->vdev_initialize_last_offset -
497094e47e9SGeorge Wilson 				    physical_rs.rs_start;
498094e47e9SGeorge Wilson 			}
499094e47e9SGeorge Wilson 		}
500094e47e9SGeorge Wilson 		mutex_exit(&msp->ms_lock);
501094e47e9SGeorge Wilson 	}
502094e47e9SGeorge Wilson }
503094e47e9SGeorge Wilson 
504094e47e9SGeorge Wilson static void
505094e47e9SGeorge Wilson vdev_initialize_load(vdev_t *vd)
506094e47e9SGeorge Wilson {
507094e47e9SGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
508094e47e9SGeorge Wilson 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
509094e47e9SGeorge Wilson 	ASSERT(vd->vdev_leaf_zap != 0);
510094e47e9SGeorge Wilson 
511094e47e9SGeorge Wilson 	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
512094e47e9SGeorge Wilson 	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
513094e47e9SGeorge Wilson 		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
514094e47e9SGeorge Wilson 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
515094e47e9SGeorge Wilson 		    sizeof (vd->vdev_initialize_last_offset), 1,
516094e47e9SGeorge Wilson 		    &vd->vdev_initialize_last_offset);
517094e47e9SGeorge Wilson 		ASSERT(err == 0 || err == ENOENT);
518094e47e9SGeorge Wilson 	}
519094e47e9SGeorge Wilson 
520094e47e9SGeorge Wilson 	vdev_initialize_calculate_progress(vd);
521094e47e9SGeorge Wilson }
522094e47e9SGeorge Wilson 
523094e47e9SGeorge Wilson 
524094e47e9SGeorge Wilson /*
525094e47e9SGeorge Wilson  * Convert the logical range into a physcial range and add it to our
526094e47e9SGeorge Wilson  * avl tree.
527094e47e9SGeorge Wilson  */
528094e47e9SGeorge Wilson void
529094e47e9SGeorge Wilson vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
530094e47e9SGeorge Wilson {
531094e47e9SGeorge Wilson 	vdev_t *vd = arg;
532094e47e9SGeorge Wilson 	range_seg_t logical_rs, physical_rs;
533094e47e9SGeorge Wilson 	logical_rs.rs_start = start;
534094e47e9SGeorge Wilson 	logical_rs.rs_end = start + size;
535094e47e9SGeorge Wilson 
536094e47e9SGeorge Wilson 	ASSERT(vd->vdev_ops->vdev_op_leaf);
537094e47e9SGeorge Wilson 	vdev_xlate(vd, &logical_rs, &physical_rs);
538094e47e9SGeorge Wilson 
539094e47e9SGeorge Wilson 	IMPLY(vd->vdev_top == vd,
540094e47e9SGeorge Wilson 	    logical_rs.rs_start == physical_rs.rs_start);
541094e47e9SGeorge Wilson 	IMPLY(vd->vdev_top == vd,
542094e47e9SGeorge Wilson 	    logical_rs.rs_end == physical_rs.rs_end);
543094e47e9SGeorge Wilson 
544094e47e9SGeorge Wilson 	/* Only add segments that we have not visited yet */
545094e47e9SGeorge Wilson 	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
546094e47e9SGeorge Wilson 		return;
547094e47e9SGeorge Wilson 
548094e47e9SGeorge Wilson 	/* Pick up where we left off mid-range. */
549094e47e9SGeorge Wilson 	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
550094e47e9SGeorge Wilson 		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
551094e47e9SGeorge Wilson 		    "(%llu, %llu)", vd->vdev_path,
552094e47e9SGeorge Wilson 		    (u_longlong_t)physical_rs.rs_start,
553094e47e9SGeorge Wilson 		    (u_longlong_t)physical_rs.rs_end,
554094e47e9SGeorge Wilson 		    (u_longlong_t)vd->vdev_initialize_last_offset,
555094e47e9SGeorge Wilson 		    (u_longlong_t)physical_rs.rs_end);
556094e47e9SGeorge Wilson 		ASSERT3U(physical_rs.rs_end, >,
557094e47e9SGeorge Wilson 		    vd->vdev_initialize_last_offset);
558094e47e9SGeorge Wilson 		physical_rs.rs_start = vd->vdev_initialize_last_offset;
559094e47e9SGeorge Wilson 	}
560094e47e9SGeorge Wilson 	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
561094e47e9SGeorge Wilson 
562094e47e9SGeorge Wilson 	/*
563094e47e9SGeorge Wilson 	 * With raidz, it's possible that the logical range does not live on
564094e47e9SGeorge Wilson 	 * this leaf vdev. We only add the physical range to this vdev's if it
565094e47e9SGeorge Wilson 	 * has a length greater than 0.
566094e47e9SGeorge Wilson 	 */
567094e47e9SGeorge Wilson 	if (physical_rs.rs_end > physical_rs.rs_start) {
568094e47e9SGeorge Wilson 		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
569094e47e9SGeorge Wilson 		    physical_rs.rs_end - physical_rs.rs_start);
570094e47e9SGeorge Wilson 	} else {
571094e47e9SGeorge Wilson 		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
572094e47e9SGeorge Wilson 	}
573094e47e9SGeorge Wilson }
574094e47e9SGeorge Wilson 
575094e47e9SGeorge Wilson static void
576094e47e9SGeorge Wilson vdev_initialize_thread(void *arg)
577094e47e9SGeorge Wilson {
578094e47e9SGeorge Wilson 	vdev_t *vd = arg;
579094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
580094e47e9SGeorge Wilson 	int error = 0;
581094e47e9SGeorge Wilson 	uint64_t ms_count = 0;
582094e47e9SGeorge Wilson 
583094e47e9SGeorge Wilson 	ASSERT(vdev_is_concrete(vd));
584094e47e9SGeorge Wilson 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
585094e47e9SGeorge Wilson 
586094e47e9SGeorge Wilson 	vd->vdev_initialize_last_offset = 0;
587094e47e9SGeorge Wilson 	vdev_initialize_load(vd);
588094e47e9SGeorge Wilson 
589094e47e9SGeorge Wilson 	abd_t *deadbeef = vdev_initialize_block_alloc();
590094e47e9SGeorge Wilson 
591094e47e9SGeorge Wilson 	vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
592094e47e9SGeorge Wilson 
593094e47e9SGeorge Wilson 	for (uint64_t i = 0; !vd->vdev_detached &&
594094e47e9SGeorge Wilson 	    i < vd->vdev_top->vdev_ms_count; i++) {
595094e47e9SGeorge Wilson 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
596094e47e9SGeorge Wilson 
597094e47e9SGeorge Wilson 		/*
598094e47e9SGeorge Wilson 		 * If we've expanded the top-level vdev or it's our
599094e47e9SGeorge Wilson 		 * first pass, calculate our progress.
600094e47e9SGeorge Wilson 		 */
601094e47e9SGeorge Wilson 		if (vd->vdev_top->vdev_ms_count != ms_count) {
602094e47e9SGeorge Wilson 			vdev_initialize_calculate_progress(vd);
603094e47e9SGeorge Wilson 			ms_count = vd->vdev_top->vdev_ms_count;
604094e47e9SGeorge Wilson 		}
605094e47e9SGeorge Wilson 
606094e47e9SGeorge Wilson 		vdev_initialize_ms_mark(msp);
607094e47e9SGeorge Wilson 		mutex_enter(&msp->ms_lock);
608a0b03b16SSerapheim Dimitropoulos 		VERIFY0(metaslab_load(msp));
609094e47e9SGeorge Wilson 
610094e47e9SGeorge Wilson 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
611094e47e9SGeorge Wilson 		    vd);
612094e47e9SGeorge Wilson 		mutex_exit(&msp->ms_lock);
613094e47e9SGeorge Wilson 
614094e47e9SGeorge Wilson 		spa_config_exit(spa, SCL_CONFIG, FTAG);
615094e47e9SGeorge Wilson 		error = vdev_initialize_ranges(vd, deadbeef);
616094e47e9SGeorge Wilson 		vdev_initialize_ms_unmark(msp);
617094e47e9SGeorge Wilson 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
618094e47e9SGeorge Wilson 
619094e47e9SGeorge Wilson 		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
620094e47e9SGeorge Wilson 		if (error != 0)
621094e47e9SGeorge Wilson 			break;
622094e47e9SGeorge Wilson 	}
623094e47e9SGeorge Wilson 
624094e47e9SGeorge Wilson 	spa_config_exit(spa, SCL_CONFIG, FTAG);
625094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_io_lock);
626094e47e9SGeorge Wilson 	while (vd->vdev_initialize_inflight > 0) {
627094e47e9SGeorge Wilson 		cv_wait(&vd->vdev_initialize_io_cv,
628094e47e9SGeorge Wilson 		    &vd->vdev_initialize_io_lock);
629094e47e9SGeorge Wilson 	}
630094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_io_lock);
631094e47e9SGeorge Wilson 
632094e47e9SGeorge Wilson 	range_tree_destroy(vd->vdev_initialize_tree);
633094e47e9SGeorge Wilson 	vdev_initialize_block_free(deadbeef);
634094e47e9SGeorge Wilson 	vd->vdev_initialize_tree = NULL;
635094e47e9SGeorge Wilson 
636094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_lock);
637094e47e9SGeorge Wilson 	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
638094e47e9SGeorge Wilson 		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
639094e47e9SGeorge Wilson 	}
640094e47e9SGeorge Wilson 	ASSERT(vd->vdev_initialize_thread != NULL ||
641094e47e9SGeorge Wilson 	    vd->vdev_initialize_inflight == 0);
642094e47e9SGeorge Wilson 
643094e47e9SGeorge Wilson 	/*
644094e47e9SGeorge Wilson 	 * Drop the vdev_initialize_lock while we sync out the
645094e47e9SGeorge Wilson 	 * txg since it's possible that a device might be trying to
646094e47e9SGeorge Wilson 	 * come online and must check to see if it needs to restart an
647094e47e9SGeorge Wilson 	 * initialization. That thread will be holding the spa_config_lock
648094e47e9SGeorge Wilson 	 * which would prevent the txg_wait_synced from completing.
649094e47e9SGeorge Wilson 	 */
650094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_lock);
651094e47e9SGeorge Wilson 	txg_wait_synced(spa_get_dsl(spa), 0);
652094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_lock);
653094e47e9SGeorge Wilson 
654094e47e9SGeorge Wilson 	vd->vdev_initialize_thread = NULL;
655094e47e9SGeorge Wilson 	cv_broadcast(&vd->vdev_initialize_cv);
656094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_lock);
657094e47e9SGeorge Wilson }
658094e47e9SGeorge Wilson 
659094e47e9SGeorge Wilson /*
660094e47e9SGeorge Wilson  * Initiates a device. Caller must hold vdev_initialize_lock.
661094e47e9SGeorge Wilson  * Device must be a leaf and not already be initializing.
662094e47e9SGeorge Wilson  */
663094e47e9SGeorge Wilson void
664094e47e9SGeorge Wilson vdev_initialize(vdev_t *vd)
665094e47e9SGeorge Wilson {
666094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
667094e47e9SGeorge Wilson 	ASSERT(vd->vdev_ops->vdev_op_leaf);
668094e47e9SGeorge Wilson 	ASSERT(vdev_is_concrete(vd));
669094e47e9SGeorge Wilson 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
670094e47e9SGeorge Wilson 	ASSERT(!vd->vdev_detached);
671094e47e9SGeorge Wilson 	ASSERT(!vd->vdev_initialize_exit_wanted);
672094e47e9SGeorge Wilson 	ASSERT(!vd->vdev_top->vdev_removing);
673094e47e9SGeorge Wilson 
674094e47e9SGeorge Wilson 	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
675094e47e9SGeorge Wilson 	vd->vdev_initialize_thread = thread_create(NULL, 0,
676094e47e9SGeorge Wilson 	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
677094e47e9SGeorge Wilson }
678094e47e9SGeorge Wilson 
679094e47e9SGeorge Wilson /*
680094e47e9SGeorge Wilson  * Stop initializng a device, with the resultant initialing state being
681094e47e9SGeorge Wilson  * tgt_state. Blocks until the initializing thread has exited.
682094e47e9SGeorge Wilson  * Caller must hold vdev_initialize_lock and must not be writing to the spa
683094e47e9SGeorge Wilson  * config, as the initializing thread may try to enter the config as a reader
684094e47e9SGeorge Wilson  * before exiting.
685094e47e9SGeorge Wilson  */
686094e47e9SGeorge Wilson void
687094e47e9SGeorge Wilson vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
688094e47e9SGeorge Wilson {
689094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
690094e47e9SGeorge Wilson 	ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
691094e47e9SGeorge Wilson 
692094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
693094e47e9SGeorge Wilson 	ASSERT(vd->vdev_ops->vdev_op_leaf);
694094e47e9SGeorge Wilson 	ASSERT(vdev_is_concrete(vd));
695094e47e9SGeorge Wilson 
696094e47e9SGeorge Wilson 	/*
697094e47e9SGeorge Wilson 	 * Allow cancel requests to proceed even if the initialize thread
698094e47e9SGeorge Wilson 	 * has stopped.
699094e47e9SGeorge Wilson 	 */
700094e47e9SGeorge Wilson 	if (vd->vdev_initialize_thread == NULL &&
701094e47e9SGeorge Wilson 	    tgt_state != VDEV_INITIALIZE_CANCELED) {
702094e47e9SGeorge Wilson 		return;
703094e47e9SGeorge Wilson 	}
704094e47e9SGeorge Wilson 
705094e47e9SGeorge Wilson 	vdev_initialize_change_state(vd, tgt_state);
706094e47e9SGeorge Wilson 	vd->vdev_initialize_exit_wanted = B_TRUE;
707094e47e9SGeorge Wilson 	while (vd->vdev_initialize_thread != NULL)
708094e47e9SGeorge Wilson 		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
709094e47e9SGeorge Wilson 
710094e47e9SGeorge Wilson 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
711094e47e9SGeorge Wilson 	vd->vdev_initialize_exit_wanted = B_FALSE;
712094e47e9SGeorge Wilson }
713094e47e9SGeorge Wilson 
714094e47e9SGeorge Wilson static void
715094e47e9SGeorge Wilson vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
716094e47e9SGeorge Wilson {
717094e47e9SGeorge Wilson 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
718094e47e9SGeorge Wilson 		mutex_enter(&vd->vdev_initialize_lock);
719094e47e9SGeorge Wilson 		vdev_initialize_stop(vd, tgt_state);
720094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_lock);
721094e47e9SGeorge Wilson 		return;
722094e47e9SGeorge Wilson 	}
723094e47e9SGeorge Wilson 
724094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
725094e47e9SGeorge Wilson 		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
726094e47e9SGeorge Wilson 	}
727094e47e9SGeorge Wilson }
728094e47e9SGeorge Wilson 
729094e47e9SGeorge Wilson /*
730094e47e9SGeorge Wilson  * Convenience function to stop initializing of a vdev tree and set all
731094e47e9SGeorge Wilson  * initialize thread pointers to NULL.
732094e47e9SGeorge Wilson  */
733094e47e9SGeorge Wilson void
734094e47e9SGeorge Wilson vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
735094e47e9SGeorge Wilson {
736094e47e9SGeorge Wilson 	vdev_initialize_stop_all_impl(vd, tgt_state);
737094e47e9SGeorge Wilson 
738094e47e9SGeorge Wilson 	if (vd->vdev_spa->spa_sync_on) {
739094e47e9SGeorge Wilson 		/* Make sure that our state has been synced to disk */
740094e47e9SGeorge Wilson 		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
741094e47e9SGeorge Wilson 	}
742094e47e9SGeorge Wilson }
743094e47e9SGeorge Wilson 
744094e47e9SGeorge Wilson void
745094e47e9SGeorge Wilson vdev_initialize_restart(vdev_t *vd)
746094e47e9SGeorge Wilson {
747094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
748094e47e9SGeorge Wilson 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
749094e47e9SGeorge Wilson 
750094e47e9SGeorge Wilson 	if (vd->vdev_leaf_zap != 0) {
751094e47e9SGeorge Wilson 		mutex_enter(&vd->vdev_initialize_lock);
752094e47e9SGeorge Wilson 		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
753094e47e9SGeorge Wilson 		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
754094e47e9SGeorge Wilson 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
755094e47e9SGeorge Wilson 		    sizeof (initialize_state), 1, &initialize_state);
756094e47e9SGeorge Wilson 		ASSERT(err == 0 || err == ENOENT);
757094e47e9SGeorge Wilson 		vd->vdev_initialize_state = initialize_state;
758094e47e9SGeorge Wilson 
759094e47e9SGeorge Wilson 		uint64_t timestamp = 0;
760094e47e9SGeorge Wilson 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
761094e47e9SGeorge Wilson 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
762094e47e9SGeorge Wilson 		    sizeof (timestamp), 1, &timestamp);
763094e47e9SGeorge Wilson 		ASSERT(err == 0 || err == ENOENT);
764094e47e9SGeorge Wilson 		vd->vdev_initialize_action_time = (time_t)timestamp;
765094e47e9SGeorge Wilson 
766094e47e9SGeorge Wilson 		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
767094e47e9SGeorge Wilson 		    vd->vdev_offline) {
768094e47e9SGeorge Wilson 			/* load progress for reporting, but don't resume */
769094e47e9SGeorge Wilson 			vdev_initialize_load(vd);
770094e47e9SGeorge Wilson 		} else if (vd->vdev_initialize_state ==
771094e47e9SGeorge Wilson 		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
772094e47e9SGeorge Wilson 			vdev_initialize(vd);
773094e47e9SGeorge Wilson 		}
774094e47e9SGeorge Wilson 
775094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_lock);
776094e47e9SGeorge Wilson 	}
777094e47e9SGeorge Wilson 
778094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
779094e47e9SGeorge Wilson 		vdev_initialize_restart(vd->vdev_child[i]);
780094e47e9SGeorge Wilson 	}
781094e47e9SGeorge Wilson }
782