xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_indirect.c (revision 667ec66f1b4f491d5e839644e0912cad1c9e7122)
1 /*
2  * CDDL HEADER START
3  *
4  * This file and its contents are supplied under the terms of the
5  * Common Development and Distribution License ("CDDL"), version 1.0.
6  * You may only use this file in accordance with the terms of version
7  * 1.0 of the CDDL.
8  *
9  * A full copy of the text of the CDDL should have accompanied this
10  * source.  A copy of the CDDL is also available via the Internet at
11  * http://www.illumos.org/license/CDDL.
12  *
13  * CDDL HEADER END
14  */
15 
16 /*
17  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
18  */
19 
20 #include <sys/zfs_context.h>
21 #include <sys/spa.h>
22 #include <sys/spa_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/fs/zfs.h>
25 #include <sys/zio.h>
26 #include <sys/metaslab.h>
27 #include <sys/refcount.h>
28 #include <sys/dmu.h>
29 #include <sys/vdev_indirect_mapping.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dsl_synctask.h>
32 #include <sys/zap.h>
33 #include <sys/abd.h>
34 #include <sys/zthr.h>
35 
36 /*
37  * An indirect vdev corresponds to a vdev that has been removed.  Since
38  * we cannot rewrite block pointers of snapshots, etc., we keep a
39  * mapping from old location on the removed device to the new location
40  * on another device in the pool and use this mapping whenever we need
41  * to access the DVA.  Unfortunately, this mapping did not respect
42  * logical block boundaries when it was first created, and so a DVA on
43  * this indirect vdev may be "split" into multiple sections that each
44  * map to a different location.  As a consequence, not all DVAs can be
45  * translated to an equivalent new DVA.  Instead we must provide a
46  * "vdev_remap" operation that executes a callback on each contiguous
47  * segment of the new location.  This function is used in multiple ways:
48  *
49  *  - reads and repair writes to this device use the callback to create
50  *    a child io for each mapped segment.
51  *
52  *  - frees and claims to this device use the callback to free or claim
53  *    each mapped segment.  (Note that we don't actually need to claim
54  *    log blocks on indirect vdevs, because we don't allocate to
55  *    removing vdevs.  However, zdb uses zio_claim() for its leak
56  *    detection.)
57  */
58 
59 /*
60  * "Big theory statement" for how we mark blocks obsolete.
61  *
62  * When a block on an indirect vdev is freed or remapped, a section of
63  * that vdev's mapping may no longer be referenced (aka "obsolete").  We
64  * keep track of how much of each mapping entry is obsolete.  When
65  * an entry becomes completely obsolete, we can remove it, thus reducing
66  * the memory used by the mapping.  The complete picture of obsolescence
67  * is given by the following data structures, described below:
68  *  - the entry-specific obsolete count
69  *  - the vdev-specific obsolete spacemap
70  *  - the pool-specific obsolete bpobj
71  *
72  * == On disk data structures used ==
73  *
74  * We track the obsolete space for the pool using several objects.  Each
75  * of these objects is created on demand and freed when no longer
76  * needed, and is assumed to be empty if it does not exist.
77  * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
78  *
79  *  - Each vic_mapping_object (associated with an indirect vdev) can
80  *    have a vimp_counts_object.  This is an array of uint32_t's
81  *    with the same number of entries as the vic_mapping_object.  When
82  *    the mapping is condensed, entries from the vic_obsolete_sm_object
83  *    (see below) are folded into the counts.  Therefore, each
84  *    obsolete_counts entry tells us the number of bytes in the
85  *    corresponding mapping entry that were not referenced when the
86  *    mapping was last condensed.
87  *
88  *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
89  *    This is a space map containing an alloc entry for every DVA that
90  *    has been obsoleted since the last time this indirect vdev was
91  *    condensed.  We use this object in order to improve performance
92  *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
93  *    offset of the vimp_counts_object, we only need to append an entry
94  *    to the end of this object.  When a DVA becomes obsolete, it is
95  *    added to the obsolete space map.  This happens when the DVA is
96  *    freed, remapped and not referenced by a snapshot, or the last
97  *    snapshot referencing it is destroyed.
98  *
99  *  - Each dataset can have a ds_remap_deadlist object.  This is a
100  *    deadlist object containing all blocks that were remapped in this
101  *    dataset but referenced in a previous snapshot.  Blocks can *only*
102  *    appear on this list if they were remapped (dsl_dataset_block_remapped);
103  *    blocks that were killed in a head dataset are put on the normal
104  *    ds_deadlist and marked obsolete when they are freed.
105  *
106  *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
107  *    in the pool that need to be marked obsolete.  When a snapshot is
108  *    destroyed, we move some of the ds_remap_deadlist to the obsolete
109  *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
110  *    asynchronously process the obsolete bpobj, moving its entries to
111  *    the specific vdevs' obsolete space maps.
112  *
113  * == Summary of how we mark blocks as obsolete ==
114  *
115  * - When freeing a block: if any DVA is on an indirect vdev, append to
116  *   vic_obsolete_sm_object.
117  * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
118  *   references; otherwise append to vic_obsolete_sm_object).
119  * - When freeing a snapshot: move parts of ds_remap_deadlist to
120  *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
121  * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
122  *   individual vdev's vic_obsolete_sm_object.
123  */
124 
125 /*
126  * "Big theory statement" for how we condense indirect vdevs.
127  *
128  * Condensing an indirect vdev's mapping is the process of determining
129  * the precise counts of obsolete space for each mapping entry (by
130  * integrating the obsolete spacemap into the obsolete counts) and
131  * writing out a new mapping that contains only referenced entries.
132  *
133  * We condense a vdev when we expect the mapping to shrink (see
134  * vdev_indirect_should_condense()), but only perform one condense at a
135  * time to limit the memory usage.  In addition, we use a separate
136  * open-context thread (spa_condense_indirect_thread) to incrementally
137  * create the new mapping object in a way that minimizes the impact on
138  * the rest of the system.
139  *
140  * == Generating a new mapping ==
141  *
142  * To generate a new mapping, we follow these steps:
143  *
144  * 1. Save the old obsolete space map and create a new mapping object
145  *    (see spa_condense_indirect_start_sync()).  This initializes the
146  *    spa_condensing_indirect_phys with the "previous obsolete space map",
147  *    which is now read only.  Newly obsolete DVAs will be added to a
148  *    new (initially empty) obsolete space map, and will not be
149  *    considered as part of this condense operation.
150  *
151  * 2. Construct in memory the precise counts of obsolete space for each
152  *    mapping entry, by incorporating the obsolete space map into the
153  *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
154  *
155  * 3. Iterate through each mapping entry, writing to the new mapping any
156  *    entries that are not completely obsolete (i.e. which don't have
157  *    obsolete count == mapping length).  (See
158  *    spa_condense_indirect_generate_new_mapping().)
159  *
160  * 4. Destroy the old mapping object and switch over to the new one
161  *    (spa_condense_indirect_complete_sync).
162  *
163  * == Restarting from failure ==
164  *
165  * To restart the condense when we import/open the pool, we must start
166  * at the 2nd step above: reconstruct the precise counts in memory,
167  * based on the space map + counts.  Then in the 3rd step, we start
168  * iterating where we left off: at vimp_max_offset of the new mapping
169  * object.
170  */
171 
172 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
173 
174 /*
175  * Condense if at least this percent of the bytes in the mapping is
176  * obsolete.  With the default of 25%, the amount of space mapped
177  * will be reduced to 1% of its original size after at most 16
178  * condenses.  Higher values will condense less often (causing less
179  * i/o); lower values will reduce the mapping size more quickly.
180  */
181 int zfs_indirect_condense_obsolete_pct = 25;
182 
183 /*
184  * Condense if the obsolete space map takes up more than this amount of
185  * space on disk (logically).  This limits the amount of disk space
186  * consumed by the obsolete space map; the default of 1GB is small enough
187  * that we typically don't mind "wasting" it.
188  */
189 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
190 
191 /*
192  * Don't bother condensing if the mapping uses less than this amount of
193  * memory.  The default of 128KB is considered a "trivial" amount of
194  * memory and not worth reducing.
195  */
196 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
197 
198 /*
199  * This is used by the test suite so that it can ensure that certain
200  * actions happen while in the middle of a condense (which might otherwise
201  * complete too quickly).  If used to reduce the performance impact of
202  * condensing in production, a maximum value of 1 should be sufficient.
203  */
204 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
205 
206 /*
207  * Mark the given offset and size as being obsolete in the given txg.
208  */
209 void
210 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
211     uint64_t txg)
212 {
213 	spa_t *spa = vd->vdev_spa;
214 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
215 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
216 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
217 	ASSERT(size > 0);
218 	VERIFY(vdev_indirect_mapping_entry_for_offset(
219 	    vd->vdev_indirect_mapping, offset) != NULL);
220 
221 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
222 		mutex_enter(&vd->vdev_obsolete_lock);
223 		range_tree_add(vd->vdev_obsolete_segments, offset, size);
224 		mutex_exit(&vd->vdev_obsolete_lock);
225 		vdev_dirty(vd, 0, NULL, txg);
226 	}
227 }
228 
229 /*
230  * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
231  * wrapper is provided because the DMU does not know about vdev_t's and
232  * cannot directly call vdev_indirect_mark_obsolete.
233  */
234 void
235 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
236     uint64_t size, dmu_tx_t *tx)
237 {
238 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
239 	ASSERT(dmu_tx_is_syncing(tx));
240 
241 	/* The DMU can only remap indirect vdevs. */
242 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
243 	vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
244 }
245 
246 static spa_condensing_indirect_t *
247 spa_condensing_indirect_create(spa_t *spa)
248 {
249 	spa_condensing_indirect_phys_t *scip =
250 	    &spa->spa_condensing_indirect_phys;
251 	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
252 	objset_t *mos = spa->spa_meta_objset;
253 
254 	for (int i = 0; i < TXG_SIZE; i++) {
255 		list_create(&sci->sci_new_mapping_entries[i],
256 		    sizeof (vdev_indirect_mapping_entry_t),
257 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
258 	}
259 
260 	sci->sci_new_mapping =
261 	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
262 
263 	return (sci);
264 }
265 
266 static void
267 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
268 {
269 	for (int i = 0; i < TXG_SIZE; i++)
270 		list_destroy(&sci->sci_new_mapping_entries[i]);
271 
272 	if (sci->sci_new_mapping != NULL)
273 		vdev_indirect_mapping_close(sci->sci_new_mapping);
274 
275 	kmem_free(sci, sizeof (*sci));
276 }
277 
278 boolean_t
279 vdev_indirect_should_condense(vdev_t *vd)
280 {
281 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
282 	spa_t *spa = vd->vdev_spa;
283 
284 	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
285 
286 	if (!zfs_condense_indirect_vdevs_enable)
287 		return (B_FALSE);
288 
289 	/*
290 	 * We can only condense one indirect vdev at a time.
291 	 */
292 	if (spa->spa_condensing_indirect != NULL)
293 		return (B_FALSE);
294 
295 	if (spa_shutting_down(spa))
296 		return (B_FALSE);
297 
298 	/*
299 	 * The mapping object size must not change while we are
300 	 * condensing, so we can only condense indirect vdevs
301 	 * (not vdevs that are still in the middle of being removed).
302 	 */
303 	if (vd->vdev_ops != &vdev_indirect_ops)
304 		return (B_FALSE);
305 
306 	/*
307 	 * If nothing new has been marked obsolete, there is no
308 	 * point in condensing.
309 	 */
310 	if (vd->vdev_obsolete_sm == NULL) {
311 		ASSERT0(vdev_obsolete_sm_object(vd));
312 		return (B_FALSE);
313 	}
314 
315 	ASSERT(vd->vdev_obsolete_sm != NULL);
316 
317 	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
318 	    space_map_object(vd->vdev_obsolete_sm));
319 
320 	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
321 	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
322 	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
323 	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
324 
325 	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
326 
327 	/*
328 	 * If a high percentage of the bytes that are mapped have become
329 	 * obsolete, condense (unless the mapping is already small enough).
330 	 * This has a good chance of reducing the amount of memory used
331 	 * by the mapping.
332 	 */
333 	if (bytes_obsolete * 100 / bytes_mapped >=
334 	    zfs_indirect_condense_obsolete_pct &&
335 	    mapping_size > zfs_condense_min_mapping_bytes) {
336 		zfs_dbgmsg("should condense vdev %llu because obsolete "
337 		    "spacemap covers %d%% of %lluMB mapping",
338 		    (u_longlong_t)vd->vdev_id,
339 		    (int)(bytes_obsolete * 100 / bytes_mapped),
340 		    (u_longlong_t)bytes_mapped / 1024 / 1024);
341 		return (B_TRUE);
342 	}
343 
344 	/*
345 	 * If the obsolete space map takes up too much space on disk,
346 	 * condense in order to free up this disk space.
347 	 */
348 	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
349 		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
350 		    "length %lluMB >= max size %lluMB",
351 		    (u_longlong_t)vd->vdev_id,
352 		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
353 		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
354 		    1024 / 1024);
355 		return (B_TRUE);
356 	}
357 
358 	return (B_FALSE);
359 }
360 
361 /*
362  * This sync task completes (finishes) a condense, deleting the old
363  * mapping and replacing it with the new one.
364  */
365 static void
366 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
367 {
368 	spa_condensing_indirect_t *sci = arg;
369 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
370 	spa_condensing_indirect_phys_t *scip =
371 	    &spa->spa_condensing_indirect_phys;
372 	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
373 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
374 	objset_t *mos = spa->spa_meta_objset;
375 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
376 	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
377 	uint64_t new_count =
378 	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
379 
380 	ASSERT(dmu_tx_is_syncing(tx));
381 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
382 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
383 	for (int i = 0; i < TXG_SIZE; i++) {
384 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
385 	}
386 	ASSERT(vic->vic_mapping_object != 0);
387 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
388 	ASSERT(scip->scip_next_mapping_object != 0);
389 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
390 
391 	/*
392 	 * Reset vdev_indirect_mapping to refer to the new object.
393 	 */
394 	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
395 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
396 	vd->vdev_indirect_mapping = sci->sci_new_mapping;
397 	rw_exit(&vd->vdev_indirect_rwlock);
398 
399 	sci->sci_new_mapping = NULL;
400 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
401 	vic->vic_mapping_object = scip->scip_next_mapping_object;
402 	scip->scip_next_mapping_object = 0;
403 
404 	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
405 	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
406 	scip->scip_prev_obsolete_sm_object = 0;
407 
408 	scip->scip_vdev = 0;
409 
410 	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
411 	    DMU_POOL_CONDENSING_INDIRECT, tx));
412 	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
413 	spa->spa_condensing_indirect = NULL;
414 
415 	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
416 	    "new mapping object %llu has %llu entries "
417 	    "(was %llu entries)",
418 	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
419 	    new_count, old_count);
420 
421 	vdev_config_dirty(spa->spa_root_vdev);
422 }
423 
424 /*
425  * This sync task appends entries to the new mapping object.
426  */
427 static void
428 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
429 {
430 	spa_condensing_indirect_t *sci = arg;
431 	uint64_t txg = dmu_tx_get_txg(tx);
432 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
433 
434 	ASSERT(dmu_tx_is_syncing(tx));
435 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
436 
437 	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
438 	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
439 	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
440 }
441 
442 /*
443  * Open-context function to add one entry to the new mapping.  The new
444  * entry will be remembered and written from syncing context.
445  */
446 static void
447 spa_condense_indirect_commit_entry(spa_t *spa,
448     vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
449 {
450 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
451 
452 	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
453 
454 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
455 	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
456 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
457 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
458 
459 	/*
460 	 * If we are the first entry committed this txg, kick off the sync
461 	 * task to write to the MOS on our behalf.
462 	 */
463 	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
464 		dsl_sync_task_nowait(dmu_tx_pool(tx),
465 		    spa_condense_indirect_commit_sync, sci,
466 		    0, ZFS_SPACE_CHECK_NONE, tx);
467 	}
468 
469 	vdev_indirect_mapping_entry_t *vime =
470 	    kmem_alloc(sizeof (*vime), KM_SLEEP);
471 	vime->vime_mapping = *vimep;
472 	vime->vime_obsolete_count = count;
473 	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
474 
475 	dmu_tx_commit(tx);
476 }
477 
478 static void
479 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
480     uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
481 {
482 	spa_t *spa = vd->vdev_spa;
483 	uint64_t mapi = start_index;
484 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
485 	uint64_t old_num_entries =
486 	    vdev_indirect_mapping_num_entries(old_mapping);
487 
488 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
489 	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
490 
491 	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
492 	    (u_longlong_t)vd->vdev_id,
493 	    (u_longlong_t)mapi);
494 
495 	while (mapi < old_num_entries) {
496 
497 		if (zthr_iscancelled(zthr)) {
498 			zfs_dbgmsg("pausing condense of vdev %llu "
499 			    "at index %llu", (u_longlong_t)vd->vdev_id,
500 			    (u_longlong_t)mapi);
501 			break;
502 		}
503 
504 		vdev_indirect_mapping_entry_phys_t *entry =
505 		    &old_mapping->vim_entries[mapi];
506 		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
507 		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
508 		if (obsolete_counts[mapi] < entry_size) {
509 			spa_condense_indirect_commit_entry(spa, entry,
510 			    obsolete_counts[mapi]);
511 
512 			/*
513 			 * This delay may be requested for testing, debugging,
514 			 * or performance reasons.
515 			 */
516 			delay(zfs_condense_indirect_commit_entry_delay_ticks);
517 		}
518 
519 		mapi++;
520 	}
521 }
522 
523 /* ARGSUSED */
524 static boolean_t
525 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
526 {
527 	spa_t *spa = arg;
528 
529 	return (spa->spa_condensing_indirect != NULL);
530 }
531 
532 /* ARGSUSED */
533 static int
534 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
535 {
536 	spa_t *spa = arg;
537 	vdev_t *vd;
538 
539 	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
540 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
541 	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
542 	ASSERT3P(vd, !=, NULL);
543 	spa_config_exit(spa, SCL_VDEV, FTAG);
544 
545 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
546 	spa_condensing_indirect_phys_t *scip =
547 	    &spa->spa_condensing_indirect_phys;
548 	uint32_t *counts;
549 	uint64_t start_index;
550 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
551 	space_map_t *prev_obsolete_sm = NULL;
552 
553 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
554 	ASSERT(scip->scip_next_mapping_object != 0);
555 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
556 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
557 
558 	for (int i = 0; i < TXG_SIZE; i++) {
559 		/*
560 		 * The list must start out empty in order for the
561 		 * _commit_sync() sync task to be properly registered
562 		 * on the first call to _commit_entry(); so it's wise
563 		 * to double check and ensure we actually are starting
564 		 * with empty lists.
565 		 */
566 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
567 	}
568 
569 	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
570 	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
571 	space_map_update(prev_obsolete_sm);
572 	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
573 	if (prev_obsolete_sm != NULL) {
574 		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
575 		    counts, prev_obsolete_sm);
576 	}
577 	space_map_close(prev_obsolete_sm);
578 
579 	/*
580 	 * Generate new mapping.  Determine what index to continue from
581 	 * based on the max offset that we've already written in the
582 	 * new mapping.
583 	 */
584 	uint64_t max_offset =
585 	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
586 	if (max_offset == 0) {
587 		/* We haven't written anything to the new mapping yet. */
588 		start_index = 0;
589 	} else {
590 		/*
591 		 * Pick up from where we left off. _entry_for_offset()
592 		 * returns a pointer into the vim_entries array. If
593 		 * max_offset is greater than any of the mappings
594 		 * contained in the table  NULL will be returned and
595 		 * that indicates we've exhausted our iteration of the
596 		 * old_mapping.
597 		 */
598 
599 		vdev_indirect_mapping_entry_phys_t *entry =
600 		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
601 		    max_offset);
602 
603 		if (entry == NULL) {
604 			/*
605 			 * We've already written the whole new mapping.
606 			 * This special value will cause us to skip the
607 			 * generate_new_mapping step and just do the sync
608 			 * task to complete the condense.
609 			 */
610 			start_index = UINT64_MAX;
611 		} else {
612 			start_index = entry - old_mapping->vim_entries;
613 			ASSERT3U(start_index, <,
614 			    vdev_indirect_mapping_num_entries(old_mapping));
615 		}
616 	}
617 
618 	spa_condense_indirect_generate_new_mapping(vd, counts,
619 	    start_index, zthr);
620 
621 	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
622 
623 	/*
624 	 * If the zthr has received a cancellation signal while running
625 	 * in generate_new_mapping() or at any point after that, then bail
626 	 * early. We don't want to complete the condense if the spa is
627 	 * shutting down.
628 	 */
629 	if (zthr_iscancelled(zthr))
630 		return (0);
631 
632 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
633 	    spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
634 
635 	return (0);
636 }
637 
638 /*
639  * Sync task to begin the condensing process.
640  */
641 void
642 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
643 {
644 	spa_t *spa = vd->vdev_spa;
645 	spa_condensing_indirect_phys_t *scip =
646 	    &spa->spa_condensing_indirect_phys;
647 
648 	ASSERT0(scip->scip_next_mapping_object);
649 	ASSERT0(scip->scip_prev_obsolete_sm_object);
650 	ASSERT0(scip->scip_vdev);
651 	ASSERT(dmu_tx_is_syncing(tx));
652 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
653 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
654 	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
655 
656 	uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
657 	ASSERT(obsolete_sm_obj != 0);
658 
659 	scip->scip_vdev = vd->vdev_id;
660 	scip->scip_next_mapping_object =
661 	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
662 
663 	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
664 
665 	/*
666 	 * We don't need to allocate a new space map object, since
667 	 * vdev_indirect_sync_obsolete will allocate one when needed.
668 	 */
669 	space_map_close(vd->vdev_obsolete_sm);
670 	vd->vdev_obsolete_sm = NULL;
671 	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
672 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
673 
674 	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
675 	    DMU_POOL_DIRECTORY_OBJECT,
676 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
677 	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
678 
679 	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
680 	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
681 
682 	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
683 	    "posm=%llu nm=%llu",
684 	    vd->vdev_id, dmu_tx_get_txg(tx),
685 	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
686 	    (u_longlong_t)scip->scip_next_mapping_object);
687 
688 	zthr_wakeup(spa->spa_condense_zthr);
689 }
690 
691 /*
692  * Sync to the given vdev's obsolete space map any segments that are no longer
693  * referenced as of the given txg.
694  *
695  * If the obsolete space map doesn't exist yet, create and open it.
696  */
697 void
698 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
699 {
700 	spa_t *spa = vd->vdev_spa;
701 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
702 
703 	ASSERT3U(vic->vic_mapping_object, !=, 0);
704 	ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
705 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
706 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
707 
708 	if (vdev_obsolete_sm_object(vd) == 0) {
709 		uint64_t obsolete_sm_object =
710 		    space_map_alloc(spa->spa_meta_objset, tx);
711 
712 		ASSERT(vd->vdev_top_zap != 0);
713 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
714 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
715 		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
716 		ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
717 
718 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
719 		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
720 		    spa->spa_meta_objset, obsolete_sm_object,
721 		    0, vd->vdev_asize, 0));
722 		space_map_update(vd->vdev_obsolete_sm);
723 	}
724 
725 	ASSERT(vd->vdev_obsolete_sm != NULL);
726 	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
727 	    space_map_object(vd->vdev_obsolete_sm));
728 
729 	space_map_write(vd->vdev_obsolete_sm,
730 	    vd->vdev_obsolete_segments, SM_ALLOC, tx);
731 	space_map_update(vd->vdev_obsolete_sm);
732 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
733 }
734 
735 int
736 spa_condense_init(spa_t *spa)
737 {
738 	int error = zap_lookup(spa->spa_meta_objset,
739 	    DMU_POOL_DIRECTORY_OBJECT,
740 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
741 	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
742 	    &spa->spa_condensing_indirect_phys);
743 	if (error == 0) {
744 		if (spa_writeable(spa)) {
745 			spa->spa_condensing_indirect =
746 			    spa_condensing_indirect_create(spa);
747 		}
748 		return (0);
749 	} else if (error == ENOENT) {
750 		return (0);
751 	} else {
752 		return (error);
753 	}
754 }
755 
756 void
757 spa_condense_fini(spa_t *spa)
758 {
759 	if (spa->spa_condensing_indirect != NULL) {
760 		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
761 		spa->spa_condensing_indirect = NULL;
762 	}
763 }
764 
765 void
766 spa_start_indirect_condensing_thread(spa_t *spa)
767 {
768 	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
769 	spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
770 	    spa_condense_indirect_thread, spa);
771 }
772 
773 /*
774  * Gets the obsolete spacemap object from the vdev's ZAP.
775  * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
776  * exist yet.
777  */
778 int
779 vdev_obsolete_sm_object(vdev_t *vd)
780 {
781 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
782 	if (vd->vdev_top_zap == 0) {
783 		return (0);
784 	}
785 
786 	uint64_t sm_obj = 0;
787 	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
788 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
789 
790 	ASSERT(err == 0 || err == ENOENT);
791 
792 	return (sm_obj);
793 }
794 
795 boolean_t
796 vdev_obsolete_counts_are_precise(vdev_t *vd)
797 {
798 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
799 	if (vd->vdev_top_zap == 0) {
800 		return (B_FALSE);
801 	}
802 
803 	uint64_t val = 0;
804 	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
805 	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
806 
807 	ASSERT(err == 0 || err == ENOENT);
808 
809 	return (val != 0);
810 }
811 
812 /* ARGSUSED */
813 static void
814 vdev_indirect_close(vdev_t *vd)
815 {
816 }
817 
818 /* ARGSUSED */
819 static void
820 vdev_indirect_io_done(zio_t *zio)
821 {
822 }
823 
824 /* ARGSUSED */
825 static int
826 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
827     uint64_t *ashift)
828 {
829 	*psize = *max_psize = vd->vdev_asize +
830 	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
831 	*ashift = vd->vdev_ashift;
832 	return (0);
833 }
834 
835 typedef struct remap_segment {
836 	vdev_t *rs_vd;
837 	uint64_t rs_offset;
838 	uint64_t rs_asize;
839 	uint64_t rs_split_offset;
840 	list_node_t rs_node;
841 } remap_segment_t;
842 
843 remap_segment_t *
844 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
845 {
846 	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
847 	rs->rs_vd = vd;
848 	rs->rs_offset = offset;
849 	rs->rs_asize = asize;
850 	rs->rs_split_offset = split_offset;
851 	return (rs);
852 }
853 
854 /*
855  * Goes through the relevant indirect mappings until it hits a concrete vdev
856  * and issues the callback. On the way to the concrete vdev, if any other
857  * indirect vdevs are encountered, then the callback will also be called on
858  * each of those indirect vdevs. For example, if the segment is mapped to
859  * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
860  * mapped to segment B on concrete vdev 2, then the callback will be called on
861  * both vdev 1 and vdev 2.
862  *
863  * While the callback passed to vdev_indirect_remap() is called on every vdev
864  * the function encounters, certain callbacks only care about concrete vdevs.
865  * These types of callbacks should return immediately and explicitly when they
866  * are called on an indirect vdev.
867  *
868  * Because there is a possibility that a DVA section in the indirect device
869  * has been split into multiple sections in our mapping, we keep track
870  * of the relevant contiguous segments of the new location (remap_segment_t)
871  * in a stack. This way we can call the callback for each of the new sections
872  * created by a single section of the indirect device. Note though, that in
873  * this scenario the callbacks in each split block won't occur in-order in
874  * terms of offset, so callers should not make any assumptions about that.
875  *
876  * For callbacks that don't handle split blocks and immediately return when
877  * they encounter them (as is the case for remap_blkptr_cb), the caller can
878  * assume that its callback will be applied from the first indirect vdev
879  * encountered to the last one and then the concrete vdev, in that order.
880  */
881 static void
882 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
883     void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
884 {
885 	list_t stack;
886 	spa_t *spa = vd->vdev_spa;
887 
888 	list_create(&stack, sizeof (remap_segment_t),
889 	    offsetof(remap_segment_t, rs_node));
890 
891 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
892 	    rs != NULL; rs = list_remove_head(&stack)) {
893 		vdev_t *v = rs->rs_vd;
894 
895 		/*
896 		 * Note: this can be called from open context
897 		 * (eg. zio_read()), so we need the rwlock to prevent
898 		 * the mapping from being changed by condensing.
899 		 */
900 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
901 		vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
902 		ASSERT3P(vim, !=, NULL);
903 
904 		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
905 		ASSERT(rs->rs_asize > 0);
906 
907 		vdev_indirect_mapping_entry_phys_t *mapping =
908 		    vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset);
909 		ASSERT3P(mapping, !=, NULL);
910 
911 		while (rs->rs_asize > 0) {
912 			/*
913 			 * Note: the vdev_indirect_mapping can not change
914 			 * while we are running.  It only changes while the
915 			 * removal is in progress, and then only from syncing
916 			 * context. While a removal is in progress, this
917 			 * function is only called for frees, which also only
918 			 * happen from syncing context.
919 			 */
920 
921 			uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
922 			uint64_t dst_offset =
923 			    DVA_GET_OFFSET(&mapping->vimep_dst);
924 			uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst);
925 
926 			ASSERT3U(rs->rs_offset, >=,
927 			    DVA_MAPPING_GET_SRC_OFFSET(mapping));
928 			ASSERT3U(rs->rs_offset, <,
929 			    DVA_MAPPING_GET_SRC_OFFSET(mapping) + size);
930 			ASSERT3U(dst_vdev, !=, v->vdev_id);
931 
932 			uint64_t inner_offset = rs->rs_offset -
933 			    DVA_MAPPING_GET_SRC_OFFSET(mapping);
934 			uint64_t inner_size =
935 			    MIN(rs->rs_asize, size - inner_offset);
936 
937 			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
938 			ASSERT3P(dst_v, !=, NULL);
939 
940 			if (dst_v->vdev_ops == &vdev_indirect_ops) {
941 				list_insert_head(&stack,
942 				    rs_alloc(dst_v, dst_offset + inner_offset,
943 				    inner_size, rs->rs_split_offset));
944 
945 			}
946 
947 			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
948 			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
949 				/*
950 				 * Note: This clause exists only solely for
951 				 * testing purposes. We use it to ensure that
952 				 * split blocks work and that the callbacks
953 				 * using them yield the same result if issued
954 				 * in reverse order.
955 				 */
956 				uint64_t inner_half = inner_size / 2;
957 
958 				func(rs->rs_split_offset + inner_half, dst_v,
959 				    dst_offset + inner_offset + inner_half,
960 				    inner_half, arg);
961 
962 				func(rs->rs_split_offset, dst_v,
963 				    dst_offset + inner_offset,
964 				    inner_half, arg);
965 			} else {
966 				func(rs->rs_split_offset, dst_v,
967 				    dst_offset + inner_offset,
968 				    inner_size, arg);
969 			}
970 
971 			rs->rs_offset += inner_size;
972 			rs->rs_asize -= inner_size;
973 			rs->rs_split_offset += inner_size;
974 			mapping++;
975 		}
976 
977 		rw_exit(&v->vdev_indirect_rwlock);
978 		kmem_free(rs, sizeof (remap_segment_t));
979 	}
980 	list_destroy(&stack);
981 }
982 
983 static void
984 vdev_indirect_child_io_done(zio_t *zio)
985 {
986 	zio_t *pio = zio->io_private;
987 
988 	mutex_enter(&pio->io_lock);
989 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
990 	mutex_exit(&pio->io_lock);
991 
992 	abd_put(zio->io_abd);
993 }
994 
995 static void
996 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
997     uint64_t size, void *arg)
998 {
999 	zio_t *zio = arg;
1000 
1001 	ASSERT3P(vd, !=, NULL);
1002 
1003 	if (vd->vdev_ops == &vdev_indirect_ops)
1004 		return;
1005 
1006 	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
1007 	    abd_get_offset(zio->io_abd, split_offset),
1008 	    size, zio->io_type, zio->io_priority,
1009 	    0, vdev_indirect_child_io_done, zio));
1010 }
1011 
1012 static void
1013 vdev_indirect_io_start(zio_t *zio)
1014 {
1015 	spa_t *spa = zio->io_spa;
1016 
1017 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1018 	if (zio->io_type != ZIO_TYPE_READ) {
1019 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1020 		ASSERT((zio->io_flags &
1021 		    (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
1022 	}
1023 
1024 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1025 	    vdev_indirect_io_start_cb, zio);
1026 
1027 	zio_execute(zio);
1028 }
1029 
1030 vdev_ops_t vdev_indirect_ops = {
1031 	vdev_indirect_open,
1032 	vdev_indirect_close,
1033 	vdev_default_asize,
1034 	vdev_indirect_io_start,
1035 	vdev_indirect_io_done,
1036 	NULL,
1037 	NULL,
1038 	NULL,
1039 	vdev_indirect_remap,
1040 	VDEV_TYPE_INDIRECT,	/* name of this vdev type */
1041 	B_FALSE			/* leaf vdev */
1042 };
1043