1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright 2016 Gary Mills
25 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
26 * Copyright 2019 Joyent, Inc.
27 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
28 */
29
30 #include <sys/dsl_scan.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/dsl_dataset.h>
33 #include <sys/dsl_prop.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dsl_synctask.h>
36 #include <sys/dnode.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/dmu_objset.h>
39 #include <sys/arc.h>
40 #include <sys/zap.h>
41 #include <sys/zio.h>
42 #include <sys/zfs_context.h>
43 #include <sys/fs/zfs.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/spa_impl.h>
46 #include <sys/vdev_impl.h>
47 #include <sys/zil_impl.h>
48 #include <sys/zio_checksum.h>
49 #include <sys/ddt.h>
50 #include <sys/sa.h>
51 #include <sys/sa_impl.h>
52 #include <sys/zfeature.h>
53 #include <sys/abd.h>
54 #include <sys/range_tree.h>
55 #ifdef _KERNEL
56 #include <sys/zfs_vfsops.h>
57 #endif
58
59 /*
60 * Grand theory statement on scan queue sorting
61 *
62 * Scanning is implemented by recursively traversing all indirection levels
63 * in an object and reading all blocks referenced from said objects. This
64 * results in us approximately traversing the object from lowest logical
65 * offset to the highest. For best performance, we would want the logical
66 * blocks to be physically contiguous. However, this is frequently not the
67 * case with pools given the allocation patterns of copy-on-write filesystems.
68 * So instead, we put the I/Os into a reordering queue and issue them in a
69 * way that will most benefit physical disks (LBA-order).
70 *
71 * Queue management:
72 *
73 * Ideally, we would want to scan all metadata and queue up all block I/O
74 * prior to starting to issue it, because that allows us to do an optimal
75 * sorting job. This can however consume large amounts of memory. Therefore
76 * we continuously monitor the size of the queues and constrain them to 5%
77 * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
78 * limit, we clear out a few of the largest extents at the head of the queues
79 * to make room for more scanning. Hopefully, these extents will be fairly
80 * large and contiguous, allowing us to approach sequential I/O throughput
81 * even without a fully sorted tree.
82 *
83 * Metadata scanning takes place in dsl_scan_visit(), which is called from
84 * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
85 * metadata on the pool, or we need to make room in memory because our
86 * queues are too large, dsl_scan_visit() is postponed and
87 * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
88 * that metadata scanning and queued I/O issuing are mutually exclusive. This
89 * allows us to provide maximum sequential I/O throughput for the majority of
90 * I/O's issued since sequential I/O performance is significantly negatively
91 * impacted if it is interleaved with random I/O.
92 *
93 * Implementation Notes
94 *
95 * One side effect of the queued scanning algorithm is that the scanning code
96 * needs to be notified whenever a block is freed. This is needed to allow
97 * the scanning code to remove these I/Os from the issuing queue. Additionally,
98 * we do not attempt to queue gang blocks to be issued sequentially since this
99 * is very hard to do and would have an extremely limited performance benefit.
100 * Instead, we simply issue gang I/Os as soon as we find them using the legacy
101 * algorithm.
102 *
103 * Backwards compatibility
104 *
105 * This new algorithm is backwards compatible with the legacy on-disk data
106 * structures (and therefore does not require a new feature flag).
107 * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
108 * will stop scanning metadata (in logical order) and wait for all outstanding
109 * sorted I/O to complete. Once this is done, we write out a checkpoint
110 * bookmark, indicating that we have scanned everything logically before it.
111 * If the pool is imported on a machine without the new sorting algorithm,
112 * the scan simply resumes from the last checkpoint using the legacy algorithm.
113 */
114
115 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
116 const zbookmark_phys_t *);
117
118 static scan_cb_t dsl_scan_scrub_cb;
119
120 static int scan_ds_queue_compare(const void *a, const void *b);
121 static int scan_prefetch_queue_compare(const void *a, const void *b);
122 static void scan_ds_queue_clear(dsl_scan_t *scn);
123 static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
124 uint64_t *txg);
125 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
126 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
127 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
128
129 extern int zfs_vdev_async_write_active_min_dirty_percent;
130
131 /*
132 * By default zfs will check to ensure it is not over the hard memory
133 * limit before each txg. If finer-grained control of this is needed
134 * this value can be set to 1 to enable checking before scanning each
135 * block.
136 */
137 int zfs_scan_strict_mem_lim = B_FALSE;
138
139 /*
140 * Maximum number of parallelly executing I/Os per top-level vdev.
141 * Tune with care. Very high settings (hundreds) are known to trigger
142 * some firmware bugs and resets on certain SSDs.
143 */
144 int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
145 unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
146 unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
147 unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */
148
149 /*
150 * Maximum number of parallelly executed bytes per leaf vdev. We attempt
151 * to strike a balance here between keeping the vdev queues full of I/Os
152 * at all times and not overflowing the queues to cause long latency,
153 * which would cause long txg sync times. No matter what, we will not
154 * overload the drives with I/O, since that is protected by
155 * zfs_vdev_scrub_max_active.
156 */
157 unsigned long zfs_scan_vdev_limit = 4 << 20;
158
159 int zfs_scan_issue_strategy = 0;
160 int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
161 uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
162
163 unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */
164 #define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval)
165
166 /*
167 * fill_weight is non-tunable at runtime, so we copy it at module init from
168 * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
169 * break queue sorting.
170 */
171 uint64_t zfs_scan_fill_weight = 3;
172 static uint64_t fill_weight;
173
174 /* See dsl_scan_should_clear() for details on the memory limit tunables */
175 uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
176 uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
177 int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */
178 int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */
179
180 unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
181 unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
182 /* min millisecs to obsolete per txg */
183 unsigned int zfs_obsolete_min_time_ms = 500;
184 /* min millisecs to resilver per txg */
185 unsigned int zfs_resilver_min_time_ms = 3000;
186 int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
187 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
188 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
189 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
190 /* max number of blocks to free in a single TXG */
191 uint64_t zfs_async_block_max_blocks = UINT64_MAX;
192
193 int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
194
195 /*
196 * We wait a few txgs after importing a pool to begin scanning so that
197 * the import / mounting code isn't held up by scrub / resilver IO.
198 * Unfortunately, it is a bit difficult to determine exactly how long
199 * this will take since userspace will trigger fs mounts asynchronously
200 * and the kernel will create zvol minors asynchronously. As a result,
201 * the value provided here is a bit arbitrary, but represents a
202 * reasonable estimate of how many txgs it will take to finish fully
203 * importing a pool
204 */
205 #define SCAN_IMPORT_WAIT_TXGS 5
206
207
208 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
209 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
210 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
211
212 extern int zfs_txg_timeout;
213
214 /*
215 * Enable/disable the processing of the free_bpobj object.
216 */
217 boolean_t zfs_free_bpobj_enabled = B_TRUE;
218
219 /* the order has to match pool_scan_type */
220 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
221 NULL,
222 dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
223 dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
224 };
225
226 /* In core node for the scn->scn_queue. Represents a dataset to be scanned */
227 typedef struct {
228 uint64_t sds_dsobj;
229 uint64_t sds_txg;
230 avl_node_t sds_node;
231 } scan_ds_t;
232
233 /*
234 * This controls what conditions are placed on dsl_scan_sync_state():
235 * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
236 * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
237 * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
238 * write out the scn_phys_cached version.
239 * See dsl_scan_sync_state for details.
240 */
241 typedef enum {
242 SYNC_OPTIONAL,
243 SYNC_MANDATORY,
244 SYNC_CACHED
245 } state_sync_type_t;
246
247 /*
248 * This struct represents the minimum information needed to reconstruct a
249 * zio for sequential scanning. This is useful because many of these will
250 * accumulate in the sequential IO queues before being issued, so saving
251 * memory matters here.
252 */
253 typedef struct scan_io {
254 /* fields from blkptr_t */
255 uint64_t sio_blk_prop;
256 uint64_t sio_phys_birth;
257 uint64_t sio_birth;
258 zio_cksum_t sio_cksum;
259 uint32_t sio_nr_dvas;
260
261 /* fields from zio_t */
262 uint32_t sio_flags;
263 zbookmark_phys_t sio_zb;
264
265 /* members for queue sorting */
266 union {
267 avl_node_t sio_addr_node; /* link into issuing queue */
268 list_node_t sio_list_node; /* link for issuing to disk */
269 } sio_nodes;
270
271 /*
272 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
273 * depending on how many were in the original bp. Only the
274 * first DVA is really used for sorting and issuing purposes.
275 * The other DVAs (if provided) simply exist so that the zio
276 * layer can find additional copies to repair from in the
277 * event of an error. This array must go at the end of the
278 * struct to allow this for the variable number of elements.
279 */
280 dva_t sio_dva[0];
281 } scan_io_t;
282
283 #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
284 #define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
285 #define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
286 #define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
287 #define SIO_GET_END_OFFSET(sio) \
288 (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
289 #define SIO_GET_MUSED(sio) \
290 (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
291
292 struct dsl_scan_io_queue {
293 dsl_scan_t *q_scn; /* associated dsl_scan_t */
294 vdev_t *q_vd; /* top-level vdev that this queue represents */
295
296 /* trees used for sorting I/Os and extents of I/Os */
297 range_tree_t *q_exts_by_addr;
298 zfs_btree_t q_exts_by_size;
299 avl_tree_t q_sios_by_addr;
300 uint64_t q_sio_memused;
301
302 /* members for zio rate limiting */
303 uint64_t q_maxinflight_bytes;
304 uint64_t q_inflight_bytes;
305 kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
306
307 /* per txg statistics */
308 uint64_t q_total_seg_size_this_txg;
309 uint64_t q_segs_this_txg;
310 uint64_t q_total_zio_size_this_txg;
311 uint64_t q_zios_this_txg;
312 };
313
314 /* private data for dsl_scan_prefetch_cb() */
315 typedef struct scan_prefetch_ctx {
316 zfs_refcount_t spc_refcnt; /* refcount for memory management */
317 dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */
318 boolean_t spc_root; /* is this prefetch for an objset? */
319 uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
320 uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */
321 } scan_prefetch_ctx_t;
322
323 /* private data for dsl_scan_prefetch() */
324 typedef struct scan_prefetch_issue_ctx {
325 avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */
326 scan_prefetch_ctx_t *spic_spc; /* spc for the callback */
327 blkptr_t spic_bp; /* bp to prefetch */
328 zbookmark_phys_t spic_zb; /* bookmark to prefetch */
329 } scan_prefetch_issue_ctx_t;
330
331 static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
332 const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
333 static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
334 scan_io_t *sio);
335
336 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
337 static void scan_io_queues_destroy(dsl_scan_t *scn);
338
339 static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
340
341 /* sio->sio_nr_dvas must be set so we know which cache to free from */
342 static void
sio_free(scan_io_t * sio)343 sio_free(scan_io_t *sio)
344 {
345 ASSERT3U(sio->sio_nr_dvas, >, 0);
346 ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
347
348 kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
349 }
350
351 /* It is up to the caller to set sio->sio_nr_dvas for freeing */
352 static scan_io_t *
sio_alloc(unsigned short nr_dvas)353 sio_alloc(unsigned short nr_dvas)
354 {
355 ASSERT3U(nr_dvas, >, 0);
356 ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
357
358 return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
359 }
360
361 void
scan_init(void)362 scan_init(void)
363 {
364 /*
365 * This is used in ext_size_compare() to weight segments
366 * based on how sparse they are. This cannot be changed
367 * mid-scan and the tree comparison functions don't currently
368 * have a mechansim for passing additional context to the
369 * compare functions. Thus we store this value globally and
370 * we only allow it to be set at module intiailization time
371 */
372 fill_weight = zfs_scan_fill_weight;
373
374 for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
375 char name[36];
376
377 (void) sprintf(name, "sio_cache_%d", i);
378 sio_cache[i] = kmem_cache_create(name,
379 (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
380 0, NULL, NULL, NULL, NULL, NULL, 0);
381 }
382 }
383
384 void
scan_fini(void)385 scan_fini(void)
386 {
387 for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
388 kmem_cache_destroy(sio_cache[i]);
389 }
390 }
391
392 static inline boolean_t
dsl_scan_is_running(const dsl_scan_t * scn)393 dsl_scan_is_running(const dsl_scan_t *scn)
394 {
395 return (scn->scn_phys.scn_state == DSS_SCANNING);
396 }
397
398 boolean_t
dsl_scan_resilvering(dsl_pool_t * dp)399 dsl_scan_resilvering(dsl_pool_t *dp)
400 {
401 return (dsl_scan_is_running(dp->dp_scan) &&
402 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
403 }
404
405 static inline void
sio2bp(const scan_io_t * sio,blkptr_t * bp)406 sio2bp(const scan_io_t *sio, blkptr_t *bp)
407 {
408 bzero(bp, sizeof (*bp));
409 bp->blk_prop = sio->sio_blk_prop;
410 bp->blk_phys_birth = sio->sio_phys_birth;
411 bp->blk_birth = sio->sio_birth;
412 bp->blk_fill = 1; /* we always only work with data pointers */
413 bp->blk_cksum = sio->sio_cksum;
414
415 ASSERT3U(sio->sio_nr_dvas, >, 0);
416 ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
417
418 bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
419 }
420
421 static inline void
bp2sio(const blkptr_t * bp,scan_io_t * sio,int dva_i)422 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
423 {
424 sio->sio_blk_prop = bp->blk_prop;
425 sio->sio_phys_birth = bp->blk_phys_birth;
426 sio->sio_birth = bp->blk_birth;
427 sio->sio_cksum = bp->blk_cksum;
428 sio->sio_nr_dvas = BP_GET_NDVAS(bp);
429
430 /*
431 * Copy the DVAs to the sio. We need all copies of the block so
432 * that the self healing code can use the alternate copies if the
433 * first is corrupted. We want the DVA at index dva_i to be first
434 * in the sio since this is the primary one that we want to issue.
435 */
436 for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
437 sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
438 }
439 }
440
441 int
dsl_scan_init(dsl_pool_t * dp,uint64_t txg)442 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
443 {
444 int err;
445 dsl_scan_t *scn;
446 spa_t *spa = dp->dp_spa;
447 uint64_t f;
448
449 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
450 scn->scn_dp = dp;
451
452 /*
453 * It's possible that we're resuming a scan after a reboot so
454 * make sure that the scan_async_destroying flag is initialized
455 * appropriately.
456 */
457 ASSERT(!scn->scn_async_destroying);
458 scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
459 SPA_FEATURE_ASYNC_DESTROY);
460
461 avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
462 offsetof(scan_ds_t, sds_node));
463 avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
464 sizeof (scan_prefetch_issue_ctx_t),
465 offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
466
467 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
468 "scrub_func", sizeof (uint64_t), 1, &f);
469 if (err == 0) {
470 /*
471 * There was an old-style scrub in progress. Restart a
472 * new-style scrub from the beginning.
473 */
474 scn->scn_restart_txg = txg;
475 zfs_dbgmsg("old-style scrub was in progress; "
476 "restarting new-style scrub in txg %llu",
477 (longlong_t)scn->scn_restart_txg);
478
479 /*
480 * Load the queue obj from the old location so that it
481 * can be freed by dsl_scan_done().
482 */
483 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
484 "scrub_queue", sizeof (uint64_t), 1,
485 &scn->scn_phys.scn_queue_obj);
486 } else {
487 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
488 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
489 &scn->scn_phys);
490
491 /*
492 * Detect if the pool contains the signature of #2094. If it
493 * does properly update the scn->scn_phys structure and notify
494 * the administrator by setting an errata for the pool.
495 */
496 if (err == EOVERFLOW) {
497 uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
498 VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
499 VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
500 (23 * sizeof (uint64_t)));
501
502 err = zap_lookup(dp->dp_meta_objset,
503 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
504 sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
505 if (err == 0) {
506 uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
507
508 if (overflow & ~DSF_VISIT_DS_AGAIN ||
509 scn->scn_async_destroying) {
510 spa->spa_errata =
511 ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
512 return (EOVERFLOW);
513 }
514
515 bcopy(zaptmp, &scn->scn_phys,
516 SCAN_PHYS_NUMINTS * sizeof (uint64_t));
517 scn->scn_phys.scn_flags = overflow;
518
519 /* Required scrub already in progress. */
520 if (scn->scn_phys.scn_state == DSS_FINISHED ||
521 scn->scn_phys.scn_state == DSS_CANCELED)
522 spa->spa_errata =
523 ZPOOL_ERRATA_ZOL_2094_SCRUB;
524 }
525 }
526
527 if (err == ENOENT)
528 return (0);
529 else if (err)
530 return (err);
531
532 /*
533 * We might be restarting after a reboot, so jump the issued
534 * counter to how far we've scanned. We know we're consistent
535 * up to here.
536 */
537 scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
538
539 if (dsl_scan_is_running(scn) &&
540 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
541 /*
542 * A new-type scrub was in progress on an old
543 * pool, and the pool was accessed by old
544 * software. Restart from the beginning, since
545 * the old software may have changed the pool in
546 * the meantime.
547 */
548 scn->scn_restart_txg = txg;
549 zfs_dbgmsg("new-style scrub was modified "
550 "by old software; restarting in txg %llu",
551 (longlong_t)scn->scn_restart_txg);
552 } else if (dsl_scan_resilvering(dp)) {
553 /*
554 * If a resilver is in progress and there are already
555 * errors, restart it instead of finishing this scan and
556 * then restarting it. If there haven't been any errors
557 * then remember that the incore DTL is valid.
558 */
559 if (scn->scn_phys.scn_errors > 0) {
560 scn->scn_restart_txg = txg;
561 zfs_dbgmsg("resilver can't excise DTL_MISSING "
562 "when finished; restarting in txg %llu",
563 (u_longlong_t)scn->scn_restart_txg);
564 } else {
565 /* it's safe to excise DTL when finished */
566 spa->spa_scrub_started = B_TRUE;
567 }
568 }
569 }
570
571 bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
572
573 /* reload the queue into the in-core state */
574 if (scn->scn_phys.scn_queue_obj != 0) {
575 zap_cursor_t zc;
576 zap_attribute_t za;
577
578 for (zap_cursor_init(&zc, dp->dp_meta_objset,
579 scn->scn_phys.scn_queue_obj);
580 zap_cursor_retrieve(&zc, &za) == 0;
581 (void) zap_cursor_advance(&zc)) {
582 scan_ds_queue_insert(scn,
583 zfs_strtonum(za.za_name, NULL),
584 za.za_first_integer);
585 }
586 zap_cursor_fini(&zc);
587 }
588
589 spa_scan_stat_init(spa);
590 return (0);
591 }
592
593 void
dsl_scan_fini(dsl_pool_t * dp)594 dsl_scan_fini(dsl_pool_t *dp)
595 {
596 if (dp->dp_scan != NULL) {
597 dsl_scan_t *scn = dp->dp_scan;
598
599 if (scn->scn_taskq != NULL)
600 taskq_destroy(scn->scn_taskq);
601 scan_ds_queue_clear(scn);
602 avl_destroy(&scn->scn_queue);
603 avl_destroy(&scn->scn_prefetch_queue);
604
605 kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
606 dp->dp_scan = NULL;
607 }
608 }
609
610 static boolean_t
dsl_scan_restarting(dsl_scan_t * scn,dmu_tx_t * tx)611 dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
612 {
613 return (scn->scn_restart_txg != 0 &&
614 scn->scn_restart_txg <= tx->tx_txg);
615 }
616
617 boolean_t
dsl_scan_resilver_scheduled(dsl_pool_t * dp)618 dsl_scan_resilver_scheduled(dsl_pool_t *dp)
619 {
620 return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
621 (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
622 }
623
624 boolean_t
dsl_scan_scrubbing(const dsl_pool_t * dp)625 dsl_scan_scrubbing(const dsl_pool_t *dp)
626 {
627 dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
628
629 return (scn_phys->scn_state == DSS_SCANNING &&
630 scn_phys->scn_func == POOL_SCAN_SCRUB);
631 }
632
633 boolean_t
dsl_scan_is_paused_scrub(const dsl_scan_t * scn)634 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
635 {
636 return (dsl_scan_scrubbing(scn->scn_dp) &&
637 scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
638 }
639
640 /*
641 * Writes out a persistent dsl_scan_phys_t record to the pool directory.
642 * Because we can be running in the block sorting algorithm, we do not always
643 * want to write out the record, only when it is "safe" to do so. This safety
644 * condition is achieved by making sure that the sorting queues are empty
645 * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
646 * is inconsistent with how much actual scanning progress has been made. The
647 * kind of sync to be performed is specified by the sync_type argument. If the
648 * sync is optional, we only sync if the queues are empty. If the sync is
649 * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
650 * third possible state is a "cached" sync. This is done in response to:
651 * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
652 * destroyed, so we wouldn't be able to restart scanning from it.
653 * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
654 * superseded by a newer snapshot.
655 * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
656 * swapped with its clone.
657 * In all cases, a cached sync simply rewrites the last record we've written,
658 * just slightly modified. For the modifications that are performed to the
659 * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
660 * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
661 */
662 static void
dsl_scan_sync_state(dsl_scan_t * scn,dmu_tx_t * tx,state_sync_type_t sync_type)663 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
664 {
665 int i;
666 spa_t *spa = scn->scn_dp->dp_spa;
667
668 ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
669 if (scn->scn_bytes_pending == 0) {
670 for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
671 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
672 dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
673
674 if (q == NULL)
675 continue;
676
677 mutex_enter(&vd->vdev_scan_io_queue_lock);
678 ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
679 ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
680 NULL);
681 ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
682 mutex_exit(&vd->vdev_scan_io_queue_lock);
683 }
684
685 if (scn->scn_phys.scn_queue_obj != 0)
686 scan_ds_queue_sync(scn, tx);
687 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
688 DMU_POOL_DIRECTORY_OBJECT,
689 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
690 &scn->scn_phys, tx));
691 bcopy(&scn->scn_phys, &scn->scn_phys_cached,
692 sizeof (scn->scn_phys));
693
694 if (scn->scn_checkpointing)
695 zfs_dbgmsg("finish scan checkpoint");
696
697 scn->scn_checkpointing = B_FALSE;
698 scn->scn_last_checkpoint = ddi_get_lbolt();
699 } else if (sync_type == SYNC_CACHED) {
700 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
701 DMU_POOL_DIRECTORY_OBJECT,
702 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
703 &scn->scn_phys_cached, tx));
704 }
705 }
706
707 /* ARGSUSED */
708 static int
dsl_scan_setup_check(void * arg,dmu_tx_t * tx)709 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
710 {
711 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
712
713 if (dsl_scan_is_running(scn))
714 return (SET_ERROR(EBUSY));
715
716 return (0);
717 }
718
719 static void
dsl_scan_setup_sync(void * arg,dmu_tx_t * tx)720 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
721 {
722 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
723 pool_scan_func_t *funcp = arg;
724 dmu_object_type_t ot = 0;
725 dsl_pool_t *dp = scn->scn_dp;
726 spa_t *spa = dp->dp_spa;
727
728 ASSERT(!dsl_scan_is_running(scn));
729 ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
730 bzero(&scn->scn_phys, sizeof (scn->scn_phys));
731 scn->scn_phys.scn_func = *funcp;
732 scn->scn_phys.scn_state = DSS_SCANNING;
733 scn->scn_phys.scn_min_txg = 0;
734 scn->scn_phys.scn_max_txg = tx->tx_txg;
735 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
736 scn->scn_phys.scn_start_time = gethrestime_sec();
737 scn->scn_phys.scn_errors = 0;
738 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
739 scn->scn_issued_before_pass = 0;
740 scn->scn_restart_txg = 0;
741 scn->scn_done_txg = 0;
742 scn->scn_last_checkpoint = 0;
743 scn->scn_checkpointing = B_FALSE;
744 spa_scan_stat_init(spa);
745
746 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
747 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
748
749 /* rewrite all disk labels */
750 vdev_config_dirty(spa->spa_root_vdev);
751
752 if (vdev_resilver_needed(spa->spa_root_vdev,
753 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
754 spa_event_notify(spa, NULL, NULL,
755 ESC_ZFS_RESILVER_START);
756 } else {
757 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
758 }
759
760 spa->spa_scrub_started = B_TRUE;
761 /*
762 * If this is an incremental scrub, limit the DDT scrub phase
763 * to just the auto-ditto class (for correctness); the rest
764 * of the scrub should go faster using top-down pruning.
765 */
766 if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
767 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
768
769 }
770
771 /* back to the generic stuff */
772
773 if (dp->dp_blkstats == NULL) {
774 dp->dp_blkstats =
775 kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
776 mutex_init(&dp->dp_blkstats->zab_lock, NULL,
777 MUTEX_DEFAULT, NULL);
778 }
779 bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
780
781 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
782 ot = DMU_OT_ZAP_OTHER;
783
784 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
785 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
786
787 bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
788
789 dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
790
791 spa_history_log_internal(spa, "scan setup", tx,
792 "func=%u mintxg=%llu maxtxg=%llu",
793 *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
794 }
795
796 /*
797 * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
798 * Can also be called to resume a paused scrub.
799 */
800 int
dsl_scan(dsl_pool_t * dp,pool_scan_func_t func)801 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
802 {
803 spa_t *spa = dp->dp_spa;
804 dsl_scan_t *scn = dp->dp_scan;
805
806 /*
807 * Purge all vdev caches and probe all devices. We do this here
808 * rather than in sync context because this requires a writer lock
809 * on the spa_config lock, which we can't do from sync context. The
810 * spa_scrub_reopen flag indicates that vdev_open() should not
811 * attempt to start another scrub.
812 */
813 spa_vdev_state_enter(spa, SCL_NONE);
814 spa->spa_scrub_reopen = B_TRUE;
815 vdev_reopen(spa->spa_root_vdev);
816 spa->spa_scrub_reopen = B_FALSE;
817 (void) spa_vdev_state_exit(spa, NULL, 0);
818
819 if (func == POOL_SCAN_RESILVER) {
820 dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
821 return (0);
822 }
823
824 if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
825 /* got scrub start cmd, resume paused scrub */
826 int err = dsl_scrub_set_pause_resume(scn->scn_dp,
827 POOL_SCRUB_NORMAL);
828 if (err == 0) {
829 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
830 return (ECANCELED);
831 }
832 return (SET_ERROR(err));
833 }
834
835 return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
836 dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
837 }
838
839 /* ARGSUSED */
840 static void
dsl_scan_done(dsl_scan_t * scn,boolean_t complete,dmu_tx_t * tx)841 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
842 {
843 static const char *old_names[] = {
844 "scrub_bookmark",
845 "scrub_ddt_bookmark",
846 "scrub_ddt_class_max",
847 "scrub_queue",
848 "scrub_min_txg",
849 "scrub_max_txg",
850 "scrub_func",
851 "scrub_errors",
852 NULL
853 };
854
855 dsl_pool_t *dp = scn->scn_dp;
856 spa_t *spa = dp->dp_spa;
857 int i;
858
859 /* Remove any remnants of an old-style scrub. */
860 for (i = 0; old_names[i]; i++) {
861 (void) zap_remove(dp->dp_meta_objset,
862 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
863 }
864
865 if (scn->scn_phys.scn_queue_obj != 0) {
866 VERIFY0(dmu_object_free(dp->dp_meta_objset,
867 scn->scn_phys.scn_queue_obj, tx));
868 scn->scn_phys.scn_queue_obj = 0;
869 }
870 scan_ds_queue_clear(scn);
871
872 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
873
874 /*
875 * If we were "restarted" from a stopped state, don't bother
876 * with anything else.
877 */
878 if (!dsl_scan_is_running(scn)) {
879 ASSERT(!scn->scn_is_sorted);
880 return;
881 }
882
883 if (scn->scn_is_sorted) {
884 scan_io_queues_destroy(scn);
885 scn->scn_is_sorted = B_FALSE;
886
887 if (scn->scn_taskq != NULL) {
888 taskq_destroy(scn->scn_taskq);
889 scn->scn_taskq = NULL;
890 }
891 }
892
893 scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
894
895 if (dsl_scan_restarting(scn, tx))
896 spa_history_log_internal(spa, "scan aborted, restarting", tx,
897 "errors=%llu", spa_get_errlog_size(spa));
898 else if (!complete)
899 spa_history_log_internal(spa, "scan cancelled", tx,
900 "errors=%llu", spa_get_errlog_size(spa));
901 else
902 spa_history_log_internal(spa, "scan done", tx,
903 "errors=%llu", spa_get_errlog_size(spa));
904
905 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
906 spa->spa_scrub_active = B_FALSE;
907
908 /*
909 * If the scrub/resilver completed, update all DTLs to
910 * reflect this. Whether it succeeded or not, vacate
911 * all temporary scrub DTLs.
912 *
913 * As the scrub does not currently support traversing
914 * data that have been freed but are part of a checkpoint,
915 * we don't mark the scrub as done in the DTLs as faults
916 * may still exist in those vdevs.
917 */
918 if (complete &&
919 !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
920 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
921 scn->scn_phys.scn_max_txg, B_TRUE);
922
923 spa_event_notify(spa, NULL, NULL,
924 scn->scn_phys.scn_min_txg ?
925 ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
926 } else {
927 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
928 0, B_TRUE);
929 }
930 spa_errlog_rotate(spa);
931
932 /*
933 * Don't clear flag until after vdev_dtl_reassess to ensure that
934 * DTL_MISSING will get updated when possible.
935 */
936 spa->spa_scrub_started = B_FALSE;
937
938 /*
939 * We may have finished replacing a device.
940 * Let the async thread assess this and handle the detach.
941 */
942 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
943
944 /*
945 * Clear any resilver_deferred flags in the config.
946 * If there are drives that need resilvering, kick
947 * off an asynchronous request to start resilver.
948 * vdev_clear_resilver_deferred() may update the config
949 * before the resilver can restart. In the event of
950 * a crash during this period, the spa loading code
951 * will find the drives that need to be resilvered
952 * and start the resilver then.
953 */
954 if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
955 vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
956 spa_history_log_internal(spa,
957 "starting deferred resilver", tx, "errors=%llu",
958 (u_longlong_t)spa_get_errlog_size(spa));
959 spa_async_request(spa, SPA_ASYNC_RESILVER);
960 }
961 }
962
963 scn->scn_phys.scn_end_time = gethrestime_sec();
964
965 ASSERT(!dsl_scan_is_running(scn));
966 }
967
968 /* ARGSUSED */
969 static int
dsl_scan_cancel_check(void * arg,dmu_tx_t * tx)970 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
971 {
972 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
973
974 if (!dsl_scan_is_running(scn))
975 return (SET_ERROR(ENOENT));
976 return (0);
977 }
978
979 /* ARGSUSED */
980 static void
dsl_scan_cancel_sync(void * arg,dmu_tx_t * tx)981 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
982 {
983 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
984
985 dsl_scan_done(scn, B_FALSE, tx);
986 dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
987 spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
988 }
989
990 int
dsl_scan_cancel(dsl_pool_t * dp)991 dsl_scan_cancel(dsl_pool_t *dp)
992 {
993 return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
994 dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
995 }
996
997 static int
dsl_scrub_pause_resume_check(void * arg,dmu_tx_t * tx)998 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
999 {
1000 pool_scrub_cmd_t *cmd = arg;
1001 dsl_pool_t *dp = dmu_tx_pool(tx);
1002 dsl_scan_t *scn = dp->dp_scan;
1003
1004 if (*cmd == POOL_SCRUB_PAUSE) {
1005 /* can't pause a scrub when there is no in-progress scrub */
1006 if (!dsl_scan_scrubbing(dp))
1007 return (SET_ERROR(ENOENT));
1008
1009 /* can't pause a paused scrub */
1010 if (dsl_scan_is_paused_scrub(scn))
1011 return (SET_ERROR(EBUSY));
1012 } else if (*cmd != POOL_SCRUB_NORMAL) {
1013 return (SET_ERROR(ENOTSUP));
1014 }
1015
1016 return (0);
1017 }
1018
1019 static void
dsl_scrub_pause_resume_sync(void * arg,dmu_tx_t * tx)1020 dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
1021 {
1022 pool_scrub_cmd_t *cmd = arg;
1023 dsl_pool_t *dp = dmu_tx_pool(tx);
1024 spa_t *spa = dp->dp_spa;
1025 dsl_scan_t *scn = dp->dp_scan;
1026
1027 if (*cmd == POOL_SCRUB_PAUSE) {
1028 /* can't pause a scrub when there is no in-progress scrub */
1029 spa->spa_scan_pass_scrub_pause = gethrestime_sec();
1030 scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
1031 scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
1032 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1033 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
1034 } else {
1035 ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
1036 if (dsl_scan_is_paused_scrub(scn)) {
1037 /*
1038 * We need to keep track of how much time we spend
1039 * paused per pass so that we can adjust the scrub rate
1040 * shown in the output of 'zpool status'
1041 */
1042 spa->spa_scan_pass_scrub_spent_paused +=
1043 gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
1044 spa->spa_scan_pass_scrub_pause = 0;
1045 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
1046 scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
1047 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1048 }
1049 }
1050 }
1051
1052 /*
1053 * Set scrub pause/resume state if it makes sense to do so
1054 */
1055 int
dsl_scrub_set_pause_resume(const dsl_pool_t * dp,pool_scrub_cmd_t cmd)1056 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
1057 {
1058 return (dsl_sync_task(spa_name(dp->dp_spa),
1059 dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
1060 ZFS_SPACE_CHECK_RESERVED));
1061 }
1062
1063
1064 /* start a new scan, or restart an existing one. */
1065 void
dsl_scan_restart_resilver(dsl_pool_t * dp,uint64_t txg)1066 dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
1067 {
1068 if (txg == 0) {
1069 dmu_tx_t *tx;
1070 tx = dmu_tx_create_dd(dp->dp_mos_dir);
1071 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
1072
1073 txg = dmu_tx_get_txg(tx);
1074 dp->dp_scan->scn_restart_txg = txg;
1075 dmu_tx_commit(tx);
1076 } else {
1077 dp->dp_scan->scn_restart_txg = txg;
1078 }
1079 zfs_dbgmsg("restarting resilver txg=%llu", txg);
1080 }
1081
1082 void
dsl_free(dsl_pool_t * dp,uint64_t txg,const blkptr_t * bp)1083 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
1084 {
1085 zio_free(dp->dp_spa, txg, bp);
1086 }
1087
1088 void
dsl_free_sync(zio_t * pio,dsl_pool_t * dp,uint64_t txg,const blkptr_t * bpp)1089 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
1090 {
1091 ASSERT(dsl_pool_sync_context(dp));
1092 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
1093 }
1094
1095 static int
scan_ds_queue_compare(const void * a,const void * b)1096 scan_ds_queue_compare(const void *a, const void *b)
1097 {
1098 const scan_ds_t *sds_a = a, *sds_b = b;
1099
1100 if (sds_a->sds_dsobj < sds_b->sds_dsobj)
1101 return (-1);
1102 if (sds_a->sds_dsobj == sds_b->sds_dsobj)
1103 return (0);
1104 return (1);
1105 }
1106
1107 static void
scan_ds_queue_clear(dsl_scan_t * scn)1108 scan_ds_queue_clear(dsl_scan_t *scn)
1109 {
1110 void *cookie = NULL;
1111 scan_ds_t *sds;
1112 while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
1113 kmem_free(sds, sizeof (*sds));
1114 }
1115 }
1116
1117 static boolean_t
scan_ds_queue_contains(dsl_scan_t * scn,uint64_t dsobj,uint64_t * txg)1118 scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
1119 {
1120 scan_ds_t srch, *sds;
1121
1122 srch.sds_dsobj = dsobj;
1123 sds = avl_find(&scn->scn_queue, &srch, NULL);
1124 if (sds != NULL && txg != NULL)
1125 *txg = sds->sds_txg;
1126 return (sds != NULL);
1127 }
1128
1129 static void
scan_ds_queue_insert(dsl_scan_t * scn,uint64_t dsobj,uint64_t txg)1130 scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
1131 {
1132 scan_ds_t *sds;
1133 avl_index_t where;
1134
1135 sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
1136 sds->sds_dsobj = dsobj;
1137 sds->sds_txg = txg;
1138
1139 VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
1140 avl_insert(&scn->scn_queue, sds, where);
1141 }
1142
1143 static void
scan_ds_queue_remove(dsl_scan_t * scn,uint64_t dsobj)1144 scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
1145 {
1146 scan_ds_t srch, *sds;
1147
1148 srch.sds_dsobj = dsobj;
1149
1150 sds = avl_find(&scn->scn_queue, &srch, NULL);
1151 VERIFY(sds != NULL);
1152 avl_remove(&scn->scn_queue, sds);
1153 kmem_free(sds, sizeof (*sds));
1154 }
1155
1156 static void
scan_ds_queue_sync(dsl_scan_t * scn,dmu_tx_t * tx)1157 scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
1158 {
1159 dsl_pool_t *dp = scn->scn_dp;
1160 spa_t *spa = dp->dp_spa;
1161 dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
1162 DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
1163
1164 ASSERT0(scn->scn_bytes_pending);
1165 ASSERT(scn->scn_phys.scn_queue_obj != 0);
1166
1167 VERIFY0(dmu_object_free(dp->dp_meta_objset,
1168 scn->scn_phys.scn_queue_obj, tx));
1169 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
1170 DMU_OT_NONE, 0, tx);
1171 for (scan_ds_t *sds = avl_first(&scn->scn_queue);
1172 sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
1173 VERIFY0(zap_add_int_key(dp->dp_meta_objset,
1174 scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
1175 sds->sds_txg, tx));
1176 }
1177 }
1178
1179 /*
1180 * Computes the memory limit state that we're currently in. A sorted scan
1181 * needs quite a bit of memory to hold the sorting queue, so we need to
1182 * reasonably constrain the size so it doesn't impact overall system
1183 * performance. We compute two limits:
1184 * 1) Hard memory limit: if the amount of memory used by the sorting
1185 * queues on a pool gets above this value, we stop the metadata
1186 * scanning portion and start issuing the queued up and sorted
1187 * I/Os to reduce memory usage.
1188 * This limit is calculated as a fraction of physmem (by default 5%).
1189 * We constrain the lower bound of the hard limit to an absolute
1190 * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
1191 * the upper bound to 5% of the total pool size - no chance we'll
1192 * ever need that much memory, but just to keep the value in check.
1193 * 2) Soft memory limit: once we hit the hard memory limit, we start
1194 * issuing I/O to reduce queue memory usage, but we don't want to
1195 * completely empty out the queues, since we might be able to find I/Os
1196 * that will fill in the gaps of our non-sequential IOs at some point
1197 * in the future. So we stop the issuing of I/Os once the amount of
1198 * memory used drops below the soft limit (at which point we stop issuing
1199 * I/O and start scanning metadata again).
1200 *
1201 * This limit is calculated by subtracting a fraction of the hard
1202 * limit from the hard limit. By default this fraction is 5%, so
1203 * the soft limit is 95% of the hard limit. We cap the size of the
1204 * difference between the hard and soft limits at an absolute
1205 * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
1206 * sufficient to not cause too frequent switching between the
1207 * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
1208 * worth of queues is about 1.2 GiB of on-pool data, so scanning
1209 * that should take at least a decent fraction of a second).
1210 */
1211 static boolean_t
dsl_scan_should_clear(dsl_scan_t * scn)1212 dsl_scan_should_clear(dsl_scan_t *scn)
1213 {
1214 spa_t *spa = scn->scn_dp->dp_spa;
1215 vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
1216 uint64_t alloc, mlim_hard, mlim_soft, mused;
1217
1218 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
1219 alloc += metaslab_class_get_alloc(spa_special_class(spa));
1220 alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
1221
1222 mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
1223 zfs_scan_mem_lim_min);
1224 mlim_hard = MIN(mlim_hard, alloc / 20);
1225 mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
1226 zfs_scan_mem_lim_soft_max);
1227 mused = 0;
1228 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1229 vdev_t *tvd = rvd->vdev_child[i];
1230 dsl_scan_io_queue_t *queue;
1231
1232 mutex_enter(&tvd->vdev_scan_io_queue_lock);
1233 queue = tvd->vdev_scan_io_queue;
1234 if (queue != NULL) {
1235 /* # extents in exts_by_size = # in exts_by_addr */
1236 mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
1237 sizeof (range_seg_gap_t) + queue->q_sio_memused;
1238 }
1239 mutex_exit(&tvd->vdev_scan_io_queue_lock);
1240 }
1241
1242 dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
1243
1244 if (mused == 0)
1245 ASSERT0(scn->scn_bytes_pending);
1246
1247 /*
1248 * If we are above our hard limit, we need to clear out memory.
1249 * If we are below our soft limit, we need to accumulate sequential IOs.
1250 * Otherwise, we should keep doing whatever we are currently doing.
1251 */
1252 if (mused >= mlim_hard)
1253 return (B_TRUE);
1254 else if (mused < mlim_soft)
1255 return (B_FALSE);
1256 else
1257 return (scn->scn_clearing);
1258 }
1259
1260 static boolean_t
dsl_scan_check_suspend(dsl_scan_t * scn,const zbookmark_phys_t * zb)1261 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
1262 {
1263 /* we never skip user/group accounting objects */
1264 if (zb && (int64_t)zb->zb_object < 0)
1265 return (B_FALSE);
1266
1267 if (scn->scn_suspending)
1268 return (B_TRUE); /* we're already suspending */
1269
1270 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
1271 return (B_FALSE); /* we're resuming */
1272
1273 /* We only know how to resume from level-0 blocks. */
1274 if (zb && zb->zb_level != 0)
1275 return (B_FALSE);
1276
1277 /*
1278 * We suspend if:
1279 * - we have scanned for at least the minimum time (default 1 sec
1280 * for scrub, 3 sec for resilver), and either we have sufficient
1281 * dirty data that we are starting to write more quickly
1282 * (default 30%), or someone is explicitly waiting for this txg
1283 * to complete.
1284 * or
1285 * - the spa is shutting down because this pool is being exported
1286 * or the machine is rebooting.
1287 * or
1288 * - the scan queue has reached its memory use limit
1289 */
1290 hrtime_t curr_time_ns = gethrtime();
1291 uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
1292 uint64_t sync_time_ns = curr_time_ns -
1293 scn->scn_dp->dp_spa->spa_sync_starttime;
1294
1295 int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
1296 int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
1297 zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
1298
1299 if ((NSEC2MSEC(scan_time_ns) > mintime &&
1300 (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
1301 txg_sync_waiting(scn->scn_dp) ||
1302 NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
1303 spa_shutting_down(scn->scn_dp->dp_spa) ||
1304 (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
1305 if (zb) {
1306 dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
1307 (longlong_t)zb->zb_objset,
1308 (longlong_t)zb->zb_object,
1309 (longlong_t)zb->zb_level,
1310 (longlong_t)zb->zb_blkid);
1311 scn->scn_phys.scn_bookmark = *zb;
1312 } else {
1313 dsl_scan_phys_t *scnp = &scn->scn_phys;
1314
1315 dprintf("suspending at DDT bookmark "
1316 "%llx/%llx/%llx/%llx\n",
1317 (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
1318 (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
1319 (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
1320 (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
1321 }
1322 scn->scn_suspending = B_TRUE;
1323 return (B_TRUE);
1324 }
1325 return (B_FALSE);
1326 }
1327
1328 typedef struct zil_scan_arg {
1329 dsl_pool_t *zsa_dp;
1330 zil_header_t *zsa_zh;
1331 } zil_scan_arg_t;
1332
1333 /* ARGSUSED */
1334 static int
dsl_scan_zil_block(zilog_t * zilog,blkptr_t * bp,void * arg,uint64_t claim_txg)1335 dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
1336 {
1337 zil_scan_arg_t *zsa = arg;
1338 dsl_pool_t *dp = zsa->zsa_dp;
1339 dsl_scan_t *scn = dp->dp_scan;
1340 zil_header_t *zh = zsa->zsa_zh;
1341 zbookmark_phys_t zb;
1342
1343 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
1344 return (0);
1345
1346 /*
1347 * One block ("stubby") can be allocated a long time ago; we
1348 * want to visit that one because it has been allocated
1349 * (on-disk) even if it hasn't been claimed (even though for
1350 * scrub there's nothing to do to it).
1351 */
1352 if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
1353 return (0);
1354
1355 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
1356 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
1357
1358 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
1359 return (0);
1360 }
1361
1362 /* ARGSUSED */
1363 static int
dsl_scan_zil_record(zilog_t * zilog,lr_t * lrc,void * arg,uint64_t claim_txg)1364 dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
1365 {
1366 if (lrc->lrc_txtype == TX_WRITE) {
1367 zil_scan_arg_t *zsa = arg;
1368 dsl_pool_t *dp = zsa->zsa_dp;
1369 dsl_scan_t *scn = dp->dp_scan;
1370 zil_header_t *zh = zsa->zsa_zh;
1371 lr_write_t *lr = (lr_write_t *)lrc;
1372 blkptr_t *bp = &lr->lr_blkptr;
1373 zbookmark_phys_t zb;
1374
1375 if (BP_IS_HOLE(bp) ||
1376 bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
1377 return (0);
1378
1379 /*
1380 * birth can be < claim_txg if this record's txg is
1381 * already txg sync'ed (but this log block contains
1382 * other records that are not synced)
1383 */
1384 if (claim_txg == 0 || bp->blk_birth < claim_txg)
1385 return (0);
1386
1387 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
1388 lr->lr_foid, ZB_ZIL_LEVEL,
1389 lr->lr_offset / BP_GET_LSIZE(bp));
1390
1391 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
1392 }
1393 return (0);
1394 }
1395
1396 static void
dsl_scan_zil(dsl_pool_t * dp,zil_header_t * zh)1397 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
1398 {
1399 uint64_t claim_txg = zh->zh_claim_txg;
1400 zil_scan_arg_t zsa = { dp, zh };
1401 zilog_t *zilog;
1402
1403 ASSERT(spa_writeable(dp->dp_spa));
1404
1405 /*
1406 * We only want to visit blocks that have been claimed
1407 * but not yet replayed.
1408 */
1409 if (claim_txg == 0)
1410 return;
1411
1412 zilog = zil_alloc(dp->dp_meta_objset, zh);
1413
1414 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
1415 claim_txg, B_FALSE);
1416
1417 zil_free(zilog);
1418 }
1419
1420 /*
1421 * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
1422 * here is to sort the AVL tree by the order each block will be needed.
1423 */
1424 static int
scan_prefetch_queue_compare(const void * a,const void * b)1425 scan_prefetch_queue_compare(const void *a, const void *b)
1426 {
1427 const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
1428 const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
1429 const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
1430
1431 return (zbookmark_compare(spc_a->spc_datablkszsec,
1432 spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
1433 spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
1434 }
1435
1436 static void
scan_prefetch_ctx_rele(scan_prefetch_ctx_t * spc,void * tag)1437 scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
1438 {
1439 if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
1440 zfs_refcount_destroy(&spc->spc_refcnt);
1441 kmem_free(spc, sizeof (scan_prefetch_ctx_t));
1442 }
1443 }
1444
1445 static scan_prefetch_ctx_t *
scan_prefetch_ctx_create(dsl_scan_t * scn,dnode_phys_t * dnp,void * tag)1446 scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
1447 {
1448 scan_prefetch_ctx_t *spc;
1449
1450 spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
1451 zfs_refcount_create(&spc->spc_refcnt);
1452 zfs_refcount_add(&spc->spc_refcnt, tag);
1453 spc->spc_scn = scn;
1454 if (dnp != NULL) {
1455 spc->spc_datablkszsec = dnp->dn_datablkszsec;
1456 spc->spc_indblkshift = dnp->dn_indblkshift;
1457 spc->spc_root = B_FALSE;
1458 } else {
1459 spc->spc_datablkszsec = 0;
1460 spc->spc_indblkshift = 0;
1461 spc->spc_root = B_TRUE;
1462 }
1463
1464 return (spc);
1465 }
1466
1467 static void
scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t * spc,void * tag)1468 scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
1469 {
1470 zfs_refcount_add(&spc->spc_refcnt, tag);
1471 }
1472
1473 static boolean_t
dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t * spc,const zbookmark_phys_t * zb)1474 dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
1475 const zbookmark_phys_t *zb)
1476 {
1477 zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
1478 dnode_phys_t tmp_dnp;
1479 dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
1480
1481 if (zb->zb_objset != last_zb->zb_objset)
1482 return (B_TRUE);
1483 if ((int64_t)zb->zb_object < 0)
1484 return (B_FALSE);
1485
1486 tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
1487 tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
1488
1489 if (zbookmark_subtree_completed(dnp, zb, last_zb))
1490 return (B_TRUE);
1491
1492 return (B_FALSE);
1493 }
1494
1495 static void
dsl_scan_prefetch(scan_prefetch_ctx_t * spc,blkptr_t * bp,zbookmark_phys_t * zb)1496 dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
1497 {
1498 avl_index_t idx;
1499 dsl_scan_t *scn = spc->spc_scn;
1500 spa_t *spa = scn->scn_dp->dp_spa;
1501 scan_prefetch_issue_ctx_t *spic;
1502
1503 if (zfs_no_scrub_prefetch)
1504 return;
1505
1506 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
1507 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
1508 BP_GET_TYPE(bp) != DMU_OT_OBJSET))
1509 return;
1510
1511 if (dsl_scan_check_prefetch_resume(spc, zb))
1512 return;
1513
1514 scan_prefetch_ctx_add_ref(spc, scn);
1515 spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
1516 spic->spic_spc = spc;
1517 spic->spic_bp = *bp;
1518 spic->spic_zb = *zb;
1519
1520 /*
1521 * Add the IO to the queue of blocks to prefetch. This allows us to
1522 * prioritize blocks that we will need first for the main traversal
1523 * thread.
1524 */
1525 mutex_enter(&spa->spa_scrub_lock);
1526 if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
1527 /* this block is already queued for prefetch */
1528 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
1529 scan_prefetch_ctx_rele(spc, scn);
1530 mutex_exit(&spa->spa_scrub_lock);
1531 return;
1532 }
1533
1534 avl_insert(&scn->scn_prefetch_queue, spic, idx);
1535 cv_broadcast(&spa->spa_scrub_io_cv);
1536 mutex_exit(&spa->spa_scrub_lock);
1537 }
1538
1539 static void
dsl_scan_prefetch_dnode(dsl_scan_t * scn,dnode_phys_t * dnp,uint64_t objset,uint64_t object)1540 dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
1541 uint64_t objset, uint64_t object)
1542 {
1543 int i;
1544 zbookmark_phys_t zb;
1545 scan_prefetch_ctx_t *spc;
1546
1547 if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1548 return;
1549
1550 SET_BOOKMARK(&zb, objset, object, 0, 0);
1551
1552 spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
1553
1554 for (i = 0; i < dnp->dn_nblkptr; i++) {
1555 zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
1556 zb.zb_blkid = i;
1557 dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
1558 }
1559
1560 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1561 zb.zb_level = 0;
1562 zb.zb_blkid = DMU_SPILL_BLKID;
1563 dsl_scan_prefetch(spc, &dnp->dn_spill, &zb);
1564 }
1565
1566 scan_prefetch_ctx_rele(spc, FTAG);
1567 }
1568
1569 void
dsl_scan_prefetch_cb(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * bp,arc_buf_t * buf,void * private)1570 dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
1571 arc_buf_t *buf, void *private)
1572 {
1573 scan_prefetch_ctx_t *spc = private;
1574 dsl_scan_t *scn = spc->spc_scn;
1575 spa_t *spa = scn->scn_dp->dp_spa;
1576
1577 /* broadcast that the IO has completed for rate limitting purposes */
1578 mutex_enter(&spa->spa_scrub_lock);
1579 ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
1580 spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
1581 cv_broadcast(&spa->spa_scrub_io_cv);
1582 mutex_exit(&spa->spa_scrub_lock);
1583
1584 /* if there was an error or we are done prefetching, just cleanup */
1585 if (buf == NULL || scn->scn_suspending)
1586 goto out;
1587
1588 if (BP_GET_LEVEL(bp) > 0) {
1589 int i;
1590 blkptr_t *cbp;
1591 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1592 zbookmark_phys_t czb;
1593
1594 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
1595 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1596 zb->zb_level - 1, zb->zb_blkid * epb + i);
1597 dsl_scan_prefetch(spc, cbp, &czb);
1598 }
1599 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
1600 dnode_phys_t *cdnp = buf->b_data;
1601 int i;
1602 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
1603
1604 for (i = 0, cdnp = buf->b_data; i < epb;
1605 i += cdnp->dn_extra_slots + 1,
1606 cdnp += cdnp->dn_extra_slots + 1) {
1607 dsl_scan_prefetch_dnode(scn, cdnp,
1608 zb->zb_objset, zb->zb_blkid * epb + i);
1609 }
1610 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
1611 objset_phys_t *osp = buf->b_data;
1612
1613 dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
1614 zb->zb_objset, DMU_META_DNODE_OBJECT);
1615
1616 if (OBJSET_BUF_HAS_USERUSED(buf)) {
1617 dsl_scan_prefetch_dnode(scn,
1618 &osp->os_groupused_dnode, zb->zb_objset,
1619 DMU_GROUPUSED_OBJECT);
1620 dsl_scan_prefetch_dnode(scn,
1621 &osp->os_userused_dnode, zb->zb_objset,
1622 DMU_USERUSED_OBJECT);
1623 }
1624 }
1625
1626 out:
1627 if (buf != NULL)
1628 arc_buf_destroy(buf, private);
1629 scan_prefetch_ctx_rele(spc, scn);
1630 }
1631
1632 /* ARGSUSED */
1633 static void
dsl_scan_prefetch_thread(void * arg)1634 dsl_scan_prefetch_thread(void *arg)
1635 {
1636 dsl_scan_t *scn = arg;
1637 spa_t *spa = scn->scn_dp->dp_spa;
1638 vdev_t *rvd = spa->spa_root_vdev;
1639 uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
1640 scan_prefetch_issue_ctx_t *spic;
1641
1642 /* loop until we are told to stop */
1643 while (!scn->scn_prefetch_stop) {
1644 arc_flags_t flags = ARC_FLAG_NOWAIT |
1645 ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
1646 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
1647
1648 mutex_enter(&spa->spa_scrub_lock);
1649
1650 /*
1651 * Wait until we have an IO to issue and are not above our
1652 * maximum in flight limit.
1653 */
1654 while (!scn->scn_prefetch_stop &&
1655 (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
1656 spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
1657 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1658 }
1659
1660 /* recheck if we should stop since we waited for the cv */
1661 if (scn->scn_prefetch_stop) {
1662 mutex_exit(&spa->spa_scrub_lock);
1663 break;
1664 }
1665
1666 /* remove the prefetch IO from the tree */
1667 spic = avl_first(&scn->scn_prefetch_queue);
1668 spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
1669 avl_remove(&scn->scn_prefetch_queue, spic);
1670
1671 mutex_exit(&spa->spa_scrub_lock);
1672
1673 if (BP_IS_PROTECTED(&spic->spic_bp)) {
1674 ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
1675 BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
1676 ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
1677 zio_flags |= ZIO_FLAG_RAW;
1678 }
1679
1680 /* issue the prefetch asynchronously */
1681 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
1682 &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
1683 ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
1684
1685 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
1686 }
1687
1688 ASSERT(scn->scn_prefetch_stop);
1689
1690 /* free any prefetches we didn't get to complete */
1691 mutex_enter(&spa->spa_scrub_lock);
1692 while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
1693 avl_remove(&scn->scn_prefetch_queue, spic);
1694 scan_prefetch_ctx_rele(spic->spic_spc, scn);
1695 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
1696 }
1697 ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
1698 mutex_exit(&spa->spa_scrub_lock);
1699 }
1700
1701 static boolean_t
dsl_scan_check_resume(dsl_scan_t * scn,const dnode_phys_t * dnp,const zbookmark_phys_t * zb)1702 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
1703 const zbookmark_phys_t *zb)
1704 {
1705 /*
1706 * We never skip over user/group accounting objects (obj<0)
1707 */
1708 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
1709 (int64_t)zb->zb_object >= 0) {
1710 /*
1711 * If we already visited this bp & everything below (in
1712 * a prior txg sync), don't bother doing it again.
1713 */
1714 if (zbookmark_subtree_completed(dnp, zb,
1715 &scn->scn_phys.scn_bookmark))
1716 return (B_TRUE);
1717
1718 /*
1719 * If we found the block we're trying to resume from, or
1720 * we went past it to a different object, zero it out to
1721 * indicate that it's OK to start checking for suspending
1722 * again.
1723 */
1724 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
1725 zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
1726 dprintf("resuming at %llx/%llx/%llx/%llx\n",
1727 (longlong_t)zb->zb_objset,
1728 (longlong_t)zb->zb_object,
1729 (longlong_t)zb->zb_level,
1730 (longlong_t)zb->zb_blkid);
1731 bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
1732 }
1733 }
1734 return (B_FALSE);
1735 }
1736
1737 static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
1738 dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
1739 dmu_objset_type_t ostype, dmu_tx_t *tx);
1740 static void dsl_scan_visitdnode(
1741 dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
1742 dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
1743
1744 /*
1745 * Return nonzero on i/o error.
1746 * Return new buf to write out in *bufp.
1747 */
1748 static int
dsl_scan_recurse(dsl_scan_t * scn,dsl_dataset_t * ds,dmu_objset_type_t ostype,dnode_phys_t * dnp,const blkptr_t * bp,const zbookmark_phys_t * zb,dmu_tx_t * tx)1749 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
1750 dnode_phys_t *dnp, const blkptr_t *bp,
1751 const zbookmark_phys_t *zb, dmu_tx_t *tx)
1752 {
1753 dsl_pool_t *dp = scn->scn_dp;
1754 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
1755 int err;
1756
1757 if (BP_GET_LEVEL(bp) > 0) {
1758 arc_flags_t flags = ARC_FLAG_WAIT;
1759 int i;
1760 blkptr_t *cbp;
1761 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1762 arc_buf_t *buf;
1763
1764 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
1765 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
1766 if (err) {
1767 scn->scn_phys.scn_errors++;
1768 return (err);
1769 }
1770 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
1771 zbookmark_phys_t czb;
1772
1773 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1774 zb->zb_level - 1,
1775 zb->zb_blkid * epb + i);
1776 dsl_scan_visitbp(cbp, &czb, dnp,
1777 ds, scn, ostype, tx);
1778 }
1779 arc_buf_destroy(buf, &buf);
1780 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
1781 arc_flags_t flags = ARC_FLAG_WAIT;
1782 dnode_phys_t *cdnp;
1783 int i;
1784 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
1785 arc_buf_t *buf;
1786
1787 if (BP_IS_PROTECTED(bp)) {
1788 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
1789 zio_flags |= ZIO_FLAG_RAW;
1790 }
1791
1792 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
1793 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
1794 if (err) {
1795 scn->scn_phys.scn_errors++;
1796 return (err);
1797 }
1798 for (i = 0, cdnp = buf->b_data; i < epb;
1799 i += cdnp->dn_extra_slots + 1,
1800 cdnp += cdnp->dn_extra_slots + 1) {
1801 dsl_scan_visitdnode(scn, ds, ostype,
1802 cdnp, zb->zb_blkid * epb + i, tx);
1803 }
1804
1805 arc_buf_destroy(buf, &buf);
1806 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
1807 arc_flags_t flags = ARC_FLAG_WAIT;
1808 objset_phys_t *osp;
1809 arc_buf_t *buf;
1810
1811 err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
1812 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
1813 if (err) {
1814 scn->scn_phys.scn_errors++;
1815 return (err);
1816 }
1817
1818 osp = buf->b_data;
1819
1820 dsl_scan_visitdnode(scn, ds, osp->os_type,
1821 &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
1822
1823 if (OBJSET_BUF_HAS_USERUSED(buf)) {
1824 /*
1825 * We also always visit user/group/project accounting
1826 * objects, and never skip them, even if we are
1827 * suspending. This is necessary so that the space
1828 * deltas from this txg get integrated.
1829 */
1830 if (OBJSET_BUF_HAS_PROJECTUSED(buf))
1831 dsl_scan_visitdnode(scn, ds, osp->os_type,
1832 &osp->os_projectused_dnode,
1833 DMU_PROJECTUSED_OBJECT, tx);
1834 dsl_scan_visitdnode(scn, ds, osp->os_type,
1835 &osp->os_groupused_dnode,
1836 DMU_GROUPUSED_OBJECT, tx);
1837 dsl_scan_visitdnode(scn, ds, osp->os_type,
1838 &osp->os_userused_dnode,
1839 DMU_USERUSED_OBJECT, tx);
1840 }
1841 arc_buf_destroy(buf, &buf);
1842 }
1843
1844 return (0);
1845 }
1846
1847 static void
dsl_scan_visitdnode(dsl_scan_t * scn,dsl_dataset_t * ds,dmu_objset_type_t ostype,dnode_phys_t * dnp,uint64_t object,dmu_tx_t * tx)1848 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
1849 dmu_objset_type_t ostype, dnode_phys_t *dnp,
1850 uint64_t object, dmu_tx_t *tx)
1851 {
1852 int j;
1853
1854 for (j = 0; j < dnp->dn_nblkptr; j++) {
1855 zbookmark_phys_t czb;
1856
1857 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
1858 dnp->dn_nlevels - 1, j);
1859 dsl_scan_visitbp(&dnp->dn_blkptr[j],
1860 &czb, dnp, ds, scn, ostype, tx);
1861 }
1862
1863 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1864 zbookmark_phys_t czb;
1865 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
1866 0, DMU_SPILL_BLKID);
1867 dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
1868 &czb, dnp, ds, scn, ostype, tx);
1869 }
1870 }
1871
1872 /*
1873 * The arguments are in this order because mdb can only print the
1874 * first 5; we want them to be useful.
1875 */
1876 static void
dsl_scan_visitbp(blkptr_t * bp,const zbookmark_phys_t * zb,dnode_phys_t * dnp,dsl_dataset_t * ds,dsl_scan_t * scn,dmu_objset_type_t ostype,dmu_tx_t * tx)1877 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
1878 dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
1879 dmu_objset_type_t ostype, dmu_tx_t *tx)
1880 {
1881 dsl_pool_t *dp = scn->scn_dp;
1882 blkptr_t *bp_toread = NULL;
1883
1884 if (dsl_scan_check_suspend(scn, zb))
1885 return;
1886
1887 if (dsl_scan_check_resume(scn, dnp, zb))
1888 return;
1889
1890 scn->scn_visited_this_txg++;
1891
1892 /*
1893 * This debugging is commented out to conserve stack space. This
1894 * function is called recursively and the debugging addes several
1895 * bytes to the stack for each call. It can be commented back in
1896 * if required to debug an issue in dsl_scan_visitbp().
1897 *
1898 * dprintf_bp(bp,
1899 * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
1900 * ds, ds ? ds->ds_object : 0,
1901 * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
1902 * bp);
1903 */
1904
1905 if (BP_IS_HOLE(bp)) {
1906 scn->scn_holes_this_txg++;
1907 return;
1908 }
1909
1910 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
1911 scn->scn_lt_min_this_txg++;
1912 return;
1913 }
1914
1915 bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
1916 *bp_toread = *bp;
1917
1918 if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
1919 goto out;
1920
1921 /*
1922 * If dsl_scan_ddt() has already visited this block, it will have
1923 * already done any translations or scrubbing, so don't call the
1924 * callback again.
1925 */
1926 if (ddt_class_contains(dp->dp_spa,
1927 scn->scn_phys.scn_ddt_class_max, bp)) {
1928 scn->scn_ddt_contained_this_txg++;
1929 goto out;
1930 }
1931
1932 /*
1933 * If this block is from the future (after cur_max_txg), then we
1934 * are doing this on behalf of a deleted snapshot, and we will
1935 * revisit the future block on the next pass of this dataset.
1936 * Don't scan it now unless we need to because something
1937 * under it was modified.
1938 */
1939 if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
1940 scn->scn_gt_max_this_txg++;
1941 goto out;
1942 }
1943
1944 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
1945
1946 out:
1947 kmem_free(bp_toread, sizeof (blkptr_t));
1948 }
1949
1950 static void
dsl_scan_visit_rootbp(dsl_scan_t * scn,dsl_dataset_t * ds,blkptr_t * bp,dmu_tx_t * tx)1951 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
1952 dmu_tx_t *tx)
1953 {
1954 zbookmark_phys_t zb;
1955 scan_prefetch_ctx_t *spc;
1956
1957 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1958 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1959
1960 if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
1961 SET_BOOKMARK(&scn->scn_prefetch_bookmark,
1962 zb.zb_objset, 0, 0, 0);
1963 } else {
1964 scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
1965 }
1966
1967 scn->scn_objsets_visited_this_txg++;
1968
1969 spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
1970 dsl_scan_prefetch(spc, bp, &zb);
1971 scan_prefetch_ctx_rele(spc, FTAG);
1972
1973 dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
1974
1975 dprintf_ds(ds, "finished scan%s", "");
1976 }
1977
1978 static void
ds_destroyed_scn_phys(dsl_dataset_t * ds,dsl_scan_phys_t * scn_phys)1979 ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
1980 {
1981 if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
1982 if (ds->ds_is_snapshot) {
1983 /*
1984 * Note:
1985 * - scn_cur_{min,max}_txg stays the same.
1986 * - Setting the flag is not really necessary if
1987 * scn_cur_max_txg == scn_max_txg, because there
1988 * is nothing after this snapshot that we care
1989 * about. However, we set it anyway and then
1990 * ignore it when we retraverse it in
1991 * dsl_scan_visitds().
1992 */
1993 scn_phys->scn_bookmark.zb_objset =
1994 dsl_dataset_phys(ds)->ds_next_snap_obj;
1995 zfs_dbgmsg("destroying ds %llu; currently traversing; "
1996 "reset zb_objset to %llu",
1997 (u_longlong_t)ds->ds_object,
1998 (u_longlong_t)dsl_dataset_phys(ds)->
1999 ds_next_snap_obj);
2000 scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
2001 } else {
2002 SET_BOOKMARK(&scn_phys->scn_bookmark,
2003 ZB_DESTROYED_OBJSET, 0, 0, 0);
2004 zfs_dbgmsg("destroying ds %llu; currently traversing; "
2005 "reset bookmark to -1,0,0,0",
2006 (u_longlong_t)ds->ds_object);
2007 }
2008 }
2009 }
2010
2011 /*
2012 * Invoked when a dataset is destroyed. We need to make sure that:
2013 *
2014 * 1) If it is the dataset that was currently being scanned, we write
2015 * a new dsl_scan_phys_t and marking the objset reference in it
2016 * as destroyed.
2017 * 2) Remove it from the work queue, if it was present.
2018 *
2019 * If the dataset was actually a snapshot, instead of marking the dataset
2020 * as destroyed, we instead substitute the next snapshot in line.
2021 */
2022 void
dsl_scan_ds_destroyed(dsl_dataset_t * ds,dmu_tx_t * tx)2023 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
2024 {
2025 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2026 dsl_scan_t *scn = dp->dp_scan;
2027 uint64_t mintxg;
2028
2029 if (!dsl_scan_is_running(scn))
2030 return;
2031
2032 ds_destroyed_scn_phys(ds, &scn->scn_phys);
2033 ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
2034
2035 if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
2036 scan_ds_queue_remove(scn, ds->ds_object);
2037 if (ds->ds_is_snapshot)
2038 scan_ds_queue_insert(scn,
2039 dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
2040 }
2041
2042 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
2043 ds->ds_object, &mintxg) == 0) {
2044 ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
2045 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2046 scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
2047 if (ds->ds_is_snapshot) {
2048 /*
2049 * We keep the same mintxg; it could be >
2050 * ds_creation_txg if the previous snapshot was
2051 * deleted too.
2052 */
2053 VERIFY(zap_add_int_key(dp->dp_meta_objset,
2054 scn->scn_phys.scn_queue_obj,
2055 dsl_dataset_phys(ds)->ds_next_snap_obj,
2056 mintxg, tx) == 0);
2057 zfs_dbgmsg("destroying ds %llu; in queue; "
2058 "replacing with %llu",
2059 (u_longlong_t)ds->ds_object,
2060 (u_longlong_t)dsl_dataset_phys(ds)->
2061 ds_next_snap_obj);
2062 } else {
2063 zfs_dbgmsg("destroying ds %llu; in queue; removing",
2064 (u_longlong_t)ds->ds_object);
2065 }
2066 }
2067
2068 /*
2069 * dsl_scan_sync() should be called after this, and should sync
2070 * out our changed state, but just to be safe, do it here.
2071 */
2072 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
2073 }
2074
2075 static void
ds_snapshotted_bookmark(dsl_dataset_t * ds,zbookmark_phys_t * scn_bookmark)2076 ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
2077 {
2078 if (scn_bookmark->zb_objset == ds->ds_object) {
2079 scn_bookmark->zb_objset =
2080 dsl_dataset_phys(ds)->ds_prev_snap_obj;
2081 zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
2082 "reset zb_objset to %llu",
2083 (u_longlong_t)ds->ds_object,
2084 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
2085 }
2086 }
2087
2088 /*
2089 * Called when a dataset is snapshotted. If we were currently traversing
2090 * this snapshot, we reset our bookmark to point at the newly created
2091 * snapshot. We also modify our work queue to remove the old snapshot and
2092 * replace with the new one.
2093 */
2094 void
dsl_scan_ds_snapshotted(dsl_dataset_t * ds,dmu_tx_t * tx)2095 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
2096 {
2097 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2098 dsl_scan_t *scn = dp->dp_scan;
2099 uint64_t mintxg;
2100
2101 if (!dsl_scan_is_running(scn))
2102 return;
2103
2104 ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
2105
2106 ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
2107 ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
2108
2109 if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
2110 scan_ds_queue_remove(scn, ds->ds_object);
2111 scan_ds_queue_insert(scn,
2112 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
2113 }
2114
2115 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
2116 ds->ds_object, &mintxg) == 0) {
2117 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2118 scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
2119 VERIFY(zap_add_int_key(dp->dp_meta_objset,
2120 scn->scn_phys.scn_queue_obj,
2121 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
2122 zfs_dbgmsg("snapshotting ds %llu; in queue; "
2123 "replacing with %llu",
2124 (u_longlong_t)ds->ds_object,
2125 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
2126 }
2127
2128 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
2129 }
2130
2131 static void
ds_clone_swapped_bookmark(dsl_dataset_t * ds1,dsl_dataset_t * ds2,zbookmark_phys_t * scn_bookmark)2132 ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
2133 zbookmark_phys_t *scn_bookmark)
2134 {
2135 if (scn_bookmark->zb_objset == ds1->ds_object) {
2136 scn_bookmark->zb_objset = ds2->ds_object;
2137 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
2138 "reset zb_objset to %llu",
2139 (u_longlong_t)ds1->ds_object,
2140 (u_longlong_t)ds2->ds_object);
2141 } else if (scn_bookmark->zb_objset == ds2->ds_object) {
2142 scn_bookmark->zb_objset = ds1->ds_object;
2143 zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
2144 "reset zb_objset to %llu",
2145 (u_longlong_t)ds2->ds_object,
2146 (u_longlong_t)ds1->ds_object);
2147 }
2148 }
2149
2150 /*
2151 * Called when a parent dataset and its clone are swapped. If we were
2152 * currently traversing the dataset, we need to switch to traversing the
2153 * newly promoted parent.
2154 */
2155 void
dsl_scan_ds_clone_swapped(dsl_dataset_t * ds1,dsl_dataset_t * ds2,dmu_tx_t * tx)2156 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
2157 {
2158 dsl_pool_t *dp = ds1->ds_dir->dd_pool;
2159 dsl_scan_t *scn = dp->dp_scan;
2160 uint64_t mintxg;
2161
2162 if (!dsl_scan_is_running(scn))
2163 return;
2164
2165 ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
2166 ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
2167
2168 if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) {
2169 scan_ds_queue_remove(scn, ds1->ds_object);
2170 scan_ds_queue_insert(scn, ds2->ds_object, mintxg);
2171 }
2172 if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) {
2173 scan_ds_queue_remove(scn, ds2->ds_object);
2174 scan_ds_queue_insert(scn, ds1->ds_object, mintxg);
2175 }
2176
2177 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
2178 ds1->ds_object, &mintxg) == 0) {
2179 int err;
2180 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
2181 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
2182 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2183 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
2184 err = zap_add_int_key(dp->dp_meta_objset,
2185 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
2186 VERIFY(err == 0 || err == EEXIST);
2187 if (err == EEXIST) {
2188 /* Both were there to begin with */
2189 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
2190 scn->scn_phys.scn_queue_obj,
2191 ds1->ds_object, mintxg, tx));
2192 }
2193 zfs_dbgmsg("clone_swap ds %llu; in queue; "
2194 "replacing with %llu",
2195 (u_longlong_t)ds1->ds_object,
2196 (u_longlong_t)ds2->ds_object);
2197 }
2198 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
2199 ds2->ds_object, &mintxg) == 0) {
2200 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
2201 ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
2202 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2203 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
2204 VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
2205 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
2206 zfs_dbgmsg("clone_swap ds %llu; in queue; "
2207 "replacing with %llu",
2208 (u_longlong_t)ds2->ds_object,
2209 (u_longlong_t)ds1->ds_object);
2210 }
2211
2212 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
2213 }
2214
2215 /* ARGSUSED */
2216 static int
enqueue_clones_cb(dsl_pool_t * dp,dsl_dataset_t * hds,void * arg)2217 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
2218 {
2219 uint64_t originobj = *(uint64_t *)arg;
2220 dsl_dataset_t *ds;
2221 int err;
2222 dsl_scan_t *scn = dp->dp_scan;
2223
2224 if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
2225 return (0);
2226
2227 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
2228 if (err)
2229 return (err);
2230
2231 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
2232 dsl_dataset_t *prev;
2233 err = dsl_dataset_hold_obj(dp,
2234 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
2235
2236 dsl_dataset_rele(ds, FTAG);
2237 if (err)
2238 return (err);
2239 ds = prev;
2240 }
2241 scan_ds_queue_insert(scn, ds->ds_object,
2242 dsl_dataset_phys(ds)->ds_prev_snap_txg);
2243 dsl_dataset_rele(ds, FTAG);
2244 return (0);
2245 }
2246
2247 static void
dsl_scan_visitds(dsl_scan_t * scn,uint64_t dsobj,dmu_tx_t * tx)2248 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
2249 {
2250 dsl_pool_t *dp = scn->scn_dp;
2251 dsl_dataset_t *ds;
2252
2253 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
2254
2255 if (scn->scn_phys.scn_cur_min_txg >=
2256 scn->scn_phys.scn_max_txg) {
2257 /*
2258 * This can happen if this snapshot was created after the
2259 * scan started, and we already completed a previous snapshot
2260 * that was created after the scan started. This snapshot
2261 * only references blocks with:
2262 *
2263 * birth < our ds_creation_txg
2264 * cur_min_txg is no less than ds_creation_txg.
2265 * We have already visited these blocks.
2266 * or
2267 * birth > scn_max_txg
2268 * The scan requested not to visit these blocks.
2269 *
2270 * Subsequent snapshots (and clones) can reference our
2271 * blocks, or blocks with even higher birth times.
2272 * Therefore we do not need to visit them either,
2273 * so we do not add them to the work queue.
2274 *
2275 * Note that checking for cur_min_txg >= cur_max_txg
2276 * is not sufficient, because in that case we may need to
2277 * visit subsequent snapshots. This happens when min_txg > 0,
2278 * which raises cur_min_txg. In this case we will visit
2279 * this dataset but skip all of its blocks, because the
2280 * rootbp's birth time is < cur_min_txg. Then we will
2281 * add the next snapshots/clones to the work queue.
2282 */
2283 char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2284 dsl_dataset_name(ds, dsname);
2285 zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
2286 "cur_min_txg (%llu) >= max_txg (%llu)",
2287 (longlong_t)dsobj, dsname,
2288 (longlong_t)scn->scn_phys.scn_cur_min_txg,
2289 (longlong_t)scn->scn_phys.scn_max_txg);
2290 kmem_free(dsname, MAXNAMELEN);
2291
2292 goto out;
2293 }
2294
2295 /*
2296 * Only the ZIL in the head (non-snapshot) is valid. Even though
2297 * snapshots can have ZIL block pointers (which may be the same
2298 * BP as in the head), they must be ignored. In addition, $ORIGIN
2299 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
2300 * need to look for a ZIL in it either. So we traverse the ZIL here,
2301 * rather than in scan_recurse(), because the regular snapshot
2302 * block-sharing rules don't apply to it.
2303 */
2304 if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
2305 (dp->dp_origin_snap == NULL ||
2306 ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
2307 objset_t *os;
2308 if (dmu_objset_from_ds(ds, &os) != 0) {
2309 goto out;
2310 }
2311 dsl_scan_zil(dp, &os->os_zil_header);
2312 }
2313
2314 /*
2315 * Iterate over the bps in this ds.
2316 */
2317 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2318 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
2319 dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
2320 rrw_exit(&ds->ds_bp_rwlock, FTAG);
2321
2322 char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
2323 dsl_dataset_name(ds, dsname);
2324 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
2325 "suspending=%u",
2326 (longlong_t)dsobj, dsname,
2327 (longlong_t)scn->scn_phys.scn_cur_min_txg,
2328 (longlong_t)scn->scn_phys.scn_cur_max_txg,
2329 (int)scn->scn_suspending);
2330 kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
2331
2332 if (scn->scn_suspending)
2333 goto out;
2334
2335 /*
2336 * We've finished this pass over this dataset.
2337 */
2338
2339 /*
2340 * If we did not completely visit this dataset, do another pass.
2341 */
2342 if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
2343 zfs_dbgmsg("incomplete pass; visiting again");
2344 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
2345 scan_ds_queue_insert(scn, ds->ds_object,
2346 scn->scn_phys.scn_cur_max_txg);
2347 goto out;
2348 }
2349
2350 /*
2351 * Add descendent datasets to work queue.
2352 */
2353 if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
2354 scan_ds_queue_insert(scn,
2355 dsl_dataset_phys(ds)->ds_next_snap_obj,
2356 dsl_dataset_phys(ds)->ds_creation_txg);
2357 }
2358 if (dsl_dataset_phys(ds)->ds_num_children > 1) {
2359 boolean_t usenext = B_FALSE;
2360 if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
2361 uint64_t count;
2362 /*
2363 * A bug in a previous version of the code could
2364 * cause upgrade_clones_cb() to not set
2365 * ds_next_snap_obj when it should, leading to a
2366 * missing entry. Therefore we can only use the
2367 * next_clones_obj when its count is correct.
2368 */
2369 int err = zap_count(dp->dp_meta_objset,
2370 dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
2371 if (err == 0 &&
2372 count == dsl_dataset_phys(ds)->ds_num_children - 1)
2373 usenext = B_TRUE;
2374 }
2375
2376 if (usenext) {
2377 zap_cursor_t zc;
2378 zap_attribute_t za;
2379 for (zap_cursor_init(&zc, dp->dp_meta_objset,
2380 dsl_dataset_phys(ds)->ds_next_clones_obj);
2381 zap_cursor_retrieve(&zc, &za) == 0;
2382 (void) zap_cursor_advance(&zc)) {
2383 scan_ds_queue_insert(scn,
2384 zfs_strtonum(za.za_name, NULL),
2385 dsl_dataset_phys(ds)->ds_creation_txg);
2386 }
2387 zap_cursor_fini(&zc);
2388 } else {
2389 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2390 enqueue_clones_cb, &ds->ds_object,
2391 DS_FIND_CHILDREN));
2392 }
2393 }
2394
2395 out:
2396 dsl_dataset_rele(ds, FTAG);
2397 }
2398
2399 /* ARGSUSED */
2400 static int
enqueue_cb(dsl_pool_t * dp,dsl_dataset_t * hds,void * arg)2401 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
2402 {
2403 dsl_dataset_t *ds;
2404 int err;
2405 dsl_scan_t *scn = dp->dp_scan;
2406
2407 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
2408 if (err)
2409 return (err);
2410
2411 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
2412 dsl_dataset_t *prev;
2413 err = dsl_dataset_hold_obj(dp,
2414 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
2415 if (err) {
2416 dsl_dataset_rele(ds, FTAG);
2417 return (err);
2418 }
2419
2420 /*
2421 * If this is a clone, we don't need to worry about it for now.
2422 */
2423 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
2424 dsl_dataset_rele(ds, FTAG);
2425 dsl_dataset_rele(prev, FTAG);
2426 return (0);
2427 }
2428 dsl_dataset_rele(ds, FTAG);
2429 ds = prev;
2430 }
2431
2432 scan_ds_queue_insert(scn, ds->ds_object,
2433 dsl_dataset_phys(ds)->ds_prev_snap_txg);
2434 dsl_dataset_rele(ds, FTAG);
2435 return (0);
2436 }
2437
2438 /* ARGSUSED */
2439 void
dsl_scan_ddt_entry(dsl_scan_t * scn,enum zio_checksum checksum,ddt_entry_t * dde,dmu_tx_t * tx)2440 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
2441 ddt_entry_t *dde, dmu_tx_t *tx)
2442 {
2443 const ddt_key_t *ddk = &dde->dde_key;
2444 ddt_phys_t *ddp = dde->dde_phys;
2445 blkptr_t bp;
2446 zbookmark_phys_t zb = { 0 };
2447 int p;
2448
2449 if (scn->scn_phys.scn_state != DSS_SCANNING)
2450 return;
2451
2452 /*
2453 * This function is special because it is the only thing
2454 * that can add scan_io_t's to the vdev scan queues from
2455 * outside dsl_scan_sync(). For the most part this is ok
2456 * as long as it is called from within syncing context.
2457 * However, dsl_scan_sync() expects that no new sio's will
2458 * be added between when all the work for a scan is done
2459 * and the next txg when the scan is actually marked as
2460 * completed. This check ensures we do not issue new sio's
2461 * during this period.
2462 */
2463 if (scn->scn_done_txg != 0)
2464 return;
2465
2466 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2467 if (ddp->ddp_phys_birth == 0 ||
2468 ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
2469 continue;
2470 ddt_bp_create(checksum, ddk, ddp, &bp);
2471
2472 scn->scn_visited_this_txg++;
2473 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
2474 }
2475 }
2476
2477 /*
2478 * Scrub/dedup interaction.
2479 *
2480 * If there are N references to a deduped block, we don't want to scrub it
2481 * N times -- ideally, we should scrub it exactly once.
2482 *
2483 * We leverage the fact that the dde's replication class (enum ddt_class)
2484 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
2485 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
2486 *
2487 * To prevent excess scrubbing, the scrub begins by walking the DDT
2488 * to find all blocks with refcnt > 1, and scrubs each of these once.
2489 * Since there are two replication classes which contain blocks with
2490 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
2491 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
2492 *
2493 * There would be nothing more to say if a block's refcnt couldn't change
2494 * during a scrub, but of course it can so we must account for changes
2495 * in a block's replication class.
2496 *
2497 * Here's an example of what can occur:
2498 *
2499 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
2500 * when visited during the top-down scrub phase, it will be scrubbed twice.
2501 * This negates our scrub optimization, but is otherwise harmless.
2502 *
2503 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
2504 * on each visit during the top-down scrub phase, it will never be scrubbed.
2505 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
2506 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
2507 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
2508 * while a scrub is in progress, it scrubs the block right then.
2509 */
2510 static void
dsl_scan_ddt(dsl_scan_t * scn,dmu_tx_t * tx)2511 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
2512 {
2513 ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
2514 ddt_entry_t dde = { 0 };
2515 int error;
2516 uint64_t n = 0;
2517
2518 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
2519 ddt_t *ddt;
2520
2521 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
2522 break;
2523 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
2524 (longlong_t)ddb->ddb_class,
2525 (longlong_t)ddb->ddb_type,
2526 (longlong_t)ddb->ddb_checksum,
2527 (longlong_t)ddb->ddb_cursor);
2528
2529 /* There should be no pending changes to the dedup table */
2530 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
2531 ASSERT(avl_first(&ddt->ddt_tree) == NULL);
2532
2533 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
2534 n++;
2535
2536 if (dsl_scan_check_suspend(scn, NULL))
2537 break;
2538 }
2539
2540 zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
2541 "suspending=%u", (longlong_t)n,
2542 (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
2543
2544 ASSERT(error == 0 || error == ENOENT);
2545 ASSERT(error != ENOENT ||
2546 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
2547 }
2548
2549 static uint64_t
dsl_scan_ds_maxtxg(dsl_dataset_t * ds)2550 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
2551 {
2552 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
2553 if (ds->ds_is_snapshot)
2554 return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
2555 return (smt);
2556 }
2557
2558 static void
dsl_scan_visit(dsl_scan_t * scn,dmu_tx_t * tx)2559 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
2560 {
2561 scan_ds_t *sds;
2562 dsl_pool_t *dp = scn->scn_dp;
2563
2564 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
2565 scn->scn_phys.scn_ddt_class_max) {
2566 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
2567 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
2568 dsl_scan_ddt(scn, tx);
2569 if (scn->scn_suspending)
2570 return;
2571 }
2572
2573 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
2574 /* First do the MOS & ORIGIN */
2575
2576 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
2577 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
2578 dsl_scan_visit_rootbp(scn, NULL,
2579 &dp->dp_meta_rootbp, tx);
2580 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
2581 if (scn->scn_suspending)
2582 return;
2583
2584 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
2585 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2586 enqueue_cb, NULL, DS_FIND_CHILDREN));
2587 } else {
2588 dsl_scan_visitds(scn,
2589 dp->dp_origin_snap->ds_object, tx);
2590 }
2591 ASSERT(!scn->scn_suspending);
2592 } else if (scn->scn_phys.scn_bookmark.zb_objset !=
2593 ZB_DESTROYED_OBJSET) {
2594 uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
2595 /*
2596 * If we were suspended, continue from here. Note if the
2597 * ds we were suspended on was deleted, the zb_objset may
2598 * be -1, so we will skip this and find a new objset
2599 * below.
2600 */
2601 dsl_scan_visitds(scn, dsobj, tx);
2602 if (scn->scn_suspending)
2603 return;
2604 }
2605
2606 /*
2607 * In case we suspended right at the end of the ds, zero the
2608 * bookmark so we don't think that we're still trying to resume.
2609 */
2610 bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
2611
2612 /*
2613 * Keep pulling things out of the dataset avl queue. Updates to the
2614 * persistent zap-object-as-queue happen only at checkpoints.
2615 */
2616 while ((sds = avl_first(&scn->scn_queue)) != NULL) {
2617 dsl_dataset_t *ds;
2618 uint64_t dsobj = sds->sds_dsobj;
2619 uint64_t txg = sds->sds_txg;
2620
2621 /* dequeue and free the ds from the queue */
2622 scan_ds_queue_remove(scn, dsobj);
2623 sds = NULL; /* must not be touched after removal */
2624
2625 /* Set up min / max txg */
2626 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
2627 if (txg != 0) {
2628 scn->scn_phys.scn_cur_min_txg =
2629 MAX(scn->scn_phys.scn_min_txg, txg);
2630 } else {
2631 scn->scn_phys.scn_cur_min_txg =
2632 MAX(scn->scn_phys.scn_min_txg,
2633 dsl_dataset_phys(ds)->ds_prev_snap_txg);
2634 }
2635 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
2636 dsl_dataset_rele(ds, FTAG);
2637
2638 dsl_scan_visitds(scn, dsobj, tx);
2639 if (scn->scn_suspending)
2640 return;
2641 }
2642 /* No more objsets to fetch, we're done */
2643 scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
2644 ASSERT0(scn->scn_suspending);
2645 }
2646
2647 static uint64_t
dsl_scan_count_leaves(vdev_t * vd)2648 dsl_scan_count_leaves(vdev_t *vd)
2649 {
2650 uint64_t i, leaves = 0;
2651
2652 /* we only count leaves that belong to the main pool and are readable */
2653 if (vd->vdev_islog || vd->vdev_isspare ||
2654 vd->vdev_isl2cache || !vdev_readable(vd))
2655 return (0);
2656
2657 if (vd->vdev_ops->vdev_op_leaf)
2658 return (1);
2659
2660 for (i = 0; i < vd->vdev_children; i++) {
2661 leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
2662 }
2663
2664 return (leaves);
2665 }
2666
2667
2668 static void
scan_io_queues_update_zio_stats(dsl_scan_io_queue_t * q,const blkptr_t * bp)2669 scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
2670 {
2671 int i;
2672 uint64_t cur_size = 0;
2673
2674 for (i = 0; i < BP_GET_NDVAS(bp); i++) {
2675 cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
2676 }
2677
2678 q->q_total_zio_size_this_txg += cur_size;
2679 q->q_zios_this_txg++;
2680 }
2681
2682 static void
scan_io_queues_update_seg_stats(dsl_scan_io_queue_t * q,uint64_t start,uint64_t end)2683 scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
2684 uint64_t end)
2685 {
2686 q->q_total_seg_size_this_txg += end - start;
2687 q->q_segs_this_txg++;
2688 }
2689
2690 static boolean_t
scan_io_queue_check_suspend(dsl_scan_t * scn)2691 scan_io_queue_check_suspend(dsl_scan_t *scn)
2692 {
2693 /* See comment in dsl_scan_check_suspend() */
2694 uint64_t curr_time_ns = gethrtime();
2695 uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
2696 uint64_t sync_time_ns = curr_time_ns -
2697 scn->scn_dp->dp_spa->spa_sync_starttime;
2698 int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
2699 int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
2700 zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
2701
2702 return ((NSEC2MSEC(scan_time_ns) > mintime &&
2703 (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
2704 txg_sync_waiting(scn->scn_dp) ||
2705 NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
2706 spa_shutting_down(scn->scn_dp->dp_spa));
2707 }
2708
2709 /*
2710 * Given a list of scan_io_t's in io_list, this issues the io's out to
2711 * disk. This consumes the io_list and frees the scan_io_t's. This is
2712 * called when emptying queues, either when we're up against the memory
2713 * limit or when we have finished scanning. Returns B_TRUE if we stopped
2714 * processing the list before we finished. Any zios that were not issued
2715 * will remain in the io_list.
2716 */
2717 static boolean_t
scan_io_queue_issue(dsl_scan_io_queue_t * queue,list_t * io_list)2718 scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
2719 {
2720 dsl_scan_t *scn = queue->q_scn;
2721 scan_io_t *sio;
2722 int64_t bytes_issued = 0;
2723 boolean_t suspended = B_FALSE;
2724
2725 while ((sio = list_head(io_list)) != NULL) {
2726 blkptr_t bp;
2727
2728 if (scan_io_queue_check_suspend(scn)) {
2729 suspended = B_TRUE;
2730 break;
2731 }
2732
2733 sio2bp(sio, &bp);
2734 bytes_issued += SIO_GET_ASIZE(sio);
2735 scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
2736 &sio->sio_zb, queue);
2737 (void) list_remove_head(io_list);
2738 scan_io_queues_update_zio_stats(queue, &bp);
2739 sio_free(sio);
2740 }
2741
2742 atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
2743
2744 return (suspended);
2745 }
2746
2747 /*
2748 * Given a range_seg_t (extent) and a list, this function passes over a
2749 * scan queue and gathers up the appropriate ios which fit into that
2750 * scan seg (starting from lowest LBA). At the end, we remove the segment
2751 * from the q_exts_by_addr range tree.
2752 */
2753 static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t * queue,range_seg_t * rs,list_t * list)2754 scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
2755 {
2756 scan_io_t *srch_sio, *sio, *next_sio;
2757 avl_index_t idx;
2758 uint_t num_sios = 0;
2759 int64_t bytes_issued = 0;
2760
2761 ASSERT(rs != NULL);
2762 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
2763
2764 srch_sio = sio_alloc(1);
2765 srch_sio->sio_nr_dvas = 1;
2766 SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
2767
2768 /*
2769 * The exact start of the extent might not contain any matching zios,
2770 * so if that's the case, examine the next one in the tree.
2771 */
2772 sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
2773 sio_free(srch_sio);
2774
2775 if (sio == NULL)
2776 sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
2777
2778 while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
2779 queue->q_exts_by_addr) && num_sios <= 32) {
2780 ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
2781 queue->q_exts_by_addr));
2782 ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
2783 queue->q_exts_by_addr));
2784
2785 next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
2786 avl_remove(&queue->q_sios_by_addr, sio);
2787 queue->q_sio_memused -= SIO_GET_MUSED(sio);
2788
2789 bytes_issued += SIO_GET_ASIZE(sio);
2790 num_sios++;
2791 list_insert_tail(list, sio);
2792 sio = next_sio;
2793 }
2794
2795 /*
2796 * We limit the number of sios we process at once to 32 to avoid
2797 * biting off more than we can chew. If we didn't take everything
2798 * in the segment we update it to reflect the work we were able to
2799 * complete. Otherwise, we remove it from the range tree entirely.
2800 */
2801 if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
2802 queue->q_exts_by_addr)) {
2803 range_tree_adjust_fill(queue->q_exts_by_addr, rs,
2804 -bytes_issued);
2805 range_tree_resize_segment(queue->q_exts_by_addr, rs,
2806 SIO_GET_OFFSET(sio), rs_get_end(rs,
2807 queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
2808
2809 return (B_TRUE);
2810 } else {
2811 uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
2812 uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
2813 range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
2814 return (B_FALSE);
2815 }
2816 }
2817
2818
2819 /*
2820 * This is called from the queue emptying thread and selects the next
2821 * extent from which we are to issue io's. The behavior of this function
2822 * depends on the state of the scan, the current memory consumption and
2823 * whether or not we are performing a scan shutdown.
2824 * 1) We select extents in an elevator algorithm (LBA-order) if the scan
2825 * needs to perform a checkpoint
2826 * 2) We select the largest available extent if we are up against the
2827 * memory limit.
2828 * 3) Otherwise we don't select any extents.
2829 */
2830 static const range_seg_t *
scan_io_queue_fetch_ext(dsl_scan_io_queue_t * queue)2831 scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
2832 {
2833 dsl_scan_t *scn = queue->q_scn;
2834 range_tree_t *rt = queue->q_exts_by_addr;
2835
2836 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
2837 ASSERT(scn->scn_is_sorted);
2838
2839 /* handle tunable overrides */
2840 if (scn->scn_checkpointing || scn->scn_clearing) {
2841 if (zfs_scan_issue_strategy == 1) {
2842 return (range_tree_first(rt));
2843 } else if (zfs_scan_issue_strategy == 2) {
2844 /*
2845 * We need to get the original entry in the by_addr
2846 * tree so we can modify it.
2847 */
2848 range_seg_t *size_rs =
2849 zfs_btree_first(&queue->q_exts_by_size, NULL);
2850 if (size_rs == NULL)
2851 return (NULL);
2852 uint64_t start = rs_get_start(size_rs, rt);
2853 uint64_t size = rs_get_end(size_rs, rt) - start;
2854 range_seg_t *addr_rs = range_tree_find(rt, start,
2855 size);
2856 ASSERT3P(addr_rs, !=, NULL);
2857 ASSERT3U(rs_get_start(size_rs, rt), ==,
2858 rs_get_start(addr_rs, rt));
2859 ASSERT3U(rs_get_end(size_rs, rt), ==,
2860 rs_get_end(addr_rs, rt));
2861 return (addr_rs);
2862 }
2863 }
2864
2865 /*
2866 * During normal clearing, we want to issue our largest segments
2867 * first, keeping IO as sequential as possible, and leaving the
2868 * smaller extents for later with the hope that they might eventually
2869 * grow to larger sequential segments. However, when the scan is
2870 * checkpointing, no new extents will be added to the sorting queue,
2871 * so the way we are sorted now is as good as it will ever get.
2872 * In this case, we instead switch to issuing extents in LBA order.
2873 */
2874 if (scn->scn_checkpointing) {
2875 return (range_tree_first(rt));
2876 } else if (scn->scn_clearing) {
2877 /*
2878 * We need to get the original entry in the by_addr
2879 * tree so we can modify it.
2880 */
2881 range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
2882 NULL);
2883 if (size_rs == NULL)
2884 return (NULL);
2885 uint64_t start = rs_get_start(size_rs, rt);
2886 uint64_t size = rs_get_end(size_rs, rt) - start;
2887 range_seg_t *addr_rs = range_tree_find(rt, start, size);
2888 ASSERT3P(addr_rs, !=, NULL);
2889 ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs,
2890 rt));
2891 ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
2892 return (addr_rs);
2893 } else {
2894 return (NULL);
2895 }
2896 }
2897
2898 static void
scan_io_queues_run_one(void * arg)2899 scan_io_queues_run_one(void *arg)
2900 {
2901 dsl_scan_io_queue_t *queue = arg;
2902 kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
2903 boolean_t suspended = B_FALSE;
2904 range_seg_t *rs = NULL;
2905 scan_io_t *sio = NULL;
2906 list_t sio_list;
2907 uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
2908 uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
2909
2910 ASSERT(queue->q_scn->scn_is_sorted);
2911
2912 list_create(&sio_list, sizeof (scan_io_t),
2913 offsetof(scan_io_t, sio_nodes.sio_list_node));
2914 mutex_enter(q_lock);
2915
2916 /* calculate maximum in-flight bytes for this txg (min 1MB) */
2917 queue->q_maxinflight_bytes =
2918 MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
2919
2920 /* reset per-queue scan statistics for this txg */
2921 queue->q_total_seg_size_this_txg = 0;
2922 queue->q_segs_this_txg = 0;
2923 queue->q_total_zio_size_this_txg = 0;
2924 queue->q_zios_this_txg = 0;
2925
2926 /* loop until we have run out of time or sios */
2927 while ((rs = (range_seg_t *)scan_io_queue_fetch_ext(queue)) != NULL) {
2928 uint64_t seg_start = 0, seg_end = 0;
2929 boolean_t more_left = B_TRUE;
2930
2931 ASSERT(list_is_empty(&sio_list));
2932
2933 /* loop while we still have sios left to process in this rs */
2934 while (more_left) {
2935 scan_io_t *first_sio, *last_sio;
2936
2937 /*
2938 * We have selected which extent needs to be
2939 * processed next. Gather up the corresponding sios.
2940 */
2941 more_left = scan_io_queue_gather(queue, rs, &sio_list);
2942 ASSERT(!list_is_empty(&sio_list));
2943 first_sio = list_head(&sio_list);
2944 last_sio = list_tail(&sio_list);
2945
2946 seg_end = SIO_GET_END_OFFSET(last_sio);
2947 if (seg_start == 0)
2948 seg_start = SIO_GET_OFFSET(first_sio);
2949
2950 /*
2951 * Issuing sios can take a long time so drop the
2952 * queue lock. The sio queue won't be updated by
2953 * other threads since we're in syncing context so
2954 * we can be sure that our trees will remain exactly
2955 * as we left them.
2956 */
2957 mutex_exit(q_lock);
2958 suspended = scan_io_queue_issue(queue, &sio_list);
2959 mutex_enter(q_lock);
2960
2961 if (suspended)
2962 break;
2963 }
2964 /* update statistics for debugging purposes */
2965 scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
2966
2967 if (suspended)
2968 break;
2969 }
2970
2971
2972 /*
2973 * If we were suspended in the middle of processing,
2974 * requeue any unfinished sios and exit.
2975 */
2976 while ((sio = list_head(&sio_list)) != NULL) {
2977 list_remove(&sio_list, sio);
2978 scan_io_queue_insert_impl(queue, sio);
2979 }
2980
2981 mutex_exit(q_lock);
2982 list_destroy(&sio_list);
2983 }
2984
2985 /*
2986 * Performs an emptying run on all scan queues in the pool. This just
2987 * punches out one thread per top-level vdev, each of which processes
2988 * only that vdev's scan queue. We can parallelize the I/O here because
2989 * we know that each queue's io's only affect its own top-level vdev.
2990 *
2991 * This function waits for the queue runs to complete, and must be
2992 * called from dsl_scan_sync (or in general, syncing context).
2993 */
2994 static void
scan_io_queues_run(dsl_scan_t * scn)2995 scan_io_queues_run(dsl_scan_t *scn)
2996 {
2997 spa_t *spa = scn->scn_dp->dp_spa;
2998
2999 ASSERT(scn->scn_is_sorted);
3000 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3001
3002 if (scn->scn_bytes_pending == 0)
3003 return;
3004
3005 if (scn->scn_taskq == NULL) {
3006 char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16,
3007 KM_SLEEP);
3008 int nthreads = spa->spa_root_vdev->vdev_children;
3009
3010 /*
3011 * We need to make this taskq *always* execute as many
3012 * threads in parallel as we have top-level vdevs and no
3013 * less, otherwise strange serialization of the calls to
3014 * scan_io_queues_run_one can occur during spa_sync runs
3015 * and that significantly impacts performance.
3016 */
3017 (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16,
3018 "dsl_scan_tq_%s", spa->spa_name);
3019 scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri,
3020 nthreads, nthreads, TASKQ_PREPOPULATE);
3021 kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16);
3022 }
3023
3024 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
3025 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
3026
3027 mutex_enter(&vd->vdev_scan_io_queue_lock);
3028 if (vd->