13f9d6adLin Ling/*
23f9d6adLin Ling * CDDL HEADER START
33f9d6adLin Ling *
43f9d6adLin Ling * The contents of this file are subject to the terms of the
53f9d6adLin Ling * Common Development and Distribution License (the "License").
63f9d6adLin Ling * You may not use this file except in compliance with the License.
73f9d6adLin Ling *
83f9d6adLin Ling * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93f9d6adLin Ling * or http://www.opensolaris.org/os/licensing.
103f9d6adLin Ling * See the License for the specific language governing permissions
113f9d6adLin Ling * and limitations under the License.
123f9d6adLin Ling *
133f9d6adLin Ling * When distributing Covered Code, include this CDDL HEADER in each
143f9d6adLin Ling * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153f9d6adLin Ling * If applicable, add the following below this CDDL HEADER, with the
163f9d6adLin Ling * fields enclosed by brackets "[]" replaced with your own identifying
173f9d6adLin Ling * information: Portions Copyright [yyyy] [name of copyright owner]
183f9d6adLin Ling *
193f9d6adLin Ling * CDDL HEADER END
203f9d6adLin Ling */
213f9d6adLin Ling/*
223f9d6adLin Ling * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23bb1f424Matthew Ahrens * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
248c04a1fGary Mills * Copyright 2016 Gary Mills
255cabbc6Prashanth Sreenivasa * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
26233f6c4Kody Kantor * Copyright 2019 Joyent, Inc.
271702cceAlek Pinchuk * Copyright (c) 2017 Datto Inc.
283f9d6adLin Ling */
293f9d6adLin Ling
303f9d6adLin Ling#include <sys/dsl_scan.h>
313f9d6adLin Ling#include <sys/dsl_pool.h>
323f9d6adLin Ling#include <sys/dsl_dataset.h>
333f9d6adLin Ling#include <sys/dsl_prop.h>
343f9d6adLin Ling#include <sys/dsl_dir.h>
353f9d6adLin Ling#include <sys/dsl_synctask.h>
363f9d6adLin Ling#include <sys/dnode.h>
373f9d6adLin Ling#include <sys/dmu_tx.h>
383f9d6adLin Ling#include <sys/dmu_objset.h>
393f9d6adLin Ling#include <sys/arc.h>
403f9d6adLin Ling#include <sys/zap.h>
413f9d6adLin Ling#include <sys/zio.h>
423f9d6adLin Ling#include <sys/zfs_context.h>
433f9d6adLin Ling#include <sys/fs/zfs.h>
443f9d6adLin Ling#include <sys/zfs_znode.h>
453f9d6adLin Ling#include <sys/spa_impl.h>
463f9d6adLin Ling#include <sys/vdev_impl.h>
473f9d6adLin Ling#include <sys/zil_impl.h>
483f9d6adLin Ling#include <sys/zio_checksum.h>
493f9d6adLin Ling#include <sys/ddt.h>
503f9d6adLin Ling#include <sys/sa.h>
513f9d6adLin Ling#include <sys/sa_impl.h>
52ad135b5Christopher Siden#include <sys/zfeature.h>
53770499eDan Kimmel#include <sys/abd.h>
54a3874b8Toomas Soome#include <sys/range_tree.h>
553f9d6adLin Ling#ifdef _KERNEL
563f9d6adLin Ling#include <sys/zfs_vfsops.h>
573f9d6adLin Ling#endif
583f9d6adLin Ling
59a3874b8Toomas Soome/*
60a3874b8Toomas Soome * Grand theory statement on scan queue sorting
61a3874b8Toomas Soome *
62a3874b8Toomas Soome * Scanning is implemented by recursively traversing all indirection levels
63a3874b8Toomas Soome * in an object and reading all blocks referenced from said objects. This
64a3874b8Toomas Soome * results in us approximately traversing the object from lowest logical
65a3874b8Toomas Soome * offset to the highest. For best performance, we would want the logical
66a3874b8Toomas Soome * blocks to be physically contiguous. However, this is frequently not the
67a3874b8Toomas Soome * case with pools given the allocation patterns of copy-on-write filesystems.
68a3874b8Toomas Soome * So instead, we put the I/Os into a reordering queue and issue them in a
69a3874b8Toomas Soome * way that will most benefit physical disks (LBA-order).
70a3874b8Toomas Soome *
71a3874b8Toomas Soome * Queue management:
72a3874b8Toomas Soome *
73a3874b8Toomas Soome * Ideally, we would want to scan all metadata and queue up all block I/O
74a3874b8Toomas Soome * prior to starting to issue it, because that allows us to do an optimal
75a3874b8Toomas Soome * sorting job. This can however consume large amounts of memory. Therefore
76a3874b8Toomas Soome * we continuously monitor the size of the queues and constrain them to 5%
77a3874b8Toomas Soome * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
78a3874b8Toomas Soome * limit, we clear out a few of the largest extents at the head of the queues
79a3874b8Toomas Soome * to make room for more scanning. Hopefully, these extents will be fairly
80a3874b8Toomas Soome * large and contiguous, allowing us to approach sequential I/O throughput
81a3874b8Toomas Soome * even without a fully sorted tree.
82a3874b8Toomas Soome *
83a3874b8Toomas Soome * Metadata scanning takes place in dsl_scan_visit(), which is called from
84a3874b8Toomas Soome * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
85a3874b8Toomas Soome * metadata on the pool, or we need to make room in memory because our
86a3874b8Toomas Soome * queues are too large, dsl_scan_visit() is postponed and
87a3874b8Toomas Soome * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
88a3874b8Toomas Soome * that metadata scanning and queued I/O issuing are mutually exclusive. This
89a3874b8Toomas Soome * allows us to provide maximum sequential I/O throughput for the majority of
90a3874b8Toomas Soome * I/O's issued since sequential I/O performance is significantly negatively
91a3874b8Toomas Soome * impacted if it is interleaved with random I/O.
92a3874b8Toomas Soome *
93a3874b8Toomas Soome * Implementation Notes
94a3874b8Toomas Soome *
95a3874b8Toomas Soome * One side effect of the queued scanning algorithm is that the scanning code
96a3874b8Toomas Soome * needs to be notified whenever a block is freed. This is needed to allow
97a3874b8Toomas Soome * the scanning code to remove these I/Os from the issuing queue. Additionally,
98a3874b8Toomas Soome * we do not attempt to queue gang blocks to be issued sequentially since this
99a3874b8Toomas Soome * is very hard to do and would have an extremely limited performance benefit.
100a3874b8Toomas Soome * Instead, we simply issue gang I/Os as soon as we find them using the legacy
101a3874b8Toomas Soome * algorithm.
102a3874b8Toomas Soome *
103a3874b8Toomas Soome * Backwards compatibility
104a3874b8Toomas Soome *
105a3874b8Toomas Soome * This new algorithm is backwards compatible with the legacy on-disk data
106a3874b8Toomas Soome * structures (and therefore does not require a new feature flag).
107a3874b8Toomas Soome * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
108a3874b8Toomas Soome * will stop scanning metadata (in logical order) and wait for all outstanding
109a3874b8Toomas Soome * sorted I/O to complete. Once this is done, we write out a checkpoint
110a3874b8Toomas Soome * bookmark, indicating that we have scanned everything logically before it.
111a3874b8Toomas Soome * If the pool is imported on a machine without the new sorting algorithm,
112a3874b8Toomas Soome * the scan simply resumes from the last checkpoint using the legacy algorithm.
113a3874b8Toomas Soome */
114a3874b8Toomas Soome
1157802d7bMatthew Ahrenstypedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
1167802d7bMatthew Ahrens    const zbookmark_phys_t *);
1173f9d6adLin Ling
1183f9d6adLin Lingstatic scan_cb_t dsl_scan_scrub_cb;
1193f9d6adLin Ling
120a3874b8Toomas Soomestatic int scan_ds_queue_compare(const void *a, const void *b);
121a3874b8Toomas Soomestatic int scan_prefetch_queue_compare(const void *a, const void *b);
122a3874b8Toomas Soomestatic void scan_ds_queue_clear(dsl_scan_t *scn);
123a3874b8Toomas Soomestatic boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
124a3874b8Toomas Soome    uint64_t *txg);
125a3874b8Toomas Soomestatic void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
126a3874b8Toomas Soomestatic void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
127a3874b8Toomas Soomestatic void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
128a3874b8Toomas Soome
129a3874b8Toomas Soomeextern int zfs_vdev_async_write_active_min_dirty_percent;
130a3874b8Toomas Soome
131a3874b8Toomas Soome/*
132a3874b8Toomas Soome * By default zfs will check to ensure it is not over the hard memory
133a3874b8Toomas Soome * limit before each txg. If finer-grained control of this is needed
134a3874b8Toomas Soome * this value can be set to 1 to enable checking before scanning each
135a3874b8Toomas Soome * block.
136a3874b8Toomas Soome */
137a3874b8Toomas Soomeint zfs_scan_strict_mem_lim = B_FALSE;
138a3874b8Toomas Soome
139a3874b8Toomas Soome/*
140a3874b8Toomas Soome * Maximum number of parallelly executing I/Os per top-level vdev.
141a3874b8Toomas Soome * Tune with care. Very high settings (hundreds) are known to trigger
142a3874b8Toomas Soome * some firmware bugs and resets on certain SSDs.
143a3874b8Toomas Soome */
14444ecc53George Wilsonint zfs_top_maxinflight = 32;		/* maximum I/Os per top-level */
145a3874b8Toomas Soomeunsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver */
146a3874b8Toomas Soomeunsigned int zfs_scrub_delay = 4;	/* number of ticks to delay scrub */
147a3874b8Toomas Soomeunsigned int zfs_scan_idle = 50;	/* idle window in clock ticks */
148a3874b8Toomas Soome
149a3874b8Toomas Soome/*
150a3874b8Toomas Soome * Maximum number of parallelly executed bytes per leaf vdev. We attempt
151a3874b8Toomas Soome * to strike a balance here between keeping the vdev queues full of I/Os
152a3874b8Toomas Soome * at all times and not overflowing the queues to cause long latency,
153a3874b8Toomas Soome * which would cause long txg sync times. No matter what, we will not
154a3874b8Toomas Soome * overload the drives with I/O, since that is protected by
155a3874b8Toomas Soome * zfs_vdev_scrub_max_active.
156a3874b8Toomas Soome */
157a3874b8Toomas Soomeunsigned long zfs_scan_vdev_limit = 4 << 20;
158a3874b8Toomas Soome
159a3874b8Toomas Soomeint zfs_scan_issue_strategy = 0;
160a3874b8Toomas Soomeint zfs_scan_legacy = B_FALSE;	/* don't queue & sort zios, go direct */
161a3874b8Toomas Soomeuint64_t zfs_scan_max_ext_gap = 2 << 20;	/* in bytes */
162a3874b8Toomas Soome
163a3874b8Toomas Soomeunsigned int zfs_scan_checkpoint_intval = 7200;	/* seconds */
164a3874b8Toomas Soome#define	ZFS_SCAN_CHECKPOINT_INTVAL	SEC_TO_TICK(zfs_scan_checkpoint_intval)
165a3874b8Toomas Soome
166a3874b8Toomas Soome/*
167a3874b8Toomas Soome * fill_weight is non-tunable at runtime, so we copy it at module init from
168a3874b8Toomas Soome * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
169a3874b8Toomas Soome * break queue sorting.
170a3874b8Toomas Soome */
171a3874b8Toomas Soomeuint64_t zfs_scan_fill_weight = 3;
172a3874b8Toomas Soomestatic uint64_t fill_weight;
173a3874b8Toomas Soome
174a3874b8Toomas Soome/* See dsl_scan_should_clear() for details on the memory limit tunables */
175a3874b8Toomas Soomeuint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
176a3874b8Toomas Soomeuint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
177a3874b8Toomas Soomeint zfs_scan_mem_lim_fact = 20;		/* fraction of physmem */
178a3874b8Toomas Soomeint zfs_scan_mem_lim_soft_fact = 20;	/* fraction of mem lim above */
179a3874b8Toomas Soome
180a3874b8Toomas Soomeunsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
181a3874b8Toomas Soomeunsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
182a3874b8Toomas Soome/* min millisecs to obsolete per txg */
183a3874b8Toomas Soomeunsigned int zfs_obsolete_min_time_ms = 500;
184a3874b8Toomas Soome/* min millisecs to resilver per txg */
185a3874b8Toomas Soomeunsigned int zfs_resilver_min_time_ms = 3000;
186e4c795bTom Caputiint zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
1873f9d6adLin Lingboolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
1887fd05acMatthew Ahrensboolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
1893f9d6adLin Lingenum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
190af3465dMax Grossman/* max number of blocks to free in a single TXG */
1915cabbc6Prashanth Sreenivasauint64_t zfs_async_block_max_blocks = UINT64_MAX;
1923f9d6adLin Ling
193e4c795bTom Caputiint zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
194e4c795bTom Caputi
195a3874b8Toomas Soome/*
196a3874b8Toomas Soome * We wait a few txgs after importing a pool to begin scanning so that
197a3874b8Toomas Soome * the import / mounting code isn't held up by scrub / resilver IO.
198a3874b8Toomas Soome * Unfortunately, it is a bit difficult to determine exactly how long
199a3874b8Toomas Soome * this will take since userspace will trigger fs mounts asynchronously
200a3874b8Toomas Soome * and the kernel will create zvol minors asynchronously. As a result,
201a3874b8Toomas Soome * the value provided here is a bit arbitrary, but represents a
202a3874b8Toomas Soome * reasonable estimate of how many txgs it will take to finish fully
203a3874b8Toomas Soome * importing a pool
204a3874b8Toomas Soome */
205a3874b8Toomas Soome#define	SCAN_IMPORT_WAIT_TXGS		5
206a3874b8Toomas Soome
207a3874b8Toomas Soome
2083f9d6adLin Ling#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
2093f9d6adLin Ling	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
2103f9d6adLin Ling	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
2113f9d6adLin Ling
2123f9d6adLin Lingextern int zfs_txg_timeout;
2133f9d6adLin Ling
214139510fGeorge Wilson/*
215139510fGeorge Wilson * Enable/disable the processing of the free_bpobj object.
216139510fGeorge Wilson */
217139510fGeorge Wilsonboolean_t zfs_free_bpobj_enabled = B_TRUE;
218139510fGeorge Wilson
2193f9d6adLin Ling/* the order has to match pool_scan_type */
2203f9d6adLin Lingstatic scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
2213f9d6adLin Ling	NULL,
2223f9d6adLin Ling	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
2233f9d6adLin Ling	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
2243f9d6adLin Ling};
2253f9d6adLin Ling
226a3874b8Toomas Soome/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
227a3874b8Toomas Soometypedef struct {
228a3874b8Toomas Soome	uint64_t	sds_dsobj;
229a3874b8Toomas Soome	uint64_t	sds_txg;
230a3874b8Toomas Soome	avl_node_t	sds_node;
231a3874b8Toomas Soome} scan_ds_t;
232a3874b8Toomas Soome
233a3874b8Toomas Soome/*
234a3874b8Toomas Soome * This controls what conditions are placed on dsl_scan_sync_state():
235a3874b8Toomas Soome * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
236a3874b8Toomas Soome * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
237a3874b8Toomas Soome * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
238a3874b8Toomas Soome *	write out the scn_phys_cached version.
239a3874b8Toomas Soome * See dsl_scan_sync_state for details.
240a3874b8Toomas Soome */
241a3874b8Toomas Soometypedef enum {
242a3874b8Toomas Soome	SYNC_OPTIONAL,
243a3874b8Toomas Soome	SYNC_MANDATORY,
244a3874b8Toomas Soome	SYNC_CACHED
245a3874b8Toomas Soome} state_sync_type_t;
246a3874b8Toomas Soome
247a3874b8Toomas Soome/*
248a3874b8Toomas Soome * This struct represents the minimum information needed to reconstruct a
249a3874b8Toomas Soome * zio for sequential scanning. This is useful because many of these will
250a3874b8Toomas Soome * accumulate in the sequential IO queues before being issued, so saving
251a3874b8Toomas Soome * memory matters here.
252a3874b8Toomas Soome */
253a3874b8Toomas Soometypedef struct scan_io {
254a3874b8Toomas Soome	/* fields from blkptr_t */
255a3874b8Toomas Soome	uint64_t		sio_blk_prop;
256a3874b8Toomas Soome	uint64_t		sio_phys_birth;
257a3874b8Toomas Soome	uint64_t		sio_birth;
258a3874b8Toomas Soome	zio_cksum_t		sio_cksum;
25912a8814Tom Caputi	uint32_t		sio_nr_dvas;
260a3874b8Toomas Soome
261a3874b8Toomas Soome	/* fields from zio_t */
26212a8814Tom Caputi	uint32_t		sio_flags;
263a3874b8Toomas Soome	zbookmark_phys_t	sio_zb;
264a3874b8Toomas Soome
265a3874b8Toomas Soome	/* members for queue sorting */
266a3874b8Toomas Soome	union {
26712a8814Tom Caputi		avl_node_t	sio_addr_node; /* link into issuing queue */
268a3874b8Toomas Soome		list_node_t	sio_list_node; /* link for issuing to disk */
269a3874b8Toomas Soome	} sio_nodes;
27012a8814Tom Caputi
27112a8814Tom Caputi	/*
27212a8814Tom Caputi	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
27312a8814Tom Caputi	 * depending on how many were in the original bp. Only the
27412a8814Tom Caputi	 * first DVA is really used for sorting and issuing purposes.
27512a8814Tom Caputi	 * The other DVAs (if provided) simply exist so that the zio
27612a8814Tom Caputi	 * layer can find additional copies to repair from in the
27712a8814Tom Caputi	 * event of an error. This array must go at the end of the
27812a8814Tom Caputi	 * struct to allow this for the variable number of elements.
27912a8814Tom Caputi	 */
28012a8814Tom Caputi	dva_t			sio_dva[0];
281a3874b8Toomas Soome} scan_io_t;
282a3874b8Toomas Soome
28312a8814Tom Caputi#define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
28412a8814Tom Caputi#define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
28512a8814Tom Caputi#define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
28612a8814Tom Caputi#define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
28712a8814Tom Caputi#define	SIO_GET_END_OFFSET(sio)		\
28812a8814Tom Caputi	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
28912a8814Tom Caputi#define	SIO_GET_MUSED(sio)		\
29012a8814Tom Caputi	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
29112a8814Tom Caputi
292a3874b8Toomas Soomestruct dsl_scan_io_queue {
293a3874b8Toomas Soome	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
294a3874b8Toomas Soome	vdev_t		*q_vd; /* top-level vdev that this queue represents */
295a3874b8Toomas Soome
296a3874b8Toomas Soome	/* trees used for sorting I/Os and extents of I/Os */
297a3874b8Toomas Soome	range_tree_t	*q_exts_by_addr;
2984d7988dPaul Dagnelie	zfs_btree_t		q_exts_by_size;
299a3874b8Toomas Soome	avl_tree_t	q_sios_by_addr;
30012a8814Tom Caputi	uint64_t	q_sio_memused;
301a3874b8Toomas Soome
302a3874b8Toomas Soome	/* members for zio rate limiting */
303a3874b8Toomas Soome	uint64_t	q_maxinflight_bytes;
304a3874b8Toomas Soome	uint64_t	q_inflight_bytes;
305a3874b8Toomas Soome	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
306a3874b8Toomas Soome
307a3874b8Toomas Soome	/* per txg statistics */
308a3874b8Toomas Soome	uint64_t	q_total_seg_size_this_txg;
309a3874b8Toomas Soome	uint64_t	q_segs_this_txg;
310a3874b8Toomas Soome	uint64_t	q_total_zio_size_this_txg;
311a3874b8Toomas Soome	uint64_t	q_zios_this_txg;
312a3874b8Toomas Soome};
313a3874b8Toomas Soome
314a3874b8Toomas Soome/* private data for dsl_scan_prefetch_cb() */
315a3874b8Toomas Soometypedef struct scan_prefetch_ctx {
316a3874b8Toomas Soome	zfs_refcount_t spc_refcnt;	/* refcount for memory management */
317a3874b8Toomas Soome	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
318a3874b8Toomas Soome	boolean_t spc_root;		/* is this prefetch for an objset? */
319a3874b8Toomas Soome	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
320a3874b8Toomas Soome	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
321a3874b8Toomas Soome} scan_prefetch_ctx_t;
322a3874b8Toomas Soome
323a3874b8Toomas Soome/* private data for dsl_scan_prefetch() */
324a3874b8Toomas Soometypedef struct scan_prefetch_issue_ctx {
325a3874b8Toomas Soome	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
326a3874b8Toomas Soome	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
327a3874b8Toomas Soome	blkptr_t spic_bp;		/* bp to prefetch */
328a3874b8Toomas Soome	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
329a3874b8Toomas Soome} scan_prefetch_issue_ctx_t;
330a3874b8Toomas Soome
331a3874b8Toomas Soomestatic void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
332a3874b8Toomas Soome    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
333a3874b8Toomas Soomestatic void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
334a3874b8Toomas Soome    scan_io_t *sio);
335a3874b8Toomas Soome
336a3874b8Toomas Soomestatic dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
337a3874b8Toomas Soomestatic void scan_io_queues_destroy(dsl_scan_t *scn);
338a3874b8Toomas Soome
33912a8814Tom Caputistatic kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
34012a8814Tom Caputi
34112a8814Tom Caputi/* sio->sio_nr_dvas must be set so we know which cache to free from */
34212a8814Tom Caputistatic void
34312a8814Tom Caputisio_free(scan_io_t *sio)
34412a8814Tom Caputi{
34512a8814Tom Caputi	ASSERT3U(sio->sio_nr_dvas, >, 0);
34612a8814Tom Caputi	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
34712a8814Tom Caputi
34812a8814Tom Caputi	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
34912a8814Tom Caputi}
35012a8814Tom Caputi
35112a8814Tom Caputi/* It is up to the caller to set sio->sio_nr_dvas for freeing */
35212a8814Tom Caputistatic scan_io_t *
35312a8814Tom Caputisio_alloc(unsigned short nr_dvas)
35412a8814Tom Caputi{
35512a8814Tom Caputi	ASSERT3U(nr_dvas, >, 0);
35612a8814Tom Caputi	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
35712a8814Tom Caputi
35812a8814Tom Caputi	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
35912a8814Tom Caputi}
360a3874b8Toomas Soome
361a3874b8Toomas Soomevoid
362a3874b8Toomas Soomescan_init(void)
363a3874b8Toomas Soome{
364a3874b8Toomas Soome	/*
365a3874b8Toomas Soome	 * This is used in ext_size_compare() to weight segments
366a3874b8Toomas Soome	 * based on how sparse they are. This cannot be changed
367a3874b8Toomas Soome	 * mid-scan and the tree comparison functions don't currently
368a3874b8Toomas Soome	 * have a mechansim for passing additional context to the
369a3874b8Toomas Soome	 * compare functions. Thus we store this value globally and
370a3874b8Toomas Soome	 * we only allow it to be set at module intiailization time
371a3874b8Toomas Soome	 */
372a3874b8Toomas Soome	fill_weight = zfs_scan_fill_weight;
373a3874b8Toomas Soome
37412a8814Tom Caputi	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
37512a8814Tom Caputi		char name[36];
37612a8814Tom Caputi
37712a8814Tom Caputi		(void) sprintf(name, "sio_cache_%d", i);
37812a8814Tom Caputi		sio_cache[i] = kmem_cache_create(name,
37912a8814Tom Caputi		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
38012a8814Tom Caputi		    0, NULL, NULL, NULL, NULL, NULL, 0);
38112a8814Tom Caputi	}
382a3874b8Toomas Soome}
383a3874b8Toomas Soome
384a3874b8Toomas Soomevoid
385a3874b8Toomas Soomescan_fini(void)
386a3874b8Toomas Soome{
38712a8814Tom Caputi	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
38812a8814Tom Caputi		kmem_cache_destroy(sio_cache[i]);
38912a8814Tom Caputi	}
390a3874b8Toomas Soome}
391a3874b8Toomas Soome
392a3874b8Toomas Soomestatic inline boolean_t
393a3874b8Toomas Soomedsl_scan_is_running(const dsl_scan_t *scn)
394a3874b8Toomas Soome{
395a3874b8Toomas Soome	return (scn->scn_phys.scn_state == DSS_SCANNING);
396a3874b8Toomas Soome}
397a3874b8Toomas Soome
398a3874b8Toomas Soomeboolean_t
399a3874b8Toomas Soomedsl_scan_resilvering(dsl_pool_t *dp)
400a3874b8Toomas Soome{
401a3874b8Toomas Soome	return (dsl_scan_is_running(dp->dp_scan) &&
402a3874b8Toomas Soome	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
403a3874b8Toomas Soome}
404a3874b8Toomas Soome
405a3874b8Toomas Soomestatic inline void
40612a8814Tom Caputisio2bp(const scan_io_t *sio, blkptr_t *bp)
407a3874b8Toomas Soome{
408a3874b8Toomas Soome	bzero(bp, sizeof (*bp));
409a3874b8Toomas Soome	bp->blk_prop = sio->sio_blk_prop;
410a3874b8Toomas Soome	bp->blk_phys_birth = sio->sio_phys_birth;
411a3874b8Toomas Soome	bp->blk_birth = sio->sio_birth;
412a3874b8Toomas Soome	bp->blk_fill = 1;	/* we always only work with data pointers */
413a3874b8Toomas Soome	bp->blk_cksum = sio->sio_cksum;
41412a8814Tom Caputi
41512a8814Tom Caputi	ASSERT3U(sio->sio_nr_dvas, >, 0);
41612a8814Tom Caputi	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
41712a8814Tom Caputi
41812a8814Tom Caputi	bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
419a3874b8Toomas Soome}
420a3874b8Toomas Soome
421a3874b8Toomas Soomestatic inline void
422a3874b8Toomas Soomebp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
423a3874b8Toomas Soome{
424a3874b8Toomas Soome	sio->sio_blk_prop = bp->blk_prop;
425a3874b8Toomas Soome	sio->sio_phys_birth = bp->blk_phys_birth;
426a3874b8Toomas Soome	sio->sio_birth = bp->blk_birth;
427a3874b8Toomas Soome	sio->sio_cksum = bp->blk_cksum;
42812a8814Tom Caputi	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
42912a8814Tom Caputi
43012a8814Tom Caputi	/*
43112a8814Tom Caputi	 * Copy the DVAs to the sio. We need all copies of the block so
43212a8814Tom Caputi	 * that the self healing code can use the alternate copies if the
43312a8814Tom Caputi	 * first is corrupted. We want the DVA at index dva_i to be first
43412a8814Tom Caputi	 * in the sio since this is the primary one that we want to issue.
43512a8814Tom Caputi	 */
43612a8814Tom Caputi	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
43712a8814Tom Caputi		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
43812a8814Tom Caputi	}
439a3874b8Toomas Soome}
440a3874b8Toomas Soome
4413f9d6adLin Lingint
4423f9d6adLin Lingdsl_scan_init(dsl_pool_t *dp, uint64_t txg)
4433f9d6adLin Ling{
4443f9d6adLin Ling	int err;
4453f9d6adLin Ling	dsl_scan_t *scn;
4463f9d6adLin Ling	spa_t *spa = dp->dp_spa;
4473f9d6adLin Ling	uint64_t f;
4483f9d6adLin Ling
4493f9d6adLin Ling	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
4503f9d6adLin Ling	scn->scn_dp = dp;
4513f9d6adLin Ling
4524a92375George Wilson	/*
4534a92375George Wilson	 * It's possible that we're resuming a scan after a reboot so
4544a92375George Wilson	 * make sure that the scan_async_destroying flag is initialized
4554a92375George Wilson	 * appropriately.
4564a92375George Wilson	 */
4574a92375George Wilson	ASSERT(!scn->scn_async_destroying);
4584a92375George Wilson	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
4592acef22Matthew Ahrens	    SPA_FEATURE_ASYNC_DESTROY);
4604a92375George Wilson
461a3874b8Toomas Soome	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
462a3874b8Toomas Soome	    offsetof(scan_ds_t, sds_node));
463a3874b8Toomas Soome	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
464a3874b8Toomas Soome	    sizeof (scan_prefetch_issue_ctx_t),
465a3874b8Toomas Soome	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
466a3874b8Toomas Soome
4673f9d6adLin Ling	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4683f9d6adLin Ling	    "scrub_func", sizeof (uint64_t), 1, &f);
4693f9d6adLin Ling	if (err == 0) {
4703f9d6adLin Ling		/*
4713f9d6adLin Ling		 * There was an old-style scrub in progress.  Restart a
4723f9d6adLin Ling		 * new-style scrub from the beginning.
4733f9d6adLin Ling		 */
4743f9d6adLin Ling		scn->scn_restart_txg = txg;
4753f9d6adLin Ling		zfs_dbgmsg("old-style scrub was in progress; "
4763f9d6adLin Ling		    "restarting new-style scrub in txg %llu",
477a3874b8Toomas Soome		    (longlong_t)scn->scn_restart_txg);
4783f9d6adLin Ling
4793f9d6adLin Ling		/*
4803f9d6adLin Ling		 * Load the queue obj from the old location so that it
4813f9d6adLin Ling		 * can be freed by dsl_scan_done().
4823f9d6adLin Ling		 */
4833f9d6adLin Ling		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4843f9d6adLin Ling		    "scrub_queue", sizeof (uint64_t), 1,
4853f9d6adLin Ling		    &scn->scn_phys.scn_queue_obj);
4863f9d6adLin Ling	} else {
4873f9d6adLin Ling		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4883f9d6adLin Ling		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
4893f9d6adLin Ling		    &scn->scn_phys);
490eb63303Tom Caputi
491eb63303Tom Caputi		/*
492eb63303Tom Caputi		 * Detect if the pool contains the signature of #2094.  If it
493eb63303Tom Caputi		 * does properly update the scn->scn_phys structure and notify
494eb63303Tom Caputi		 * the administrator by setting an errata for the pool.
495eb63303Tom Caputi		 */
496eb63303Tom Caputi		if (err == EOVERFLOW) {
497eb63303Tom Caputi			uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
498eb63303Tom Caputi			VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
499eb63303Tom Caputi			VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
500eb63303Tom Caputi			    (23 * sizeof (uint64_t)));
501eb63303Tom Caputi
502eb63303Tom Caputi			err = zap_lookup(dp->dp_meta_objset,
503eb63303Tom Caputi			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
504eb63303Tom Caputi			    sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
505eb63303Tom Caputi			if (err == 0) {
506eb63303Tom Caputi				uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
507eb63303Tom Caputi
508eb63303Tom Caputi				if (overflow & ~DSF_VISIT_DS_AGAIN ||
509eb63303Tom Caputi				    scn->scn_async_destroying) {
510eb63303Tom Caputi					spa->spa_errata =
511eb63303Tom Caputi					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
512eb63303Tom Caputi					return (EOVERFLOW);
513eb63303Tom Caputi				}
514eb63303Tom Caputi
515eb63303Tom Caputi				bcopy(zaptmp, &scn->scn_phys,
516eb63303Tom Caputi				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
517eb63303Tom Caputi				scn->scn_phys.scn_flags = overflow;
518eb63303Tom Caputi
519eb63303Tom Caputi				/* Required scrub already in progress. */
520eb63303Tom Caputi				if (scn->scn_phys.scn_state == DSS_FINISHED ||
521eb63303Tom Caputi				    scn->scn_phys.scn_state == DSS_CANCELED)
522eb63303Tom Caputi					spa->spa_errata =
523eb63303Tom Caputi					    ZPOOL_ERRATA_ZOL_2094_SCRUB;
524eb63303Tom Caputi			}
525eb63303Tom Caputi		}
526eb63303Tom Caputi
5273f9d6adLin Ling		if (err == ENOENT)
5283f9d6adLin Ling			return (0);
5293f9d6adLin Ling		else if (err)
5303f9d6adLin Ling			return (err);
5313f9d6adLin Ling
532a3874b8Toomas Soome		/*
533a3874b8Toomas Soome		 * We might be restarting after a reboot, so jump the issued
534a3874b8Toomas Soome		 * counter to how far we've scanned. We know we're consistent
535a3874b8Toomas Soome		 * up to here.
536a3874b8Toomas Soome		 */
537a3874b8Toomas Soome		scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
538a3874b8Toomas Soome
539a3874b8Toomas Soome		if (dsl_scan_is_running(scn) &&
5403f9d6adLin Ling		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
5413f9d6adLin Ling			/*
5423f9d6adLin Ling			 * A new-type scrub was in progress on an old
5433f9d6adLin Ling			 * pool, and the pool was accessed by old
5443f9d6adLin Ling			 * software.  Restart from the beginning, since
5453f9d6adLin Ling			 * the old software may have changed the pool in
5463f9d6adLin Ling			 * the meantime.
5473f9d6adLin Ling			 */
5483f9d6adLin Ling			scn->scn_restart_txg = txg;
5493f9d6adLin Ling			zfs_dbgmsg("new-style scrub was modified "
5503f9d6adLin Ling			    "by old software; restarting in txg %llu",
551a3874b8Toomas Soome			    (longlong_t)scn->scn_restart_txg);
552a3874b8Toomas Soome		}
553a3874b8Toomas Soome	}
554a3874b8Toomas Soome
555e4c795bTom Caputi	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
556e4c795bTom Caputi
557a3874b8Toomas Soome	/* reload the queue into the in-core state */
558a3874b8Toomas Soome	if (scn->scn_phys.scn_queue_obj != 0) {
559a3874b8Toomas Soome		zap_cursor_t zc;
560a3874b8Toomas Soome		zap_attribute_t za;
561a3874b8Toomas Soome
562a3874b8Toomas Soome		for (zap_cursor_init(&zc, dp->dp_meta_objset,
563a3874b8Toomas Soome		    scn->scn_phys.scn_queue_obj);
564a3874b8Toomas Soome		    zap_cursor_retrieve(&zc, &za) == 0;
565a3874b8Toomas Soome		    (void) zap_cursor_advance(&zc)) {
566a3874b8Toomas Soome			scan_ds_queue_insert(scn,
567a3874b8Toomas Soome			    zfs_strtonum(za.za_name, NULL),
568a3874b8Toomas Soome			    za.za_first_integer);
5693f9d6adLin Ling		}
570a3874b8Toomas Soome		zap_cursor_fini(&zc);
5713f9d6adLin Ling	}
5723f9d6adLin Ling
5733f9d6adLin Ling	spa_scan_stat_init(spa);
5743f9d6adLin Ling	return (0);
5753f9d6adLin Ling}
5763f9d6adLin Ling
5773f9d6adLin Lingvoid
5783f9d6adLin Lingdsl_scan_fini(dsl_pool_t *dp)
5793f9d6adLin Ling{
580a3874b8Toomas Soome	if (dp->dp_scan != NULL) {
581a3874b8Toomas Soome		dsl_scan_t *scn = dp->dp_scan;
582a3874b8Toomas Soome
583a3874b8Toomas Soome		if (scn->scn_taskq != NULL)
584a3874b8Toomas Soome			taskq_destroy(scn->scn_taskq);
585a3874b8Toomas Soome		scan_ds_queue_clear(scn);
586a3874b8Toomas Soome		avl_destroy(&scn->scn_queue);
587a3874b8Toomas Soome		avl_destroy(&scn->scn_prefetch_queue);
588a3874b8Toomas Soome
5893f9d6adLin Ling		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
5903f9d6adLin Ling		dp->dp_scan = NULL;
5913f9d6adLin Ling	}
5923f9d6adLin Ling}
5933f9d6adLin Ling
594a3874b8Toomas Soomestatic boolean_t
595a3874b8Toomas Soomedsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
596a3874b8Toomas Soome{
597a3874b8Toomas Soome	return (scn->scn_restart_txg != 0 &&
598a3874b8Toomas Soome	    scn->scn_restart_txg <= tx->tx_txg);
599a3874b8Toomas Soome}
600a3874b8Toomas Soome
601a3874b8Toomas Soomeboolean_t
602a3874b8Toomas Soomedsl_scan_scrubbing(const dsl_pool_t *dp)
603a3874b8Toomas Soome{
604a3874b8Toomas Soome	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
605a3874b8Toomas Soome
606a3874b8Toomas Soome	return (scn_phys->scn_state == DSS_SCANNING &&
607a3874b8Toomas Soome	    scn_phys->scn_func == POOL_SCAN_SCRUB);
608a3874b8Toomas Soome}
609a3874b8Toomas Soome
610a3874b8Toomas Soomeboolean_t
611a3874b8Toomas Soomedsl_scan_is_paused_scrub(const dsl_scan_t *scn)
612a3874b8Toomas Soome{
613a3874b8Toomas Soome	return (dsl_scan_scrubbing(scn->scn_dp) &&
614a3874b8Toomas Soome	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
615a3874b8Toomas Soome}
616a3874b8Toomas Soome
617a3874b8Toomas Soome/*
618a3874b8Toomas Soome * Writes out a persistent dsl_scan_phys_t record to the pool directory.
619a3874b8Toomas Soome * Because we can be running in the block sorting algorithm, we do not always
620a3874b8Toomas Soome * want to write out the record, only when it is "safe" to do so. This safety
621a3874b8Toomas Soome * condition is achieved by making sure that the sorting queues are empty
622a3874b8Toomas Soome * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
623a3874b8Toomas Soome * is inconsistent with how much actual scanning progress has been made. The
624a3874b8Toomas Soome * kind of sync to be performed is specified by the sync_type argument. If the
625a3874b8Toomas Soome * sync is optional, we only sync if the queues are empty. If the sync is
626a3874b8Toomas Soome * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
627a3874b8Toomas Soome * third possible state is a "cached" sync. This is done in response to:
628a3874b8Toomas Soome * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
629a3874b8Toomas Soome *	destroyed, so we wouldn't be able to restart scanning from it.
630a3874b8Toomas Soome * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
631a3874b8Toomas Soome *	superseded by a newer snapshot.
632a3874b8Toomas Soome * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
633a3874b8Toomas Soome *	swapped with its clone.
634a3874b8Toomas Soome * In all cases, a cached sync simply rewrites the last record we've written,
635a3874b8Toomas Soome * just slightly modified. For the modifications that are performed to the
636a3874b8Toomas Soome * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
637a3874b8Toomas Soome * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
638a3874b8Toomas Soome */
639a3874b8Toomas Soomestatic void
640a3874b8Toomas Soomedsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
641a3874b8Toomas Soome{
642a3874b8Toomas Soome	int i;
643a3874b8Toomas Soome	spa_t *spa = scn->scn_dp->dp_spa;
644a3874b8