1084fd14Brian Behlendorf/*
2084fd14Brian Behlendorf * CDDL HEADER START
3084fd14Brian Behlendorf *
4084fd14Brian Behlendorf * The contents of this file are subject to the terms of the
5084fd14Brian Behlendorf * Common Development and Distribution License (the "License").
6084fd14Brian Behlendorf * You may not use this file except in compliance with the License.
7084fd14Brian Behlendorf *
8084fd14Brian Behlendorf * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9084fd14Brian Behlendorf * or http://www.opensolaris.org/os/licensing.
10084fd14Brian Behlendorf * See the License for the specific language governing permissions
11084fd14Brian Behlendorf * and limitations under the License.
12084fd14Brian Behlendorf *
13084fd14Brian Behlendorf * When distributing Covered Code, include this CDDL HEADER in each
14084fd14Brian Behlendorf * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15084fd14Brian Behlendorf * If applicable, add the following below this CDDL HEADER, with the
16084fd14Brian Behlendorf * fields enclosed by brackets "[]" replaced with your own identifying
17084fd14Brian Behlendorf * information: Portions Copyright [yyyy] [name of copyright owner]
18084fd14Brian Behlendorf *
19084fd14Brian Behlendorf * CDDL HEADER END
20084fd14Brian Behlendorf */
21084fd14Brian Behlendorf
22084fd14Brian Behlendorf/*
23084fd14Brian Behlendorf * Copyright (c) 2016 by Delphix. All rights reserved.
24084fd14Brian Behlendorf * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
25084fd14Brian Behlendorf * Copyright 2019 Joyent, Inc.
26084fd14Brian Behlendorf */
27084fd14Brian Behlendorf
28084fd14Brian Behlendorf#include <sys/spa.h>
29084fd14Brian Behlendorf#include <sys/spa_impl.h>
30084fd14Brian Behlendorf#include <sys/txg.h>
31084fd14Brian Behlendorf#include <sys/vdev_impl.h>
32084fd14Brian Behlendorf#include <sys/vdev_trim.h>
33084fd14Brian Behlendorf#include <sys/refcount.h>
34084fd14Brian Behlendorf#include <sys/metaslab_impl.h>
35084fd14Brian Behlendorf#include <sys/dsl_synctask.h>
36084fd14Brian Behlendorf#include <sys/zap.h>
37084fd14Brian Behlendorf#include <sys/dmu_tx.h>
38084fd14Brian Behlendorf
39084fd14Brian Behlendorf/*
40084fd14Brian Behlendorf * TRIM is a feature which is used to notify a SSD that some previously
41084fd14Brian Behlendorf * written space is no longer allocated by the pool.  This is useful because
42084fd14Brian Behlendorf * writes to a SSD must be performed to blocks which have first been erased.
43084fd14Brian Behlendorf * Ensuring the SSD always has a supply of erased blocks for new writes
44084fd14Brian Behlendorf * helps prevent the performance from deteriorating.
45084fd14Brian Behlendorf *
46084fd14Brian Behlendorf * There are two supported TRIM methods; manual and automatic.
47084fd14Brian Behlendorf *
48084fd14Brian Behlendorf * Manual TRIM:
49084fd14Brian Behlendorf *
50084fd14Brian Behlendorf * A manual TRIM is initiated by running the 'zpool trim' command.  A single
51084fd14Brian Behlendorf * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
52084fd14Brian Behlendorf * managing that vdev TRIM process.  This involves iterating over all the
53084fd14Brian Behlendorf * metaslabs, calculating the unallocated space ranges, and then issuing the
54084fd14Brian Behlendorf * required TRIM I/Os.
55084fd14Brian Behlendorf *
56084fd14Brian Behlendorf * While a metaslab is being actively trimmed it is not eligible to perform
57084fd14Brian Behlendorf * new allocations.  After traversing all of the metaslabs the thread is
58084fd14Brian Behlendorf * terminated.  Finally, both the requested options and current progress of
59084fd14Brian Behlendorf * the TRIM are regularly written to the pool.  This allows the TRIM to be
60084fd14Brian Behlendorf * suspended and resumed as needed.
61084fd14Brian Behlendorf *
62084fd14Brian Behlendorf * Automatic TRIM:
63084fd14Brian Behlendorf *
64084fd14Brian Behlendorf * An automatic TRIM is enabled by setting the 'autotrim' pool property
65084fd14Brian Behlendorf * to 'on'.  When enabled, a `vdev_autotrim' thread is created for each
66084fd14Brian Behlendorf * top-level (not leaf) vdev in the pool.  These threads perform the same
67084fd14Brian Behlendorf * core TRIM process as a manual TRIM, but with a few key differences.
68084fd14Brian Behlendorf *
69084fd14Brian Behlendorf * 1) Automatic TRIM happens continuously in the background and operates
70084fd14Brian Behlendorf *    solely on recently freed blocks (ms_trim not ms_allocatable).
71084fd14Brian Behlendorf *
72084fd14Brian Behlendorf * 2) Each thread is associated with a top-level (not leaf) vdev.  This has
73084fd14Brian Behlendorf *    the benefit of simplifying the threading model, it makes it easier
74084fd14Brian Behlendorf *    to coordinate administrative commands, and it ensures only a single
75084fd14Brian Behlendorf *    metaslab is disabled at a time.  Unlike manual TRIM, this means each
76084fd14Brian Behlendorf *    'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
77084fd14Brian Behlendorf *    children.
78084fd14Brian Behlendorf *
79084fd14Brian Behlendorf * 3) There is no automatic TRIM progress information stored on disk, nor
80084fd14Brian Behlendorf *    is it reported by 'zpool status'.
81084fd14Brian Behlendorf *
82084fd14Brian Behlendorf * While the automatic TRIM process is highly effective it is more likely
83084fd14Brian Behlendorf * than a manual TRIM to encounter tiny ranges.  Ranges less than or equal to
84084fd14Brian Behlendorf * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
85084fd14Brian Behlendorf * TRIM and are skipped.  This means small amounts of freed space may not
86084fd14Brian Behlendorf * be automatically trimmed.
87084fd14Brian Behlendorf *
88084fd14Brian Behlendorf * Furthermore, devices with attached hot spares and devices being actively
89084fd14Brian Behlendorf * replaced are skipped.  This is done to avoid adding additional stress to
90084fd14Brian Behlendorf * a potentially unhealthy device and to minimize the required rebuild time.
91084fd14Brian Behlendorf *
92084fd14Brian Behlendorf * For this reason it may be beneficial to occasionally manually TRIM a pool
93084fd14Brian Behlendorf * even when automatic TRIM is enabled.
94084fd14Brian Behlendorf */
95084fd14Brian Behlendorf
96084fd14Brian Behlendorf/*
97084fd14Brian Behlendorf * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
98084fd14Brian Behlendorf */
99084fd14Brian Behlendorfunsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
100084fd14Brian Behlendorf
101084fd14Brian Behlendorf/*
102084fd14Brian Behlendorf * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
103084fd14Brian Behlendorf */
104084fd14Brian Behlendorfunsigned int zfs_trim_extent_bytes_min = 32 * 1024;
105084fd14Brian Behlendorf
106084fd14Brian Behlendorf/*
107084fd14Brian Behlendorf * Skip uninitialized metaslabs during the TRIM process.  This option is
108084fd14Brian Behlendorf * useful for pools constructed from large thinly-provisioned devices where
109084fd14Brian Behlendorf * TRIM operations are slow.  As a pool ages an increasing fraction of
110084fd14Brian Behlendorf * the pools metaslabs will be initialized progressively degrading the
111084fd14Brian Behlendorf * usefulness of this option.  This setting is stored when starting a
112084fd14Brian Behlendorf * manual TRIM and will persist for the duration of the requested TRIM.
113084fd14Brian Behlendorf */
114084fd14Brian Behlendorfunsigned int zfs_trim_metaslab_skip = 0;
115084fd14Brian Behlendorf
116084fd14Brian Behlendorf/*
117084fd14Brian Behlendorf * Maximum number of queued TRIM I/Os per leaf vdev.  The number of
118084fd14Brian Behlendorf * concurrent TRIM I/Os issued to the device is controlled by the
119084fd14Brian Behlendorf * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
120084fd14Brian Behlendorf */
121084fd14Brian Behlendorfunsigned int zfs_trim_queue_limit = 10;
122084fd14Brian Behlendorf
123084fd14Brian Behlendorf/*
124084fd14Brian Behlendorf * The minimum number of transaction groups between automatic trims of a
125084fd14Brian Behlendorf * metaslab.  This setting represents a trade-off between issuing more
126084fd14Brian Behlendorf * efficient TRIM operations, by allowing them to be aggregated longer,
127084fd14Brian Behlendorf * and issuing them promptly so the trimmed space is available.  Note
128084fd14Brian Behlendorf * that this value is a minimum; metaslabs can be trimmed less frequently
129084fd14Brian Behlendorf * when there are a large number of ranges which need to be trimmed.
130084fd14Brian Behlendorf *
131084fd14Brian Behlendorf * Increasing this value will allow frees to be aggregated for a longer
132084fd14Brian Behlendorf * time.  This can result is larger TRIM operations, and increased memory
133084fd14Brian Behlendorf * usage in order to track the ranges to be trimmed.  Decreasing this value
134084fd14Brian Behlendorf * has the opposite effect.  The default value of 32 was determined though
135084fd14Brian Behlendorf * testing to be a reasonable compromise.
136084fd14Brian Behlendorf */
137084fd14Brian Behlendorfunsigned int zfs_trim_txg_batch = 32;
138084fd14Brian Behlendorf
139084fd14Brian Behlendorf/*
140084fd14Brian Behlendorf * The trim_args are a control structure which describe how a leaf vdev
141084fd14Brian Behlendorf * should be trimmed.  The core elements are the vdev, the metaslab being
142084fd14Brian Behlendorf * trimmed and a range tree containing the extents to TRIM.  All provided
143084fd14Brian Behlendorf * ranges must be within the metaslab.
144084fd14Brian Behlendorf */
145084fd14Brian Behlendorftypedef struct trim_args {
146084fd14Brian Behlendorf	/*
147084fd14Brian Behlendorf	 * These fields are set by the caller of vdev_trim_ranges().
148084fd14Brian Behlendorf	 */
149084fd14Brian Behlendorf	vdev_t		*trim_vdev;		/* Leaf vdev to TRIM */
150084fd14Brian Behlendorf	metaslab_t	*trim_msp;		/* Disabled metaslab */
151084fd14Brian Behlendorf	range_tree_t	*trim_tree;		/* TRIM ranges (in metaslab) */
152084fd14Brian Behlendorf	trim_type_t	trim_type;		/* Manual or auto TRIM */
153084fd14Brian Behlendorf	uint64_t	trim_extent_bytes_max;	/* Maximum TRIM I/O size */
154084fd14Brian Behlendorf	uint64_t	trim_extent_bytes_min;	/* Minimum TRIM I/O size */
155084fd14Brian Behlendorf	enum trim_flag	trim_flags;		/* TRIM flags (secure) */
156084fd14Brian Behlendorf
157084fd14Brian Behlendorf	/*
158084fd14Brian Behlendorf	 * These fields are updated by vdev_trim_ranges().
159084fd14Brian Behlendorf	 */
160084fd14Brian Behlendorf	hrtime_t	trim_start_time;	/* Start time */
161084fd14Brian Behlendorf	uint64_t	trim_bytes_done;	/* Bytes trimmed */
162084fd14Brian Behlendorf} trim_args_t;
163084fd14Brian Behlendorf
164084fd14Brian Behlendorf/*
165084fd14Brian Behlendorf * Determines whether a vdev_trim_thread() should be stopped.
166084fd14Brian Behlendorf */
167084fd14Brian Behlendorfstatic boolean_t
168084fd14Brian Behlendorfvdev_trim_should_stop(vdev_t *vd)
169084fd14Brian Behlendorf{
170084fd14Brian Behlendorf	return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
171084fd14Brian Behlendorf	    vd->vdev_detached || vd->vdev_top->vdev_removing);
172084fd14Brian Behlendorf}
173084fd14Brian Behlendorf
174084fd14Brian Behlendorf/*
175084fd14Brian Behlendorf * Determines whether a vdev_autotrim_thread() should be stopped.
176084fd14Brian Behlendorf */
177084fd14Brian Behlendorfstatic boolean_t
178084fd14Brian Behlendorfvdev_autotrim_should_stop(vdev_t *tvd)
179084fd14Brian Behlendorf{
180084fd14Brian Behlendorf	return (tvd->vdev_autotrim_exit_wanted ||
181084fd14Brian Behlendorf	    !vdev_writeable(tvd) || tvd->vdev_removing ||
182084fd14Brian Behlendorf	    spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
183084fd14Brian Behlendorf}
184084fd14Brian Behlendorf
185084fd14Brian Behlendorf/*
186084fd14Brian Behlendorf * The sync task for updating the on-disk state of a manual TRIM.  This
187084fd14Brian Behlendorf * is scheduled by vdev_trim_change_state().
188084fd14Brian Behlendorf */
189084fd14Brian Behlendorfstatic void
190084fd14Brian Behlendorfvdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
191084fd14Brian Behlendorf{
192084fd14Brian Behlendorf	/*
193084fd14Brian Behlendorf	 * We pass in the guid instead of the vdev_t since the vdev may
194084fd14Brian Behlendorf	 * have been freed prior to the sync task being processed.  This
195084fd14Brian Behlendorf	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
196084fd14Brian Behlendorf	 * stop the trimming thread, schedule the sync task, and free
197084fd14Brian Behlendorf	 * the vdev. Later when the scheduled sync task is invoked, it would
198084fd14Brian Behlendorf	 * find that the vdev has been freed.
199084fd14Brian Behlendorf	 */
200084fd14Brian Behlendorf	uint64_t guid = *(uint64_t *)arg;
201084fd14Brian Behlendorf	uint64_t txg = dmu_tx_get_txg(tx);
202084fd14Brian Behlendorf	kmem_free(arg, sizeof (uint64_t));
203084fd14Brian Behlendorf
204084fd14Brian Behlendorf	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
205084fd14Brian Behlendorf	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
206084fd14Brian Behlendorf		return;
207084fd14Brian Behlendorf
208084fd14Brian Behlendorf	uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
209084fd14Brian Behlendorf	vd->vdev_trim_offset[txg & TXG_MASK] = 0;
210084fd14Brian Behlendorf
211084fd14Brian Behlendorf	VERIFY3U(vd->vdev_leaf_zap, !=, 0);
212084fd14Brian Behlendorf
213084fd14Brian Behlendorf	objset_t *mos = vd->vdev_spa->spa_meta_objset;
214084fd14Brian Behlendorf
215084fd14Brian Behlendorf	if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
216084fd14Brian Behlendorf
217084fd14Brian Behlendorf		if (vd->vdev_trim_last_offset == UINT64_MAX)
218084fd14Brian Behlendorf			last_offset = 0;
219084fd14Brian Behlendorf
220084fd14Brian Behlendorf		vd->vdev_trim_last_offset = last_offset;
221084fd14Brian Behlendorf		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
222084fd14Brian Behlendorf		    VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
223084fd14Brian Behlendorf		    sizeof (last_offset), 1, &last_offset, tx));
224084fd14Brian Behlendorf	}
225084fd14Brian Behlendorf
226084fd14Brian Behlendorf	if (vd->vdev_trim_action_time > 0) {
227084fd14Brian Behlendorf		uint64_t val = (uint64_t)vd->vdev_trim_action_time;
228084fd14Brian Behlendorf		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
229084fd14Brian Behlendorf		    VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
230084fd14Brian Behlendorf		    1, &val, tx));
231084fd14Brian Behlendorf	}
232084fd14Brian Behlendorf
233084fd14Brian Behlendorf	if (vd->vdev_trim_rate > 0) {
234084fd14Brian Behlendorf		uint64_t rate = (uint64_t)vd->vdev_trim_rate;
235084fd14Brian Behlendorf
236084fd14Brian Behlendorf		if (rate == UINT64_MAX)
237084fd14Brian Behlendorf			rate = 0;
238084fd14Brian Behlendorf
239084fd14Brian Behlendorf		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
240084fd14Brian Behlendorf		    VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
241084fd14Brian Behlendorf	}
242084fd14Brian Behlendorf
243084fd14Brian Behlendorf	uint64_t partial = vd->vdev_trim_partial;
244084fd14Brian Behlendorf	if (partial == UINT64_MAX)
245084fd14Brian Behlendorf		partial = 0;
246084fd14Brian Behlendorf
247084fd14Brian Behlendorf	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
248084fd14Brian Behlendorf	    sizeof (partial), 1, &partial, tx));
249084fd14Brian Behlendorf
250084fd14Brian Behlendorf	uint64_t secure = vd->vdev_trim_secure;
251084fd14Brian Behlendorf	if (secure == UINT64_MAX)
252084fd14Brian Behlendorf		secure = 0;
253084fd14Brian Behlendorf
254084fd14Brian Behlendorf	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
255084fd14Brian Behlendorf	    sizeof (secure), 1, &secure, tx));
256084fd14Brian Behlendorf
257084fd14Brian Behlendorf
258084fd14Brian Behlendorf	uint64_t trim_state = vd->vdev_trim_state;
259084fd14Brian Behlendorf	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
260084fd14Brian Behlendorf	    sizeof (trim_state), 1, &trim_state, tx));
261084fd14Brian Behlendorf}
262084fd14Brian Behlendorf
263084fd14Brian Behlendorf/*
264084fd14Brian Behlendorf * Update the on-disk state of a manual TRIM.  This is called to request
265084fd14Brian Behlendorf * that a TRIM be started/suspended/canceled, or to change one of the
266084fd14Brian Behlendorf * TRIM options (partial, secure, rate).
267084fd14Brian Behlendorf */
268084fd14Brian Behlendorfstatic void
269084fd14Brian Behlendorfvdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
270084fd14Brian Behlendorf    uint64_t rate, boolean_t partial, boolean_t secure)
271084fd14Brian Behlendorf{
272084fd14Brian Behlendorf	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
273084fd14Brian Behlendorf	spa_t *spa = vd->vdev_spa;
274084fd14Brian Behlendorf
275084fd14Brian Behlendorf	if (new_state == vd->vdev_trim_state)
276084fd14Brian Behlendorf		return;
277084fd14Brian Behlendorf
278084fd14Brian Behlendorf	/*
279084fd14Brian Behlendorf	 * Copy the vd's guid, this will be freed by the sync task.
280084fd14Brian Behlendorf	 */
281084fd14Brian Behlendorf	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
282084fd14Brian Behlendorf	*guid = vd->vdev_guid;
283084fd14Brian Behlendorf
284084fd14Brian Behlendorf	/*
285084fd14Brian Behlendorf	 * If we're suspending, then preserve the original start time.
286084fd14Brian Behlendorf	 */
287084fd14Brian Behlendorf	if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
288084fd14Brian Behlendorf		vd->vdev_trim_action_time = gethrestime_sec();
289084fd14Brian Behlendorf	}
290084fd14Brian Behlendorf
291084fd14Brian Behlendorf	/*
292084fd14Brian Behlendorf	 * If we're activating, then preserve the requested rate and trim
293084fd14Brian Behlendorf	 * method.  Setting the last offset and rate to UINT64_MAX is used
294084fd14Brian Behlendorf	 * as a sentinel to indicate they should be reset to default values.
295084fd14Brian Behlendorf	 */
296084fd14Brian Behlendorf	if (new_state == VDEV_TRIM_ACTIVE) {
297084fd14Brian Behlendorf		if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
298084fd14Brian Behlendorf		    vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
299084fd14Brian Behlendorf			vd->vdev_trim_last_offset = UINT64_MAX;
300084fd14Brian Behlendorf			vd->vdev_trim_rate = UINT64_MAX;
301084fd14Brian Behlendorf			vd->vdev_trim_partial = UINT64_MAX;
302084fd14Brian Behlendorf			vd->vdev_trim_secure = UINT64_MAX;
303084fd14Brian Behlendorf		}
304084fd14Brian Behlendorf
305084fd14Brian Behlendorf		if (rate != 0)
306084fd14Brian Behlendorf			vd->vdev_trim_rate = rate;
307084fd14Brian Behlendorf
308084fd14Brian Behlendorf		if (partial != 0)
309084fd14Brian Behlendorf			vd->vdev_trim_partial = partial;
310084fd14Brian Behlendorf
311084fd14Brian Behlendorf		if (secure != 0)
312084fd14Brian Behlendorf			vd->vdev_trim_secure = secure;
313084fd14Brian Behlendorf	}
314084fd14Brian Behlendorf
315084fd14Brian Behlendorf	boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
316084fd14Brian Behlendorf	vd->vdev_trim_state = new_state;
317084fd14Brian Behlendorf
318084fd14Brian Behlendorf	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
319084fd14Brian Behlendorf	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
320084fd14Brian Behlendorf	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
321084fd14Brian Behlendorf	    guid, 2, ZFS_SPACE_CHECK_NONE, tx);
322084fd14Brian Behlendorf
323084fd14Brian Behlendorf	switch (new_state) {
324084fd14Brian Behlendorf	case VDEV_TRIM_ACTIVE:
325084fd14Brian Behlendorf		spa_event_notify(spa, vd, NULL,
326084fd14Brian Behlendorf		    resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
327084fd14Brian Behlendorf		spa_history_log_internal(spa, "trim", tx,
328084fd14Brian Behlendorf		    "vdev=%s activated", vd->vdev_path);
329084fd14Brian Behlendorf		break;
330084fd14Brian Behlendorf	case VDEV_TRIM_SUSPENDED:
331084fd14Brian Behlendorf		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
332084fd14Brian Behlendorf		spa_history_log_internal(spa, "trim", tx,
333084fd14Brian Behlendorf		    "vdev=%s suspended", vd->vdev_path);
334084fd14Brian Behlendorf		break;
335084fd14Brian Behlendorf	case VDEV_TRIM_CANCELED:
336084fd14Brian Behlendorf		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
337084fd14Brian Behlendorf		spa_history_log_internal(spa, "trim", tx,
338084fd14Brian Behlendorf		    "vdev=%s canceled", vd->vdev_path);
339084fd14Brian Behlendorf		break;
340084fd14Brian Behlendorf	case VDEV_TRIM_COMPLETE:
341084fd14Brian Behlendorf		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
342084fd14Brian Behlendorf		spa_history_log_internal(spa, "trim", tx,
343084fd14Brian Behlendorf		    "vdev=%s complete", vd->vdev_path);
344084fd14Brian Behlendorf		break;
345084fd14Brian Behlendorf	default:
346084fd14Brian Behlendorf		panic("invalid state %llu", (unsigned long long)new_state);
347084fd14Brian Behlendorf	}
348084fd14Brian Behlendorf
349084fd14Brian Behlendorf	dmu_tx_commit(tx);
350084fd14Brian Behlendorf}
351084fd14Brian Behlendorf
352084fd14Brian Behlendorf/*
353084fd14Brian Behlendorf * The zio_done_func_t done callback for each manual TRIM issued.  It is
354084fd14Brian Behlendorf * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
355084fd14Brian Behlendorf * and limiting the number of in-flight TRIM I/Os.
356084fd14Brian Behlendorf */
357084fd14Brian Behlendorfstatic void
358084fd14Brian Behlendorfvdev_trim_cb(zio_t *zio)
359084fd14Brian Behlendorf{
360084fd14Brian Behlendorf	vdev_t *vd = zio->io_vd;
361084fd14Brian Behlendorf
362084fd14Brian Behlendorf	mutex_enter(&vd->vdev_trim_io_lock);
363084fd14Brian Behlendorf	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
364084fd14Brian Behlendorf		/*
365084fd14Brian Behlendorf		 * The I/O failed because the vdev was unavailable; roll the
366084fd14Brian Behlendorf		 * last offset back. (This works because spa_sync waits on
367084fd14Brian Behlendorf		 * spa_txg_zio before it runs sync tasks.)
368084fd14Brian Behlendorf		 */
369084fd14Brian Behlendorf		uint64_t *offset =
370084fd14Brian Behlendorf		    &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
371084fd14Brian Behlendorf		*offset = MIN(*offset, zio->io_offset);
372084fd14Brian Behlendorf	} else {
373084fd14Brian Behlendorf		if (zio->io_error != 0) {
374084fd14Brian Behlendorf			vd->vdev_stat.vs_trim_errors++;
375084fd14Brian Behlendorf			/*
376084fd14Brian Behlendorf			 * spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
377084fd14Brian Behlendorf			 *  0, 0, 0, 0, 1, zio->io_orig_size);
378084fd14Brian Behlendorf			 */
379084fd14Brian Behlendorf		} else {
380084fd14Brian Behlendorf			/*
381084fd14Brian Behlendorf			 * spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
382084fd14Brian Behlendorf			 *  1, zio->io_orig_size, 0, 0, 0, 0);
383084fd14Brian Behlendorf			 */
384084fd14Brian Behlendorf		}
385084fd14Brian Behlendorf
386084fd14Brian Behlendorf		vd->vdev_trim_bytes_done += zio->io_orig_size;
387084fd14Brian Behlendorf	}
388084fd14Brian Behlendorf
389084fd14Brian Behlendorf	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
390084fd14Brian Behlendorf	vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
391084fd14Brian Behlendorf	cv_broadcast(&vd->vdev_trim_io_cv);
392084fd14Brian Behlendorf	mutex_exit(&vd->vdev_trim_io_lock);
393084fd14Brian Behlendorf
394084fd14Brian Behlendorf	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
395084fd14Brian Behlendorf}
396084fd14Brian Behlendorf
397084fd14Brian Behlendorf/*
398084fd14Brian Behlendorf * The zio_done_func_t done callback for each automatic TRIM issued.  It
399084fd14Brian Behlendorf * is responsible for updating the TRIM stats and limiting the number of
400084fd14Brian Behlendorf * in-flight TRIM I/Os.  Automatic TRIM I/Os are best effort and are
401084fd14Brian Behlendorf * never reissued on failure.
402084fd14Brian Behlendorf */
403084fd14Brian Behlendorfstatic void
404084fd14Brian Behlendorfvdev_autotrim_cb(zio_t *zio)
405084fd14Brian Behlendorf{
406084fd14Brian Behlendorf	vdev_t *vd = zio->io_vd;
407084fd14Brian Behlendorf
408084fd14Brian Behlendorf	mutex_enter(&vd->vdev_trim_io_lock);
409084fd14Brian Behlendorf
410084fd14Brian Behlendorf	if (zio->io_error != 0) {
411084fd14Brian Behlendorf		vd->vdev_stat.vs_trim_errors++;