1814dcd4Serapheim Dimitropoulos/*
2814dcd4Serapheim Dimitropoulos * CDDL HEADER START
3814dcd4Serapheim Dimitropoulos *
4814dcd4Serapheim Dimitropoulos * The contents of this file are subject to the terms of the
5814dcd4Serapheim Dimitropoulos * Common Development and Distribution License (the "License").
6814dcd4Serapheim Dimitropoulos * You may not use this file except in compliance with the License.
7814dcd4Serapheim Dimitropoulos *
8814dcd4Serapheim Dimitropoulos * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9814dcd4Serapheim Dimitropoulos * or http://www.opensolaris.org/os/licensing.
10814dcd4Serapheim Dimitropoulos * See the License for the specific language governing permissions
11814dcd4Serapheim Dimitropoulos * and limitations under the License.
12814dcd4Serapheim Dimitropoulos *
13814dcd4Serapheim Dimitropoulos * When distributing Covered Code, include this CDDL HEADER in each
14814dcd4Serapheim Dimitropoulos * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15814dcd4Serapheim Dimitropoulos * If applicable, add the following below this CDDL HEADER, with the
16814dcd4Serapheim Dimitropoulos * fields enclosed by brackets "[]" replaced with your own identifying
17814dcd4Serapheim Dimitropoulos * information: Portions Copyright [yyyy] [name of copyright owner]
18814dcd4Serapheim Dimitropoulos *
19814dcd4Serapheim Dimitropoulos * CDDL HEADER END
20814dcd4Serapheim Dimitropoulos */
21814dcd4Serapheim Dimitropoulos
22814dcd4Serapheim Dimitropoulos/*
23814dcd4Serapheim Dimitropoulos * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
24814dcd4Serapheim Dimitropoulos */
25814dcd4Serapheim Dimitropoulos
26814dcd4Serapheim Dimitropoulos#include <sys/dmu_objset.h>
27814dcd4Serapheim Dimitropoulos#include <sys/metaslab.h>
28814dcd4Serapheim Dimitropoulos#include <sys/metaslab_impl.h>
29814dcd4Serapheim Dimitropoulos#include <sys/spa.h>
30814dcd4Serapheim Dimitropoulos#include <sys/spa_impl.h>
31814dcd4Serapheim Dimitropoulos#include <sys/spa_log_spacemap.h>
32814dcd4Serapheim Dimitropoulos#include <sys/vdev_impl.h>
33814dcd4Serapheim Dimitropoulos#include <sys/zap.h>
34814dcd4Serapheim Dimitropoulos
35814dcd4Serapheim Dimitropoulos/*
36814dcd4Serapheim Dimitropoulos * Log Space Maps
37814dcd4Serapheim Dimitropoulos *
38814dcd4Serapheim Dimitropoulos * Log space maps are an optimization in ZFS metadata allocations for pools
39814dcd4Serapheim Dimitropoulos * whose workloads are primarily random-writes. Random-write workloads are also
40814dcd4Serapheim Dimitropoulos * typically random-free, meaning that they are freeing from locations scattered
41814dcd4Serapheim Dimitropoulos * throughout the pool. This means that each TXG we will have to append some
42814dcd4Serapheim Dimitropoulos * FREE records to almost every metaslab. With log space maps, we hold their
43814dcd4Serapheim Dimitropoulos * changes in memory and log them altogether in one pool-wide space map on-disk
44814dcd4Serapheim Dimitropoulos * for persistence. As more blocks are accumulated in the log space maps and
45814dcd4Serapheim Dimitropoulos * more unflushed changes are accounted in memory, we flush a selected group
46814dcd4Serapheim Dimitropoulos * of metaslabs every TXG to relieve memory pressure and potential overheads
47814dcd4Serapheim Dimitropoulos * when loading the pool. Flushing a metaslab to disk relieves memory as we
48814dcd4Serapheim Dimitropoulos * flush any unflushed changes from memory to disk (i.e. the metaslab's space
49814dcd4Serapheim Dimitropoulos * map) and saves import time by making old log space maps obsolete and
50814dcd4Serapheim Dimitropoulos * eventually destroying them. [A log space map is said to be obsolete when all
51814dcd4Serapheim Dimitropoulos * its entries have made it to their corresponding metaslab space maps].
52814dcd4Serapheim Dimitropoulos *
53814dcd4Serapheim Dimitropoulos * == On disk data structures used ==
54814dcd4Serapheim Dimitropoulos *
55814dcd4Serapheim Dimitropoulos * - The pool has a new feature flag and a new entry in the MOS. The feature
56814dcd4Serapheim Dimitropoulos *   is activated when we create the first log space map and remains active
57814dcd4Serapheim Dimitropoulos *   for the lifetime of the pool. The new entry in the MOS Directory [refer
58814dcd4Serapheim Dimitropoulos *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
59814dcd4Serapheim Dimitropoulos *   pairs are of the form <key: txg, value: log space map object for that txg>.
60814dcd4Serapheim Dimitropoulos *   This entry is our on-disk reference of the log space maps that exist in
61814dcd4Serapheim Dimitropoulos *   the pool for each TXG and it is used during import to load all the
62814dcd4Serapheim Dimitropoulos *   metaslab unflushed changes in memory. To see how this structure is first
63814dcd4Serapheim Dimitropoulos *   created and later populated refer to spa_generate_syncing_log_sm(). To see
64814dcd4Serapheim Dimitropoulos *   how it is used during import time refer to spa_ld_log_sm_metadata().
65814dcd4Serapheim Dimitropoulos *
66814dcd4Serapheim Dimitropoulos * - Each vdev has a new entry in its vdev_top_zap (see field
67814dcd4Serapheim Dimitropoulos *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
68814dcd4Serapheim Dimitropoulos *   each metaslab in this vdev. This field is the on-disk counterpart of the
69814dcd4Serapheim Dimitropoulos *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
70814dcd4Serapheim Dimitropoulos *   the metaslab haven't had its changes flushed. During import, we use this
71814dcd4Serapheim Dimitropoulos *   to ignore any entries in the space map log that are for this metaslab but
72814dcd4Serapheim Dimitropoulos *   from a TXG before msp_unflushed_txg. At that point, we also populate its
73814dcd4Serapheim Dimitropoulos *   in-memory counterpart and from there both fields are updated every time
74814dcd4Serapheim Dimitropoulos *   we flush that metaslab.
75814dcd4Serapheim Dimitropoulos *
76814dcd4Serapheim Dimitropoulos * - A space map is created every TXG and, during that TXG, it is used to log
77814dcd4Serapheim Dimitropoulos *   all incoming changes (the log space map). When created, the log space map
78814dcd4Serapheim Dimitropoulos *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
79814dcd4Serapheim Dimitropoulos *   to the space map ZAP mentioned above. The log space map is closed at the
80814dcd4Serapheim Dimitropoulos *   end of the TXG and will be destroyed when it becomes fully obsolete. We
81814dcd4Serapheim Dimitropoulos *   know when a log space map has become obsolete by looking at the oldest
82814dcd4Serapheim Dimitropoulos *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
83814dcd4Serapheim Dimitropoulos *   than the log space map's TXG, then it means that there is no metaslab who
84814dcd4Serapheim Dimitropoulos *   doesn't have the changes from that log and we can therefore destroy it.
85814dcd4Serapheim Dimitropoulos *   [see spa_cleanup_old_sm_logs()].
86814dcd4Serapheim Dimitropoulos *
87814dcd4Serapheim Dimitropoulos * == Important in-memory structures ==
88814dcd4Serapheim Dimitropoulos *
89814dcd4Serapheim Dimitropoulos * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
90814dcd4Serapheim Dimitropoulos *   the pool by their ms_unflushed_txg field. It is primarily used for three
91814dcd4Serapheim Dimitropoulos *   reasons. First of all, it is used during flushing where we try to flush
92814dcd4Serapheim Dimitropoulos *   metaslabs in-order from the oldest-flushed to the most recently flushed
93814dcd4Serapheim Dimitropoulos *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
94814dcd4Serapheim Dimitropoulos *   oldest flushed metaslab to distinguish which log space maps have become
95814dcd4Serapheim Dimitropoulos *   obsolete and which ones are still relevant. Finally it tells us which
96814dcd4Serapheim Dimitropoulos *   metaslabs have unflushed changes in a pool where this feature was just
97814dcd4Serapheim Dimitropoulos *   enabled, as we don't immediately add all of the pool's metaslabs but we
98814dcd4Serapheim Dimitropoulos *   add them over time as they go through metaslab_sync(). The reason that
99814dcd4Serapheim Dimitropoulos *   we do that is to ease these pools into the behavior of the flushing
100814dcd4Serapheim Dimitropoulos *   algorithm (described later on).
101814dcd4Serapheim Dimitropoulos *
102814dcd4Serapheim Dimitropoulos * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
103814dcd4Serapheim Dimitropoulos *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
104814dcd4Serapheim Dimitropoulos *   nodes represent the log space maps in the pool. This in-memory
105814dcd4Serapheim Dimitropoulos *   representation of log space maps in the pool sorts the log space maps by
106814dcd4Serapheim Dimitropoulos *   the TXG that they were created (which is also the TXG of their unflushed
107814dcd4Serapheim Dimitropoulos *   changes). It also contains the following extra information for each
108814dcd4Serapheim Dimitropoulos *   space map:
109814dcd4Serapheim Dimitropoulos *   [1] The number of metaslabs that were last flushed on that TXG. This is
110814dcd4Serapheim Dimitropoulos *       important because if that counter is zero and this is the oldest
111814dcd4Serapheim Dimitropoulos *       log then it means that it is also obsolete.
112814dcd4Serapheim Dimitropoulos *   [2] The number of blocks of that space map. This field is used by the
113814dcd4Serapheim Dimitropoulos *       block heuristic of our flushing algorithm (described later on).
114814dcd4Serapheim Dimitropoulos *       It represents how many blocks of metadata changes ZFS had to write
115814dcd4Serapheim Dimitropoulos *       to disk for that TXG.
116814dcd4Serapheim Dimitropoulos *
117814dcd4Serapheim Dimitropoulos * - The per-spa field spa_log_summary is a list of entries that summarizes
118814dcd4Serapheim Dimitropoulos *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
119814dcd4Serapheim Dimitropoulos *   AVL tree mentioned above. The reason this exists is that our flushing
120814dcd4Serapheim Dimitropoulos *   algorithm (described later) tries to estimate how many metaslabs to flush
121814dcd4Serapheim Dimitropoulos *   in each TXG by iterating over all the log space maps and looking at their
122814dcd4Serapheim Dimitropoulos *   block counts. Summarizing that information means that don't have to
123814dcd4Serapheim Dimitropoulos *   iterate through each space map, minimizing the runtime overhead of the
124814dcd4Serapheim Dimitropoulos *   flushing algorithm which would be induced in syncing context. In terms of
125814dcd4Serapheim Dimitropoulos *   implementation the log summary is used as a queue:
126814dcd4Serapheim Dimitropoulos *   * we modify or pop entries from its head when we flush metaslabs
127814dcd4Serapheim Dimitropoulos *   * we modify or append entries to its tail when we sync changes.
128814dcd4Serapheim Dimitropoulos *
129814dcd4Serapheim Dimitropoulos * - Each metaslab has two new range trees that hold its unflushed changes,
130814dcd4Serapheim Dimitropoulos *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
131814dcd4Serapheim Dimitropoulos *
132814dcd4Serapheim Dimitropoulos * == Flushing algorithm ==
133814dcd4Serapheim Dimitropoulos *
134814dcd4Serapheim Dimitropoulos * The decision of how many metaslabs to flush on a give TXG is guided by
135814dcd4Serapheim Dimitropoulos * two heuristics:
136814dcd4Serapheim Dimitropoulos *
137814dcd4Serapheim Dimitropoulos * [1] The memory heuristic -
138814dcd4Serapheim Dimitropoulos * We keep track of the memory used by the unflushed trees from all the
139814dcd4Serapheim Dimitropoulos * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
140814dcd4Serapheim Dimitropoulos * stays below a certain threshold which is determined by an arbitrary hard
141814dcd4Serapheim Dimitropoulos * limit and an arbitrary percentage of the system's memory [see
142814dcd4Serapheim Dimitropoulos * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
143814dcd4Serapheim Dimitropoulos * unflushed changes are passing that threshold, we flush metaslabs, which
144814dcd4Serapheim Dimitropoulos * empties their unflushed range trees, reducing the memory used.
145814dcd4Serapheim Dimitropoulos *
146814dcd4Serapheim Dimitropoulos * [2] The block heuristic -
147814dcd4Serapheim Dimitropoulos * We try to keep the total number of blocks in the log space maps in check
148814dcd4Serapheim Dimitropoulos * so the log doesn't grow indefinitely and we don't induce a lot of overhead
149814dcd4Serapheim Dimitropoulos * when loading the pool. At the same time we don't want to flush a lot of
150814dcd4Serapheim Dimitropoulos * metaslabs too often as this would defeat the purpose of the log space map.
151814dcd4Serapheim Dimitropoulos * As a result we set a limit in the amount of blocks that we think it's
152814dcd4Serapheim Dimitropoulos * acceptable for the log space maps to have and try not to cross it.
153814dcd4Serapheim Dimitropoulos * [see sus_blocklimit from spa_unflushed_stats].
154814dcd4Serapheim Dimitropoulos *
155814dcd4Serapheim Dimitropoulos * In order to stay below the block limit every TXG we have to estimate how
156814dcd4Serapheim Dimitropoulos * many metaslabs we need to flush based on the current rate of incoming blocks
157814dcd4Serapheim Dimitropoulos * and our history of log space map blocks. The main idea here is to answer
158814dcd4Serapheim Dimitropoulos * the question of how many metaslabs do we need to flush in order to get rid
159814dcd4Serapheim Dimitropoulos * at least an X amount of log space map blocks. We can answer this question
160814dcd4Serapheim Dimitropoulos * by iterating backwards from the oldest log space map to the newest one
161814dcd4Serapheim Dimitropoulos * and looking at their metaslab and block counts. At this point the log summary
162814dcd4Serapheim Dimitropoulos * mentioned above comes handy as it reduces the amount of things that we have
163814dcd4Serapheim Dimitropoulos * to iterate (even though it may reduce the preciseness of our estimates due
164814dcd4Serapheim Dimitropoulos * to its aggregation of data). So with that in mind, we project the incoming
165814dcd4Serapheim Dimitropoulos * rate of the current TXG into the future and attempt to approximate how many
166814dcd4Serapheim Dimitropoulos * metaslabs would we need to flush from now in order to avoid exceeding our
167814dcd4Serapheim Dimitropoulos * block limit in different points in the future (granted that we would keep
168814dcd4Serapheim Dimitropoulos * flushing the same number of metaslabs for every TXG). Then we take the
169814dcd4Serapheim Dimitropoulos * maximum number from all these estimates to be on the safe side. For the
170814dcd4Serapheim Dimitropoulos * exact implementation details of algorithm refer to
171814dcd4Serapheim Dimitropoulos * spa_estimate_metaslabs_to_flush.
172814dcd4Serapheim Dimitropoulos */
173814dcd4Serapheim Dimitropoulos
174814dcd4Serapheim Dimitropoulos/*
175814dcd4Serapheim Dimitropoulos * This is used as the block size for the space maps used for the
176814dcd4Serapheim Dimitropoulos * log space map feature. These space maps benefit from a bigger
177814dcd4Serapheim Dimitropoulos * block size as we expect to be writing a lot of data to them at
178814dcd4Serapheim Dimitropoulos * once.
179814dcd4Serapheim Dimitropoulos */
180814dcd4Serapheim Dimitropoulosunsigned long zfs_log_sm_blksz = 1ULL << 17;
181814dcd4Serapheim Dimitropoulos
182814dcd4Serapheim Dimitropoulos/*
183814dcd4Serapheim Dimitropoulos * Percentage of the overall system���s memory that ZFS allows to be
184814dcd4Serapheim Dimitropoulos * used for unflushed changes (e.g. the sum of size of all the nodes
185814dcd4Serapheim Dimitropoulos * in the unflushed trees).
186814dcd4Serapheim Dimitropoulos *
187814dcd4Serapheim Dimitropoulos * Note that this value is calculated over 1000000 for finer granularity
188814dcd4Serapheim Dimitropoulos * (thus the _ppm suffix; reads as "parts per million"). As an example,
189814dcd4Serapheim Dimitropoulos * the default of 1000 allows 0.1% of memory to be used.
190814dcd4Serapheim Dimitropoulos */
191814dcd4Serapheim Dimitropoulosunsigned long zfs_unflushed_max_mem_ppm = 1000;
192814dcd4Serapheim Dimitropoulos
193814dcd4Serapheim Dimitropoulos/*
194814dcd4Serapheim Dimitropoulos * Specific hard-limit in memory that ZFS allows to be used for
195814dcd4Serapheim Dimitropoulos * unflushed changes.
196814dcd4Serapheim Dimitropoulos */
197814dcd4Serapheim Dimitropoulosunsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
198814dcd4Serapheim Dimitropoulos
199814dcd4Serapheim Dimitropoulos/*
200814dcd4Serapheim Dimitropoulos * The following tunable determines the number of blocks that can be used for
201814dcd4Serapheim Dimitropoulos * the log space maps. It is expressed as a percentage of the total number of
202814dcd4Serapheim Dimitropoulos * metaslabs in the pool (i.e. the default of 400 means that the number of log
203814dcd4Serapheim Dimitropoulos * blocks is capped at 4 times the number of metaslabs).
204814dcd4Serapheim Dimitropoulos *
205814dcd4Serapheim Dimitropoulos * This value exists to tune our flushing algorithm, with higher values
206814dcd4Serapheim Dimitropoulos * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
207814dcd4Serapheim Dimitropoulos * flushing metaslabs more aggressively with the upside of saving overheads
208814dcd4Serapheim Dimitropoulos * when loading the pool. Another factor in this tradeoff is that flushing
209814dcd4Serapheim Dimitropoulos * less often can potentially lead to better utilization of the metaslab space
210814dcd4Serapheim Dimitropoulos * map's block size as we accumulate more changes per flush.
211814dcd4Serapheim Dimitropoulos *
212814dcd4Serapheim Dimitropoulos * Given that this tunable indirectly controls the flush rate (metaslabs
213814dcd4Serapheim Dimitropoulos * flushed per txg) and that's why making it a percentage in terms of the
214814dcd4Serapheim Dimitropoulos * number of metaslabs in the pool makes sense here.
215814dcd4Serapheim Dimitropoulos *
216814dcd4Serapheim Dimitropoulos * As a rule of thumb we default this tunable to 400% based on the following:
217814dcd4Serapheim Dimitropoulos *
218814dcd4Serapheim Dimitropoulos * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
219814dcd4Serapheim Dimitropoulos *    it is reasonable to expect that the amount of obsolete entries changes
220814dcd4Serapheim Dimitropoulos *    linearly from txg to txg (e.g. the oldest log should have the most
221814dcd4Serapheim Dimitropoulos *    obsolete entries, and the most recent one the least). With this we could
222814dcd4Serapheim Dimitropoulos *    say that, at any given time, about half of the entries in the whole space
223814dcd4Serapheim Dimitropoulos *    map log are obsolete. Thus for every two entries for a metaslab in the
224814dcd4Serapheim Dimitropoulos *    log space map, only one of them is valid and actually makes it to the
225814dcd4Serapheim Dimitropoulos *    metaslab's space map.
226814dcd4Serapheim Dimitropoulos *    [factor of 2]
227814dcd4Serapheim Dimitropoulos * 2] Each entry in the log space map is guaranteed to be two words while
228814dcd4Serapheim Dimitropoulos *    entries in metaslab space maps are generally single-word.
229814dcd4Serapheim Dimitropoulos *    [an extra factor of 2 - 400% overall]
230814dcd4Serapheim Dimitropoulos * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
231814dcd4Serapheim Dimitropoulos *    account any consolidation of segments from the log space map to the
232814dcd4Serapheim Dimitropoulos *    unflushed range trees nor their history (e.g. a segment being allocated,
233814dcd4Serapheim Dimitropoulos *    then freed, then allocated again means 3 log space map entries but 0
234814dcd4Serapheim Dimitropoulos *    metaslab space map entries). Depending on the workload, we've seen ~1.8
235814dcd4Serapheim Dimitropoulos *    non-obsolete log space map entries per metaslab entry, for a total of
236814dcd4Serapheim Dimitropoulos *    ~600%. Since most of these estimates though are workload dependent, we
237814dcd4Serapheim Dimitropoulos *    default on 400% to be conservative.
238814dcd4Serapheim Dimitropoulos *
239814dcd4Serapheim Dimitropoulos *    Thus we could say that even in the worst
240814dcd4Serapheim Dimitropoulos *    case of [1] and [2], the factor should end up being 4.
241814dcd4Serapheim Dimitropoulos *
242814dcd4Serapheim Dimitropoulos * That said, regardless of the number of metaslabs in the pool we need to
243814dcd4Serapheim Dimitropoulos * provide upper and lower bounds for the log block limit.
244814dcd4Serapheim Dimitropoulos * [see zfs_unflushed_log_block_{min,max}]
245814dcd4Serapheim Dimitropoulos */
246814dcd4Serapheim Dimitropoulosunsigned long zfs_unflushed_log_block_pct = 400;
247814dcd4Serapheim Dimitropoulos
248814dcd4Serapheim Dimitropoulos/*
249814dcd4Serapheim Dimitropoulos * If the number of metaslabs is small and our incoming rate is high, we could
250814dcd4Serapheim Dimitropoulos * get into a situation that we are flushing all our metaslabs every TXG. Thus
251814dcd4Serapheim Dimitropoulos * we always allow at least this many log blocks.
252814dcd4Serapheim Dimitropoulos */
253814dcd4Serapheim Dimitropoulosunsigned long zfs_unflushed_log_block_min = 1000;
254814dcd4Serapheim Dimitropoulos
255814dcd4Serapheim Dimitropoulos/*
256814dcd4Serapheim Dimitropoulos * If the log becomes too big, the import time of the pool can take a hit in
257814dcd4Serapheim Dimitropoulos * terms of performance. Thus we have a hard limit in the size of the log in
258814dcd4Serapheim Dimitropoulos * terms of blocks.
259814dcd4Serapheim Dimitropoulos */
260814dcd4Serapheim Dimitropoulosunsigned long zfs_unflushed_log_block_max = (1ULL << 18);
261814dcd4Serapheim Dimitropoulos
262814dcd4Serapheim Dimitropoulos/*
263814dcd4Serapheim Dimitropoulos * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
264814dcd4Serapheim Dimitropoulos * stability of the flushing algorithm (longer summary) vs its runtime overhead
265814dcd4Serapheim Dimitropoulos * (smaller summary is faster to traverse).
266814dcd4Serapheim Dimitropoulos */
267814dcd4Serapheim Dimitropoulosunsigned long zfs_max_logsm_summary_length = 10;
268814dcd4Serapheim Dimitropoulos
269814dcd4Serapheim Dimitropoulos/*
270814dcd4Serapheim Dimitropoulos * Tunable that sets the lower bound on the metaslabs to flush every TXG.
271814dcd4Serapheim Dimitropoulos *
272814dcd4Serapheim Dimitropoulos * Setting this to 0 has no effect since if the pool is idle we won't even be
273814dcd4Serapheim Dimitropoulos * creating log space maps and therefore we won't be flushing. On the other
274814dcd4Serapheim Dimitropoulos * hand if the pool has any incoming workload our block heuristic will start
275814dcd4Serapheim Dimitropoulos * flushing metaslabs anyway.
276814dcd4Serapheim Dimitropoulos *
277814dcd4Serapheim Dimitropoulos * The point of this tunable is to be used in extreme cases where we really
278814dcd4Serapheim Dimitropoulos * want to flush more metaslabs than our adaptable heuristic plans to flush.
279814dcd4Serapheim Dimitropoulos */
280814dcd4Serapheim Dimitropoulosunsigned long zfs_min_metaslabs_to_flush = 1;
281814dcd4Serapheim Dimitropoulos
282814dcd4Serapheim Dimitropoulos/*
283814dcd4Serapheim Dimitropoulos * Tunable that specifies how far in the past do we want to look when trying to
284814dcd4Serapheim Dimitropoulos * estimate the incoming log blocks for the current TXG.
285814dcd4Serapheim Dimitropoulos *
286814dcd4Serapheim Dimitropoulos * Setting this too high may not only increase runtime but also minimize the
287814dcd4Serapheim Dimitropoulos * effect of the incoming rates from the most recent TXGs as we take the
288814dcd4Serapheim Dimitropoulos * average over all the blocks that we walk
289814dcd4Serapheim Dimitropoulos * [see spa_estimate_incoming_log_blocks].
290814dcd4Serapheim Dimitropoulos */
291814dcd4Serapheim Dimitropoulosunsigned long zfs_max_log_walking = 5;
292814dcd4Serapheim Dimitropoulos
293814dcd4Serapheim Dimitropoulos/*
294814dcd4Serapheim Dimitropoulos * This tunable exists solely for testing purposes. It ensures that the log
295814dcd4Serapheim Dimitropoulos * spacemaps are not flushed and destroyed during export in order for the
296814dcd4Serapheim Dimitropoulos * relevant log spacemap import code paths to be tested (effectively simulating
297814dcd4Serapheim Dimitropoulos * a crash).
298814dcd4Serapheim Dimitropoulos */
299814dcd4Serapheim Dimitropoulosint zfs_keep_log_spacemaps_at_export = 0;
300814dcd4Serapheim Dimitropoulos
301814dcd4Serapheim Dimitropoulosstatic uint64_t
302814dcd4Serapheim Dimitropoulosspa_estimate_incoming_log_blocks(spa_t *spa)
303814dcd4Serapheim Dimitropoulos{
304814dcd4Serapheim Dimitropoulos	ASSERT3U(spa_sync_pass(spa), ==, 1);
305814dcd4Serapheim Dimitropoulos	uint64_t steps = 0, sum = 0;
306814dcd4Serapheim Dimitropoulos
307814dcd4Serapheim Dimitropoulos	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
308814dcd4Serapheim Dimitropoulos	    sls != NULL && steps < zfs_max_log_walking;
309814dcd4Serapheim Dimitropoulos	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
310814dcd4Serapheim Dimitropoulos		if (sls->sls_txg == spa_syncing_txg(spa)) {
311814dcd4Serapheim Dimitropoulos			/*
312814dcd4Serapheim Dimitropoulos			 * skip the log created in this TXG as this would
313814dcd4Serapheim Dimitropoulos			 * make our estimations inaccurate.
314814dcd4Serapheim Dimitropoulos			 */
315814dcd4Serapheim Dimitropoulos			continue;
316814dcd4Serapheim Dimitropoulos		}
317814dcd4Serapheim Dimitropoulos		sum += sls->sls_nblocks;
318814dcd4Serapheim Dimitropoulos		steps++;
319814dcd4Serapheim Dimitropoulos	}
320814dcd4Serapheim Dimitropoulos	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
321814dcd4Serapheim Dimitropoulos}
322814dcd4Serapheim Dimitropoulos
323814dcd4Serapheim Dimitropoulosuint64_t
324814dcd4Serapheim Dimitropoulosspa_log_sm_blocklimit(spa_t *spa)
325814dcd4Serapheim Dimitropoulos{
326814dcd4Serapheim Dimitropoulos	return (spa->spa_unflushed_stats.sus_blocklimit);
327814dcd4Serapheim Dimitropoulos}
328814dcd4Serapheim Dimitropoulos
329814dcd4Serapheim Dimitropoulosvoid
330814dcd4Serapheim Dimitropoulosspa_log_sm_set_blocklimit(spa_t *spa)
331814dcd4Serapheim Dimitropoulos{
332814dcd4Serapheim Dimitropoulos	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
333814dcd4Serapheim Dimitropoulos		ASSERT0(spa_log_sm_blocklimit(spa));
334814dcd4Serapheim Dimitropoulos		return;
335814dcd4Serapheim Dimitropoulos	}
336814dcd4Serapheim Dimitropoulos
337814dcd4Serapheim Dimitropoulos	uint64_t calculated_limit =
338814dcd4Serapheim Dimitropoulos	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
339814dcd4Serapheim Dimitropoulos	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
340814dcd4Serapheim Dimitropoulos	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
341814dcd4Serapheim Dimitropoulos}
342814dcd4Serapheim Dimitropoulos
343814dcd4Serapheim Dimitropoulosuint64_t
344814dcd4Serapheim Dimitropoulosspa_log_sm_nblocks(spa_t *spa)
345814dcd4Serapheim Dimitropoulos{
346814dcd4Serapheim Dimitropoulos	return (spa->spa_unflushed_stats.sus_nblocks);
347814dcd4Serapheim Dimitropoulos}
348814dcd4Serapheim Dimitropoulos
349814dcd4Serapheim Dimitropoulos/*
350814dcd4Serapheim Dimitropoulos * Ensure that the in-memory log space map structures and the summary
351814dcd4Serapheim Dimitropoulos * have the same block and metaslab counts.
352814dcd4Serapheim Dimitropoulos */
353814dcd4Serapheim Dimitropoulosstatic void
354814dcd4Serapheim Dimitropoulosspa_log_summary_verify_counts(spa_t *spa)
355814dcd4Serapheim Dimitropoulos{
356814dcd4Serapheim Dimitropoulos	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
357814dcd4Serapheim Dimitropoulos
358814dcd4Serapheim Dimitropoulos	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
359814dcd4Serapheim Dimitropoulos		return;
360814dcd4Serapheim Dimitropoulos
361814dcd4Serapheim Dimitropoulos	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
362814dcd4Serapheim Dimitropoulos
363814dcd4Serapheim Dimitropoulos	uint64_t ms_in_summary = 0, blk_in_summary = 0;
364814dcd4Serapheim Dimitropoulos	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
365814dcd4Serapheim Dimitropoulos	    e; e = list_next(&spa->spa_log_summary, e)) {
366814dcd4Serapheim Dimitropoulos		ms_in_summary += e->lse_mscount;
367814dcd4Serapheim Dimitropoulos		blk_in_summary += e->lse_blkcount;
368814dcd4Serapheim Dimitropoulos	}
369814dcd4Serapheim Dimitropoulos
370814dcd4Serapheim Dimitropoulos	uint64_t ms_in_logs = 0, blk_in_logs = 0;
371814dcd4Serapheim Dimitropoulos	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
372814dcd4Serapheim Dimitropoulos	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
373814dcd4Serapheim Dimitropoulos		ms_in_logs += sls->sls_mscount;
374814dcd4Serapheim Dimitropoulos		blk_in_logs += sls->sls_nblocks;
375814dcd4Serapheim Dimitropoulos	}
376814dcd4Serapheim Dimitropoulos
377814dcd4Serapheim Dimitropoulos	VERIFY3U(ms_in_logs, ==, ms_in_summary);
378814dcd4Serapheim Dimitropoulos	VERIFY3U(ms_in_logs, ==, ms_in_avl);
379814dcd4Serapheim Dimitropoulos	VERIFY3U(blk_in_logs, ==, blk_in_summary);
380814dcd4Serapheim Dimitropoulos	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
381814dcd4Serapheim Dimitropoulos}
382814dcd4Serapheim Dimitropoulos
383814dcd4Serapheim Dimitropoulosstatic boolean_t
384814dcd4Serapheim Dimitropoulossummary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
385814dcd4Serapheim Dimitropoulos{
386814dcd4Serapheim Dimitropoulos	uint64_t blocks_per_row = MAX(1,
387814dcd4Serapheim Dimitropoulos	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
388814dcd4Serapheim Dimitropoulos	    zfs_max_logsm_summary_length));
389814dcd4Serapheim Dimitropoulos
390814dcd4Serapheim Dimitropoulos	return (blocks_per_row <= e->lse_blkcount);
391814dcd4Serapheim Dimitropoulos}
392814dcd4Serapheim Dimitropoulos
393814dcd4Serapheim Dimitropoulos/*
394814dcd4Serapheim Dimitropoulos * Update the log summary information to reflect the fact that a metaslab
395814dcd4Serapheim Dimitropoulos * was flushed or destroyed (e.g due to device removal or pool export/destroy).
396814dcd4Serapheim Dimitropoulos *
397814dcd4Serapheim Dimitropoulos * We typically flush the oldest flushed metaslab so the first (and oldest)
398814dcd4Serapheim Dimitropoulos * entry of the summary is updated. However if that metaslab is getting loaded
399814dcd4Serapheim Dimitropoulos * we may flush the second oldest one which may be part of an entry later in
400814dcd4Serapheim Dimitropoulos * the summary. Moreover, if we call into this function from metaslab_fini()
401814dcd4Serapheim Dimitropoulos * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
402814dcd4Serapheim Dimitropoulos * for a txg as an argument so we can locate the appropriate summary entry for
403814dcd4Serapheim Dimitropoulos * the metaslab.
404814dcd4Serapheim Dimitropoulos */
405814dcd4Serapheim Dimitropoulosvoid
406814dcd4Serapheim Dimitropoulosspa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
407814dcd4Serapheim Dimitropoulos{
408814dcd4Serapheim Dimitropoulos	/*
409