xref: /illumos-gate/usr/src/uts/common/fs/zfs/spa_log_spacemap.c (revision 814dcd43c3de9925fd6226c256e4d4327841a0e1)
1*814dcd43SSerapheim Dimitropoulos /*
2*814dcd43SSerapheim Dimitropoulos  * CDDL HEADER START
3*814dcd43SSerapheim Dimitropoulos  *
4*814dcd43SSerapheim Dimitropoulos  * The contents of this file are subject to the terms of the
5*814dcd43SSerapheim Dimitropoulos  * Common Development and Distribution License (the "License").
6*814dcd43SSerapheim Dimitropoulos  * You may not use this file except in compliance with the License.
7*814dcd43SSerapheim Dimitropoulos  *
8*814dcd43SSerapheim Dimitropoulos  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*814dcd43SSerapheim Dimitropoulos  * or http://www.opensolaris.org/os/licensing.
10*814dcd43SSerapheim Dimitropoulos  * See the License for the specific language governing permissions
11*814dcd43SSerapheim Dimitropoulos  * and limitations under the License.
12*814dcd43SSerapheim Dimitropoulos  *
13*814dcd43SSerapheim Dimitropoulos  * When distributing Covered Code, include this CDDL HEADER in each
14*814dcd43SSerapheim Dimitropoulos  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*814dcd43SSerapheim Dimitropoulos  * If applicable, add the following below this CDDL HEADER, with the
16*814dcd43SSerapheim Dimitropoulos  * fields enclosed by brackets "[]" replaced with your own identifying
17*814dcd43SSerapheim Dimitropoulos  * information: Portions Copyright [yyyy] [name of copyright owner]
18*814dcd43SSerapheim Dimitropoulos  *
19*814dcd43SSerapheim Dimitropoulos  * CDDL HEADER END
20*814dcd43SSerapheim Dimitropoulos  */
21*814dcd43SSerapheim Dimitropoulos 
22*814dcd43SSerapheim Dimitropoulos /*
23*814dcd43SSerapheim Dimitropoulos  * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
24*814dcd43SSerapheim Dimitropoulos  */
25*814dcd43SSerapheim Dimitropoulos 
26*814dcd43SSerapheim Dimitropoulos #include <sys/dmu_objset.h>
27*814dcd43SSerapheim Dimitropoulos #include <sys/metaslab.h>
28*814dcd43SSerapheim Dimitropoulos #include <sys/metaslab_impl.h>
29*814dcd43SSerapheim Dimitropoulos #include <sys/spa.h>
30*814dcd43SSerapheim Dimitropoulos #include <sys/spa_impl.h>
31*814dcd43SSerapheim Dimitropoulos #include <sys/spa_log_spacemap.h>
32*814dcd43SSerapheim Dimitropoulos #include <sys/vdev_impl.h>
33*814dcd43SSerapheim Dimitropoulos #include <sys/zap.h>
34*814dcd43SSerapheim Dimitropoulos 
35*814dcd43SSerapheim Dimitropoulos /*
36*814dcd43SSerapheim Dimitropoulos  * Log Space Maps
37*814dcd43SSerapheim Dimitropoulos  *
38*814dcd43SSerapheim Dimitropoulos  * Log space maps are an optimization in ZFS metadata allocations for pools
39*814dcd43SSerapheim Dimitropoulos  * whose workloads are primarily random-writes. Random-write workloads are also
40*814dcd43SSerapheim Dimitropoulos  * typically random-free, meaning that they are freeing from locations scattered
41*814dcd43SSerapheim Dimitropoulos  * throughout the pool. This means that each TXG we will have to append some
42*814dcd43SSerapheim Dimitropoulos  * FREE records to almost every metaslab. With log space maps, we hold their
43*814dcd43SSerapheim Dimitropoulos  * changes in memory and log them altogether in one pool-wide space map on-disk
44*814dcd43SSerapheim Dimitropoulos  * for persistence. As more blocks are accumulated in the log space maps and
45*814dcd43SSerapheim Dimitropoulos  * more unflushed changes are accounted in memory, we flush a selected group
46*814dcd43SSerapheim Dimitropoulos  * of metaslabs every TXG to relieve memory pressure and potential overheads
47*814dcd43SSerapheim Dimitropoulos  * when loading the pool. Flushing a metaslab to disk relieves memory as we
48*814dcd43SSerapheim Dimitropoulos  * flush any unflushed changes from memory to disk (i.e. the metaslab's space
49*814dcd43SSerapheim Dimitropoulos  * map) and saves import time by making old log space maps obsolete and
50*814dcd43SSerapheim Dimitropoulos  * eventually destroying them. [A log space map is said to be obsolete when all
51*814dcd43SSerapheim Dimitropoulos  * its entries have made it to their corresponding metaslab space maps].
52*814dcd43SSerapheim Dimitropoulos  *
53*814dcd43SSerapheim Dimitropoulos  * == On disk data structures used ==
54*814dcd43SSerapheim Dimitropoulos  *
55*814dcd43SSerapheim Dimitropoulos  * - The pool has a new feature flag and a new entry in the MOS. The feature
56*814dcd43SSerapheim Dimitropoulos  *   is activated when we create the first log space map and remains active
57*814dcd43SSerapheim Dimitropoulos  *   for the lifetime of the pool. The new entry in the MOS Directory [refer
58*814dcd43SSerapheim Dimitropoulos  *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
59*814dcd43SSerapheim Dimitropoulos  *   pairs are of the form <key: txg, value: log space map object for that txg>.
60*814dcd43SSerapheim Dimitropoulos  *   This entry is our on-disk reference of the log space maps that exist in
61*814dcd43SSerapheim Dimitropoulos  *   the pool for each TXG and it is used during import to load all the
62*814dcd43SSerapheim Dimitropoulos  *   metaslab unflushed changes in memory. To see how this structure is first
63*814dcd43SSerapheim Dimitropoulos  *   created and later populated refer to spa_generate_syncing_log_sm(). To see
64*814dcd43SSerapheim Dimitropoulos  *   how it is used during import time refer to spa_ld_log_sm_metadata().
65*814dcd43SSerapheim Dimitropoulos  *
66*814dcd43SSerapheim Dimitropoulos  * - Each vdev has a new entry in its vdev_top_zap (see field
67*814dcd43SSerapheim Dimitropoulos  *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
68*814dcd43SSerapheim Dimitropoulos  *   each metaslab in this vdev. This field is the on-disk counterpart of the
69*814dcd43SSerapheim Dimitropoulos  *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
70*814dcd43SSerapheim Dimitropoulos  *   the metaslab haven't had its changes flushed. During import, we use this
71*814dcd43SSerapheim Dimitropoulos  *   to ignore any entries in the space map log that are for this metaslab but
72*814dcd43SSerapheim Dimitropoulos  *   from a TXG before msp_unflushed_txg. At that point, we also populate its
73*814dcd43SSerapheim Dimitropoulos  *   in-memory counterpart and from there both fields are updated every time
74*814dcd43SSerapheim Dimitropoulos  *   we flush that metaslab.
75*814dcd43SSerapheim Dimitropoulos  *
76*814dcd43SSerapheim Dimitropoulos  * - A space map is created every TXG and, during that TXG, it is used to log
77*814dcd43SSerapheim Dimitropoulos  *   all incoming changes (the log space map). When created, the log space map
78*814dcd43SSerapheim Dimitropoulos  *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
79*814dcd43SSerapheim Dimitropoulos  *   to the space map ZAP mentioned above. The log space map is closed at the
80*814dcd43SSerapheim Dimitropoulos  *   end of the TXG and will be destroyed when it becomes fully obsolete. We
81*814dcd43SSerapheim Dimitropoulos  *   know when a log space map has become obsolete by looking at the oldest
82*814dcd43SSerapheim Dimitropoulos  *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
83*814dcd43SSerapheim Dimitropoulos  *   than the log space map's TXG, then it means that there is no metaslab who
84*814dcd43SSerapheim Dimitropoulos  *   doesn't have the changes from that log and we can therefore destroy it.
85*814dcd43SSerapheim Dimitropoulos  *   [see spa_cleanup_old_sm_logs()].
86*814dcd43SSerapheim Dimitropoulos  *
87*814dcd43SSerapheim Dimitropoulos  * == Important in-memory structures ==
88*814dcd43SSerapheim Dimitropoulos  *
89*814dcd43SSerapheim Dimitropoulos  * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
90*814dcd43SSerapheim Dimitropoulos  *   the pool by their ms_unflushed_txg field. It is primarily used for three
91*814dcd43SSerapheim Dimitropoulos  *   reasons. First of all, it is used during flushing where we try to flush
92*814dcd43SSerapheim Dimitropoulos  *   metaslabs in-order from the oldest-flushed to the most recently flushed
93*814dcd43SSerapheim Dimitropoulos  *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
94*814dcd43SSerapheim Dimitropoulos  *   oldest flushed metaslab to distinguish which log space maps have become
95*814dcd43SSerapheim Dimitropoulos  *   obsolete and which ones are still relevant. Finally it tells us which
96*814dcd43SSerapheim Dimitropoulos  *   metaslabs have unflushed changes in a pool where this feature was just
97*814dcd43SSerapheim Dimitropoulos  *   enabled, as we don't immediately add all of the pool's metaslabs but we
98*814dcd43SSerapheim Dimitropoulos  *   add them over time as they go through metaslab_sync(). The reason that
99*814dcd43SSerapheim Dimitropoulos  *   we do that is to ease these pools into the behavior of the flushing
100*814dcd43SSerapheim Dimitropoulos  *   algorithm (described later on).
101*814dcd43SSerapheim Dimitropoulos  *
102*814dcd43SSerapheim Dimitropoulos  * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
103*814dcd43SSerapheim Dimitropoulos  *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
104*814dcd43SSerapheim Dimitropoulos  *   nodes represent the log space maps in the pool. This in-memory
105*814dcd43SSerapheim Dimitropoulos  *   representation of log space maps in the pool sorts the log space maps by
106*814dcd43SSerapheim Dimitropoulos  *   the TXG that they were created (which is also the TXG of their unflushed
107*814dcd43SSerapheim Dimitropoulos  *   changes). It also contains the following extra information for each
108*814dcd43SSerapheim Dimitropoulos  *   space map:
109*814dcd43SSerapheim Dimitropoulos  *   [1] The number of metaslabs that were last flushed on that TXG. This is
110*814dcd43SSerapheim Dimitropoulos  *       important because if that counter is zero and this is the oldest
111*814dcd43SSerapheim Dimitropoulos  *       log then it means that it is also obsolete.
112*814dcd43SSerapheim Dimitropoulos  *   [2] The number of blocks of that space map. This field is used by the
113*814dcd43SSerapheim Dimitropoulos  *       block heuristic of our flushing algorithm (described later on).
114*814dcd43SSerapheim Dimitropoulos  *       It represents how many blocks of metadata changes ZFS had to write
115*814dcd43SSerapheim Dimitropoulos  *       to disk for that TXG.
116*814dcd43SSerapheim Dimitropoulos  *
117*814dcd43SSerapheim Dimitropoulos  * - The per-spa field spa_log_summary is a list of entries that summarizes
118*814dcd43SSerapheim Dimitropoulos  *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
119*814dcd43SSerapheim Dimitropoulos  *   AVL tree mentioned above. The reason this exists is that our flushing
120*814dcd43SSerapheim Dimitropoulos  *   algorithm (described later) tries to estimate how many metaslabs to flush
121*814dcd43SSerapheim Dimitropoulos  *   in each TXG by iterating over all the log space maps and looking at their
122*814dcd43SSerapheim Dimitropoulos  *   block counts. Summarizing that information means that don't have to
123*814dcd43SSerapheim Dimitropoulos  *   iterate through each space map, minimizing the runtime overhead of the
124*814dcd43SSerapheim Dimitropoulos  *   flushing algorithm which would be induced in syncing context. In terms of
125*814dcd43SSerapheim Dimitropoulos  *   implementation the log summary is used as a queue:
126*814dcd43SSerapheim Dimitropoulos  *   * we modify or pop entries from its head when we flush metaslabs
127*814dcd43SSerapheim Dimitropoulos  *   * we modify or append entries to its tail when we sync changes.
128*814dcd43SSerapheim Dimitropoulos  *
129*814dcd43SSerapheim Dimitropoulos  * - Each metaslab has two new range trees that hold its unflushed changes,
130*814dcd43SSerapheim Dimitropoulos  *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
131*814dcd43SSerapheim Dimitropoulos  *
132*814dcd43SSerapheim Dimitropoulos  * == Flushing algorithm ==
133*814dcd43SSerapheim Dimitropoulos  *
134*814dcd43SSerapheim Dimitropoulos  * The decision of how many metaslabs to flush on a give TXG is guided by
135*814dcd43SSerapheim Dimitropoulos  * two heuristics:
136*814dcd43SSerapheim Dimitropoulos  *
137*814dcd43SSerapheim Dimitropoulos  * [1] The memory heuristic -
138*814dcd43SSerapheim Dimitropoulos  * We keep track of the memory used by the unflushed trees from all the
139*814dcd43SSerapheim Dimitropoulos  * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
140*814dcd43SSerapheim Dimitropoulos  * stays below a certain threshold which is determined by an arbitrary hard
141*814dcd43SSerapheim Dimitropoulos  * limit and an arbitrary percentage of the system's memory [see
142*814dcd43SSerapheim Dimitropoulos  * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
143*814dcd43SSerapheim Dimitropoulos  * unflushed changes are passing that threshold, we flush metaslabs, which
144*814dcd43SSerapheim Dimitropoulos  * empties their unflushed range trees, reducing the memory used.
145*814dcd43SSerapheim Dimitropoulos  *
146*814dcd43SSerapheim Dimitropoulos  * [2] The block heuristic -
147*814dcd43SSerapheim Dimitropoulos  * We try to keep the total number of blocks in the log space maps in check
148*814dcd43SSerapheim Dimitropoulos  * so the log doesn't grow indefinitely and we don't induce a lot of overhead
149*814dcd43SSerapheim Dimitropoulos  * when loading the pool. At the same time we don't want to flush a lot of
150*814dcd43SSerapheim Dimitropoulos  * metaslabs too often as this would defeat the purpose of the log space map.
151*814dcd43SSerapheim Dimitropoulos  * As a result we set a limit in the amount of blocks that we think it's
152*814dcd43SSerapheim Dimitropoulos  * acceptable for the log space maps to have and try not to cross it.
153*814dcd43SSerapheim Dimitropoulos  * [see sus_blocklimit from spa_unflushed_stats].
154*814dcd43SSerapheim Dimitropoulos  *
155*814dcd43SSerapheim Dimitropoulos  * In order to stay below the block limit every TXG we have to estimate how
156*814dcd43SSerapheim Dimitropoulos  * many metaslabs we need to flush based on the current rate of incoming blocks
157*814dcd43SSerapheim Dimitropoulos  * and our history of log space map blocks. The main idea here is to answer
158*814dcd43SSerapheim Dimitropoulos  * the question of how many metaslabs do we need to flush in order to get rid
159*814dcd43SSerapheim Dimitropoulos  * at least an X amount of log space map blocks. We can answer this question
160*814dcd43SSerapheim Dimitropoulos  * by iterating backwards from the oldest log space map to the newest one
161*814dcd43SSerapheim Dimitropoulos  * and looking at their metaslab and block counts. At this point the log summary
162*814dcd43SSerapheim Dimitropoulos  * mentioned above comes handy as it reduces the amount of things that we have
163*814dcd43SSerapheim Dimitropoulos  * to iterate (even though it may reduce the preciseness of our estimates due
164*814dcd43SSerapheim Dimitropoulos  * to its aggregation of data). So with that in mind, we project the incoming
165*814dcd43SSerapheim Dimitropoulos  * rate of the current TXG into the future and attempt to approximate how many
166*814dcd43SSerapheim Dimitropoulos  * metaslabs would we need to flush from now in order to avoid exceeding our
167*814dcd43SSerapheim Dimitropoulos  * block limit in different points in the future (granted that we would keep
168*814dcd43SSerapheim Dimitropoulos  * flushing the same number of metaslabs for every TXG). Then we take the
169*814dcd43SSerapheim Dimitropoulos  * maximum number from all these estimates to be on the safe side. For the
170*814dcd43SSerapheim Dimitropoulos  * exact implementation details of algorithm refer to
171*814dcd43SSerapheim Dimitropoulos  * spa_estimate_metaslabs_to_flush.
172*814dcd43SSerapheim Dimitropoulos  */
173*814dcd43SSerapheim Dimitropoulos 
174*814dcd43SSerapheim Dimitropoulos /*
175*814dcd43SSerapheim Dimitropoulos  * This is used as the block size for the space maps used for the
176*814dcd43SSerapheim Dimitropoulos  * log space map feature. These space maps benefit from a bigger
177*814dcd43SSerapheim Dimitropoulos  * block size as we expect to be writing a lot of data to them at
178*814dcd43SSerapheim Dimitropoulos  * once.
179*814dcd43SSerapheim Dimitropoulos  */
180*814dcd43SSerapheim Dimitropoulos unsigned long zfs_log_sm_blksz = 1ULL << 17;
181*814dcd43SSerapheim Dimitropoulos 
182*814dcd43SSerapheim Dimitropoulos /*
183*814dcd43SSerapheim Dimitropoulos  * Percentage of the overall system’s memory that ZFS allows to be
184*814dcd43SSerapheim Dimitropoulos  * used for unflushed changes (e.g. the sum of size of all the nodes
185*814dcd43SSerapheim Dimitropoulos  * in the unflushed trees).
186*814dcd43SSerapheim Dimitropoulos  *
187*814dcd43SSerapheim Dimitropoulos  * Note that this value is calculated over 1000000 for finer granularity
188*814dcd43SSerapheim Dimitropoulos  * (thus the _ppm suffix; reads as "parts per million"). As an example,
189*814dcd43SSerapheim Dimitropoulos  * the default of 1000 allows 0.1% of memory to be used.
190*814dcd43SSerapheim Dimitropoulos  */
191*814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_max_mem_ppm = 1000;
192*814dcd43SSerapheim Dimitropoulos 
193*814dcd43SSerapheim Dimitropoulos /*
194*814dcd43SSerapheim Dimitropoulos  * Specific hard-limit in memory that ZFS allows to be used for
195*814dcd43SSerapheim Dimitropoulos  * unflushed changes.
196*814dcd43SSerapheim Dimitropoulos  */
197*814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
198*814dcd43SSerapheim Dimitropoulos 
199*814dcd43SSerapheim Dimitropoulos /*
200*814dcd43SSerapheim Dimitropoulos  * The following tunable determines the number of blocks that can be used for
201*814dcd43SSerapheim Dimitropoulos  * the log space maps. It is expressed as a percentage of the total number of
202*814dcd43SSerapheim Dimitropoulos  * metaslabs in the pool (i.e. the default of 400 means that the number of log
203*814dcd43SSerapheim Dimitropoulos  * blocks is capped at 4 times the number of metaslabs).
204*814dcd43SSerapheim Dimitropoulos  *
205*814dcd43SSerapheim Dimitropoulos  * This value exists to tune our flushing algorithm, with higher values
206*814dcd43SSerapheim Dimitropoulos  * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
207*814dcd43SSerapheim Dimitropoulos  * flushing metaslabs more aggressively with the upside of saving overheads
208*814dcd43SSerapheim Dimitropoulos  * when loading the pool. Another factor in this tradeoff is that flushing
209*814dcd43SSerapheim Dimitropoulos  * less often can potentially lead to better utilization of the metaslab space
210*814dcd43SSerapheim Dimitropoulos  * map's block size as we accumulate more changes per flush.
211*814dcd43SSerapheim Dimitropoulos  *
212*814dcd43SSerapheim Dimitropoulos  * Given that this tunable indirectly controls the flush rate (metaslabs
213*814dcd43SSerapheim Dimitropoulos  * flushed per txg) and that's why making it a percentage in terms of the
214*814dcd43SSerapheim Dimitropoulos  * number of metaslabs in the pool makes sense here.
215*814dcd43SSerapheim Dimitropoulos  *
216*814dcd43SSerapheim Dimitropoulos  * As a rule of thumb we default this tunable to 400% based on the following:
217*814dcd43SSerapheim Dimitropoulos  *
218*814dcd43SSerapheim Dimitropoulos  * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
219*814dcd43SSerapheim Dimitropoulos  *    it is reasonable to expect that the amount of obsolete entries changes
220*814dcd43SSerapheim Dimitropoulos  *    linearly from txg to txg (e.g. the oldest log should have the most
221*814dcd43SSerapheim Dimitropoulos  *    obsolete entries, and the most recent one the least). With this we could
222*814dcd43SSerapheim Dimitropoulos  *    say that, at any given time, about half of the entries in the whole space
223*814dcd43SSerapheim Dimitropoulos  *    map log are obsolete. Thus for every two entries for a metaslab in the
224*814dcd43SSerapheim Dimitropoulos  *    log space map, only one of them is valid and actually makes it to the
225*814dcd43SSerapheim Dimitropoulos  *    metaslab's space map.
226*814dcd43SSerapheim Dimitropoulos  *    [factor of 2]
227*814dcd43SSerapheim Dimitropoulos  * 2] Each entry in the log space map is guaranteed to be two words while
228*814dcd43SSerapheim Dimitropoulos  *    entries in metaslab space maps are generally single-word.
229*814dcd43SSerapheim Dimitropoulos  *    [an extra factor of 2 - 400% overall]
230*814dcd43SSerapheim Dimitropoulos  * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
231*814dcd43SSerapheim Dimitropoulos  *    account any consolidation of segments from the log space map to the
232*814dcd43SSerapheim Dimitropoulos  *    unflushed range trees nor their history (e.g. a segment being allocated,
233*814dcd43SSerapheim Dimitropoulos  *    then freed, then allocated again means 3 log space map entries but 0
234*814dcd43SSerapheim Dimitropoulos  *    metaslab space map entries). Depending on the workload, we've seen ~1.8
235*814dcd43SSerapheim Dimitropoulos  *    non-obsolete log space map entries per metaslab entry, for a total of
236*814dcd43SSerapheim Dimitropoulos  *    ~600%. Since most of these estimates though are workload dependent, we
237*814dcd43SSerapheim Dimitropoulos  *    default on 400% to be conservative.
238*814dcd43SSerapheim Dimitropoulos  *
239*814dcd43SSerapheim Dimitropoulos  *    Thus we could say that even in the worst
240*814dcd43SSerapheim Dimitropoulos  *    case of [1] and [2], the factor should end up being 4.
241*814dcd43SSerapheim Dimitropoulos  *
242*814dcd43SSerapheim Dimitropoulos  * That said, regardless of the number of metaslabs in the pool we need to
243*814dcd43SSerapheim Dimitropoulos  * provide upper and lower bounds for the log block limit.
244*814dcd43SSerapheim Dimitropoulos  * [see zfs_unflushed_log_block_{min,max}]
245*814dcd43SSerapheim Dimitropoulos  */
246*814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_log_block_pct = 400;
247*814dcd43SSerapheim Dimitropoulos 
248*814dcd43SSerapheim Dimitropoulos /*
249*814dcd43SSerapheim Dimitropoulos  * If the number of metaslabs is small and our incoming rate is high, we could
250*814dcd43SSerapheim Dimitropoulos  * get into a situation that we are flushing all our metaslabs every TXG. Thus
251*814dcd43SSerapheim Dimitropoulos  * we always allow at least this many log blocks.
252*814dcd43SSerapheim Dimitropoulos  */
253*814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_log_block_min = 1000;
254*814dcd43SSerapheim Dimitropoulos 
255*814dcd43SSerapheim Dimitropoulos /*
256*814dcd43SSerapheim Dimitropoulos  * If the log becomes too big, the import time of the pool can take a hit in
257*814dcd43SSerapheim Dimitropoulos  * terms of performance. Thus we have a hard limit in the size of the log in
258*814dcd43SSerapheim Dimitropoulos  * terms of blocks.
259*814dcd43SSerapheim Dimitropoulos  */
260*814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
261*814dcd43SSerapheim Dimitropoulos 
262*814dcd43SSerapheim Dimitropoulos /*
263*814dcd43SSerapheim Dimitropoulos  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
264*814dcd43SSerapheim Dimitropoulos  * stability of the flushing algorithm (longer summary) vs its runtime overhead
265*814dcd43SSerapheim Dimitropoulos  * (smaller summary is faster to traverse).
266*814dcd43SSerapheim Dimitropoulos  */
267*814dcd43SSerapheim Dimitropoulos unsigned long zfs_max_logsm_summary_length = 10;
268*814dcd43SSerapheim Dimitropoulos 
269*814dcd43SSerapheim Dimitropoulos /*
270*814dcd43SSerapheim Dimitropoulos  * Tunable that sets the lower bound on the metaslabs to flush every TXG.
271*814dcd43SSerapheim Dimitropoulos  *
272*814dcd43SSerapheim Dimitropoulos  * Setting this to 0 has no effect since if the pool is idle we won't even be
273*814dcd43SSerapheim Dimitropoulos  * creating log space maps and therefore we won't be flushing. On the other
274*814dcd43SSerapheim Dimitropoulos  * hand if the pool has any incoming workload our block heuristic will start
275*814dcd43SSerapheim Dimitropoulos  * flushing metaslabs anyway.
276*814dcd43SSerapheim Dimitropoulos  *
277*814dcd43SSerapheim Dimitropoulos  * The point of this tunable is to be used in extreme cases where we really
278*814dcd43SSerapheim Dimitropoulos  * want to flush more metaslabs than our adaptable heuristic plans to flush.
279*814dcd43SSerapheim Dimitropoulos  */
280*814dcd43SSerapheim Dimitropoulos unsigned long zfs_min_metaslabs_to_flush = 1;
281*814dcd43SSerapheim Dimitropoulos 
282*814dcd43SSerapheim Dimitropoulos /*
283*814dcd43SSerapheim Dimitropoulos  * Tunable that specifies how far in the past do we want to look when trying to
284*814dcd43SSerapheim Dimitropoulos  * estimate the incoming log blocks for the current TXG.
285*814dcd43SSerapheim Dimitropoulos  *
286*814dcd43SSerapheim Dimitropoulos  * Setting this too high may not only increase runtime but also minimize the
287*814dcd43SSerapheim Dimitropoulos  * effect of the incoming rates from the most recent TXGs as we take the
288*814dcd43SSerapheim Dimitropoulos  * average over all the blocks that we walk
289*814dcd43SSerapheim Dimitropoulos  * [see spa_estimate_incoming_log_blocks].
290*814dcd43SSerapheim Dimitropoulos  */
291*814dcd43SSerapheim Dimitropoulos unsigned long zfs_max_log_walking = 5;
292*814dcd43SSerapheim Dimitropoulos 
293*814dcd43SSerapheim Dimitropoulos /*
294*814dcd43SSerapheim Dimitropoulos  * This tunable exists solely for testing purposes. It ensures that the log
295*814dcd43SSerapheim Dimitropoulos  * spacemaps are not flushed and destroyed during export in order for the
296*814dcd43SSerapheim Dimitropoulos  * relevant log spacemap import code paths to be tested (effectively simulating
297*814dcd43SSerapheim Dimitropoulos  * a crash).
298*814dcd43SSerapheim Dimitropoulos  */
299*814dcd43SSerapheim Dimitropoulos int zfs_keep_log_spacemaps_at_export = 0;
300*814dcd43SSerapheim Dimitropoulos 
301*814dcd43SSerapheim Dimitropoulos static uint64_t
302*814dcd43SSerapheim Dimitropoulos spa_estimate_incoming_log_blocks(spa_t *spa)
303*814dcd43SSerapheim Dimitropoulos {
304*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa_sync_pass(spa), ==, 1);
305*814dcd43SSerapheim Dimitropoulos 	uint64_t steps = 0, sum = 0;
306*814dcd43SSerapheim Dimitropoulos 
307*814dcd43SSerapheim Dimitropoulos 	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
308*814dcd43SSerapheim Dimitropoulos 	    sls != NULL && steps < zfs_max_log_walking;
309*814dcd43SSerapheim Dimitropoulos 	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
310*814dcd43SSerapheim Dimitropoulos 		if (sls->sls_txg == spa_syncing_txg(spa)) {
311*814dcd43SSerapheim Dimitropoulos 			/*
312*814dcd43SSerapheim Dimitropoulos 			 * skip the log created in this TXG as this would
313*814dcd43SSerapheim Dimitropoulos 			 * make our estimations inaccurate.
314*814dcd43SSerapheim Dimitropoulos 			 */
315*814dcd43SSerapheim Dimitropoulos 			continue;
316*814dcd43SSerapheim Dimitropoulos 		}
317*814dcd43SSerapheim Dimitropoulos 		sum += sls->sls_nblocks;
318*814dcd43SSerapheim Dimitropoulos 		steps++;
319*814dcd43SSerapheim Dimitropoulos 	}
320*814dcd43SSerapheim Dimitropoulos 	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
321*814dcd43SSerapheim Dimitropoulos }
322*814dcd43SSerapheim Dimitropoulos 
323*814dcd43SSerapheim Dimitropoulos uint64_t
324*814dcd43SSerapheim Dimitropoulos spa_log_sm_blocklimit(spa_t *spa)
325*814dcd43SSerapheim Dimitropoulos {
326*814dcd43SSerapheim Dimitropoulos 	return (spa->spa_unflushed_stats.sus_blocklimit);
327*814dcd43SSerapheim Dimitropoulos }
328*814dcd43SSerapheim Dimitropoulos 
329*814dcd43SSerapheim Dimitropoulos void
330*814dcd43SSerapheim Dimitropoulos spa_log_sm_set_blocklimit(spa_t *spa)
331*814dcd43SSerapheim Dimitropoulos {
332*814dcd43SSerapheim Dimitropoulos 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
333*814dcd43SSerapheim Dimitropoulos 		ASSERT0(spa_log_sm_blocklimit(spa));
334*814dcd43SSerapheim Dimitropoulos 		return;
335*814dcd43SSerapheim Dimitropoulos 	}
336*814dcd43SSerapheim Dimitropoulos 
337*814dcd43SSerapheim Dimitropoulos 	uint64_t calculated_limit =
338*814dcd43SSerapheim Dimitropoulos 	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
339*814dcd43SSerapheim Dimitropoulos 	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
340*814dcd43SSerapheim Dimitropoulos 	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
341*814dcd43SSerapheim Dimitropoulos }
342*814dcd43SSerapheim Dimitropoulos 
343*814dcd43SSerapheim Dimitropoulos uint64_t
344*814dcd43SSerapheim Dimitropoulos spa_log_sm_nblocks(spa_t *spa)
345*814dcd43SSerapheim Dimitropoulos {
346*814dcd43SSerapheim Dimitropoulos 	return (spa->spa_unflushed_stats.sus_nblocks);
347*814dcd43SSerapheim Dimitropoulos }
348*814dcd43SSerapheim Dimitropoulos 
349*814dcd43SSerapheim Dimitropoulos /*
350*814dcd43SSerapheim Dimitropoulos  * Ensure that the in-memory log space map structures and the summary
351*814dcd43SSerapheim Dimitropoulos  * have the same block and metaslab counts.
352*814dcd43SSerapheim Dimitropoulos  */
353*814dcd43SSerapheim Dimitropoulos static void
354*814dcd43SSerapheim Dimitropoulos spa_log_summary_verify_counts(spa_t *spa)
355*814dcd43SSerapheim Dimitropoulos {
356*814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
357*814dcd43SSerapheim Dimitropoulos 
358*814dcd43SSerapheim Dimitropoulos 	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
359*814dcd43SSerapheim Dimitropoulos 		return;
360*814dcd43SSerapheim Dimitropoulos 
361*814dcd43SSerapheim Dimitropoulos 	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
362*814dcd43SSerapheim Dimitropoulos 
363*814dcd43SSerapheim Dimitropoulos 	uint64_t ms_in_summary = 0, blk_in_summary = 0;
364*814dcd43SSerapheim Dimitropoulos 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
365*814dcd43SSerapheim Dimitropoulos 	    e; e = list_next(&spa->spa_log_summary, e)) {
366*814dcd43SSerapheim Dimitropoulos 		ms_in_summary += e->lse_mscount;
367*814dcd43SSerapheim Dimitropoulos 		blk_in_summary += e->lse_blkcount;
368*814dcd43SSerapheim Dimitropoulos 	}
369*814dcd43SSerapheim Dimitropoulos 
370*814dcd43SSerapheim Dimitropoulos 	uint64_t ms_in_logs = 0, blk_in_logs = 0;
371*814dcd43SSerapheim Dimitropoulos 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
372*814dcd43SSerapheim Dimitropoulos 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
373*814dcd43SSerapheim Dimitropoulos 		ms_in_logs += sls->sls_mscount;
374*814dcd43SSerapheim Dimitropoulos 		blk_in_logs += sls->sls_nblocks;
375*814dcd43SSerapheim Dimitropoulos 	}
376*814dcd43SSerapheim Dimitropoulos 
377*814dcd43SSerapheim Dimitropoulos 	VERIFY3U(ms_in_logs, ==, ms_in_summary);
378*814dcd43SSerapheim Dimitropoulos 	VERIFY3U(ms_in_logs, ==, ms_in_avl);
379*814dcd43SSerapheim Dimitropoulos 	VERIFY3U(blk_in_logs, ==, blk_in_summary);
380*814dcd43SSerapheim Dimitropoulos 	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
381*814dcd43SSerapheim Dimitropoulos }
382*814dcd43SSerapheim Dimitropoulos 
383*814dcd43SSerapheim Dimitropoulos static boolean_t
384*814dcd43SSerapheim Dimitropoulos summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
385*814dcd43SSerapheim Dimitropoulos {
386*814dcd43SSerapheim Dimitropoulos 	uint64_t blocks_per_row = MAX(1,
387*814dcd43SSerapheim Dimitropoulos 	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
388*814dcd43SSerapheim Dimitropoulos 	    zfs_max_logsm_summary_length));
389*814dcd43SSerapheim Dimitropoulos 
390*814dcd43SSerapheim Dimitropoulos 	return (blocks_per_row <= e->lse_blkcount);
391*814dcd43SSerapheim Dimitropoulos }
392*814dcd43SSerapheim Dimitropoulos 
393*814dcd43SSerapheim Dimitropoulos /*
394*814dcd43SSerapheim Dimitropoulos  * Update the log summary information to reflect the fact that a metaslab
395*814dcd43SSerapheim Dimitropoulos  * was flushed or destroyed (e.g due to device removal or pool export/destroy).
396*814dcd43SSerapheim Dimitropoulos  *
397*814dcd43SSerapheim Dimitropoulos  * We typically flush the oldest flushed metaslab so the first (and oldest)
398*814dcd43SSerapheim Dimitropoulos  * entry of the summary is updated. However if that metaslab is getting loaded
399*814dcd43SSerapheim Dimitropoulos  * we may flush the second oldest one which may be part of an entry later in
400*814dcd43SSerapheim Dimitropoulos  * the summary. Moreover, if we call into this function from metaslab_fini()
401*814dcd43SSerapheim Dimitropoulos  * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
402*814dcd43SSerapheim Dimitropoulos  * for a txg as an argument so we can locate the appropriate summary entry for
403*814dcd43SSerapheim Dimitropoulos  * the metaslab.
404*814dcd43SSerapheim Dimitropoulos  */
405*814dcd43SSerapheim Dimitropoulos void
406*814dcd43SSerapheim Dimitropoulos spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
407*814dcd43SSerapheim Dimitropoulos {
408*814dcd43SSerapheim Dimitropoulos 	/*
409*814dcd43SSerapheim Dimitropoulos 	 * We don't track summary data for read-only pools and this function
410*814dcd43SSerapheim Dimitropoulos 	 * can be called from metaslab_fini(). In that case return immediately.
411*814dcd43SSerapheim Dimitropoulos 	 */
412*814dcd43SSerapheim Dimitropoulos 	if (!spa_writeable(spa))
413*814dcd43SSerapheim Dimitropoulos 		return;
414*814dcd43SSerapheim Dimitropoulos 
415*814dcd43SSerapheim Dimitropoulos 	log_summary_entry_t *target = NULL;
416*814dcd43SSerapheim Dimitropoulos 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
417*814dcd43SSerapheim Dimitropoulos 	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
418*814dcd43SSerapheim Dimitropoulos 		if (e->lse_start > txg)
419*814dcd43SSerapheim Dimitropoulos 			break;
420*814dcd43SSerapheim Dimitropoulos 		target = e;
421*814dcd43SSerapheim Dimitropoulos 	}
422*814dcd43SSerapheim Dimitropoulos 
423*814dcd43SSerapheim Dimitropoulos 	if (target == NULL || target->lse_mscount == 0) {
424*814dcd43SSerapheim Dimitropoulos 		/*
425*814dcd43SSerapheim Dimitropoulos 		 * We didn't find a summary entry for this metaslab. We must be
426*814dcd43SSerapheim Dimitropoulos 		 * at the teardown of a spa_load() attempt that got an error
427*814dcd43SSerapheim Dimitropoulos 		 * while reading the log space maps.
428*814dcd43SSerapheim Dimitropoulos 		 */
429*814dcd43SSerapheim Dimitropoulos 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
430*814dcd43SSerapheim Dimitropoulos 		return;
431*814dcd43SSerapheim Dimitropoulos 	}
432*814dcd43SSerapheim Dimitropoulos 
433*814dcd43SSerapheim Dimitropoulos 	target->lse_mscount--;
434*814dcd43SSerapheim Dimitropoulos }
435*814dcd43SSerapheim Dimitropoulos 
436*814dcd43SSerapheim Dimitropoulos /*
437*814dcd43SSerapheim Dimitropoulos  * Update the log summary information to reflect the fact that we destroyed
438*814dcd43SSerapheim Dimitropoulos  * old log space maps. Since we can only destroy the oldest log space maps,
439*814dcd43SSerapheim Dimitropoulos  * we decrement the block count of the oldest summary entry and potentially
440*814dcd43SSerapheim Dimitropoulos  * destroy it when that count hits 0.
441*814dcd43SSerapheim Dimitropoulos  *
442*814dcd43SSerapheim Dimitropoulos  * This function is called after a metaslab is flushed and typically that
443*814dcd43SSerapheim Dimitropoulos  * metaslab is the oldest flushed, which means that this function will
444*814dcd43SSerapheim Dimitropoulos  * typically decrement the block count of the first entry of the summary and
445*814dcd43SSerapheim Dimitropoulos  * potentially free it if the block count gets to zero (its metaslab count
446*814dcd43SSerapheim Dimitropoulos  * should be zero too at that point).
447*814dcd43SSerapheim Dimitropoulos  *
448*814dcd43SSerapheim Dimitropoulos  * There are certain scenarios though that don't work exactly like that so we
449*814dcd43SSerapheim Dimitropoulos  * need to account for them:
450*814dcd43SSerapheim Dimitropoulos  *
451*814dcd43SSerapheim Dimitropoulos  * Scenario [1]: It is possible that after we flushed the oldest flushed
452*814dcd43SSerapheim Dimitropoulos  * metaslab and we destroyed the oldest log space map, more recent logs had 0
453*814dcd43SSerapheim Dimitropoulos  * metaslabs pointing to them so we got rid of them too. This can happen due
454*814dcd43SSerapheim Dimitropoulos  * to metaslabs being destroyed through device removal, or because the oldest
455*814dcd43SSerapheim Dimitropoulos  * flushed metaslab was loading but we kept flushing more recently flushed
456*814dcd43SSerapheim Dimitropoulos  * metaslabs due to the memory pressure of unflushed changes. Because of that,
457*814dcd43SSerapheim Dimitropoulos  * we always iterate from the beginning of the summary and if blocks_gone is
458*814dcd43SSerapheim Dimitropoulos  * bigger than the block_count of the current entry we free that entry (we
459*814dcd43SSerapheim Dimitropoulos  * expect its metaslab count to be zero), we decrement blocks_gone and on to
460*814dcd43SSerapheim Dimitropoulos  * the next entry repeating this procedure until blocks_gone gets decremented
461*814dcd43SSerapheim Dimitropoulos  * to 0. Doing this also works for the typical case mentioned above.
462*814dcd43SSerapheim Dimitropoulos  *
463*814dcd43SSerapheim Dimitropoulos  * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
464*814dcd43SSerapheim Dimitropoulos  * the first (and oldest) entry in the summary. If the first few entries of
465*814dcd43SSerapheim Dimitropoulos  * the summary were only accounting metaslabs from a device that was just
466*814dcd43SSerapheim Dimitropoulos  * removed, then the current oldest flushed metaslab could be accounted by an
467*814dcd43SSerapheim Dimitropoulos  * entry somewhere in the middle of the summary. Moreover flushing that
468*814dcd43SSerapheim Dimitropoulos  * metaslab will destroy all the log space maps older than its ms_unflushed_txg
469*814dcd43SSerapheim Dimitropoulos  * because they became obsolete after the removal. Thus, iterating as we did
470*814dcd43SSerapheim Dimitropoulos  * for scenario [1] works out for this case too.
471*814dcd43SSerapheim Dimitropoulos  *
472*814dcd43SSerapheim Dimitropoulos  * Scenario [3]: At times we decide to flush all the metaslabs in the pool
473*814dcd43SSerapheim Dimitropoulos  * in one TXG (either because we are exporting the pool or because our flushing
474*814dcd43SSerapheim Dimitropoulos  * heuristics decided to do so). When that happens all the log space maps get
475*814dcd43SSerapheim Dimitropoulos  * destroyed except the one created for the current TXG which doesn't have
476*814dcd43SSerapheim Dimitropoulos  * any log blocks yet. As log space maps get destroyed with every metaslab that
477*814dcd43SSerapheim Dimitropoulos  * we flush, entries in the summary are also destroyed. This brings a weird
478*814dcd43SSerapheim Dimitropoulos  * corner-case when we flush the last metaslab and the log space map of the
479*814dcd43SSerapheim Dimitropoulos  * current TXG is in the same summary entry with other log space maps that
480*814dcd43SSerapheim Dimitropoulos  * are older. When that happens we are eventually left with this one last
481*814dcd43SSerapheim Dimitropoulos  * summary entry whose blocks are gone (blocks_gone equals the entry's block
482*814dcd43SSerapheim Dimitropoulos  * count) but its metaslab count is non-zero (because it accounts all the
483*814dcd43SSerapheim Dimitropoulos  * metaslabs in the pool as they all got flushed). Under this scenario we can't
484*814dcd43SSerapheim Dimitropoulos  * free this last summary entry as it's referencing all the metaslabs in the
485*814dcd43SSerapheim Dimitropoulos  * pool and its block count will get incremented at the end of this sync (when
486*814dcd43SSerapheim Dimitropoulos  * we close the syncing log space map). Thus we just decrement its current
487*814dcd43SSerapheim Dimitropoulos  * block count and leave it alone. In the case that the pool gets exported,
488*814dcd43SSerapheim Dimitropoulos  * its metaslab count will be decremented over time as we call metaslab_fini()
489*814dcd43SSerapheim Dimitropoulos  * for all the metaslabs in the pool and the entry will be freed at
490*814dcd43SSerapheim Dimitropoulos  * spa_unload_log_sm_metadata().
491*814dcd43SSerapheim Dimitropoulos  */
492*814dcd43SSerapheim Dimitropoulos void
493*814dcd43SSerapheim Dimitropoulos spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
494*814dcd43SSerapheim Dimitropoulos {
495*814dcd43SSerapheim Dimitropoulos 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
496*814dcd43SSerapheim Dimitropoulos 	    e != NULL; e = list_head(&spa->spa_log_summary)) {
497*814dcd43SSerapheim Dimitropoulos 		if (e->lse_blkcount > blocks_gone) {
498*814dcd43SSerapheim Dimitropoulos 			/*
499*814dcd43SSerapheim Dimitropoulos 			 * Assert that we stopped at an entry that is not
500*814dcd43SSerapheim Dimitropoulos 			 * obsolete.
501*814dcd43SSerapheim Dimitropoulos 			 */
502*814dcd43SSerapheim Dimitropoulos 			ASSERT(e->lse_mscount != 0);
503*814dcd43SSerapheim Dimitropoulos 
504*814dcd43SSerapheim Dimitropoulos 			e->lse_blkcount -= blocks_gone;
505*814dcd43SSerapheim Dimitropoulos 			blocks_gone = 0;
506*814dcd43SSerapheim Dimitropoulos 			break;
507*814dcd43SSerapheim Dimitropoulos 		} else if (e->lse_mscount == 0) {
508*814dcd43SSerapheim Dimitropoulos 			/* remove obsolete entry */
509*814dcd43SSerapheim Dimitropoulos 			blocks_gone -= e->lse_blkcount;
510*814dcd43SSerapheim Dimitropoulos 			list_remove(&spa->spa_log_summary, e);
511*814dcd43SSerapheim Dimitropoulos 			kmem_free(e, sizeof (log_summary_entry_t));
512*814dcd43SSerapheim Dimitropoulos 		} else {
513*814dcd43SSerapheim Dimitropoulos 			/* Verify that this is scenario [3] mentioned above. */
514*814dcd43SSerapheim Dimitropoulos 			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
515*814dcd43SSerapheim Dimitropoulos 
516*814dcd43SSerapheim Dimitropoulos 			/*
517*814dcd43SSerapheim Dimitropoulos 			 * Assert that this is scenario [3] further by ensuring
518*814dcd43SSerapheim Dimitropoulos 			 * that this is the only entry in the summary.
519*814dcd43SSerapheim Dimitropoulos 			 */
520*814dcd43SSerapheim Dimitropoulos 			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
521*814dcd43SSerapheim Dimitropoulos 			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
522*814dcd43SSerapheim Dimitropoulos 
523*814dcd43SSerapheim Dimitropoulos 			blocks_gone = e->lse_blkcount = 0;
524*814dcd43SSerapheim Dimitropoulos 			break;
525*814dcd43SSerapheim Dimitropoulos 		}
526*814dcd43SSerapheim Dimitropoulos 	}
527*814dcd43SSerapheim Dimitropoulos 
528*814dcd43SSerapheim Dimitropoulos 	/*
529*814dcd43SSerapheim Dimitropoulos 	 * Ensure that there is no way we are trying to remove more blocks
530*814dcd43SSerapheim Dimitropoulos 	 * than the # of blocks in the summary.
531*814dcd43SSerapheim Dimitropoulos 	 */
532*814dcd43SSerapheim Dimitropoulos 	ASSERT0(blocks_gone);
533*814dcd43SSerapheim Dimitropoulos }
534*814dcd43SSerapheim Dimitropoulos 
535*814dcd43SSerapheim Dimitropoulos void
536*814dcd43SSerapheim Dimitropoulos spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
537*814dcd43SSerapheim Dimitropoulos {
538*814dcd43SSerapheim Dimitropoulos 	spa_log_sm_t target = { .sls_txg = txg };
539*814dcd43SSerapheim Dimitropoulos 	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
540*814dcd43SSerapheim Dimitropoulos 	    &target, NULL);
541*814dcd43SSerapheim Dimitropoulos 
542*814dcd43SSerapheim Dimitropoulos 	if (sls == NULL) {
543*814dcd43SSerapheim Dimitropoulos 		/*
544*814dcd43SSerapheim Dimitropoulos 		 * We must be at the teardown of a spa_load() attempt that
545*814dcd43SSerapheim Dimitropoulos 		 * got an error while reading the log space maps.
546*814dcd43SSerapheim Dimitropoulos 		 */
547*814dcd43SSerapheim Dimitropoulos 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
548*814dcd43SSerapheim Dimitropoulos 		return;
549*814dcd43SSerapheim Dimitropoulos 	}
550*814dcd43SSerapheim Dimitropoulos 
551*814dcd43SSerapheim Dimitropoulos 	ASSERT(sls->sls_mscount > 0);
552*814dcd43SSerapheim Dimitropoulos 	sls->sls_mscount--;
553*814dcd43SSerapheim Dimitropoulos }
554*814dcd43SSerapheim Dimitropoulos 
555*814dcd43SSerapheim Dimitropoulos void
556*814dcd43SSerapheim Dimitropoulos spa_log_sm_increment_current_mscount(spa_t *spa)
557*814dcd43SSerapheim Dimitropoulos {
558*814dcd43SSerapheim Dimitropoulos 	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
559*814dcd43SSerapheim Dimitropoulos 
560*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
561*814dcd43SSerapheim Dimitropoulos 	last_sls->sls_mscount++;
562*814dcd43SSerapheim Dimitropoulos }
563*814dcd43SSerapheim Dimitropoulos 
564*814dcd43SSerapheim Dimitropoulos static void
565*814dcd43SSerapheim Dimitropoulos summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
566*814dcd43SSerapheim Dimitropoulos     uint64_t nblocks)
567*814dcd43SSerapheim Dimitropoulos {
568*814dcd43SSerapheim Dimitropoulos 	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
569*814dcd43SSerapheim Dimitropoulos 
570*814dcd43SSerapheim Dimitropoulos 	if (e == NULL || summary_entry_is_full(spa, e)) {
571*814dcd43SSerapheim Dimitropoulos 		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
572*814dcd43SSerapheim Dimitropoulos 		e->lse_start = txg;
573*814dcd43SSerapheim Dimitropoulos 		list_insert_tail(&spa->spa_log_summary, e);
574*814dcd43SSerapheim Dimitropoulos 	}
575*814dcd43SSerapheim Dimitropoulos 
576*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(e->lse_start, <=, txg);
577*814dcd43SSerapheim Dimitropoulos 	e->lse_mscount += metaslabs_flushed;
578*814dcd43SSerapheim Dimitropoulos 	e->lse_blkcount += nblocks;
579*814dcd43SSerapheim Dimitropoulos }
580*814dcd43SSerapheim Dimitropoulos 
581*814dcd43SSerapheim Dimitropoulos static void
582*814dcd43SSerapheim Dimitropoulos spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
583*814dcd43SSerapheim Dimitropoulos {
584*814dcd43SSerapheim Dimitropoulos 	summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
585*814dcd43SSerapheim Dimitropoulos }
586*814dcd43SSerapheim Dimitropoulos 
587*814dcd43SSerapheim Dimitropoulos void
588*814dcd43SSerapheim Dimitropoulos spa_log_summary_add_flushed_metaslab(spa_t *spa)
589*814dcd43SSerapheim Dimitropoulos {
590*814dcd43SSerapheim Dimitropoulos 	summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
591*814dcd43SSerapheim Dimitropoulos }
592*814dcd43SSerapheim Dimitropoulos 
593*814dcd43SSerapheim Dimitropoulos /*
594*814dcd43SSerapheim Dimitropoulos  * This function attempts to estimate how many metaslabs should
595*814dcd43SSerapheim Dimitropoulos  * we flush to satisfy our block heuristic for the log spacemap
596*814dcd43SSerapheim Dimitropoulos  * for the upcoming TXGs.
597*814dcd43SSerapheim Dimitropoulos  *
598*814dcd43SSerapheim Dimitropoulos  * Specifically, it first tries to estimate the number of incoming
599*814dcd43SSerapheim Dimitropoulos  * blocks in this TXG. Then by projecting that incoming rate to
600*814dcd43SSerapheim Dimitropoulos  * future TXGs and using the log summary, it figures out how many
601*814dcd43SSerapheim Dimitropoulos  * flushes we would need to do for future TXGs individually to
602*814dcd43SSerapheim Dimitropoulos  * stay below our block limit and returns the maximum number of
603*814dcd43SSerapheim Dimitropoulos  * flushes from those estimates.
604*814dcd43SSerapheim Dimitropoulos  */
605*814dcd43SSerapheim Dimitropoulos static uint64_t
606*814dcd43SSerapheim Dimitropoulos spa_estimate_metaslabs_to_flush(spa_t *spa)
607*814dcd43SSerapheim Dimitropoulos {
608*814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
609*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa_sync_pass(spa), ==, 1);
610*814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_log_sm_blocklimit(spa) != 0);
611*814dcd43SSerapheim Dimitropoulos 
612*814dcd43SSerapheim Dimitropoulos 	/*
613*814dcd43SSerapheim Dimitropoulos 	 * This variable contains the incoming rate that will be projected
614*814dcd43SSerapheim Dimitropoulos 	 * and used for our flushing estimates in the future.
615*814dcd43SSerapheim Dimitropoulos 	 */
616*814dcd43SSerapheim Dimitropoulos 	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
617*814dcd43SSerapheim Dimitropoulos 
618*814dcd43SSerapheim Dimitropoulos 	/*
619*814dcd43SSerapheim Dimitropoulos 	 * At any point in time this variable tells us how many
620*814dcd43SSerapheim Dimitropoulos 	 * TXGs in the future we are so we can make our estimations.
621*814dcd43SSerapheim Dimitropoulos 	 */
622*814dcd43SSerapheim Dimitropoulos 	uint64_t txgs_in_future = 1;
623*814dcd43SSerapheim Dimitropoulos 
624*814dcd43SSerapheim Dimitropoulos 	/*
625*814dcd43SSerapheim Dimitropoulos 	 * This variable tells us how much room do we have until we hit
626*814dcd43SSerapheim Dimitropoulos 	 * our limit. When it goes negative, it means that we've exceeded
627*814dcd43SSerapheim Dimitropoulos 	 * our limit and we need to flush.
628*814dcd43SSerapheim Dimitropoulos 	 *
629*814dcd43SSerapheim Dimitropoulos 	 * Note that since we start at the first TXG in the future (i.e.
630*814dcd43SSerapheim Dimitropoulos 	 * txgs_in_future starts from 1) we already decrement this
631*814dcd43SSerapheim Dimitropoulos 	 * variable by the incoming rate.
632*814dcd43SSerapheim Dimitropoulos 	 */
633*814dcd43SSerapheim Dimitropoulos 	int64_t available_blocks =
634*814dcd43SSerapheim Dimitropoulos 	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
635*814dcd43SSerapheim Dimitropoulos 
636*814dcd43SSerapheim Dimitropoulos 	/*
637*814dcd43SSerapheim Dimitropoulos 	 * This variable tells us the total number of flushes needed to
638*814dcd43SSerapheim Dimitropoulos 	 * keep the log size within the limit when we reach txgs_in_future.
639*814dcd43SSerapheim Dimitropoulos 	 */
640*814dcd43SSerapheim Dimitropoulos 	uint64_t total_flushes = 0;
641*814dcd43SSerapheim Dimitropoulos 
642*814dcd43SSerapheim Dimitropoulos 	/* Holds the current maximum of our estimates so far. */
643*814dcd43SSerapheim Dimitropoulos 	uint64_t max_flushes_pertxg =
644*814dcd43SSerapheim Dimitropoulos 	    MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
645*814dcd43SSerapheim Dimitropoulos 	    zfs_min_metaslabs_to_flush);
646*814dcd43SSerapheim Dimitropoulos 
647*814dcd43SSerapheim Dimitropoulos 	/*
648*814dcd43SSerapheim Dimitropoulos 	 * For our estimations we only look as far in the future
649*814dcd43SSerapheim Dimitropoulos 	 * as the summary allows us.
650*814dcd43SSerapheim Dimitropoulos 	 */
651*814dcd43SSerapheim Dimitropoulos 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
652*814dcd43SSerapheim Dimitropoulos 	    e; e = list_next(&spa->spa_log_summary, e)) {
653*814dcd43SSerapheim Dimitropoulos 
654*814dcd43SSerapheim Dimitropoulos 		/*
655*814dcd43SSerapheim Dimitropoulos 		 * If there is still room before we exceed our limit
656*814dcd43SSerapheim Dimitropoulos 		 * then keep skipping TXGs accumulating more blocks
657*814dcd43SSerapheim Dimitropoulos 		 * based on the incoming rate until we exceed it.
658*814dcd43SSerapheim Dimitropoulos 		 */
659*814dcd43SSerapheim Dimitropoulos 		if (available_blocks >= 0) {
660*814dcd43SSerapheim Dimitropoulos 			uint64_t skip_txgs = (available_blocks / incoming) + 1;
661*814dcd43SSerapheim Dimitropoulos 			available_blocks -= (skip_txgs * incoming);
662*814dcd43SSerapheim Dimitropoulos 			txgs_in_future += skip_txgs;
663*814dcd43SSerapheim Dimitropoulos 			ASSERT3S(available_blocks, >=, -incoming);
664*814dcd43SSerapheim Dimitropoulos 		}
665*814dcd43SSerapheim Dimitropoulos 
666*814dcd43SSerapheim Dimitropoulos 		/*
667*814dcd43SSerapheim Dimitropoulos 		 * At this point we're far enough into the future where
668*814dcd43SSerapheim Dimitropoulos 		 * the limit was just exceeded and we flush metaslabs
669*814dcd43SSerapheim Dimitropoulos 		 * based on the current entry in the summary, updating
670*814dcd43SSerapheim Dimitropoulos 		 * our available_blocks.
671*814dcd43SSerapheim Dimitropoulos 		 */
672*814dcd43SSerapheim Dimitropoulos 		ASSERT3S(available_blocks, <, 0);
673*814dcd43SSerapheim Dimitropoulos 		available_blocks += e->lse_blkcount;
674*814dcd43SSerapheim Dimitropoulos 		total_flushes += e->lse_mscount;
675*814dcd43SSerapheim Dimitropoulos 
676*814dcd43SSerapheim Dimitropoulos 		/*
677*814dcd43SSerapheim Dimitropoulos 		 * Keep the running maximum of the total_flushes that
678*814dcd43SSerapheim Dimitropoulos 		 * we've done so far over the number of TXGs in the
679*814dcd43SSerapheim Dimitropoulos 		 * future that we are. The idea here is to estimate
680*814dcd43SSerapheim Dimitropoulos 		 * the average number of flushes that we should do
681*814dcd43SSerapheim Dimitropoulos 		 * every TXG so that when we are that many TXGs in the
682*814dcd43SSerapheim Dimitropoulos 		 * future we stay under the limit.
683*814dcd43SSerapheim Dimitropoulos 		 */
684*814dcd43SSerapheim Dimitropoulos 		max_flushes_pertxg = MAX(max_flushes_pertxg,
685*814dcd43SSerapheim Dimitropoulos 		    DIV_ROUND_UP(total_flushes, txgs_in_future));
686*814dcd43SSerapheim Dimitropoulos 		ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
687*814dcd43SSerapheim Dimitropoulos 		    max_flushes_pertxg);
688*814dcd43SSerapheim Dimitropoulos 	}
689*814dcd43SSerapheim Dimitropoulos 	return (max_flushes_pertxg);
690*814dcd43SSerapheim Dimitropoulos }
691*814dcd43SSerapheim Dimitropoulos 
692*814dcd43SSerapheim Dimitropoulos uint64_t
693*814dcd43SSerapheim Dimitropoulos spa_log_sm_memused(spa_t *spa)
694*814dcd43SSerapheim Dimitropoulos {
695*814dcd43SSerapheim Dimitropoulos 	return (spa->spa_unflushed_stats.sus_memused);
696*814dcd43SSerapheim Dimitropoulos }
697*814dcd43SSerapheim Dimitropoulos 
698*814dcd43SSerapheim Dimitropoulos static boolean_t
699*814dcd43SSerapheim Dimitropoulos spa_log_exceeds_memlimit(spa_t *spa)
700*814dcd43SSerapheim Dimitropoulos {
701*814dcd43SSerapheim Dimitropoulos 	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
702*814dcd43SSerapheim Dimitropoulos 		return (B_TRUE);
703*814dcd43SSerapheim Dimitropoulos 
704*814dcd43SSerapheim Dimitropoulos 	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
705*814dcd43SSerapheim Dimitropoulos 	    zfs_unflushed_max_mem_ppm) / 1000000;
706*814dcd43SSerapheim Dimitropoulos 	if (spa_log_sm_memused(spa) > system_mem_allowed)
707*814dcd43SSerapheim Dimitropoulos 		return (B_TRUE);
708*814dcd43SSerapheim Dimitropoulos 
709*814dcd43SSerapheim Dimitropoulos 	return (B_FALSE);
710*814dcd43SSerapheim Dimitropoulos }
711*814dcd43SSerapheim Dimitropoulos 
712*814dcd43SSerapheim Dimitropoulos boolean_t
713*814dcd43SSerapheim Dimitropoulos spa_flush_all_logs_requested(spa_t *spa)
714*814dcd43SSerapheim Dimitropoulos {
715*814dcd43SSerapheim Dimitropoulos 	return (spa->spa_log_flushall_txg != 0);
716*814dcd43SSerapheim Dimitropoulos }
717*814dcd43SSerapheim Dimitropoulos 
718*814dcd43SSerapheim Dimitropoulos void
719*814dcd43SSerapheim Dimitropoulos spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
720*814dcd43SSerapheim Dimitropoulos {
721*814dcd43SSerapheim Dimitropoulos 	uint64_t txg = dmu_tx_get_txg(tx);
722*814dcd43SSerapheim Dimitropoulos 
723*814dcd43SSerapheim Dimitropoulos 	if (spa_sync_pass(spa) != 1)
724*814dcd43SSerapheim Dimitropoulos 		return;
725*814dcd43SSerapheim Dimitropoulos 
726*814dcd43SSerapheim Dimitropoulos 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
727*814dcd43SSerapheim Dimitropoulos 		return;
728*814dcd43SSerapheim Dimitropoulos 
729*814dcd43SSerapheim Dimitropoulos 	/*
730*814dcd43SSerapheim Dimitropoulos 	 * If we don't have any metaslabs with unflushed changes
731*814dcd43SSerapheim Dimitropoulos 	 * return immediately.
732*814dcd43SSerapheim Dimitropoulos 	 */
733*814dcd43SSerapheim Dimitropoulos 	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
734*814dcd43SSerapheim Dimitropoulos 		return;
735*814dcd43SSerapheim Dimitropoulos 
736*814dcd43SSerapheim Dimitropoulos 	/*
737*814dcd43SSerapheim Dimitropoulos 	 * During SPA export we leave a few empty TXGs to go by [see
738*814dcd43SSerapheim Dimitropoulos 	 * spa_final_dirty_txg() to understand why]. For this specific
739*814dcd43SSerapheim Dimitropoulos 	 * case, it is important to not flush any metaslabs as that
740*814dcd43SSerapheim Dimitropoulos 	 * would dirty this TXG.
741*814dcd43SSerapheim Dimitropoulos 	 *
742*814dcd43SSerapheim Dimitropoulos 	 * That said, during one of these dirty TXGs that is less or
743*814dcd43SSerapheim Dimitropoulos 	 * equal to spa_final_dirty(), spa_unload() will request that
744*814dcd43SSerapheim Dimitropoulos 	 * we try to flush all the metaslabs for that TXG before
745*814dcd43SSerapheim Dimitropoulos 	 * exporting the pool, thus we ensure that we didn't get a
746*814dcd43SSerapheim Dimitropoulos 	 * request of flushing everything before we attempt to return
747*814dcd43SSerapheim Dimitropoulos 	 * immediately.
748*814dcd43SSerapheim Dimitropoulos 	 */
749*814dcd43SSerapheim Dimitropoulos 	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
750*814dcd43SSerapheim Dimitropoulos 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
751*814dcd43SSerapheim Dimitropoulos 	    !spa_flush_all_logs_requested(spa))
752*814dcd43SSerapheim Dimitropoulos 		return;
753*814dcd43SSerapheim Dimitropoulos 
754*814dcd43SSerapheim Dimitropoulos 	/*
755*814dcd43SSerapheim Dimitropoulos 	 * We need to generate a log space map before flushing because this
756*814dcd43SSerapheim Dimitropoulos 	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
757*814dcd43SSerapheim Dimitropoulos 	 * for this TXG's flushed metaslab count (aka sls_mscount which is
758*814dcd43SSerapheim Dimitropoulos 	 * manipulated in many ways down the metaslab_flush() codepath).
759*814dcd43SSerapheim Dimitropoulos 	 *
760*814dcd43SSerapheim Dimitropoulos 	 * That is not to say that we may generate a log space map when we
761*814dcd43SSerapheim Dimitropoulos 	 * don't need it. If we are flushing metaslabs, that means that we
762*814dcd43SSerapheim Dimitropoulos 	 * were going to write changes to disk anyway, so even if we were
763*814dcd43SSerapheim Dimitropoulos 	 * not flushing, a log space map would have been created anyway in
764*814dcd43SSerapheim Dimitropoulos 	 * metaslab_sync().
765*814dcd43SSerapheim Dimitropoulos 	 */
766*814dcd43SSerapheim Dimitropoulos 	spa_generate_syncing_log_sm(spa, tx);
767*814dcd43SSerapheim Dimitropoulos 
768*814dcd43SSerapheim Dimitropoulos 	/*
769*814dcd43SSerapheim Dimitropoulos 	 * This variable tells us how many metaslabs we want to flush based
770*814dcd43SSerapheim Dimitropoulos 	 * on the block-heuristic of our flushing algorithm (see block comment
771*814dcd43SSerapheim Dimitropoulos 	 * of log space map feature). We also decrement this as we flush
772*814dcd43SSerapheim Dimitropoulos 	 * metaslabs and attempt to destroy old log space maps.
773*814dcd43SSerapheim Dimitropoulos 	 */
774*814dcd43SSerapheim Dimitropoulos 	uint64_t want_to_flush;
775*814dcd43SSerapheim Dimitropoulos 	if (spa_flush_all_logs_requested(spa)) {
776*814dcd43SSerapheim Dimitropoulos 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
777*814dcd43SSerapheim Dimitropoulos 		want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
778*814dcd43SSerapheim Dimitropoulos 	} else {
779*814dcd43SSerapheim Dimitropoulos 		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
780*814dcd43SSerapheim Dimitropoulos 	}
781*814dcd43SSerapheim Dimitropoulos 
782*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
783*814dcd43SSerapheim Dimitropoulos 	    want_to_flush);
784*814dcd43SSerapheim Dimitropoulos 
785*814dcd43SSerapheim Dimitropoulos 	/* Used purely for verification purposes */
786*814dcd43SSerapheim Dimitropoulos 	uint64_t visited = 0;
787*814dcd43SSerapheim Dimitropoulos 
788*814dcd43SSerapheim Dimitropoulos 	/*
789*814dcd43SSerapheim Dimitropoulos 	 * Ideally we would only iterate through spa_metaslabs_by_flushed
790*814dcd43SSerapheim Dimitropoulos 	 * using only one variable (curr). We can't do that because
791*814dcd43SSerapheim Dimitropoulos 	 * metaslab_flush() mutates position of curr in the AVL when
792*814dcd43SSerapheim Dimitropoulos 	 * it flushes that metaslab by moving it to the end of the tree.
793*814dcd43SSerapheim Dimitropoulos 	 * Thus we always keep track of the original next node of the
794*814dcd43SSerapheim Dimitropoulos 	 * current node (curr) in another variable (next).
795*814dcd43SSerapheim Dimitropoulos 	 */
796*814dcd43SSerapheim Dimitropoulos 	metaslab_t *next = NULL;
797*814dcd43SSerapheim Dimitropoulos 	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
798*814dcd43SSerapheim Dimitropoulos 	    curr != NULL; curr = next) {
799*814dcd43SSerapheim Dimitropoulos 		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
800*814dcd43SSerapheim Dimitropoulos 
801*814dcd43SSerapheim Dimitropoulos 		/*
802*814dcd43SSerapheim Dimitropoulos 		 * If this metaslab has been flushed this txg then we've done
803*814dcd43SSerapheim Dimitropoulos 		 * a full circle over the metaslabs.
804*814dcd43SSerapheim Dimitropoulos 		 */
805*814dcd43SSerapheim Dimitropoulos 		if (metaslab_unflushed_txg(curr) == txg)
806*814dcd43SSerapheim Dimitropoulos 			break;
807*814dcd43SSerapheim Dimitropoulos 
808*814dcd43SSerapheim Dimitropoulos 		/*
809*814dcd43SSerapheim Dimitropoulos 		 * If we are done flushing for the block heuristic and the
810*814dcd43SSerapheim Dimitropoulos 		 * unflushed changes don't exceed the memory limit just stop.
811*814dcd43SSerapheim Dimitropoulos 		 */
812*814dcd43SSerapheim Dimitropoulos 		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
813*814dcd43SSerapheim Dimitropoulos 			break;
814*814dcd43SSerapheim Dimitropoulos 
815*814dcd43SSerapheim Dimitropoulos 		mutex_enter(&curr->ms_sync_lock);
816*814dcd43SSerapheim Dimitropoulos 		mutex_enter(&curr->ms_lock);
817*814dcd43SSerapheim Dimitropoulos 		boolean_t flushed = metaslab_flush(curr, tx);
818*814dcd43SSerapheim Dimitropoulos 		mutex_exit(&curr->ms_lock);
819*814dcd43SSerapheim Dimitropoulos 		mutex_exit(&curr->ms_sync_lock);
820*814dcd43SSerapheim Dimitropoulos 
821*814dcd43SSerapheim Dimitropoulos 		/*
822*814dcd43SSerapheim Dimitropoulos 		 * If we failed to flush a metaslab (because it was loading),
823*814dcd43SSerapheim Dimitropoulos 		 * then we are done with the block heuristic as it's not
824*814dcd43SSerapheim Dimitropoulos 		 * possible to destroy any log space maps once you've skipped
825*814dcd43SSerapheim Dimitropoulos 		 * a metaslab. In that case we just set our counter to 0 but
826*814dcd43SSerapheim Dimitropoulos 		 * we continue looping in case there is still memory pressure
827*814dcd43SSerapheim Dimitropoulos 		 * due to unflushed changes. Note that, flushing a metaslab
828*814dcd43SSerapheim Dimitropoulos 		 * that is not the oldest flushed in the pool, will never
829*814dcd43SSerapheim Dimitropoulos 		 * destroy any log space maps [see spa_cleanup_old_sm_logs()].
830*814dcd43SSerapheim Dimitropoulos 		 */
831*814dcd43SSerapheim Dimitropoulos 		if (!flushed) {
832*814dcd43SSerapheim Dimitropoulos 			want_to_flush = 0;
833*814dcd43SSerapheim Dimitropoulos 		} else if (want_to_flush > 0) {
834*814dcd43SSerapheim Dimitropoulos 			want_to_flush--;
835*814dcd43SSerapheim Dimitropoulos 		}
836*814dcd43SSerapheim Dimitropoulos 
837*814dcd43SSerapheim Dimitropoulos 		visited++;
838*814dcd43SSerapheim Dimitropoulos 	}
839*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
840*814dcd43SSerapheim Dimitropoulos }
841*814dcd43SSerapheim Dimitropoulos 
842*814dcd43SSerapheim Dimitropoulos /*
843*814dcd43SSerapheim Dimitropoulos  * Close the log space map for this TXG and update the block counts
844*814dcd43SSerapheim Dimitropoulos  * for the the log's in-memory structure and the summary.
845*814dcd43SSerapheim Dimitropoulos  */
846*814dcd43SSerapheim Dimitropoulos void
847*814dcd43SSerapheim Dimitropoulos spa_sync_close_syncing_log_sm(spa_t *spa)
848*814dcd43SSerapheim Dimitropoulos {
849*814dcd43SSerapheim Dimitropoulos 	if (spa_syncing_log_sm(spa) == NULL)
850*814dcd43SSerapheim Dimitropoulos 		return;
851*814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
852*814dcd43SSerapheim Dimitropoulos 
853*814dcd43SSerapheim Dimitropoulos 	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
854*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
855*814dcd43SSerapheim Dimitropoulos 
856*814dcd43SSerapheim Dimitropoulos 	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
857*814dcd43SSerapheim Dimitropoulos 	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
858*814dcd43SSerapheim Dimitropoulos 
859*814dcd43SSerapheim Dimitropoulos 	/*
860*814dcd43SSerapheim Dimitropoulos 	 * Note that we can't assert that sls_mscount is not 0,
861*814dcd43SSerapheim Dimitropoulos 	 * because there is the case where the first metaslab
862*814dcd43SSerapheim Dimitropoulos 	 * in spa_metaslabs_by_flushed is loading and we were
863*814dcd43SSerapheim Dimitropoulos 	 * not able to flush any metaslabs the current TXG.
864*814dcd43SSerapheim Dimitropoulos 	 */
865*814dcd43SSerapheim Dimitropoulos 	ASSERT(sls->sls_nblocks != 0);
866*814dcd43SSerapheim Dimitropoulos 
867*814dcd43SSerapheim Dimitropoulos 	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
868*814dcd43SSerapheim Dimitropoulos 	spa_log_summary_verify_counts(spa);
869*814dcd43SSerapheim Dimitropoulos 
870*814dcd43SSerapheim Dimitropoulos 	space_map_close(spa->spa_syncing_log_sm);
871*814dcd43SSerapheim Dimitropoulos 	spa->spa_syncing_log_sm = NULL;
872*814dcd43SSerapheim Dimitropoulos 
873*814dcd43SSerapheim Dimitropoulos 	/*
874*814dcd43SSerapheim Dimitropoulos 	 * At this point we tried to flush as many metaslabs as we
875*814dcd43SSerapheim Dimitropoulos 	 * can as the pool is getting exported. Reset the "flush all"
876*814dcd43SSerapheim Dimitropoulos 	 * so the last few TXGs before closing the pool can be empty
877*814dcd43SSerapheim Dimitropoulos 	 * (e.g. not dirty).
878*814dcd43SSerapheim Dimitropoulos 	 */
879*814dcd43SSerapheim Dimitropoulos 	if (spa_flush_all_logs_requested(spa)) {
880*814dcd43SSerapheim Dimitropoulos 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
881*814dcd43SSerapheim Dimitropoulos 		spa->spa_log_flushall_txg = 0;
882*814dcd43SSerapheim Dimitropoulos 	}
883*814dcd43SSerapheim Dimitropoulos }
884*814dcd43SSerapheim Dimitropoulos 
885*814dcd43SSerapheim Dimitropoulos void
886*814dcd43SSerapheim Dimitropoulos spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
887*814dcd43SSerapheim Dimitropoulos {
888*814dcd43SSerapheim Dimitropoulos 	objset_t *mos = spa_meta_objset(spa);
889*814dcd43SSerapheim Dimitropoulos 
890*814dcd43SSerapheim Dimitropoulos 	uint64_t spacemap_zap;
891*814dcd43SSerapheim Dimitropoulos 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
892*814dcd43SSerapheim Dimitropoulos 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
893*814dcd43SSerapheim Dimitropoulos 	if (error == ENOENT) {
894*814dcd43SSerapheim Dimitropoulos 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
895*814dcd43SSerapheim Dimitropoulos 		return;
896*814dcd43SSerapheim Dimitropoulos 	}
897*814dcd43SSerapheim Dimitropoulos 	VERIFY0(error);
898*814dcd43SSerapheim Dimitropoulos 
899*814dcd43SSerapheim Dimitropoulos 	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
900*814dcd43SSerapheim Dimitropoulos 	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
901*814dcd43SSerapheim Dimitropoulos 
902*814dcd43SSerapheim Dimitropoulos 	/* Free all log space maps older than the oldest_flushed_txg. */
903*814dcd43SSerapheim Dimitropoulos 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
904*814dcd43SSerapheim Dimitropoulos 	    sls && sls->sls_txg < oldest_flushed_txg;
905*814dcd43SSerapheim Dimitropoulos 	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
906*814dcd43SSerapheim Dimitropoulos 		ASSERT0(sls->sls_mscount);
907*814dcd43SSerapheim Dimitropoulos 		avl_remove(&spa->spa_sm_logs_by_txg, sls);
908*814dcd43SSerapheim Dimitropoulos 		space_map_free_obj(mos, sls->sls_sm_obj, tx);
909*814dcd43SSerapheim Dimitropoulos 		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
910*814dcd43SSerapheim Dimitropoulos 		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
911*814dcd43SSerapheim Dimitropoulos 		kmem_free(sls, sizeof (spa_log_sm_t));
912*814dcd43SSerapheim Dimitropoulos 	}
913*814dcd43SSerapheim Dimitropoulos }
914*814dcd43SSerapheim Dimitropoulos 
915*814dcd43SSerapheim Dimitropoulos static spa_log_sm_t *
916*814dcd43SSerapheim Dimitropoulos spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
917*814dcd43SSerapheim Dimitropoulos {
918*814dcd43SSerapheim Dimitropoulos 	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
919*814dcd43SSerapheim Dimitropoulos 
920*814dcd43SSerapheim Dimitropoulos 	sls->sls_sm_obj = sm_obj;
921*814dcd43SSerapheim Dimitropoulos 	sls->sls_txg = txg;
922*814dcd43SSerapheim Dimitropoulos 	return (sls);
923*814dcd43SSerapheim Dimitropoulos }
924*814dcd43SSerapheim Dimitropoulos 
925*814dcd43SSerapheim Dimitropoulos void
926*814dcd43SSerapheim Dimitropoulos spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
927*814dcd43SSerapheim Dimitropoulos {
928*814dcd43SSerapheim Dimitropoulos 	uint64_t txg = dmu_tx_get_txg(tx);
929*814dcd43SSerapheim Dimitropoulos 	objset_t *mos = spa_meta_objset(spa);
930*814dcd43SSerapheim Dimitropoulos 
931*814dcd43SSerapheim Dimitropoulos 	if (spa_syncing_log_sm(spa) != NULL)
932*814dcd43SSerapheim Dimitropoulos 		return;
933*814dcd43SSerapheim Dimitropoulos 
934*814dcd43SSerapheim Dimitropoulos 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
935*814dcd43SSerapheim Dimitropoulos 		return;
936*814dcd43SSerapheim Dimitropoulos 
937*814dcd43SSerapheim Dimitropoulos 	uint64_t spacemap_zap;
938*814dcd43SSerapheim Dimitropoulos 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
939*814dcd43SSerapheim Dimitropoulos 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
940*814dcd43SSerapheim Dimitropoulos 	if (error == ENOENT) {
941*814dcd43SSerapheim Dimitropoulos 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
942*814dcd43SSerapheim Dimitropoulos 
943*814dcd43SSerapheim Dimitropoulos 		error = 0;
944*814dcd43SSerapheim Dimitropoulos 		spacemap_zap = zap_create(mos,
945*814dcd43SSerapheim Dimitropoulos 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
946*814dcd43SSerapheim Dimitropoulos 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
947*814dcd43SSerapheim Dimitropoulos 		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
948*814dcd43SSerapheim Dimitropoulos 		    &spacemap_zap, tx));
949*814dcd43SSerapheim Dimitropoulos 		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
950*814dcd43SSerapheim Dimitropoulos 	}
951*814dcd43SSerapheim Dimitropoulos 	VERIFY0(error);
952*814dcd43SSerapheim Dimitropoulos 
953*814dcd43SSerapheim Dimitropoulos 	uint64_t sm_obj;
954*814dcd43SSerapheim Dimitropoulos 	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
955*814dcd43SSerapheim Dimitropoulos 	    ==, ENOENT);
956*814dcd43SSerapheim Dimitropoulos 	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
957*814dcd43SSerapheim Dimitropoulos 	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
958*814dcd43SSerapheim Dimitropoulos 	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
959*814dcd43SSerapheim Dimitropoulos 
960*814dcd43SSerapheim Dimitropoulos 	/*
961*814dcd43SSerapheim Dimitropoulos 	 * We pass UINT64_MAX as the space map's representation size
962*814dcd43SSerapheim Dimitropoulos 	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
963*814dcd43SSerapheim Dimitropoulos 	 * accept any sorts of segments since there's no real advantage
964*814dcd43SSerapheim Dimitropoulos 	 * to being more restrictive (given that we're already going
965*814dcd43SSerapheim Dimitropoulos 	 * to be using 2-word entries).
966*814dcd43SSerapheim Dimitropoulos 	 */
967*814dcd43SSerapheim Dimitropoulos 	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
968*814dcd43SSerapheim Dimitropoulos 	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
969*814dcd43SSerapheim Dimitropoulos 
970*814dcd43SSerapheim Dimitropoulos 	/*
971*814dcd43SSerapheim Dimitropoulos 	 * If the log space map feature was just enabled, the blocklimit
972*814dcd43SSerapheim Dimitropoulos 	 * has not yet been set.
973*814dcd43SSerapheim Dimitropoulos 	 */
974*814dcd43SSerapheim Dimitropoulos 	if (spa_log_sm_blocklimit(spa) == 0)
975*814dcd43SSerapheim Dimitropoulos 		spa_log_sm_set_blocklimit(spa);
976*814dcd43SSerapheim Dimitropoulos }
977*814dcd43SSerapheim Dimitropoulos 
978*814dcd43SSerapheim Dimitropoulos /*
979*814dcd43SSerapheim Dimitropoulos  * Find all the log space maps stored in the space map ZAP and sort
980*814dcd43SSerapheim Dimitropoulos  * them by their TXG in spa_sm_logs_by_txg.
981*814dcd43SSerapheim Dimitropoulos  */
982*814dcd43SSerapheim Dimitropoulos static int
983*814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_metadata(spa_t *spa)
984*814dcd43SSerapheim Dimitropoulos {
985*814dcd43SSerapheim Dimitropoulos 	int error;
986*814dcd43SSerapheim Dimitropoulos 	uint64_t spacemap_zap;
987*814dcd43SSerapheim Dimitropoulos 
988*814dcd43SSerapheim Dimitropoulos 	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
989*814dcd43SSerapheim Dimitropoulos 
990*814dcd43SSerapheim Dimitropoulos 	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
991*814dcd43SSerapheim Dimitropoulos 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
992*814dcd43SSerapheim Dimitropoulos 	if (error == ENOENT) {
993*814dcd43SSerapheim Dimitropoulos 		/* the space map ZAP doesn't exist yet */
994*814dcd43SSerapheim Dimitropoulos 		return (0);
995*814dcd43SSerapheim Dimitropoulos 	} else if (error != 0) {
996*814dcd43SSerapheim Dimitropoulos 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
997*814dcd43SSerapheim Dimitropoulos 		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
998*814dcd43SSerapheim Dimitropoulos 		    error);
999*814dcd43SSerapheim Dimitropoulos 		return (error);
1000*814dcd43SSerapheim Dimitropoulos 	}
1001*814dcd43SSerapheim Dimitropoulos 
1002*814dcd43SSerapheim Dimitropoulos 	zap_cursor_t zc;
1003*814dcd43SSerapheim Dimitropoulos 	zap_attribute_t za;
1004*814dcd43SSerapheim Dimitropoulos 	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
1005*814dcd43SSerapheim Dimitropoulos 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
1006*814dcd43SSerapheim Dimitropoulos 	    zap_cursor_advance(&zc)) {
1007*814dcd43SSerapheim Dimitropoulos 		uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
1008*814dcd43SSerapheim Dimitropoulos 		spa_log_sm_t *sls =
1009*814dcd43SSerapheim Dimitropoulos 		    spa_log_sm_alloc(za.za_first_integer, log_txg);
1010*814dcd43SSerapheim Dimitropoulos 		avl_add(&spa->spa_sm_logs_by_txg, sls);
1011*814dcd43SSerapheim Dimitropoulos 	}
1012*814dcd43SSerapheim Dimitropoulos 	zap_cursor_fini(&zc);
1013*814dcd43SSerapheim Dimitropoulos 	if (error != ENOENT) {
1014*814dcd43SSerapheim Dimitropoulos 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1015*814dcd43SSerapheim Dimitropoulos 		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
1016*814dcd43SSerapheim Dimitropoulos 		    error);
1017*814dcd43SSerapheim Dimitropoulos 		return (error);
1018*814dcd43SSerapheim Dimitropoulos 	}
1019*814dcd43SSerapheim Dimitropoulos 
1020*814dcd43SSerapheim Dimitropoulos 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1021*814dcd43SSerapheim Dimitropoulos 	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1022*814dcd43SSerapheim Dimitropoulos 		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1023*814dcd43SSerapheim Dimitropoulos 		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1024*814dcd43SSerapheim Dimitropoulos 		    &target, NULL);
1025*814dcd43SSerapheim Dimitropoulos 
1026*814dcd43SSerapheim Dimitropoulos 		/*
1027*814dcd43SSerapheim Dimitropoulos 		 * At this point if sls is zero it means that a bug occurred
1028*814dcd43SSerapheim Dimitropoulos 		 * in ZFS the last time the pool was open or earlier in the
1029*814dcd43SSerapheim Dimitropoulos 		 * import code path. In general, we would have placed a
1030*814dcd43SSerapheim Dimitropoulos 		 * VERIFY() here or in this case just let the kernel panic
1031*814dcd43SSerapheim Dimitropoulos 		 * with NULL pointer dereference when incrementing sls_mscount,
1032*814dcd43SSerapheim Dimitropoulos 		 * but since this is the import code path we can be a bit more
1033*814dcd43SSerapheim Dimitropoulos 		 * lenient. Thus, for DEBUG bits we always cause a panic, while
1034*814dcd43SSerapheim Dimitropoulos 		 * in production we log the error and just fail the import.
1035*814dcd43SSerapheim Dimitropoulos 		 */
1036*814dcd43SSerapheim Dimitropoulos 		ASSERT(sls != NULL);
1037*814dcd43SSerapheim Dimitropoulos 		if (sls == NULL) {
1038*814dcd43SSerapheim Dimitropoulos 			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1039*814dcd43SSerapheim Dimitropoulos 			    "encountered: could not find log spacemap for "
1040*814dcd43SSerapheim Dimitropoulos 			    "TXG %ld [error %d]",
1041*814dcd43SSerapheim Dimitropoulos 			    metaslab_unflushed_txg(m), ENOENT);
1042*814dcd43SSerapheim Dimitropoulos 			return (ENOENT);
1043*814dcd43SSerapheim Dimitropoulos 		}
1044*814dcd43SSerapheim Dimitropoulos 		sls->sls_mscount++;
1045*814dcd43SSerapheim Dimitropoulos 	}
1046*814dcd43SSerapheim Dimitropoulos 
1047*814dcd43SSerapheim Dimitropoulos 	return (0);
1048*814dcd43SSerapheim Dimitropoulos }
1049*814dcd43SSerapheim Dimitropoulos 
1050*814dcd43SSerapheim Dimitropoulos typedef struct spa_ld_log_sm_arg {
1051*814dcd43SSerapheim Dimitropoulos 	spa_t *slls_spa;
1052*814dcd43SSerapheim Dimitropoulos 	uint64_t slls_txg;
1053*814dcd43SSerapheim Dimitropoulos } spa_ld_log_sm_arg_t;
1054*814dcd43SSerapheim Dimitropoulos 
1055*814dcd43SSerapheim Dimitropoulos static int
1056*814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1057*814dcd43SSerapheim Dimitropoulos {
1058*814dcd43SSerapheim Dimitropoulos 	uint64_t offset = sme->sme_offset;
1059*814dcd43SSerapheim Dimitropoulos 	uint64_t size = sme->sme_run;
1060*814dcd43SSerapheim Dimitropoulos 	uint32_t vdev_id = sme->sme_vdev;
1061*814dcd43SSerapheim Dimitropoulos 	spa_ld_log_sm_arg_t *slls = arg;
1062*814dcd43SSerapheim Dimitropoulos 	spa_t *spa = slls->slls_spa;
1063*814dcd43SSerapheim Dimitropoulos 
1064*814dcd43SSerapheim Dimitropoulos 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1065*814dcd43SSerapheim Dimitropoulos 
1066*814dcd43SSerapheim Dimitropoulos 	/*
1067*814dcd43SSerapheim Dimitropoulos 	 * If the vdev has been removed (i.e. it is indirect or a hole)
1068*814dcd43SSerapheim Dimitropoulos 	 * skip this entry. The contents of this vdev have already moved
1069*814dcd43SSerapheim Dimitropoulos 	 * elsewhere.
1070*814dcd43SSerapheim Dimitropoulos 	 */
1071*814dcd43SSerapheim Dimitropoulos 	if (!vdev_is_concrete(vd))
1072*814dcd43SSerapheim Dimitropoulos 		return (0);
1073*814dcd43SSerapheim Dimitropoulos 
1074*814dcd43SSerapheim Dimitropoulos 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1075*814dcd43SSerapheim Dimitropoulos 	ASSERT(!ms->ms_loaded);
1076*814dcd43SSerapheim Dimitropoulos 
1077*814dcd43SSerapheim Dimitropoulos 	/*
1078*814dcd43SSerapheim Dimitropoulos 	 * If we have already flushed entries for this TXG to this
1079*814dcd43SSerapheim Dimitropoulos 	 * metaslab's space map, then ignore it. Note that we flush
1080*814dcd43SSerapheim Dimitropoulos 	 * before processing any allocations/frees for that TXG, so
1081*814dcd43SSerapheim Dimitropoulos 	 * the metaslab's space map only has entries from *before*
1082*814dcd43SSerapheim Dimitropoulos 	 * the unflushed TXG.
1083*814dcd43SSerapheim Dimitropoulos 	 */
1084*814dcd43SSerapheim Dimitropoulos 	if (slls->slls_txg < metaslab_unflushed_txg(ms))
1085*814dcd43SSerapheim Dimitropoulos 		return (0);
1086*814dcd43SSerapheim Dimitropoulos 
1087*814dcd43SSerapheim Dimitropoulos 	switch (sme->sme_type) {
1088*814dcd43SSerapheim Dimitropoulos 	case SM_ALLOC:
1089*814dcd43SSerapheim Dimitropoulos 		range_tree_remove_xor_add_segment(offset, offset + size,
1090*814dcd43SSerapheim Dimitropoulos 		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1091*814dcd43SSerapheim Dimitropoulos 		break;
1092*814dcd43SSerapheim Dimitropoulos 	case SM_FREE:
1093*814dcd43SSerapheim Dimitropoulos 		range_tree_remove_xor_add_segment(offset, offset + size,
1094*814dcd43SSerapheim Dimitropoulos 		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1095*814dcd43SSerapheim Dimitropoulos 		break;
1096*814dcd43SSerapheim Dimitropoulos 	default:
1097*814dcd43SSerapheim Dimitropoulos 		panic("invalid maptype_t");
1098*814dcd43SSerapheim Dimitropoulos 		break;
1099*814dcd43SSerapheim Dimitropoulos 	}
1100*814dcd43SSerapheim Dimitropoulos 	return (0);
1101*814dcd43SSerapheim Dimitropoulos }
1102*814dcd43SSerapheim Dimitropoulos 
1103*814dcd43SSerapheim Dimitropoulos static int
1104*814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_data(spa_t *spa)
1105*814dcd43SSerapheim Dimitropoulos {
1106*814dcd43SSerapheim Dimitropoulos 	int error = 0;
1107*814dcd43SSerapheim Dimitropoulos 
1108*814dcd43SSerapheim Dimitropoulos 	/*
1109*814dcd43SSerapheim Dimitropoulos 	 * If we are not going to do any writes there is no need
1110*814dcd43SSerapheim Dimitropoulos 	 * to read the log space maps.
1111*814dcd43SSerapheim Dimitropoulos 	 */
1112*814dcd43SSerapheim Dimitropoulos 	if (!spa_writeable(spa))
1113*814dcd43SSerapheim Dimitropoulos 		return (0);
1114*814dcd43SSerapheim Dimitropoulos 
1115*814dcd43SSerapheim Dimitropoulos 	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1116*814dcd43SSerapheim Dimitropoulos 	ASSERT0(spa->spa_unflushed_stats.sus_memused);
1117*814dcd43SSerapheim Dimitropoulos 
1118*814dcd43SSerapheim Dimitropoulos 	hrtime_t read_logs_starttime = gethrtime();
1119*814dcd43SSerapheim Dimitropoulos 	/* this is a no-op when we don't have space map logs */
1120*814dcd43SSerapheim Dimitropoulos 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1121*814dcd43SSerapheim Dimitropoulos 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1122*814dcd43SSerapheim Dimitropoulos 		space_map_t *sm = NULL;
1123*814dcd43SSerapheim Dimitropoulos 		error = space_map_open(&sm, spa_meta_objset(spa),
1124*814dcd43SSerapheim Dimitropoulos 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
1125*814dcd43SSerapheim Dimitropoulos 		if (error != 0) {
1126*814dcd43SSerapheim Dimitropoulos 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
1127*814dcd43SSerapheim Dimitropoulos 			    "space_map_open(obj=%llu) [error %d]",
1128*814dcd43SSerapheim Dimitropoulos 			    (u_longlong_t)sls->sls_sm_obj, error);
1129*814dcd43SSerapheim Dimitropoulos 			goto out;
1130*814dcd43SSerapheim Dimitropoulos 		}
1131*814dcd43SSerapheim Dimitropoulos 
1132*814dcd43SSerapheim Dimitropoulos 		struct spa_ld_log_sm_arg vla = {
1133*814dcd43SSerapheim Dimitropoulos 			.slls_spa = spa,
1134*814dcd43SSerapheim Dimitropoulos 			.slls_txg = sls->sls_txg
1135*814dcd43SSerapheim Dimitropoulos 		};
1136*814dcd43SSerapheim Dimitropoulos 		error = space_map_iterate(sm, space_map_length(sm),
1137*814dcd43SSerapheim Dimitropoulos 		    spa_ld_log_sm_cb, &vla);
1138*814dcd43SSerapheim Dimitropoulos 		if (error != 0) {
1139*814dcd43SSerapheim Dimitropoulos 			space_map_close(sm);
1140*814dcd43SSerapheim Dimitropoulos 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1141*814dcd43SSerapheim Dimitropoulos 			    "at space_map_iterate(obj=%llu) [error %d]",
1142*814dcd43SSerapheim Dimitropoulos 			    (u_longlong_t)sls->sls_sm_obj, error);
1143*814dcd43SSerapheim Dimitropoulos 			goto out;
1144*814dcd43SSerapheim Dimitropoulos 		}
1145*814dcd43SSerapheim Dimitropoulos 
1146*814dcd43SSerapheim Dimitropoulos 		ASSERT0(sls->sls_nblocks);
1147*814dcd43SSerapheim Dimitropoulos 		sls->sls_nblocks = space_map_nblocks(sm);
1148*814dcd43SSerapheim Dimitropoulos 		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1149*814dcd43SSerapheim Dimitropoulos 		summary_add_data(spa, sls->sls_txg,
1150*814dcd43SSerapheim Dimitropoulos 		    sls->sls_mscount, sls->sls_nblocks);
1151*814dcd43SSerapheim Dimitropoulos 
1152*814dcd43SSerapheim Dimitropoulos 		space_map_close(sm);
1153*814dcd43SSerapheim Dimitropoulos 	}
1154*814dcd43SSerapheim Dimitropoulos 	hrtime_t read_logs_endtime = gethrtime();
1155*814dcd43SSerapheim Dimitropoulos 	spa_load_note(spa,
1156*814dcd43SSerapheim Dimitropoulos 	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
1157*814dcd43SSerapheim Dimitropoulos 	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
1158*814dcd43SSerapheim Dimitropoulos 	    (u_longlong_t)spa_log_sm_nblocks(spa),
1159*814dcd43SSerapheim Dimitropoulos 	    (u_longlong_t)zfs_log_sm_blksz,
1160*814dcd43SSerapheim Dimitropoulos 	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
1161*814dcd43SSerapheim Dimitropoulos 
1162*814dcd43SSerapheim Dimitropoulos out:
1163*814dcd43SSerapheim Dimitropoulos 	/*
1164*814dcd43SSerapheim Dimitropoulos 	 * Now that the metaslabs contain their unflushed changes:
1165*814dcd43SSerapheim Dimitropoulos 	 * [1] recalculate their actual allocated space
1166*814dcd43SSerapheim Dimitropoulos 	 * [2] recalculate their weights
1167*814dcd43SSerapheim Dimitropoulos 	 * [3] sum up the memory usage of their unflushed range trees
1168*814dcd43SSerapheim Dimitropoulos 	 * [4] optionally load them, if debug_load is set
1169*814dcd43SSerapheim Dimitropoulos 	 *
1170*814dcd43SSerapheim Dimitropoulos 	 * Note that even in the case where we get here because of an
1171*814dcd43SSerapheim Dimitropoulos 	 * error (e.g. error != 0), we still want to update the fields
1172*814dcd43SSerapheim Dimitropoulos 	 * below in order to have a proper teardown in spa_unload().
1173*814dcd43SSerapheim Dimitropoulos 	 */
1174*814dcd43SSerapheim Dimitropoulos 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1175*814dcd43SSerapheim Dimitropoulos 	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1176*814dcd43SSerapheim Dimitropoulos 		mutex_enter(&m->ms_lock);
1177*814dcd43SSerapheim Dimitropoulos 		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1178*814dcd43SSerapheim Dimitropoulos 		    range_tree_space(m->ms_unflushed_allocs) -
1179*814dcd43SSerapheim Dimitropoulos 		    range_tree_space(m->ms_unflushed_frees);
1180*814dcd43SSerapheim Dimitropoulos 
1181*814dcd43SSerapheim Dimitropoulos 		vdev_t *vd = m->ms_group->mg_vd;
1182*814dcd43SSerapheim Dimitropoulos 		metaslab_space_update(vd, m->ms_group->mg_class,
1183*814dcd43SSerapheim Dimitropoulos 		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
1184*814dcd43SSerapheim Dimitropoulos 		metaslab_space_update(vd, m->ms_group->mg_class,
1185*814dcd43SSerapheim Dimitropoulos 		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
1186*814dcd43SSerapheim Dimitropoulos 
1187*814dcd43SSerapheim Dimitropoulos 		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1188*814dcd43SSerapheim Dimitropoulos 		metaslab_recalculate_weight_and_sort(m);
1189*814dcd43SSerapheim Dimitropoulos 
1190*814dcd43SSerapheim Dimitropoulos 		spa->spa_unflushed_stats.sus_memused +=
1191*814dcd43SSerapheim Dimitropoulos 		    metaslab_unflushed_changes_memused(m);
1192*814dcd43SSerapheim Dimitropoulos 
1193*814dcd43SSerapheim Dimitropoulos 		if (metaslab_debug_load && m->ms_sm != NULL) {
1194*814dcd43SSerapheim Dimitropoulos 			VERIFY0(metaslab_load(m));
1195*814dcd43SSerapheim Dimitropoulos 		}
1196*814dcd43SSerapheim Dimitropoulos 		mutex_exit(&m->ms_lock);
1197*814dcd43SSerapheim Dimitropoulos 	}
1198*814dcd43SSerapheim Dimitropoulos 
1199*814dcd43SSerapheim Dimitropoulos 	return (error);
1200*814dcd43SSerapheim Dimitropoulos }
1201*814dcd43SSerapheim Dimitropoulos 
1202*814dcd43SSerapheim Dimitropoulos static int
1203*814dcd43SSerapheim Dimitropoulos spa_ld_unflushed_txgs(vdev_t *vd)
1204*814dcd43SSerapheim Dimitropoulos {
1205*814dcd43SSerapheim Dimitropoulos 	spa_t *spa = vd->vdev_spa;
1206*814dcd43SSerapheim Dimitropoulos 	objset_t *mos = spa_meta_objset(spa);
1207*814dcd43SSerapheim Dimitropoulos 
1208*814dcd43SSerapheim Dimitropoulos 	if (vd->vdev_top_zap == 0)
1209*814dcd43SSerapheim Dimitropoulos 		return (0);
1210*814dcd43SSerapheim Dimitropoulos 
1211*814dcd43SSerapheim Dimitropoulos 	uint64_t object = 0;
1212*814dcd43SSerapheim Dimitropoulos 	int error = zap_lookup(mos, vd->vdev_top_zap,
1213*814dcd43SSerapheim Dimitropoulos 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1214*814dcd43SSerapheim Dimitropoulos 	    sizeof (uint64_t), 1, &object);
1215*814dcd43SSerapheim Dimitropoulos 	if (error == ENOENT)
1216*814dcd43SSerapheim Dimitropoulos 		return (0);
1217*814dcd43SSerapheim Dimitropoulos 	else if (error != 0) {
1218*814dcd43SSerapheim Dimitropoulos 		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1219*814dcd43SSerapheim Dimitropoulos 		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
1220*814dcd43SSerapheim Dimitropoulos 		    (u_longlong_t)vd->vdev_top_zap, error);
1221*814dcd43SSerapheim Dimitropoulos 		return (error);
1222*814dcd43SSerapheim Dimitropoulos 	}
1223*814dcd43SSerapheim Dimitropoulos 
1224*814dcd43SSerapheim Dimitropoulos 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1225*814dcd43SSerapheim Dimitropoulos 		metaslab_t *ms = vd->vdev_ms[m];
1226*814dcd43SSerapheim Dimitropoulos 		ASSERT(ms != NULL);
1227*814dcd43SSerapheim Dimitropoulos 
1228*814dcd43SSerapheim Dimitropoulos 		metaslab_unflushed_phys_t entry;
1229*814dcd43SSerapheim Dimitropoulos 		uint64_t entry_size = sizeof (entry);
1230*814dcd43SSerapheim Dimitropoulos 		uint64_t entry_offset = ms->ms_id * entry_size;
1231*814dcd43SSerapheim Dimitropoulos 
1232*814dcd43SSerapheim Dimitropoulos 		error = dmu_read(mos, object,
1233*814dcd43SSerapheim Dimitropoulos 		    entry_offset, entry_size, &entry, 0);
1234*814dcd43SSerapheim Dimitropoulos 		if (error != 0) {
1235*814dcd43SSerapheim Dimitropoulos 			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1236*814dcd43SSerapheim Dimitropoulos 			    "failed at dmu_read(obj=%llu) [error %d]",
1237*814dcd43SSerapheim Dimitropoulos 			    (u_longlong_t)object, error);
1238*814dcd43SSerapheim Dimitropoulos 			return (error);
1239*814dcd43SSerapheim Dimitropoulos 		}
1240*814dcd43SSerapheim Dimitropoulos 
1241*814dcd43SSerapheim Dimitropoulos 		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1242*814dcd43SSerapheim Dimitropoulos 		if (ms->ms_unflushed_txg != 0) {
1243*814dcd43SSerapheim Dimitropoulos 			mutex_enter(&spa->spa_flushed_ms_lock);
1244*814dcd43SSerapheim Dimitropoulos 			avl_add(&spa->spa_metaslabs_by_flushed, ms);
1245*814dcd43SSerapheim Dimitropoulos 			mutex_exit(&spa->spa_flushed_ms_lock);
1246*814dcd43SSerapheim Dimitropoulos 		}
1247*814dcd43SSerapheim Dimitropoulos 	}
1248*814dcd43SSerapheim Dimitropoulos 	return (0);
1249*814dcd43SSerapheim Dimitropoulos }
1250*814dcd43SSerapheim Dimitropoulos 
1251*814dcd43SSerapheim Dimitropoulos /*
1252*814dcd43SSerapheim Dimitropoulos  * Read all the log space map entries into their respective
1253*814dcd43SSerapheim Dimitropoulos  * metaslab unflushed trees and keep them sorted by TXG in the
1254*814dcd43SSerapheim Dimitropoulos  * SPA's metadata. In addition, setup all the metadata for the
1255*814dcd43SSerapheim Dimitropoulos  * memory and the block heuristics.
1256*814dcd43SSerapheim Dimitropoulos  */
1257*814dcd43SSerapheim Dimitropoulos int
1258*814dcd43SSerapheim Dimitropoulos spa_ld_log_spacemaps(spa_t *spa)
1259*814dcd43SSerapheim Dimitropoulos {
1260*814dcd43SSerapheim Dimitropoulos 	int error;
1261*814dcd43SSerapheim Dimitropoulos 
1262*814dcd43SSerapheim Dimitropoulos 	spa_log_sm_set_blocklimit(spa);
1263*814dcd43SSerapheim Dimitropoulos 
1264*814dcd43SSerapheim Dimitropoulos 	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1265*814dcd43SSerapheim Dimitropoulos 		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1266*814dcd43SSerapheim Dimitropoulos 		error = spa_ld_unflushed_txgs(vd);
1267*814dcd43SSerapheim Dimitropoulos 		if (error != 0)
1268*814dcd43SSerapheim Dimitropoulos 			return (error);
1269*814dcd43SSerapheim Dimitropoulos 	}
1270*814dcd43SSerapheim Dimitropoulos 
1271*814dcd43SSerapheim Dimitropoulos 	error = spa_ld_log_sm_metadata(spa);
1272*814dcd43SSerapheim Dimitropoulos 	if (error != 0)
1273*814dcd43SSerapheim Dimitropoulos 		return (error);
1274*814dcd43SSerapheim Dimitropoulos 
1275*814dcd43SSerapheim Dimitropoulos 	/*
1276*814dcd43SSerapheim Dimitropoulos 	 * Note: we don't actually expect anything to change at this point
1277*814dcd43SSerapheim Dimitropoulos 	 * but we grab the config lock so we don't fail any assertions
1278*814dcd43SSerapheim Dimitropoulos 	 * when using vdev_lookup_top().
1279*814dcd43SSerapheim Dimitropoulos 	 */
1280*814dcd43SSerapheim Dimitropoulos 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1281*814dcd43SSerapheim Dimitropoulos 	error = spa_ld_log_sm_data(spa);
1282*814dcd43SSerapheim Dimitropoulos 	spa_config_exit(spa, SCL_CONFIG, FTAG);
1283*814dcd43SSerapheim Dimitropoulos 
1284*814dcd43SSerapheim Dimitropoulos 	return (error);
1285*814dcd43SSerapheim Dimitropoulos }
1286