1814dcd43SSerapheim Dimitropoulos /*
2814dcd43SSerapheim Dimitropoulos * CDDL HEADER START
3814dcd43SSerapheim Dimitropoulos *
4814dcd43SSerapheim Dimitropoulos * The contents of this file are subject to the terms of the
5814dcd43SSerapheim Dimitropoulos * Common Development and Distribution License (the "License").
6814dcd43SSerapheim Dimitropoulos * You may not use this file except in compliance with the License.
7814dcd43SSerapheim Dimitropoulos *
8814dcd43SSerapheim Dimitropoulos * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9814dcd43SSerapheim Dimitropoulos * or http://www.opensolaris.org/os/licensing.
10814dcd43SSerapheim Dimitropoulos * See the License for the specific language governing permissions
11814dcd43SSerapheim Dimitropoulos * and limitations under the License.
12814dcd43SSerapheim Dimitropoulos *
13814dcd43SSerapheim Dimitropoulos * When distributing Covered Code, include this CDDL HEADER in each
14814dcd43SSerapheim Dimitropoulos * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15814dcd43SSerapheim Dimitropoulos * If applicable, add the following below this CDDL HEADER, with the
16814dcd43SSerapheim Dimitropoulos * fields enclosed by brackets "[]" replaced with your own identifying
17814dcd43SSerapheim Dimitropoulos * information: Portions Copyright [yyyy] [name of copyright owner]
18814dcd43SSerapheim Dimitropoulos *
19814dcd43SSerapheim Dimitropoulos * CDDL HEADER END
20814dcd43SSerapheim Dimitropoulos */
21814dcd43SSerapheim Dimitropoulos
22814dcd43SSerapheim Dimitropoulos /*
23814dcd43SSerapheim Dimitropoulos * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
24814dcd43SSerapheim Dimitropoulos */
25814dcd43SSerapheim Dimitropoulos
26814dcd43SSerapheim Dimitropoulos #include <sys/dmu_objset.h>
27814dcd43SSerapheim Dimitropoulos #include <sys/metaslab.h>
28814dcd43SSerapheim Dimitropoulos #include <sys/metaslab_impl.h>
29814dcd43SSerapheim Dimitropoulos #include <sys/spa.h>
30814dcd43SSerapheim Dimitropoulos #include <sys/spa_impl.h>
31814dcd43SSerapheim Dimitropoulos #include <sys/spa_log_spacemap.h>
32814dcd43SSerapheim Dimitropoulos #include <sys/vdev_impl.h>
33814dcd43SSerapheim Dimitropoulos #include <sys/zap.h>
34814dcd43SSerapheim Dimitropoulos
35814dcd43SSerapheim Dimitropoulos /*
36814dcd43SSerapheim Dimitropoulos * Log Space Maps
37814dcd43SSerapheim Dimitropoulos *
38814dcd43SSerapheim Dimitropoulos * Log space maps are an optimization in ZFS metadata allocations for pools
39814dcd43SSerapheim Dimitropoulos * whose workloads are primarily random-writes. Random-write workloads are also
40814dcd43SSerapheim Dimitropoulos * typically random-free, meaning that they are freeing from locations scattered
41814dcd43SSerapheim Dimitropoulos * throughout the pool. This means that each TXG we will have to append some
42814dcd43SSerapheim Dimitropoulos * FREE records to almost every metaslab. With log space maps, we hold their
43814dcd43SSerapheim Dimitropoulos * changes in memory and log them altogether in one pool-wide space map on-disk
44814dcd43SSerapheim Dimitropoulos * for persistence. As more blocks are accumulated in the log space maps and
45814dcd43SSerapheim Dimitropoulos * more unflushed changes are accounted in memory, we flush a selected group
46814dcd43SSerapheim Dimitropoulos * of metaslabs every TXG to relieve memory pressure and potential overheads
47814dcd43SSerapheim Dimitropoulos * when loading the pool. Flushing a metaslab to disk relieves memory as we
48814dcd43SSerapheim Dimitropoulos * flush any unflushed changes from memory to disk (i.e. the metaslab's space
49814dcd43SSerapheim Dimitropoulos * map) and saves import time by making old log space maps obsolete and
50814dcd43SSerapheim Dimitropoulos * eventually destroying them. [A log space map is said to be obsolete when all
51814dcd43SSerapheim Dimitropoulos * its entries have made it to their corresponding metaslab space maps].
52814dcd43SSerapheim Dimitropoulos *
53814dcd43SSerapheim Dimitropoulos * == On disk data structures used ==
54814dcd43SSerapheim Dimitropoulos *
55814dcd43SSerapheim Dimitropoulos * - The pool has a new feature flag and a new entry in the MOS. The feature
56814dcd43SSerapheim Dimitropoulos * is activated when we create the first log space map and remains active
57814dcd43SSerapheim Dimitropoulos * for the lifetime of the pool. The new entry in the MOS Directory [refer
58814dcd43SSerapheim Dimitropoulos * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
59814dcd43SSerapheim Dimitropoulos * pairs are of the form <key: txg, value: log space map object for that txg>.
60814dcd43SSerapheim Dimitropoulos * This entry is our on-disk reference of the log space maps that exist in
61814dcd43SSerapheim Dimitropoulos * the pool for each TXG and it is used during import to load all the
62814dcd43SSerapheim Dimitropoulos * metaslab unflushed changes in memory. To see how this structure is first
63814dcd43SSerapheim Dimitropoulos * created and later populated refer to spa_generate_syncing_log_sm(). To see
64814dcd43SSerapheim Dimitropoulos * how it is used during import time refer to spa_ld_log_sm_metadata().
65814dcd43SSerapheim Dimitropoulos *
66814dcd43SSerapheim Dimitropoulos * - Each vdev has a new entry in its vdev_top_zap (see field
67814dcd43SSerapheim Dimitropoulos * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
68814dcd43SSerapheim Dimitropoulos * each metaslab in this vdev. This field is the on-disk counterpart of the
69814dcd43SSerapheim Dimitropoulos * in-memory field ms_unflushed_txg which tells us from which TXG and onwards
70814dcd43SSerapheim Dimitropoulos * the metaslab haven't had its changes flushed. During import, we use this
71814dcd43SSerapheim Dimitropoulos * to ignore any entries in the space map log that are for this metaslab but
72814dcd43SSerapheim Dimitropoulos * from a TXG before msp_unflushed_txg. At that point, we also populate its
73814dcd43SSerapheim Dimitropoulos * in-memory counterpart and from there both fields are updated every time
74814dcd43SSerapheim Dimitropoulos * we flush that metaslab.
75814dcd43SSerapheim Dimitropoulos *
76814dcd43SSerapheim Dimitropoulos * - A space map is created every TXG and, during that TXG, it is used to log
77814dcd43SSerapheim Dimitropoulos * all incoming changes (the log space map). When created, the log space map
78814dcd43SSerapheim Dimitropoulos * is referenced in memory by spa_syncing_log_sm and its object ID is inserted
79814dcd43SSerapheim Dimitropoulos * to the space map ZAP mentioned above. The log space map is closed at the
80814dcd43SSerapheim Dimitropoulos * end of the TXG and will be destroyed when it becomes fully obsolete. We
81814dcd43SSerapheim Dimitropoulos * know when a log space map has become obsolete by looking at the oldest
82814dcd43SSerapheim Dimitropoulos * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
83814dcd43SSerapheim Dimitropoulos * than the log space map's TXG, then it means that there is no metaslab who
84814dcd43SSerapheim Dimitropoulos * doesn't have the changes from that log and we can therefore destroy it.
85814dcd43SSerapheim Dimitropoulos * [see spa_cleanup_old_sm_logs()].
86814dcd43SSerapheim Dimitropoulos *
87814dcd43SSerapheim Dimitropoulos * == Important in-memory structures ==
88814dcd43SSerapheim Dimitropoulos *
89814dcd43SSerapheim Dimitropoulos * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
90814dcd43SSerapheim Dimitropoulos * the pool by their ms_unflushed_txg field. It is primarily used for three
91814dcd43SSerapheim Dimitropoulos * reasons. First of all, it is used during flushing where we try to flush
92814dcd43SSerapheim Dimitropoulos * metaslabs in-order from the oldest-flushed to the most recently flushed
93814dcd43SSerapheim Dimitropoulos * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
94814dcd43SSerapheim Dimitropoulos * oldest flushed metaslab to distinguish which log space maps have become
95814dcd43SSerapheim Dimitropoulos * obsolete and which ones are still relevant. Finally it tells us which
96814dcd43SSerapheim Dimitropoulos * metaslabs have unflushed changes in a pool where this feature was just
97814dcd43SSerapheim Dimitropoulos * enabled, as we don't immediately add all of the pool's metaslabs but we
98814dcd43SSerapheim Dimitropoulos * add them over time as they go through metaslab_sync(). The reason that
99814dcd43SSerapheim Dimitropoulos * we do that is to ease these pools into the behavior of the flushing
100814dcd43SSerapheim Dimitropoulos * algorithm (described later on).
101814dcd43SSerapheim Dimitropoulos *
102814dcd43SSerapheim Dimitropoulos * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
103814dcd43SSerapheim Dimitropoulos * counterpart of the space map ZAP mentioned above. It's an AVL tree whose
104814dcd43SSerapheim Dimitropoulos * nodes represent the log space maps in the pool. This in-memory
105814dcd43SSerapheim Dimitropoulos * representation of log space maps in the pool sorts the log space maps by
106814dcd43SSerapheim Dimitropoulos * the TXG that they were created (which is also the TXG of their unflushed
107814dcd43SSerapheim Dimitropoulos * changes). It also contains the following extra information for each
108814dcd43SSerapheim Dimitropoulos * space map:
109814dcd43SSerapheim Dimitropoulos * [1] The number of metaslabs that were last flushed on that TXG. This is
110814dcd43SSerapheim Dimitropoulos * important because if that counter is zero and this is the oldest
111814dcd43SSerapheim Dimitropoulos * log then it means that it is also obsolete.
112814dcd43SSerapheim Dimitropoulos * [2] The number of blocks of that space map. This field is used by the
113814dcd43SSerapheim Dimitropoulos * block heuristic of our flushing algorithm (described later on).
114814dcd43SSerapheim Dimitropoulos * It represents how many blocks of metadata changes ZFS had to write
115814dcd43SSerapheim Dimitropoulos * to disk for that TXG.
116814dcd43SSerapheim Dimitropoulos *
117814dcd43SSerapheim Dimitropoulos * - The per-spa field spa_log_summary is a list of entries that summarizes
118814dcd43SSerapheim Dimitropoulos * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
119814dcd43SSerapheim Dimitropoulos * AVL tree mentioned above. The reason this exists is that our flushing
120814dcd43SSerapheim Dimitropoulos * algorithm (described later) tries to estimate how many metaslabs to flush
121814dcd43SSerapheim Dimitropoulos * in each TXG by iterating over all the log space maps and looking at their
122814dcd43SSerapheim Dimitropoulos * block counts. Summarizing that information means that don't have to
123814dcd43SSerapheim Dimitropoulos * iterate through each space map, minimizing the runtime overhead of the
124814dcd43SSerapheim Dimitropoulos * flushing algorithm which would be induced in syncing context. In terms of
125814dcd43SSerapheim Dimitropoulos * implementation the log summary is used as a queue:
126814dcd43SSerapheim Dimitropoulos * * we modify or pop entries from its head when we flush metaslabs
127814dcd43SSerapheim Dimitropoulos * * we modify or append entries to its tail when we sync changes.
128814dcd43SSerapheim Dimitropoulos *
129814dcd43SSerapheim Dimitropoulos * - Each metaslab has two new range trees that hold its unflushed changes,
130814dcd43SSerapheim Dimitropoulos * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
131814dcd43SSerapheim Dimitropoulos *
132814dcd43SSerapheim Dimitropoulos * == Flushing algorithm ==
133814dcd43SSerapheim Dimitropoulos *
134814dcd43SSerapheim Dimitropoulos * The decision of how many metaslabs to flush on a give TXG is guided by
135814dcd43SSerapheim Dimitropoulos * two heuristics:
136814dcd43SSerapheim Dimitropoulos *
137814dcd43SSerapheim Dimitropoulos * [1] The memory heuristic -
138814dcd43SSerapheim Dimitropoulos * We keep track of the memory used by the unflushed trees from all the
139814dcd43SSerapheim Dimitropoulos * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
140814dcd43SSerapheim Dimitropoulos * stays below a certain threshold which is determined by an arbitrary hard
141814dcd43SSerapheim Dimitropoulos * limit and an arbitrary percentage of the system's memory [see
142814dcd43SSerapheim Dimitropoulos * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
143814dcd43SSerapheim Dimitropoulos * unflushed changes are passing that threshold, we flush metaslabs, which
144814dcd43SSerapheim Dimitropoulos * empties their unflushed range trees, reducing the memory used.
145814dcd43SSerapheim Dimitropoulos *
146814dcd43SSerapheim Dimitropoulos * [2] The block heuristic -
147814dcd43SSerapheim Dimitropoulos * We try to keep the total number of blocks in the log space maps in check
148814dcd43SSerapheim Dimitropoulos * so the log doesn't grow indefinitely and we don't induce a lot of overhead
149814dcd43SSerapheim Dimitropoulos * when loading the pool. At the same time we don't want to flush a lot of
150814dcd43SSerapheim Dimitropoulos * metaslabs too often as this would defeat the purpose of the log space map.
151814dcd43SSerapheim Dimitropoulos * As a result we set a limit in the amount of blocks that we think it's
152814dcd43SSerapheim Dimitropoulos * acceptable for the log space maps to have and try not to cross it.
153814dcd43SSerapheim Dimitropoulos * [see sus_blocklimit from spa_unflushed_stats].
154814dcd43SSerapheim Dimitropoulos *
155814dcd43SSerapheim Dimitropoulos * In order to stay below the block limit every TXG we have to estimate how
156814dcd43SSerapheim Dimitropoulos * many metaslabs we need to flush based on the current rate of incoming blocks
157814dcd43SSerapheim Dimitropoulos * and our history of log space map blocks. The main idea here is to answer
158814dcd43SSerapheim Dimitropoulos * the question of how many metaslabs do we need to flush in order to get rid
159814dcd43SSerapheim Dimitropoulos * at least an X amount of log space map blocks. We can answer this question
160814dcd43SSerapheim Dimitropoulos * by iterating backwards from the oldest log space map to the newest one
161814dcd43SSerapheim Dimitropoulos * and looking at their metaslab and block counts. At this point the log summary
162814dcd43SSerapheim Dimitropoulos * mentioned above comes handy as it reduces the amount of things that we have
163814dcd43SSerapheim Dimitropoulos * to iterate (even though it may reduce the preciseness of our estimates due
164814dcd43SSerapheim Dimitropoulos * to its aggregation of data). So with that in mind, we project the incoming
165814dcd43SSerapheim Dimitropoulos * rate of the current TXG into the future and attempt to approximate how many
166814dcd43SSerapheim Dimitropoulos * metaslabs would we need to flush from now in order to avoid exceeding our
167814dcd43SSerapheim Dimitropoulos * block limit in different points in the future (granted that we would keep
168814dcd43SSerapheim Dimitropoulos * flushing the same number of metaslabs for every TXG). Then we take the
169814dcd43SSerapheim Dimitropoulos * maximum number from all these estimates to be on the safe side. For the
170814dcd43SSerapheim Dimitropoulos * exact implementation details of algorithm refer to
171814dcd43SSerapheim Dimitropoulos * spa_estimate_metaslabs_to_flush.
172814dcd43SSerapheim Dimitropoulos */
173814dcd43SSerapheim Dimitropoulos
174814dcd43SSerapheim Dimitropoulos /*
175814dcd43SSerapheim Dimitropoulos * This is used as the block size for the space maps used for the
176814dcd43SSerapheim Dimitropoulos * log space map feature. These space maps benefit from a bigger
177814dcd43SSerapheim Dimitropoulos * block size as we expect to be writing a lot of data to them at
178814dcd43SSerapheim Dimitropoulos * once.
179814dcd43SSerapheim Dimitropoulos */
180814dcd43SSerapheim Dimitropoulos unsigned long zfs_log_sm_blksz = 1ULL << 17;
181814dcd43SSerapheim Dimitropoulos
182814dcd43SSerapheim Dimitropoulos /*
183814dcd43SSerapheim Dimitropoulos * Percentage of the overall system’s memory that ZFS allows to be
184814dcd43SSerapheim Dimitropoulos * used for unflushed changes (e.g. the sum of size of all the nodes
185814dcd43SSerapheim Dimitropoulos * in the unflushed trees).
186814dcd43SSerapheim Dimitropoulos *
187814dcd43SSerapheim Dimitropoulos * Note that this value is calculated over 1000000 for finer granularity
188814dcd43SSerapheim Dimitropoulos * (thus the _ppm suffix; reads as "parts per million"). As an example,
189814dcd43SSerapheim Dimitropoulos * the default of 1000 allows 0.1% of memory to be used.
190814dcd43SSerapheim Dimitropoulos */
191814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_max_mem_ppm = 1000;
192814dcd43SSerapheim Dimitropoulos
193814dcd43SSerapheim Dimitropoulos /*
194814dcd43SSerapheim Dimitropoulos * Specific hard-limit in memory that ZFS allows to be used for
195814dcd43SSerapheim Dimitropoulos * unflushed changes.
196814dcd43SSerapheim Dimitropoulos */
197814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
198814dcd43SSerapheim Dimitropoulos
199814dcd43SSerapheim Dimitropoulos /*
200814dcd43SSerapheim Dimitropoulos * The following tunable determines the number of blocks that can be used for
201814dcd43SSerapheim Dimitropoulos * the log space maps. It is expressed as a percentage of the total number of
202814dcd43SSerapheim Dimitropoulos * metaslabs in the pool (i.e. the default of 400 means that the number of log
203814dcd43SSerapheim Dimitropoulos * blocks is capped at 4 times the number of metaslabs).
204814dcd43SSerapheim Dimitropoulos *
205814dcd43SSerapheim Dimitropoulos * This value exists to tune our flushing algorithm, with higher values
206814dcd43SSerapheim Dimitropoulos * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
207814dcd43SSerapheim Dimitropoulos * flushing metaslabs more aggressively with the upside of saving overheads
208814dcd43SSerapheim Dimitropoulos * when loading the pool. Another factor in this tradeoff is that flushing
209814dcd43SSerapheim Dimitropoulos * less often can potentially lead to better utilization of the metaslab space
210814dcd43SSerapheim Dimitropoulos * map's block size as we accumulate more changes per flush.
211814dcd43SSerapheim Dimitropoulos *
212814dcd43SSerapheim Dimitropoulos * Given that this tunable indirectly controls the flush rate (metaslabs
213814dcd43SSerapheim Dimitropoulos * flushed per txg) and that's why making it a percentage in terms of the
214814dcd43SSerapheim Dimitropoulos * number of metaslabs in the pool makes sense here.
215814dcd43SSerapheim Dimitropoulos *
216814dcd43SSerapheim Dimitropoulos * As a rule of thumb we default this tunable to 400% based on the following:
217814dcd43SSerapheim Dimitropoulos *
218814dcd43SSerapheim Dimitropoulos * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
219814dcd43SSerapheim Dimitropoulos * it is reasonable to expect that the amount of obsolete entries changes
220814dcd43SSerapheim Dimitropoulos * linearly from txg to txg (e.g. the oldest log should have the most
221814dcd43SSerapheim Dimitropoulos * obsolete entries, and the most recent one the least). With this we could
222814dcd43SSerapheim Dimitropoulos * say that, at any given time, about half of the entries in the whole space
223814dcd43SSerapheim Dimitropoulos * map log are obsolete. Thus for every two entries for a metaslab in the
224814dcd43SSerapheim Dimitropoulos * log space map, only one of them is valid and actually makes it to the
225814dcd43SSerapheim Dimitropoulos * metaslab's space map.
226814dcd43SSerapheim Dimitropoulos * [factor of 2]
227814dcd43SSerapheim Dimitropoulos * 2] Each entry in the log space map is guaranteed to be two words while
228814dcd43SSerapheim Dimitropoulos * entries in metaslab space maps are generally single-word.
229814dcd43SSerapheim Dimitropoulos * [an extra factor of 2 - 400% overall]
230814dcd43SSerapheim Dimitropoulos * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
231814dcd43SSerapheim Dimitropoulos * account any consolidation of segments from the log space map to the
232814dcd43SSerapheim Dimitropoulos * unflushed range trees nor their history (e.g. a segment being allocated,
233814dcd43SSerapheim Dimitropoulos * then freed, then allocated again means 3 log space map entries but 0
234814dcd43SSerapheim Dimitropoulos * metaslab space map entries). Depending on the workload, we've seen ~1.8
235814dcd43SSerapheim Dimitropoulos * non-obsolete log space map entries per metaslab entry, for a total of
236814dcd43SSerapheim Dimitropoulos * ~600%. Since most of these estimates though are workload dependent, we
237814dcd43SSerapheim Dimitropoulos * default on 400% to be conservative.
238814dcd43SSerapheim Dimitropoulos *
239814dcd43SSerapheim Dimitropoulos * Thus we could say that even in the worst
240814dcd43SSerapheim Dimitropoulos * case of [1] and [2], the factor should end up being 4.
241814dcd43SSerapheim Dimitropoulos *
242814dcd43SSerapheim Dimitropoulos * That said, regardless of the number of metaslabs in the pool we need to
243814dcd43SSerapheim Dimitropoulos * provide upper and lower bounds for the log block limit.
244814dcd43SSerapheim Dimitropoulos * [see zfs_unflushed_log_block_{min,max}]
245814dcd43SSerapheim Dimitropoulos */
246814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_log_block_pct = 400;
247814dcd43SSerapheim Dimitropoulos
248814dcd43SSerapheim Dimitropoulos /*
249814dcd43SSerapheim Dimitropoulos * If the number of metaslabs is small and our incoming rate is high, we could
250814dcd43SSerapheim Dimitropoulos * get into a situation that we are flushing all our metaslabs every TXG. Thus
251814dcd43SSerapheim Dimitropoulos * we always allow at least this many log blocks.
252814dcd43SSerapheim Dimitropoulos */
253814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_log_block_min = 1000;
254814dcd43SSerapheim Dimitropoulos
255814dcd43SSerapheim Dimitropoulos /*
256814dcd43SSerapheim Dimitropoulos * If the log becomes too big, the import time of the pool can take a hit in
257814dcd43SSerapheim Dimitropoulos * terms of performance. Thus we have a hard limit in the size of the log in
258814dcd43SSerapheim Dimitropoulos * terms of blocks.
259814dcd43SSerapheim Dimitropoulos */
260814dcd43SSerapheim Dimitropoulos unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
261814dcd43SSerapheim Dimitropoulos
262814dcd43SSerapheim Dimitropoulos /*
263814dcd43SSerapheim Dimitropoulos * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
264814dcd43SSerapheim Dimitropoulos * stability of the flushing algorithm (longer summary) vs its runtime overhead
265814dcd43SSerapheim Dimitropoulos * (smaller summary is faster to traverse).
266814dcd43SSerapheim Dimitropoulos */
267814dcd43SSerapheim Dimitropoulos unsigned long zfs_max_logsm_summary_length = 10;
268814dcd43SSerapheim Dimitropoulos
269814dcd43SSerapheim Dimitropoulos /*
270814dcd43SSerapheim Dimitropoulos * Tunable that sets the lower bound on the metaslabs to flush every TXG.
271814dcd43SSerapheim Dimitropoulos *
272814dcd43SSerapheim Dimitropoulos * Setting this to 0 has no effect since if the pool is idle we won't even be
273814dcd43SSerapheim Dimitropoulos * creating log space maps and therefore we won't be flushing. On the other
274814dcd43SSerapheim Dimitropoulos * hand if the pool has any incoming workload our block heuristic will start
275814dcd43SSerapheim Dimitropoulos * flushing metaslabs anyway.
276814dcd43SSerapheim Dimitropoulos *
277814dcd43SSerapheim Dimitropoulos * The point of this tunable is to be used in extreme cases where we really
278814dcd43SSerapheim Dimitropoulos * want to flush more metaslabs than our adaptable heuristic plans to flush.
279814dcd43SSerapheim Dimitropoulos */
280814dcd43SSerapheim Dimitropoulos unsigned long zfs_min_metaslabs_to_flush = 1;
281814dcd43SSerapheim Dimitropoulos
282814dcd43SSerapheim Dimitropoulos /*
283814dcd43SSerapheim Dimitropoulos * Tunable that specifies how far in the past do we want to look when trying to
284814dcd43SSerapheim Dimitropoulos * estimate the incoming log blocks for the current TXG.
285814dcd43SSerapheim Dimitropoulos *
286814dcd43SSerapheim Dimitropoulos * Setting this too high may not only increase runtime but also minimize the
287814dcd43SSerapheim Dimitropoulos * effect of the incoming rates from the most recent TXGs as we take the
288814dcd43SSerapheim Dimitropoulos * average over all the blocks that we walk
289814dcd43SSerapheim Dimitropoulos * [see spa_estimate_incoming_log_blocks].
290814dcd43SSerapheim Dimitropoulos */
291814dcd43SSerapheim Dimitropoulos unsigned long zfs_max_log_walking = 5;
292814dcd43SSerapheim Dimitropoulos
293814dcd43SSerapheim Dimitropoulos /*
294814dcd43SSerapheim Dimitropoulos * This tunable exists solely for testing purposes. It ensures that the log
295814dcd43SSerapheim Dimitropoulos * spacemaps are not flushed and destroyed during export in order for the
296814dcd43SSerapheim Dimitropoulos * relevant log spacemap import code paths to be tested (effectively simulating
297814dcd43SSerapheim Dimitropoulos * a crash).
298814dcd43SSerapheim Dimitropoulos */
299814dcd43SSerapheim Dimitropoulos int zfs_keep_log_spacemaps_at_export = 0;
300814dcd43SSerapheim Dimitropoulos
301814dcd43SSerapheim Dimitropoulos static uint64_t
spa_estimate_incoming_log_blocks(spa_t * spa)302814dcd43SSerapheim Dimitropoulos spa_estimate_incoming_log_blocks(spa_t *spa)
303814dcd43SSerapheim Dimitropoulos {
304814dcd43SSerapheim Dimitropoulos ASSERT3U(spa_sync_pass(spa), ==, 1);
305814dcd43SSerapheim Dimitropoulos uint64_t steps = 0, sum = 0;
306814dcd43SSerapheim Dimitropoulos
307814dcd43SSerapheim Dimitropoulos for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
308814dcd43SSerapheim Dimitropoulos sls != NULL && steps < zfs_max_log_walking;
309814dcd43SSerapheim Dimitropoulos sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
310814dcd43SSerapheim Dimitropoulos if (sls->sls_txg == spa_syncing_txg(spa)) {
311814dcd43SSerapheim Dimitropoulos /*
312814dcd43SSerapheim Dimitropoulos * skip the log created in this TXG as this would
313814dcd43SSerapheim Dimitropoulos * make our estimations inaccurate.
314814dcd43SSerapheim Dimitropoulos */
315814dcd43SSerapheim Dimitropoulos continue;
316814dcd43SSerapheim Dimitropoulos }
317814dcd43SSerapheim Dimitropoulos sum += sls->sls_nblocks;
318814dcd43SSerapheim Dimitropoulos steps++;
319814dcd43SSerapheim Dimitropoulos }
320814dcd43SSerapheim Dimitropoulos return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
321814dcd43SSerapheim Dimitropoulos }
322814dcd43SSerapheim Dimitropoulos
323814dcd43SSerapheim Dimitropoulos uint64_t
spa_log_sm_blocklimit(spa_t * spa)324814dcd43SSerapheim Dimitropoulos spa_log_sm_blocklimit(spa_t *spa)
325814dcd43SSerapheim Dimitropoulos {
326814dcd43SSerapheim Dimitropoulos return (spa->spa_unflushed_stats.sus_blocklimit);
327814dcd43SSerapheim Dimitropoulos }
328814dcd43SSerapheim Dimitropoulos
329814dcd43SSerapheim Dimitropoulos void
spa_log_sm_set_blocklimit(spa_t * spa)330814dcd43SSerapheim Dimitropoulos spa_log_sm_set_blocklimit(spa_t *spa)
331814dcd43SSerapheim Dimitropoulos {
332814dcd43SSerapheim Dimitropoulos if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
333814dcd43SSerapheim Dimitropoulos ASSERT0(spa_log_sm_blocklimit(spa));
334814dcd43SSerapheim Dimitropoulos return;
335814dcd43SSerapheim Dimitropoulos }
336814dcd43SSerapheim Dimitropoulos
337814dcd43SSerapheim Dimitropoulos uint64_t calculated_limit =
338814dcd43SSerapheim Dimitropoulos (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
339814dcd43SSerapheim Dimitropoulos spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
340814dcd43SSerapheim Dimitropoulos zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
341814dcd43SSerapheim Dimitropoulos }
342814dcd43SSerapheim Dimitropoulos
343814dcd43SSerapheim Dimitropoulos uint64_t
spa_log_sm_nblocks(spa_t * spa)344814dcd43SSerapheim Dimitropoulos spa_log_sm_nblocks(spa_t *spa)
345814dcd43SSerapheim Dimitropoulos {
346814dcd43SSerapheim Dimitropoulos return (spa->spa_unflushed_stats.sus_nblocks);
347814dcd43SSerapheim Dimitropoulos }
348814dcd43SSerapheim Dimitropoulos
349814dcd43SSerapheim Dimitropoulos /*
350814dcd43SSerapheim Dimitropoulos * Ensure that the in-memory log space map structures and the summary
351814dcd43SSerapheim Dimitropoulos * have the same block and metaslab counts.
352814dcd43SSerapheim Dimitropoulos */
353814dcd43SSerapheim Dimitropoulos static void
spa_log_summary_verify_counts(spa_t * spa)354814dcd43SSerapheim Dimitropoulos spa_log_summary_verify_counts(spa_t *spa)
355814dcd43SSerapheim Dimitropoulos {
356814dcd43SSerapheim Dimitropoulos ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
357814dcd43SSerapheim Dimitropoulos
358814dcd43SSerapheim Dimitropoulos if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
359814dcd43SSerapheim Dimitropoulos return;
360814dcd43SSerapheim Dimitropoulos
361814dcd43SSerapheim Dimitropoulos uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
362814dcd43SSerapheim Dimitropoulos
363814dcd43SSerapheim Dimitropoulos uint64_t ms_in_summary = 0, blk_in_summary = 0;
364814dcd43SSerapheim Dimitropoulos for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
365814dcd43SSerapheim Dimitropoulos e; e = list_next(&spa->spa_log_summary, e)) {
366814dcd43SSerapheim Dimitropoulos ms_in_summary += e->lse_mscount;
367814dcd43SSerapheim Dimitropoulos blk_in_summary += e->lse_blkcount;
368814dcd43SSerapheim Dimitropoulos }
369814dcd43SSerapheim Dimitropoulos
370814dcd43SSerapheim Dimitropoulos uint64_t ms_in_logs = 0, blk_in_logs = 0;
371814dcd43SSerapheim Dimitropoulos for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
372814dcd43SSerapheim Dimitropoulos sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
373814dcd43SSerapheim Dimitropoulos ms_in_logs += sls->sls_mscount;
374814dcd43SSerapheim Dimitropoulos blk_in_logs += sls->sls_nblocks;
375814dcd43SSerapheim Dimitropoulos }
376814dcd43SSerapheim Dimitropoulos
377814dcd43SSerapheim Dimitropoulos VERIFY3U(ms_in_logs, ==, ms_in_summary);
378814dcd43SSerapheim Dimitropoulos VERIFY3U(ms_in_logs, ==, ms_in_avl);
379814dcd43SSerapheim Dimitropoulos VERIFY3U(blk_in_logs, ==, blk_in_summary);
380814dcd43SSerapheim Dimitropoulos VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
381814dcd43SSerapheim Dimitropoulos }
382814dcd43SSerapheim Dimitropoulos
383814dcd43SSerapheim Dimitropoulos static boolean_t
summary_entry_is_full(spa_t * spa,log_summary_entry_t * e)384814dcd43SSerapheim Dimitropoulos summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
385814dcd43SSerapheim Dimitropoulos {
386814dcd43SSerapheim Dimitropoulos uint64_t blocks_per_row = MAX(1,
387814dcd43SSerapheim Dimitropoulos DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
388814dcd43SSerapheim Dimitropoulos zfs_max_logsm_summary_length));
389814dcd43SSerapheim Dimitropoulos
390814dcd43SSerapheim Dimitropoulos return (blocks_per_row <= e->lse_blkcount);
391814dcd43SSerapheim Dimitropoulos }
392814dcd43SSerapheim Dimitropoulos
393814dcd43SSerapheim Dimitropoulos /*
394814dcd43SSerapheim Dimitropoulos * Update the log summary information to reflect the fact that a metaslab
395814dcd43SSerapheim Dimitropoulos * was flushed or destroyed (e.g due to device removal or pool export/destroy).
396814dcd43SSerapheim Dimitropoulos *
397814dcd43SSerapheim Dimitropoulos * We typically flush the oldest flushed metaslab so the first (and oldest)
398814dcd43SSerapheim Dimitropoulos * entry of the summary is updated. However if that metaslab is getting loaded
399814dcd43SSerapheim Dimitropoulos * we may flush the second oldest one which may be part of an entry later in
400814dcd43SSerapheim Dimitropoulos * the summary. Moreover, if we call into this function from metaslab_fini()
401814dcd43SSerapheim Dimitropoulos * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
402814dcd43SSerapheim Dimitropoulos * for a txg as an argument so we can locate the appropriate summary entry for
403814dcd43SSerapheim Dimitropoulos * the metaslab.
404814dcd43SSerapheim Dimitropoulos */
405814dcd43SSerapheim Dimitropoulos void
spa_log_summary_decrement_mscount(spa_t * spa,uint64_t txg)406814dcd43SSerapheim Dimitropoulos spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
407814dcd43SSerapheim Dimitropoulos {
408814dcd43SSerapheim Dimitropoulos /*
409814dcd43SSerapheim Dimitropoulos * We don't track summary data for read-only pools and this function
410814dcd43SSerapheim Dimitropoulos * can be called from metaslab_fini(). In that case return immediately.
411814dcd43SSerapheim Dimitropoulos */
412814dcd43SSerapheim Dimitropoulos if (!spa_writeable(spa))
413814dcd43SSerapheim Dimitropoulos return;
414814dcd43SSerapheim Dimitropoulos
415814dcd43SSerapheim Dimitropoulos log_summary_entry_t *target = NULL;
416814dcd43SSerapheim Dimitropoulos for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
417814dcd43SSerapheim Dimitropoulos e != NULL; e = list_next(&spa->spa_log_summary, e)) {
418814dcd43SSerapheim Dimitropoulos if (e->lse_start > txg)
419814dcd43SSerapheim Dimitropoulos break;
420814dcd43SSerapheim Dimitropoulos target = e;
421814dcd43SSerapheim Dimitropoulos }
422814dcd43SSerapheim Dimitropoulos
423814dcd43SSerapheim Dimitropoulos if (target == NULL || target->lse_mscount == 0) {
424814dcd43SSerapheim Dimitropoulos /*
425814dcd43SSerapheim Dimitropoulos * We didn't find a summary entry for this metaslab. We must be
426814dcd43SSerapheim Dimitropoulos * at the teardown of a spa_load() attempt that got an error
427814dcd43SSerapheim Dimitropoulos * while reading the log space maps.
428814dcd43SSerapheim Dimitropoulos */
429814dcd43SSerapheim Dimitropoulos VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
430814dcd43SSerapheim Dimitropoulos return;
431814dcd43SSerapheim Dimitropoulos }
432814dcd43SSerapheim Dimitropoulos
433814dcd43SSerapheim Dimitropoulos target->lse_mscount--;
434814dcd43SSerapheim Dimitropoulos }
435814dcd43SSerapheim Dimitropoulos
436814dcd43SSerapheim Dimitropoulos /*
437814dcd43SSerapheim Dimitropoulos * Update the log summary information to reflect the fact that we destroyed
438814dcd43SSerapheim Dimitropoulos * old log space maps. Since we can only destroy the oldest log space maps,
439814dcd43SSerapheim Dimitropoulos * we decrement the block count of the oldest summary entry and potentially
440814dcd43SSerapheim Dimitropoulos * destroy it when that count hits 0.
441814dcd43SSerapheim Dimitropoulos *
442814dcd43SSerapheim Dimitropoulos * This function is called after a metaslab is flushed and typically that
443814dcd43SSerapheim Dimitropoulos * metaslab is the oldest flushed, which means that this function will
444814dcd43SSerapheim Dimitropoulos * typically decrement the block count of the first entry of the summary and
445814dcd43SSerapheim Dimitropoulos * potentially free it if the block count gets to zero (its metaslab count
446814dcd43SSerapheim Dimitropoulos * should be zero too at that point).
447814dcd43SSerapheim Dimitropoulos *
448814dcd43SSerapheim Dimitropoulos * There are certain scenarios though that don't work exactly like that so we
449814dcd43SSerapheim Dimitropoulos * need to account for them:
450814dcd43SSerapheim Dimitropoulos *
451814dcd43SSerapheim Dimitropoulos * Scenario [1]: It is possible that after we flushed the oldest flushed
452814dcd43SSerapheim Dimitropoulos * metaslab and we destroyed the oldest log space map, more recent logs had 0
453814dcd43SSerapheim Dimitropoulos * metaslabs pointing to them so we got rid of them too. This can happen due
454814dcd43SSerapheim Dimitropoulos * to metaslabs being destroyed through device removal, or because the oldest
455814dcd43SSerapheim Dimitropoulos * flushed metaslab was loading but we kept flushing more recently flushed
456814dcd43SSerapheim Dimitropoulos * metaslabs due to the memory pressure of unflushed changes. Because of that,
457814dcd43SSerapheim Dimitropoulos * we always iterate from the beginning of the summary and if blocks_gone is
458814dcd43SSerapheim Dimitropoulos * bigger than the block_count of the current entry we free that entry (we
459814dcd43SSerapheim Dimitropoulos * expect its metaslab count to be zero), we decrement blocks_gone and on to
460814dcd43SSerapheim Dimitropoulos * the next entry repeating this procedure until blocks_gone gets decremented
461814dcd43SSerapheim Dimitropoulos * to 0. Doing this also works for the typical case mentioned above.
462814dcd43SSerapheim Dimitropoulos *
463814dcd43SSerapheim Dimitropoulos * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
464814dcd43SSerapheim Dimitropoulos * the first (and oldest) entry in the summary. If the first few entries of
465814dcd43SSerapheim Dimitropoulos * the summary were only accounting metaslabs from a device that was just
466814dcd43SSerapheim Dimitropoulos * removed, then the current oldest flushed metaslab could be accounted by an
467814dcd43SSerapheim Dimitropoulos * entry somewhere in the middle of the summary. Moreover flushing that
468814dcd43SSerapheim Dimitropoulos * metaslab will destroy all the log space maps older than its ms_unflushed_txg
469814dcd43SSerapheim Dimitropoulos * because they became obsolete after the removal. Thus, iterating as we did
470814dcd43SSerapheim Dimitropoulos * for scenario [1] works out for this case too.
471814dcd43SSerapheim Dimitropoulos *
472814dcd43SSerapheim Dimitropoulos * Scenario [3]: At times we decide to flush all the metaslabs in the pool
473814dcd43SSerapheim Dimitropoulos * in one TXG (either because we are exporting the pool or because our flushing
474814dcd43SSerapheim Dimitropoulos * heuristics decided to do so). When that happens all the log space maps get
475814dcd43SSerapheim Dimitropoulos * destroyed except the one created for the current TXG which doesn't have
476814dcd43SSerapheim Dimitropoulos * any log blocks yet. As log space maps get destroyed with every metaslab that
477814dcd43SSerapheim Dimitropoulos * we flush, entries in the summary are also destroyed. This brings a weird
478814dcd43SSerapheim Dimitropoulos * corner-case when we flush the last metaslab and the log space map of the
479814dcd43SSerapheim Dimitropoulos * current TXG is in the same summary entry with other log space maps that
480814dcd43SSerapheim Dimitropoulos * are older. When that happens we are eventually left with this one last
481814dcd43SSerapheim Dimitropoulos * summary entry whose blocks are gone (blocks_gone equals the entry's block
482814dcd43SSerapheim Dimitropoulos * count) but its metaslab count is non-zero (because it accounts all the
483814dcd43SSerapheim Dimitropoulos * metaslabs in the pool as they all got flushed). Under this scenario we can't
484814dcd43SSerapheim Dimitropoulos * free this last summary entry as it's referencing all the metaslabs in the
485814dcd43SSerapheim Dimitropoulos * pool and its block count will get incremented at the end of this sync (when
486814dcd43SSerapheim Dimitropoulos * we close the syncing log space map). Thus we just decrement its current
487814dcd43SSerapheim Dimitropoulos * block count and leave it alone. In the case that the pool gets exported,
488814dcd43SSerapheim Dimitropoulos * its metaslab count will be decremented over time as we call metaslab_fini()
489814dcd43SSerapheim Dimitropoulos * for all the metaslabs in the pool and the entry will be freed at
490814dcd43SSerapheim Dimitropoulos * spa_unload_log_sm_metadata().
491814dcd43SSerapheim Dimitropoulos */
492814dcd43SSerapheim Dimitropoulos void
spa_log_summary_decrement_blkcount(spa_t * spa,uint64_t blocks_gone)493814dcd43SSerapheim Dimitropoulos spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
494814dcd43SSerapheim Dimitropoulos {
495814dcd43SSerapheim Dimitropoulos for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
496814dcd43SSerapheim Dimitropoulos e != NULL; e = list_head(&spa->spa_log_summary)) {
497814dcd43SSerapheim Dimitropoulos if (e->lse_blkcount > blocks_gone) {
498814dcd43SSerapheim Dimitropoulos /*
499814dcd43SSerapheim Dimitropoulos * Assert that we stopped at an entry that is not
500814dcd43SSerapheim Dimitropoulos * obsolete.
501814dcd43SSerapheim Dimitropoulos */
502814dcd43SSerapheim Dimitropoulos ASSERT(e->lse_mscount != 0);
503814dcd43SSerapheim Dimitropoulos
504814dcd43SSerapheim Dimitropoulos e->lse_blkcount -= blocks_gone;
505814dcd43SSerapheim Dimitropoulos blocks_gone = 0;
506814dcd43SSerapheim Dimitropoulos break;
507814dcd43SSerapheim Dimitropoulos } else if (e->lse_mscount == 0) {
508814dcd43SSerapheim Dimitropoulos /* remove obsolete entry */
509814dcd43SSerapheim Dimitropoulos blocks_gone -= e->lse_blkcount;
510814dcd43SSerapheim Dimitropoulos list_remove(&spa->spa_log_summary, e);
511814dcd43SSerapheim Dimitropoulos kmem_free(e, sizeof (log_summary_entry_t));
512814dcd43SSerapheim Dimitropoulos } else {
513814dcd43SSerapheim Dimitropoulos /* Verify that this is scenario [3] mentioned above. */
514814dcd43SSerapheim Dimitropoulos VERIFY3U(blocks_gone, ==, e->lse_blkcount);
515814dcd43SSerapheim Dimitropoulos
516814dcd43SSerapheim Dimitropoulos /*
517814dcd43SSerapheim Dimitropoulos * Assert that this is scenario [3] further by ensuring
518814dcd43SSerapheim Dimitropoulos * that this is the only entry in the summary.
519814dcd43SSerapheim Dimitropoulos */
520814dcd43SSerapheim Dimitropoulos VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
521814dcd43SSerapheim Dimitropoulos ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
522814dcd43SSerapheim Dimitropoulos
523814dcd43SSerapheim Dimitropoulos blocks_gone = e->lse_blkcount = 0;
524814dcd43SSerapheim Dimitropoulos break;
525814dcd43SSerapheim Dimitropoulos }
526814dcd43SSerapheim Dimitropoulos }
527814dcd43SSerapheim Dimitropoulos
528814dcd43SSerapheim Dimitropoulos /*
529814dcd43SSerapheim Dimitropoulos * Ensure that there is no way we are trying to remove more blocks
530814dcd43SSerapheim Dimitropoulos * than the # of blocks in the summary.
531814dcd43SSerapheim Dimitropoulos */
532814dcd43SSerapheim Dimitropoulos ASSERT0(blocks_gone);
533814dcd43SSerapheim Dimitropoulos }
534814dcd43SSerapheim Dimitropoulos
535814dcd43SSerapheim Dimitropoulos void
spa_log_sm_decrement_mscount(spa_t * spa,uint64_t txg)536814dcd43SSerapheim Dimitropoulos spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
537814dcd43SSerapheim Dimitropoulos {
538814dcd43SSerapheim Dimitropoulos spa_log_sm_t target = { .sls_txg = txg };
539814dcd43SSerapheim Dimitropoulos spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
540814dcd43SSerapheim Dimitropoulos &target, NULL);
541814dcd43SSerapheim Dimitropoulos
542814dcd43SSerapheim Dimitropoulos if (sls == NULL) {
543814dcd43SSerapheim Dimitropoulos /*
544814dcd43SSerapheim Dimitropoulos * We must be at the teardown of a spa_load() attempt that
545814dcd43SSerapheim Dimitropoulos * got an error while reading the log space maps.
546814dcd43SSerapheim Dimitropoulos */
547814dcd43SSerapheim Dimitropoulos VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
548814dcd43SSerapheim Dimitropoulos return;
549814dcd43SSerapheim Dimitropoulos }
550814dcd43SSerapheim Dimitropoulos
551814dcd43SSerapheim Dimitropoulos ASSERT(sls->sls_mscount > 0);
552814dcd43SSerapheim Dimitropoulos sls->sls_mscount--;
553814dcd43SSerapheim Dimitropoulos }
554814dcd43SSerapheim Dimitropoulos
555814dcd43SSerapheim Dimitropoulos void
spa_log_sm_increment_current_mscount(spa_t * spa)556814dcd43SSerapheim Dimitropoulos spa_log_sm_increment_current_mscount(spa_t *spa)
557814dcd43SSerapheim Dimitropoulos {
558814dcd43SSerapheim Dimitropoulos spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
559814dcd43SSerapheim Dimitropoulos
560814dcd43SSerapheim Dimitropoulos ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
561814dcd43SSerapheim Dimitropoulos last_sls->sls_mscount++;
562814dcd43SSerapheim Dimitropoulos }
563814dcd43SSerapheim Dimitropoulos
564814dcd43SSerapheim Dimitropoulos static void
summary_add_data(spa_t * spa,uint64_t txg,uint64_t metaslabs_flushed,uint64_t nblocks)565814dcd43SSerapheim Dimitropoulos summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
566814dcd43SSerapheim Dimitropoulos uint64_t nblocks)
567814dcd43SSerapheim Dimitropoulos {
568814dcd43SSerapheim Dimitropoulos log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
569814dcd43SSerapheim Dimitropoulos
570814dcd43SSerapheim Dimitropoulos if (e == NULL || summary_entry_is_full(spa, e)) {
571814dcd43SSerapheim Dimitropoulos e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
572814dcd43SSerapheim Dimitropoulos e->lse_start = txg;
573814dcd43SSerapheim Dimitropoulos list_insert_tail(&spa->spa_log_summary, e);
574814dcd43SSerapheim Dimitropoulos }
575814dcd43SSerapheim Dimitropoulos
576814dcd43SSerapheim Dimitropoulos ASSERT3U(e->lse_start, <=, txg);
577814dcd43SSerapheim Dimitropoulos e->lse_mscount += metaslabs_flushed;
578814dcd43SSerapheim Dimitropoulos e->lse_blkcount += nblocks;
579814dcd43SSerapheim Dimitropoulos }
580814dcd43SSerapheim Dimitropoulos
581814dcd43SSerapheim Dimitropoulos static void
spa_log_summary_add_incoming_blocks(spa_t * spa,uint64_t nblocks)582814dcd43SSerapheim Dimitropoulos spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
583814dcd43SSerapheim Dimitropoulos {
584814dcd43SSerapheim Dimitropoulos summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
585814dcd43SSerapheim Dimitropoulos }
586814dcd43SSerapheim Dimitropoulos
587814dcd43SSerapheim Dimitropoulos void
spa_log_summary_add_flushed_metaslab(spa_t * spa)588814dcd43SSerapheim Dimitropoulos spa_log_summary_add_flushed_metaslab(spa_t *spa)
589814dcd43SSerapheim Dimitropoulos {
590814dcd43SSerapheim Dimitropoulos summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
591814dcd43SSerapheim Dimitropoulos }
592814dcd43SSerapheim Dimitropoulos
593814dcd43SSerapheim Dimitropoulos /*
594814dcd43SSerapheim Dimitropoulos * This function attempts to estimate how many metaslabs should
595814dcd43SSerapheim Dimitropoulos * we flush to satisfy our block heuristic for the log spacemap
596814dcd43SSerapheim Dimitropoulos * for the upcoming TXGs.
597814dcd43SSerapheim Dimitropoulos *
598814dcd43SSerapheim Dimitropoulos * Specifically, it first tries to estimate the number of incoming
599814dcd43SSerapheim Dimitropoulos * blocks in this TXG. Then by projecting that incoming rate to
600814dcd43SSerapheim Dimitropoulos * future TXGs and using the log summary, it figures out how many
601814dcd43SSerapheim Dimitropoulos * flushes we would need to do for future TXGs individually to
602814dcd43SSerapheim Dimitropoulos * stay below our block limit and returns the maximum number of
603814dcd43SSerapheim Dimitropoulos * flushes from those estimates.
604814dcd43SSerapheim Dimitropoulos */
605814dcd43SSerapheim Dimitropoulos static uint64_t
spa_estimate_metaslabs_to_flush(spa_t * spa)606814dcd43SSerapheim Dimitropoulos spa_estimate_metaslabs_to_flush(spa_t *spa)
607814dcd43SSerapheim Dimitropoulos {
608814dcd43SSerapheim Dimitropoulos ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
609814dcd43SSerapheim Dimitropoulos ASSERT3U(spa_sync_pass(spa), ==, 1);
610814dcd43SSerapheim Dimitropoulos ASSERT(spa_log_sm_blocklimit(spa) != 0);
611814dcd43SSerapheim Dimitropoulos
612814dcd43SSerapheim Dimitropoulos /*
613814dcd43SSerapheim Dimitropoulos * This variable contains the incoming rate that will be projected
614814dcd43SSerapheim Dimitropoulos * and used for our flushing estimates in the future.
615814dcd43SSerapheim Dimitropoulos */
616814dcd43SSerapheim Dimitropoulos uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
617814dcd43SSerapheim Dimitropoulos
618814dcd43SSerapheim Dimitropoulos /*
619814dcd43SSerapheim Dimitropoulos * At any point in time this variable tells us how many
620814dcd43SSerapheim Dimitropoulos * TXGs in the future we are so we can make our estimations.
621814dcd43SSerapheim Dimitropoulos */
622814dcd43SSerapheim Dimitropoulos uint64_t txgs_in_future = 1;
623814dcd43SSerapheim Dimitropoulos
624814dcd43SSerapheim Dimitropoulos /*
625814dcd43SSerapheim Dimitropoulos * This variable tells us how much room do we have until we hit
626814dcd43SSerapheim Dimitropoulos * our limit. When it goes negative, it means that we've exceeded
627814dcd43SSerapheim Dimitropoulos * our limit and we need to flush.
628814dcd43SSerapheim Dimitropoulos *
629814dcd43SSerapheim Dimitropoulos * Note that since we start at the first TXG in the future (i.e.
630814dcd43SSerapheim Dimitropoulos * txgs_in_future starts from 1) we already decrement this
631814dcd43SSerapheim Dimitropoulos * variable by the incoming rate.
632814dcd43SSerapheim Dimitropoulos */
633814dcd43SSerapheim Dimitropoulos int64_t available_blocks =
634814dcd43SSerapheim Dimitropoulos spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
635814dcd43SSerapheim Dimitropoulos
636814dcd43SSerapheim Dimitropoulos /*
637814dcd43SSerapheim Dimitropoulos * This variable tells us the total number of flushes needed to
638814dcd43SSerapheim Dimitropoulos * keep the log size within the limit when we reach txgs_in_future.
639814dcd43SSerapheim Dimitropoulos */
640814dcd43SSerapheim Dimitropoulos uint64_t total_flushes = 0;
641814dcd43SSerapheim Dimitropoulos
642814dcd43SSerapheim Dimitropoulos /* Holds the current maximum of our estimates so far. */
643814dcd43SSerapheim Dimitropoulos uint64_t max_flushes_pertxg =
644814dcd43SSerapheim Dimitropoulos MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
645814dcd43SSerapheim Dimitropoulos zfs_min_metaslabs_to_flush);
646814dcd43SSerapheim Dimitropoulos
647814dcd43SSerapheim Dimitropoulos /*
648814dcd43SSerapheim Dimitropoulos * For our estimations we only look as far in the future
649814dcd43SSerapheim Dimitropoulos * as the summary allows us.
650814dcd43SSerapheim Dimitropoulos */
651814dcd43SSerapheim Dimitropoulos for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
652814dcd43SSerapheim Dimitropoulos e; e = list_next(&spa->spa_log_summary, e)) {
653814dcd43SSerapheim Dimitropoulos
654814dcd43SSerapheim Dimitropoulos /*
655814dcd43SSerapheim Dimitropoulos * If there is still room before we exceed our limit
656814dcd43SSerapheim Dimitropoulos * then keep skipping TXGs accumulating more blocks
657814dcd43SSerapheim Dimitropoulos * based on the incoming rate until we exceed it.
658814dcd43SSerapheim Dimitropoulos */
659814dcd43SSerapheim Dimitropoulos if (available_blocks >= 0) {
660814dcd43SSerapheim Dimitropoulos uint64_t skip_txgs = (available_blocks / incoming) + 1;
661814dcd43SSerapheim Dimitropoulos available_blocks -= (skip_txgs * incoming);
662814dcd43SSerapheim Dimitropoulos txgs_in_future += skip_txgs;
663814dcd43SSerapheim Dimitropoulos ASSERT3S(available_blocks, >=, -incoming);
664814dcd43SSerapheim Dimitropoulos }
665814dcd43SSerapheim Dimitropoulos
666814dcd43SSerapheim Dimitropoulos /*
667814dcd43SSerapheim Dimitropoulos * At this point we're far enough into the future where
668814dcd43SSerapheim Dimitropoulos * the limit was just exceeded and we flush metaslabs
669814dcd43SSerapheim Dimitropoulos * based on the current entry in the summary, updating
670814dcd43SSerapheim Dimitropoulos * our available_blocks.
671814dcd43SSerapheim Dimitropoulos */
672814dcd43SSerapheim Dimitropoulos ASSERT3S(available_blocks, <, 0);
673814dcd43SSerapheim Dimitropoulos available_blocks += e->lse_blkcount;
674814dcd43SSerapheim Dimitropoulos total_flushes += e->lse_mscount;
675814dcd43SSerapheim Dimitropoulos
676814dcd43SSerapheim Dimitropoulos /*
677814dcd43SSerapheim Dimitropoulos * Keep the running maximum of the total_flushes that
678814dcd43SSerapheim Dimitropoulos * we've done so far over the number of TXGs in the
679814dcd43SSerapheim Dimitropoulos * future that we are. The idea here is to estimate
680814dcd43SSerapheim Dimitropoulos * the average number of flushes that we should do
681814dcd43SSerapheim Dimitropoulos * every TXG so that when we are that many TXGs in the
682814dcd43SSerapheim Dimitropoulos * future we stay under the limit.
683814dcd43SSerapheim Dimitropoulos */
684814dcd43SSerapheim Dimitropoulos max_flushes_pertxg = MAX(max_flushes_pertxg,
685814dcd43SSerapheim Dimitropoulos DIV_ROUND_UP(total_flushes, txgs_in_future));
686814dcd43SSerapheim Dimitropoulos ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
687814dcd43SSerapheim Dimitropoulos max_flushes_pertxg);
688814dcd43SSerapheim Dimitropoulos }
689814dcd43SSerapheim Dimitropoulos return (max_flushes_pertxg);
690814dcd43SSerapheim Dimitropoulos }
691814dcd43SSerapheim Dimitropoulos
692814dcd43SSerapheim Dimitropoulos uint64_t
spa_log_sm_memused(spa_t * spa)693814dcd43SSerapheim Dimitropoulos spa_log_sm_memused(spa_t *spa)
694814dcd43SSerapheim Dimitropoulos {
695814dcd43SSerapheim Dimitropoulos return (spa->spa_unflushed_stats.sus_memused);
696814dcd43SSerapheim Dimitropoulos }
697814dcd43SSerapheim Dimitropoulos
698814dcd43SSerapheim Dimitropoulos static boolean_t
spa_log_exceeds_memlimit(spa_t * spa)699814dcd43SSerapheim Dimitropoulos spa_log_exceeds_memlimit(spa_t *spa)
700814dcd43SSerapheim Dimitropoulos {
701814dcd43SSerapheim Dimitropoulos if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
702814dcd43SSerapheim Dimitropoulos return (B_TRUE);
703814dcd43SSerapheim Dimitropoulos
704814dcd43SSerapheim Dimitropoulos uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
705814dcd43SSerapheim Dimitropoulos zfs_unflushed_max_mem_ppm) / 1000000;
706814dcd43SSerapheim Dimitropoulos if (spa_log_sm_memused(spa) > system_mem_allowed)
707814dcd43SSerapheim Dimitropoulos return (B_TRUE);
708814dcd43SSerapheim Dimitropoulos
709814dcd43SSerapheim Dimitropoulos return (B_FALSE);
710814dcd43SSerapheim Dimitropoulos }
711814dcd43SSerapheim Dimitropoulos
712814dcd43SSerapheim Dimitropoulos boolean_t
spa_flush_all_logs_requested(spa_t * spa)713814dcd43SSerapheim Dimitropoulos spa_flush_all_logs_requested(spa_t *spa)
714814dcd43SSerapheim Dimitropoulos {
715814dcd43SSerapheim Dimitropoulos return (spa->spa_log_flushall_txg != 0);
716814dcd43SSerapheim Dimitropoulos }
717814dcd43SSerapheim Dimitropoulos
718814dcd43SSerapheim Dimitropoulos void
spa_flush_metaslabs(spa_t * spa,dmu_tx_t * tx)719814dcd43SSerapheim Dimitropoulos spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
720814dcd43SSerapheim Dimitropoulos {
721814dcd43SSerapheim Dimitropoulos uint64_t txg = dmu_tx_get_txg(tx);
722814dcd43SSerapheim Dimitropoulos
723814dcd43SSerapheim Dimitropoulos if (spa_sync_pass(spa) != 1)
724814dcd43SSerapheim Dimitropoulos return;
725814dcd43SSerapheim Dimitropoulos
726814dcd43SSerapheim Dimitropoulos if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
727814dcd43SSerapheim Dimitropoulos return;
728814dcd43SSerapheim Dimitropoulos
729814dcd43SSerapheim Dimitropoulos /*
730814dcd43SSerapheim Dimitropoulos * If we don't have any metaslabs with unflushed changes
731814dcd43SSerapheim Dimitropoulos * return immediately.
732814dcd43SSerapheim Dimitropoulos */
733814dcd43SSerapheim Dimitropoulos if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
734814dcd43SSerapheim Dimitropoulos return;
735814dcd43SSerapheim Dimitropoulos
736814dcd43SSerapheim Dimitropoulos /*
737814dcd43SSerapheim Dimitropoulos * During SPA export we leave a few empty TXGs to go by [see
738814dcd43SSerapheim Dimitropoulos * spa_final_dirty_txg() to understand why]. For this specific
739814dcd43SSerapheim Dimitropoulos * case, it is important to not flush any metaslabs as that
740814dcd43SSerapheim Dimitropoulos * would dirty this TXG.
741814dcd43SSerapheim Dimitropoulos *
742814dcd43SSerapheim Dimitropoulos * That said, during one of these dirty TXGs that is less or
743814dcd43SSerapheim Dimitropoulos * equal to spa_final_dirty(), spa_unload() will request that
744814dcd43SSerapheim Dimitropoulos * we try to flush all the metaslabs for that TXG before
745814dcd43SSerapheim Dimitropoulos * exporting the pool, thus we ensure that we didn't get a
746814dcd43SSerapheim Dimitropoulos * request of flushing everything before we attempt to return
747814dcd43SSerapheim Dimitropoulos * immediately.
748814dcd43SSerapheim Dimitropoulos */
749814dcd43SSerapheim Dimitropoulos if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
750814dcd43SSerapheim Dimitropoulos !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
751814dcd43SSerapheim Dimitropoulos !spa_flush_all_logs_requested(spa))
752814dcd43SSerapheim Dimitropoulos return;
753814dcd43SSerapheim Dimitropoulos
754814dcd43SSerapheim Dimitropoulos /*
755814dcd43SSerapheim Dimitropoulos * We need to generate a log space map before flushing because this
756814dcd43SSerapheim Dimitropoulos * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
757814dcd43SSerapheim Dimitropoulos * for this TXG's flushed metaslab count (aka sls_mscount which is
758814dcd43SSerapheim Dimitropoulos * manipulated in many ways down the metaslab_flush() codepath).
759814dcd43SSerapheim Dimitropoulos *
760814dcd43SSerapheim Dimitropoulos * That is not to say that we may generate a log space map when we
761814dcd43SSerapheim Dimitropoulos * don't need it. If we are flushing metaslabs, that means that we
762814dcd43SSerapheim Dimitropoulos * were going to write changes to disk anyway, so even if we were
763814dcd43SSerapheim Dimitropoulos * not flushing, a log space map would have been created anyway in
764814dcd43SSerapheim Dimitropoulos * metaslab_sync().
765814dcd43SSerapheim Dimitropoulos */
766814dcd43SSerapheim Dimitropoulos spa_generate_syncing_log_sm(spa, tx);
767814dcd43SSerapheim Dimitropoulos
768814dcd43SSerapheim Dimitropoulos /*
769814dcd43SSerapheim Dimitropoulos * This variable tells us how many metaslabs we want to flush based
770814dcd43SSerapheim Dimitropoulos * on the block-heuristic of our flushing algorithm (see block comment
771814dcd43SSerapheim Dimitropoulos * of log space map feature). We also decrement this as we flush
772814dcd43SSerapheim Dimitropoulos * metaslabs and attempt to destroy old log space maps.
773814dcd43SSerapheim Dimitropoulos */
774814dcd43SSerapheim Dimitropoulos uint64_t want_to_flush;
775814dcd43SSerapheim Dimitropoulos if (spa_flush_all_logs_requested(spa)) {
776814dcd43SSerapheim Dimitropoulos ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
777814dcd43SSerapheim Dimitropoulos want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
778814dcd43SSerapheim Dimitropoulos } else {
779814dcd43SSerapheim Dimitropoulos want_to_flush = spa_estimate_metaslabs_to_flush(spa);
780814dcd43SSerapheim Dimitropoulos }
781814dcd43SSerapheim Dimitropoulos
782814dcd43SSerapheim Dimitropoulos ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
783814dcd43SSerapheim Dimitropoulos want_to_flush);
784814dcd43SSerapheim Dimitropoulos
785814dcd43SSerapheim Dimitropoulos /* Used purely for verification purposes */
786814dcd43SSerapheim Dimitropoulos uint64_t visited = 0;
787814dcd43SSerapheim Dimitropoulos
788814dcd43SSerapheim Dimitropoulos /*
789814dcd43SSerapheim Dimitropoulos * Ideally we would only iterate through spa_metaslabs_by_flushed
790814dcd43SSerapheim Dimitropoulos * using only one variable (curr). We can't do that because
791814dcd43SSerapheim Dimitropoulos * metaslab_flush() mutates position of curr in the AVL when
792814dcd43SSerapheim Dimitropoulos * it flushes that metaslab by moving it to the end of the tree.
793814dcd43SSerapheim Dimitropoulos * Thus we always keep track of the original next node of the
794814dcd43SSerapheim Dimitropoulos * current node (curr) in another variable (next).
795814dcd43SSerapheim Dimitropoulos */
796814dcd43SSerapheim Dimitropoulos metaslab_t *next = NULL;
797814dcd43SSerapheim Dimitropoulos for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
798814dcd43SSerapheim Dimitropoulos curr != NULL; curr = next) {
799814dcd43SSerapheim Dimitropoulos next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
800814dcd43SSerapheim Dimitropoulos
801814dcd43SSerapheim Dimitropoulos /*
802814dcd43SSerapheim Dimitropoulos * If this metaslab has been flushed this txg then we've done
803814dcd43SSerapheim Dimitropoulos * a full circle over the metaslabs.
804814dcd43SSerapheim Dimitropoulos */
805814dcd43SSerapheim Dimitropoulos if (metaslab_unflushed_txg(curr) == txg)
806814dcd43SSerapheim Dimitropoulos break;
807814dcd43SSerapheim Dimitropoulos
808814dcd43SSerapheim Dimitropoulos /*
809814dcd43SSerapheim Dimitropoulos * If we are done flushing for the block heuristic and the
810814dcd43SSerapheim Dimitropoulos * unflushed changes don't exceed the memory limit just stop.
811814dcd43SSerapheim Dimitropoulos */
812814dcd43SSerapheim Dimitropoulos if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
813814dcd43SSerapheim Dimitropoulos break;
814814dcd43SSerapheim Dimitropoulos
815814dcd43SSerapheim Dimitropoulos mutex_enter(&curr->ms_sync_lock);
816814dcd43SSerapheim Dimitropoulos mutex_enter(&curr->ms_lock);
817814dcd43SSerapheim Dimitropoulos boolean_t flushed = metaslab_flush(curr, tx);
818814dcd43SSerapheim Dimitropoulos mutex_exit(&curr->ms_lock);
819814dcd43SSerapheim Dimitropoulos mutex_exit(&curr->ms_sync_lock);
820814dcd43SSerapheim Dimitropoulos
821814dcd43SSerapheim Dimitropoulos /*
822814dcd43SSerapheim Dimitropoulos * If we failed to flush a metaslab (because it was loading),
823814dcd43SSerapheim Dimitropoulos * then we are done with the block heuristic as it's not
824814dcd43SSerapheim Dimitropoulos * possible to destroy any log space maps once you've skipped
825814dcd43SSerapheim Dimitropoulos * a metaslab. In that case we just set our counter to 0 but
826814dcd43SSerapheim Dimitropoulos * we continue looping in case there is still memory pressure
827814dcd43SSerapheim Dimitropoulos * due to unflushed changes. Note that, flushing a metaslab
828814dcd43SSerapheim Dimitropoulos * that is not the oldest flushed in the pool, will never
829814dcd43SSerapheim Dimitropoulos * destroy any log space maps [see spa_cleanup_old_sm_logs()].
830814dcd43SSerapheim Dimitropoulos */
831814dcd43SSerapheim Dimitropoulos if (!flushed) {
832814dcd43SSerapheim Dimitropoulos want_to_flush = 0;
833814dcd43SSerapheim Dimitropoulos } else if (want_to_flush > 0) {
834814dcd43SSerapheim Dimitropoulos want_to_flush--;
835814dcd43SSerapheim Dimitropoulos }
836814dcd43SSerapheim Dimitropoulos
837814dcd43SSerapheim Dimitropoulos visited++;
838814dcd43SSerapheim Dimitropoulos }
839814dcd43SSerapheim Dimitropoulos ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
840814dcd43SSerapheim Dimitropoulos }
841814dcd43SSerapheim Dimitropoulos
842814dcd43SSerapheim Dimitropoulos /*
843814dcd43SSerapheim Dimitropoulos * Close the log space map for this TXG and update the block counts
844814dcd43SSerapheim Dimitropoulos * for the the log's in-memory structure and the summary.
845814dcd43SSerapheim Dimitropoulos */
846814dcd43SSerapheim Dimitropoulos void
spa_sync_close_syncing_log_sm(spa_t * spa)847814dcd43SSerapheim Dimitropoulos spa_sync_close_syncing_log_sm(spa_t *spa)
848814dcd43SSerapheim Dimitropoulos {
849814dcd43SSerapheim Dimitropoulos if (spa_syncing_log_sm(spa) == NULL)
850814dcd43SSerapheim Dimitropoulos return;
851814dcd43SSerapheim Dimitropoulos ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
852814dcd43SSerapheim Dimitropoulos
853814dcd43SSerapheim Dimitropoulos spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
854814dcd43SSerapheim Dimitropoulos ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
855814dcd43SSerapheim Dimitropoulos
856814dcd43SSerapheim Dimitropoulos sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
857814dcd43SSerapheim Dimitropoulos spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
858814dcd43SSerapheim Dimitropoulos
859814dcd43SSerapheim Dimitropoulos /*
860814dcd43SSerapheim Dimitropoulos * Note that we can't assert that sls_mscount is not 0,
861814dcd43SSerapheim Dimitropoulos * because there is the case where the first metaslab
862814dcd43SSerapheim Dimitropoulos * in spa_metaslabs_by_flushed is loading and we were
863814dcd43SSerapheim Dimitropoulos * not able to flush any metaslabs the current TXG.
864814dcd43SSerapheim Dimitropoulos */
865814dcd43SSerapheim Dimitropoulos ASSERT(sls->sls_nblocks != 0);
866814dcd43SSerapheim Dimitropoulos
867814dcd43SSerapheim Dimitropoulos spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
868814dcd43SSerapheim Dimitropoulos spa_log_summary_verify_counts(spa);
869814dcd43SSerapheim Dimitropoulos
870814dcd43SSerapheim Dimitropoulos space_map_close(spa->spa_syncing_log_sm);
871814dcd43SSerapheim Dimitropoulos spa->spa_syncing_log_sm = NULL;
872814dcd43SSerapheim Dimitropoulos
873814dcd43SSerapheim Dimitropoulos /*
874814dcd43SSerapheim Dimitropoulos * At this point we tried to flush as many metaslabs as we
875814dcd43SSerapheim Dimitropoulos * can as the pool is getting exported. Reset the "flush all"
876814dcd43SSerapheim Dimitropoulos * so the last few TXGs before closing the pool can be empty
877814dcd43SSerapheim Dimitropoulos * (e.g. not dirty).
878814dcd43SSerapheim Dimitropoulos */
879814dcd43SSerapheim Dimitropoulos if (spa_flush_all_logs_requested(spa)) {
880814dcd43SSerapheim Dimitropoulos ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
881814dcd43SSerapheim Dimitropoulos spa->spa_log_flushall_txg = 0;
882814dcd43SSerapheim Dimitropoulos }
883814dcd43SSerapheim Dimitropoulos }
884814dcd43SSerapheim Dimitropoulos
885814dcd43SSerapheim Dimitropoulos void
spa_cleanup_old_sm_logs(spa_t * spa,dmu_tx_t * tx)886814dcd43SSerapheim Dimitropoulos spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
887814dcd43SSerapheim Dimitropoulos {
888814dcd43SSerapheim Dimitropoulos objset_t *mos = spa_meta_objset(spa);
889814dcd43SSerapheim Dimitropoulos
890814dcd43SSerapheim Dimitropoulos uint64_t spacemap_zap;
891814dcd43SSerapheim Dimitropoulos int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
892814dcd43SSerapheim Dimitropoulos DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
893814dcd43SSerapheim Dimitropoulos if (error == ENOENT) {
894814dcd43SSerapheim Dimitropoulos ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
895814dcd43SSerapheim Dimitropoulos return;
896814dcd43SSerapheim Dimitropoulos }
897814dcd43SSerapheim Dimitropoulos VERIFY0(error);
898814dcd43SSerapheim Dimitropoulos
899814dcd43SSerapheim Dimitropoulos metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
900814dcd43SSerapheim Dimitropoulos uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
901814dcd43SSerapheim Dimitropoulos
902814dcd43SSerapheim Dimitropoulos /* Free all log space maps older than the oldest_flushed_txg. */
903814dcd43SSerapheim Dimitropoulos for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
904814dcd43SSerapheim Dimitropoulos sls && sls->sls_txg < oldest_flushed_txg;
905814dcd43SSerapheim Dimitropoulos sls = avl_first(&spa->spa_sm_logs_by_txg)) {
906814dcd43SSerapheim Dimitropoulos ASSERT0(sls->sls_mscount);
907814dcd43SSerapheim Dimitropoulos avl_remove(&spa->spa_sm_logs_by_txg, sls);
908814dcd43SSerapheim Dimitropoulos space_map_free_obj(mos, sls->sls_sm_obj, tx);
909814dcd43SSerapheim Dimitropoulos VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
910814dcd43SSerapheim Dimitropoulos spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
911814dcd43SSerapheim Dimitropoulos kmem_free(sls, sizeof (spa_log_sm_t));
912814dcd43SSerapheim Dimitropoulos }
913814dcd43SSerapheim Dimitropoulos }
914814dcd43SSerapheim Dimitropoulos
915814dcd43SSerapheim Dimitropoulos static spa_log_sm_t *
spa_log_sm_alloc(uint64_t sm_obj,uint64_t txg)916814dcd43SSerapheim Dimitropoulos spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
917814dcd43SSerapheim Dimitropoulos {
918814dcd43SSerapheim Dimitropoulos spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
919814dcd43SSerapheim Dimitropoulos
920814dcd43SSerapheim Dimitropoulos sls->sls_sm_obj = sm_obj;
921814dcd43SSerapheim Dimitropoulos sls->sls_txg = txg;
922814dcd43SSerapheim Dimitropoulos return (sls);
923814dcd43SSerapheim Dimitropoulos }
924814dcd43SSerapheim Dimitropoulos
925814dcd43SSerapheim Dimitropoulos void
spa_generate_syncing_log_sm(spa_t * spa,dmu_tx_t * tx)926814dcd43SSerapheim Dimitropoulos spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
927814dcd43SSerapheim Dimitropoulos {
928814dcd43SSerapheim Dimitropoulos uint64_t txg = dmu_tx_get_txg(tx);
929814dcd43SSerapheim Dimitropoulos objset_t *mos = spa_meta_objset(spa);
930814dcd43SSerapheim Dimitropoulos
931814dcd43SSerapheim Dimitropoulos if (spa_syncing_log_sm(spa) != NULL)
932814dcd43SSerapheim Dimitropoulos return;
933814dcd43SSerapheim Dimitropoulos
934814dcd43SSerapheim Dimitropoulos if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
935814dcd43SSerapheim Dimitropoulos return;
936814dcd43SSerapheim Dimitropoulos
937814dcd43SSerapheim Dimitropoulos uint64_t spacemap_zap;
938814dcd43SSerapheim Dimitropoulos int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
939814dcd43SSerapheim Dimitropoulos DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
940814dcd43SSerapheim Dimitropoulos if (error == ENOENT) {
941814dcd43SSerapheim Dimitropoulos ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
942814dcd43SSerapheim Dimitropoulos
943814dcd43SSerapheim Dimitropoulos error = 0;
944814dcd43SSerapheim Dimitropoulos spacemap_zap = zap_create(mos,
945814dcd43SSerapheim Dimitropoulos DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
946814dcd43SSerapheim Dimitropoulos VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
947814dcd43SSerapheim Dimitropoulos DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
948814dcd43SSerapheim Dimitropoulos &spacemap_zap, tx));
949814dcd43SSerapheim Dimitropoulos spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
950814dcd43SSerapheim Dimitropoulos }
951814dcd43SSerapheim Dimitropoulos VERIFY0(error);
952814dcd43SSerapheim Dimitropoulos
953814dcd43SSerapheim Dimitropoulos uint64_t sm_obj;
954814dcd43SSerapheim Dimitropoulos ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
955814dcd43SSerapheim Dimitropoulos ==, ENOENT);
956814dcd43SSerapheim Dimitropoulos sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
957814dcd43SSerapheim Dimitropoulos VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
958814dcd43SSerapheim Dimitropoulos avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
959814dcd43SSerapheim Dimitropoulos
960814dcd43SSerapheim Dimitropoulos /*
961814dcd43SSerapheim Dimitropoulos * We pass UINT64_MAX as the space map's representation size
962814dcd43SSerapheim Dimitropoulos * and SPA_MINBLOCKSHIFT as the shift, to make the space map
963814dcd43SSerapheim Dimitropoulos * accept any sorts of segments since there's no real advantage
964814dcd43SSerapheim Dimitropoulos * to being more restrictive (given that we're already going
965814dcd43SSerapheim Dimitropoulos * to be using 2-word entries).
966814dcd43SSerapheim Dimitropoulos */
967814dcd43SSerapheim Dimitropoulos VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
968814dcd43SSerapheim Dimitropoulos 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
969814dcd43SSerapheim Dimitropoulos
970814dcd43SSerapheim Dimitropoulos /*
971814dcd43SSerapheim Dimitropoulos * If the log space map feature was just enabled, the blocklimit
972814dcd43SSerapheim Dimitropoulos * has not yet been set.
973814dcd43SSerapheim Dimitropoulos */
974814dcd43SSerapheim Dimitropoulos if (spa_log_sm_blocklimit(spa) == 0)
975814dcd43SSerapheim Dimitropoulos spa_log_sm_set_blocklimit(spa);
976814dcd43SSerapheim Dimitropoulos }
977814dcd43SSerapheim Dimitropoulos
978814dcd43SSerapheim Dimitropoulos /*
979814dcd43SSerapheim Dimitropoulos * Find all the log space maps stored in the space map ZAP and sort
980814dcd43SSerapheim Dimitropoulos * them by their TXG in spa_sm_logs_by_txg.
981814dcd43SSerapheim Dimitropoulos */
982814dcd43SSerapheim Dimitropoulos static int
spa_ld_log_sm_metadata(spa_t * spa)983814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_metadata(spa_t *spa)
984814dcd43SSerapheim Dimitropoulos {
985814dcd43SSerapheim Dimitropoulos int error;
986814dcd43SSerapheim Dimitropoulos uint64_t spacemap_zap;
987814dcd43SSerapheim Dimitropoulos
988814dcd43SSerapheim Dimitropoulos ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
989814dcd43SSerapheim Dimitropoulos
990814dcd43SSerapheim Dimitropoulos error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
991814dcd43SSerapheim Dimitropoulos DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
992814dcd43SSerapheim Dimitropoulos if (error == ENOENT) {
993814dcd43SSerapheim Dimitropoulos /* the space map ZAP doesn't exist yet */
994814dcd43SSerapheim Dimitropoulos return (0);
995814dcd43SSerapheim Dimitropoulos } else if (error != 0) {
996814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
997814dcd43SSerapheim Dimitropoulos "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
998814dcd43SSerapheim Dimitropoulos error);
999814dcd43SSerapheim Dimitropoulos return (error);
1000814dcd43SSerapheim Dimitropoulos }
1001814dcd43SSerapheim Dimitropoulos
1002814dcd43SSerapheim Dimitropoulos zap_cursor_t zc;
1003814dcd43SSerapheim Dimitropoulos zap_attribute_t za;
1004814dcd43SSerapheim Dimitropoulos for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
1005814dcd43SSerapheim Dimitropoulos (error = zap_cursor_retrieve(&zc, &za)) == 0;
1006814dcd43SSerapheim Dimitropoulos zap_cursor_advance(&zc)) {
1007814dcd43SSerapheim Dimitropoulos uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
1008814dcd43SSerapheim Dimitropoulos spa_log_sm_t *sls =
1009814dcd43SSerapheim Dimitropoulos spa_log_sm_alloc(za.za_first_integer, log_txg);
1010814dcd43SSerapheim Dimitropoulos avl_add(&spa->spa_sm_logs_by_txg, sls);
1011814dcd43SSerapheim Dimitropoulos }
1012814dcd43SSerapheim Dimitropoulos zap_cursor_fini(&zc);
1013814dcd43SSerapheim Dimitropoulos if (error != ENOENT) {
1014814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1015814dcd43SSerapheim Dimitropoulos "zap_cursor_retrieve(spacemap_zap) [error %d]",
1016814dcd43SSerapheim Dimitropoulos error);
1017814dcd43SSerapheim Dimitropoulos return (error);
1018814dcd43SSerapheim Dimitropoulos }
1019814dcd43SSerapheim Dimitropoulos
1020814dcd43SSerapheim Dimitropoulos for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1021814dcd43SSerapheim Dimitropoulos m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1022814dcd43SSerapheim Dimitropoulos spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1023814dcd43SSerapheim Dimitropoulos spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1024814dcd43SSerapheim Dimitropoulos &target, NULL);
1025814dcd43SSerapheim Dimitropoulos
1026814dcd43SSerapheim Dimitropoulos /*
1027814dcd43SSerapheim Dimitropoulos * At this point if sls is zero it means that a bug occurred
1028814dcd43SSerapheim Dimitropoulos * in ZFS the last time the pool was open or earlier in the
1029814dcd43SSerapheim Dimitropoulos * import code path. In general, we would have placed a
1030814dcd43SSerapheim Dimitropoulos * VERIFY() here or in this case just let the kernel panic
1031814dcd43SSerapheim Dimitropoulos * with NULL pointer dereference when incrementing sls_mscount,
1032814dcd43SSerapheim Dimitropoulos * but since this is the import code path we can be a bit more
1033814dcd43SSerapheim Dimitropoulos * lenient. Thus, for DEBUG bits we always cause a panic, while
1034814dcd43SSerapheim Dimitropoulos * in production we log the error and just fail the import.
1035814dcd43SSerapheim Dimitropoulos */
1036814dcd43SSerapheim Dimitropoulos ASSERT(sls != NULL);
1037814dcd43SSerapheim Dimitropoulos if (sls == NULL) {
1038814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1039814dcd43SSerapheim Dimitropoulos "encountered: could not find log spacemap for "
1040814dcd43SSerapheim Dimitropoulos "TXG %ld [error %d]",
1041814dcd43SSerapheim Dimitropoulos metaslab_unflushed_txg(m), ENOENT);
1042814dcd43SSerapheim Dimitropoulos return (ENOENT);
1043814dcd43SSerapheim Dimitropoulos }
1044814dcd43SSerapheim Dimitropoulos sls->sls_mscount++;
1045814dcd43SSerapheim Dimitropoulos }
1046814dcd43SSerapheim Dimitropoulos
1047814dcd43SSerapheim Dimitropoulos return (0);
1048814dcd43SSerapheim Dimitropoulos }
1049814dcd43SSerapheim Dimitropoulos
1050814dcd43SSerapheim Dimitropoulos typedef struct spa_ld_log_sm_arg {
1051814dcd43SSerapheim Dimitropoulos spa_t *slls_spa;
1052814dcd43SSerapheim Dimitropoulos uint64_t slls_txg;
1053814dcd43SSerapheim Dimitropoulos } spa_ld_log_sm_arg_t;
1054814dcd43SSerapheim Dimitropoulos
1055814dcd43SSerapheim Dimitropoulos static int
spa_ld_log_sm_cb(space_map_entry_t * sme,void * arg)1056814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1057814dcd43SSerapheim Dimitropoulos {
1058814dcd43SSerapheim Dimitropoulos uint64_t offset = sme->sme_offset;
1059814dcd43SSerapheim Dimitropoulos uint64_t size = sme->sme_run;
1060814dcd43SSerapheim Dimitropoulos uint32_t vdev_id = sme->sme_vdev;
1061814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_arg_t *slls = arg;
1062814dcd43SSerapheim Dimitropoulos spa_t *spa = slls->slls_spa;
1063814dcd43SSerapheim Dimitropoulos
1064814dcd43SSerapheim Dimitropoulos vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1065814dcd43SSerapheim Dimitropoulos
1066814dcd43SSerapheim Dimitropoulos /*
1067814dcd43SSerapheim Dimitropoulos * If the vdev has been removed (i.e. it is indirect or a hole)
1068814dcd43SSerapheim Dimitropoulos * skip this entry. The contents of this vdev have already moved
1069814dcd43SSerapheim Dimitropoulos * elsewhere.
1070814dcd43SSerapheim Dimitropoulos */
1071814dcd43SSerapheim Dimitropoulos if (!vdev_is_concrete(vd))
1072814dcd43SSerapheim Dimitropoulos return (0);
1073814dcd43SSerapheim Dimitropoulos
1074814dcd43SSerapheim Dimitropoulos metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1075814dcd43SSerapheim Dimitropoulos ASSERT(!ms->ms_loaded);
1076814dcd43SSerapheim Dimitropoulos
1077814dcd43SSerapheim Dimitropoulos /*
1078814dcd43SSerapheim Dimitropoulos * If we have already flushed entries for this TXG to this
1079814dcd43SSerapheim Dimitropoulos * metaslab's space map, then ignore it. Note that we flush
1080814dcd43SSerapheim Dimitropoulos * before processing any allocations/frees for that TXG, so
1081814dcd43SSerapheim Dimitropoulos * the metaslab's space map only has entries from *before*
1082814dcd43SSerapheim Dimitropoulos * the unflushed TXG.
1083814dcd43SSerapheim Dimitropoulos */
1084814dcd43SSerapheim Dimitropoulos if (slls->slls_txg < metaslab_unflushed_txg(ms))
1085814dcd43SSerapheim Dimitropoulos return (0);
1086814dcd43SSerapheim Dimitropoulos
1087814dcd43SSerapheim Dimitropoulos switch (sme->sme_type) {
1088814dcd43SSerapheim Dimitropoulos case SM_ALLOC:
1089814dcd43SSerapheim Dimitropoulos range_tree_remove_xor_add_segment(offset, offset + size,
1090814dcd43SSerapheim Dimitropoulos ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1091814dcd43SSerapheim Dimitropoulos break;
1092814dcd43SSerapheim Dimitropoulos case SM_FREE:
1093814dcd43SSerapheim Dimitropoulos range_tree_remove_xor_add_segment(offset, offset + size,
1094814dcd43SSerapheim Dimitropoulos ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1095814dcd43SSerapheim Dimitropoulos break;
1096814dcd43SSerapheim Dimitropoulos default:
1097814dcd43SSerapheim Dimitropoulos panic("invalid maptype_t");
1098814dcd43SSerapheim Dimitropoulos break;
1099814dcd43SSerapheim Dimitropoulos }
1100814dcd43SSerapheim Dimitropoulos return (0);
1101814dcd43SSerapheim Dimitropoulos }
1102814dcd43SSerapheim Dimitropoulos
1103814dcd43SSerapheim Dimitropoulos static int
spa_ld_log_sm_data(spa_t * spa)1104814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_data(spa_t *spa)
1105814dcd43SSerapheim Dimitropoulos {
1106814dcd43SSerapheim Dimitropoulos int error = 0;
1107814dcd43SSerapheim Dimitropoulos
1108814dcd43SSerapheim Dimitropoulos /*
1109814dcd43SSerapheim Dimitropoulos * If we are not going to do any writes there is no need
1110814dcd43SSerapheim Dimitropoulos * to read the log space maps.
1111814dcd43SSerapheim Dimitropoulos */
1112814dcd43SSerapheim Dimitropoulos if (!spa_writeable(spa))
1113814dcd43SSerapheim Dimitropoulos return (0);
1114814dcd43SSerapheim Dimitropoulos
1115814dcd43SSerapheim Dimitropoulos ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1116814dcd43SSerapheim Dimitropoulos ASSERT0(spa->spa_unflushed_stats.sus_memused);
1117814dcd43SSerapheim Dimitropoulos
1118814dcd43SSerapheim Dimitropoulos hrtime_t read_logs_starttime = gethrtime();
1119814dcd43SSerapheim Dimitropoulos /* this is a no-op when we don't have space map logs */
1120814dcd43SSerapheim Dimitropoulos for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1121814dcd43SSerapheim Dimitropoulos sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1122814dcd43SSerapheim Dimitropoulos space_map_t *sm = NULL;
1123814dcd43SSerapheim Dimitropoulos error = space_map_open(&sm, spa_meta_objset(spa),
1124814dcd43SSerapheim Dimitropoulos sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
1125814dcd43SSerapheim Dimitropoulos if (error != 0) {
1126814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
1127814dcd43SSerapheim Dimitropoulos "space_map_open(obj=%llu) [error %d]",
1128814dcd43SSerapheim Dimitropoulos (u_longlong_t)sls->sls_sm_obj, error);
1129814dcd43SSerapheim Dimitropoulos goto out;
1130814dcd43SSerapheim Dimitropoulos }
1131814dcd43SSerapheim Dimitropoulos
1132814dcd43SSerapheim Dimitropoulos struct spa_ld_log_sm_arg vla = {
1133814dcd43SSerapheim Dimitropoulos .slls_spa = spa,
1134814dcd43SSerapheim Dimitropoulos .slls_txg = sls->sls_txg
1135814dcd43SSerapheim Dimitropoulos };
1136814dcd43SSerapheim Dimitropoulos error = space_map_iterate(sm, space_map_length(sm),
1137814dcd43SSerapheim Dimitropoulos spa_ld_log_sm_cb, &vla);
1138814dcd43SSerapheim Dimitropoulos if (error != 0) {
1139814dcd43SSerapheim Dimitropoulos space_map_close(sm);
1140814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1141814dcd43SSerapheim Dimitropoulos "at space_map_iterate(obj=%llu) [error %d]",
1142814dcd43SSerapheim Dimitropoulos (u_longlong_t)sls->sls_sm_obj, error);
1143814dcd43SSerapheim Dimitropoulos goto out;
1144814dcd43SSerapheim Dimitropoulos }
1145814dcd43SSerapheim Dimitropoulos
1146814dcd43SSerapheim Dimitropoulos ASSERT0(sls->sls_nblocks);
1147814dcd43SSerapheim Dimitropoulos sls->sls_nblocks = space_map_nblocks(sm);
1148814dcd43SSerapheim Dimitropoulos spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1149814dcd43SSerapheim Dimitropoulos summary_add_data(spa, sls->sls_txg,
1150814dcd43SSerapheim Dimitropoulos sls->sls_mscount, sls->sls_nblocks);
1151814dcd43SSerapheim Dimitropoulos
1152814dcd43SSerapheim Dimitropoulos space_map_close(sm);
1153814dcd43SSerapheim Dimitropoulos }
1154814dcd43SSerapheim Dimitropoulos hrtime_t read_logs_endtime = gethrtime();
1155814dcd43SSerapheim Dimitropoulos spa_load_note(spa,
1156814dcd43SSerapheim Dimitropoulos "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
1157814dcd43SSerapheim Dimitropoulos "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
1158814dcd43SSerapheim Dimitropoulos (u_longlong_t)spa_log_sm_nblocks(spa),
1159814dcd43SSerapheim Dimitropoulos (u_longlong_t)zfs_log_sm_blksz,
1160814dcd43SSerapheim Dimitropoulos (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
1161814dcd43SSerapheim Dimitropoulos
1162814dcd43SSerapheim Dimitropoulos out:
1163814dcd43SSerapheim Dimitropoulos /*
1164814dcd43SSerapheim Dimitropoulos * Now that the metaslabs contain their unflushed changes:
1165814dcd43SSerapheim Dimitropoulos * [1] recalculate their actual allocated space
1166814dcd43SSerapheim Dimitropoulos * [2] recalculate their weights
1167814dcd43SSerapheim Dimitropoulos * [3] sum up the memory usage of their unflushed range trees
1168814dcd43SSerapheim Dimitropoulos * [4] optionally load them, if debug_load is set
1169814dcd43SSerapheim Dimitropoulos *
1170814dcd43SSerapheim Dimitropoulos * Note that even in the case where we get here because of an
1171814dcd43SSerapheim Dimitropoulos * error (e.g. error != 0), we still want to update the fields
1172814dcd43SSerapheim Dimitropoulos * below in order to have a proper teardown in spa_unload().
1173814dcd43SSerapheim Dimitropoulos */
1174814dcd43SSerapheim Dimitropoulos for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1175814dcd43SSerapheim Dimitropoulos m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1176814dcd43SSerapheim Dimitropoulos mutex_enter(&m->ms_lock);
1177814dcd43SSerapheim Dimitropoulos m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1178814dcd43SSerapheim Dimitropoulos range_tree_space(m->ms_unflushed_allocs) -
1179814dcd43SSerapheim Dimitropoulos range_tree_space(m->ms_unflushed_frees);
1180814dcd43SSerapheim Dimitropoulos
1181814dcd43SSerapheim Dimitropoulos vdev_t *vd = m->ms_group->mg_vd;
1182814dcd43SSerapheim Dimitropoulos metaslab_space_update(vd, m->ms_group->mg_class,
1183814dcd43SSerapheim Dimitropoulos range_tree_space(m->ms_unflushed_allocs), 0, 0);
1184814dcd43SSerapheim Dimitropoulos metaslab_space_update(vd, m->ms_group->mg_class,
1185814dcd43SSerapheim Dimitropoulos -range_tree_space(m->ms_unflushed_frees), 0, 0);
1186814dcd43SSerapheim Dimitropoulos
1187814dcd43SSerapheim Dimitropoulos ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1188814dcd43SSerapheim Dimitropoulos metaslab_recalculate_weight_and_sort(m);
1189814dcd43SSerapheim Dimitropoulos
1190814dcd43SSerapheim Dimitropoulos spa->spa_unflushed_stats.sus_memused +=
1191814dcd43SSerapheim Dimitropoulos metaslab_unflushed_changes_memused(m);
1192814dcd43SSerapheim Dimitropoulos
1193814dcd43SSerapheim Dimitropoulos if (metaslab_debug_load && m->ms_sm != NULL) {
1194814dcd43SSerapheim Dimitropoulos VERIFY0(metaslab_load(m));
1195*af1d63abSPaul Dagnelie metaslab_set_selected_txg(m, 0);
1196814dcd43SSerapheim Dimitropoulos }
1197814dcd43SSerapheim Dimitropoulos mutex_exit(&m->ms_lock);
1198814dcd43SSerapheim Dimitropoulos }
1199814dcd43SSerapheim Dimitropoulos
1200814dcd43SSerapheim Dimitropoulos return (error);
1201814dcd43SSerapheim Dimitropoulos }
1202814dcd43SSerapheim Dimitropoulos
1203814dcd43SSerapheim Dimitropoulos static int
spa_ld_unflushed_txgs(vdev_t * vd)1204814dcd43SSerapheim Dimitropoulos spa_ld_unflushed_txgs(vdev_t *vd)
1205814dcd43SSerapheim Dimitropoulos {
1206814dcd43SSerapheim Dimitropoulos spa_t *spa = vd->vdev_spa;
1207814dcd43SSerapheim Dimitropoulos objset_t *mos = spa_meta_objset(spa);
1208814dcd43SSerapheim Dimitropoulos
1209814dcd43SSerapheim Dimitropoulos if (vd->vdev_top_zap == 0)
1210814dcd43SSerapheim Dimitropoulos return (0);
1211814dcd43SSerapheim Dimitropoulos
1212814dcd43SSerapheim Dimitropoulos uint64_t object = 0;
1213814dcd43SSerapheim Dimitropoulos int error = zap_lookup(mos, vd->vdev_top_zap,
1214814dcd43SSerapheim Dimitropoulos VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1215814dcd43SSerapheim Dimitropoulos sizeof (uint64_t), 1, &object);
1216814dcd43SSerapheim Dimitropoulos if (error == ENOENT)
1217814dcd43SSerapheim Dimitropoulos return (0);
1218814dcd43SSerapheim Dimitropoulos else if (error != 0) {
1219814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1220814dcd43SSerapheim Dimitropoulos "zap_lookup(vdev_top_zap=%llu) [error %d]",
1221814dcd43SSerapheim Dimitropoulos (u_longlong_t)vd->vdev_top_zap, error);
1222814dcd43SSerapheim Dimitropoulos return (error);
1223814dcd43SSerapheim Dimitropoulos }
1224814dcd43SSerapheim Dimitropoulos
1225814dcd43SSerapheim Dimitropoulos for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1226814dcd43SSerapheim Dimitropoulos metaslab_t *ms = vd->vdev_ms[m];
1227814dcd43SSerapheim Dimitropoulos ASSERT(ms != NULL);
1228814dcd43SSerapheim Dimitropoulos
1229814dcd43SSerapheim Dimitropoulos metaslab_unflushed_phys_t entry;
1230814dcd43SSerapheim Dimitropoulos uint64_t entry_size = sizeof (entry);
1231814dcd43SSerapheim Dimitropoulos uint64_t entry_offset = ms->ms_id * entry_size;
1232814dcd43SSerapheim Dimitropoulos
1233814dcd43SSerapheim Dimitropoulos error = dmu_read(mos, object,
1234814dcd43SSerapheim Dimitropoulos entry_offset, entry_size, &entry, 0);
1235814dcd43SSerapheim Dimitropoulos if (error != 0) {
1236814dcd43SSerapheim Dimitropoulos spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1237814dcd43SSerapheim Dimitropoulos "failed at dmu_read(obj=%llu) [error %d]",
1238814dcd43SSerapheim Dimitropoulos (u_longlong_t)object, error);
1239814dcd43SSerapheim Dimitropoulos return (error);
1240814dcd43SSerapheim Dimitropoulos }
1241814dcd43SSerapheim Dimitropoulos
1242814dcd43SSerapheim Dimitropoulos ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1243814dcd43SSerapheim Dimitropoulos if (ms->ms_unflushed_txg != 0) {
1244814dcd43SSerapheim Dimitropoulos mutex_enter(&spa->spa_flushed_ms_lock);
1245814dcd43SSerapheim Dimitropoulos avl_add(&spa->spa_metaslabs_by_flushed, ms);
1246814dcd43SSerapheim Dimitropoulos mutex_exit(&spa->spa_flushed_ms_lock);
1247814dcd43SSerapheim Dimitropoulos }
1248814dcd43SSerapheim Dimitropoulos }
1249814dcd43SSerapheim Dimitropoulos return (0);
1250814dcd43SSerapheim Dimitropoulos }
1251814dcd43SSerapheim Dimitropoulos
1252814dcd43SSerapheim Dimitropoulos /*
1253814dcd43SSerapheim Dimitropoulos * Read all the log space map entries into their respective
1254814dcd43SSerapheim Dimitropoulos * metaslab unflushed trees and keep them sorted by TXG in the
1255814dcd43SSerapheim Dimitropoulos * SPA's metadata. In addition, setup all the metadata for the
1256814dcd43SSerapheim Dimitropoulos * memory and the block heuristics.
1257814dcd43SSerapheim Dimitropoulos */
1258814dcd43SSerapheim Dimitropoulos int
spa_ld_log_spacemaps(spa_t * spa)1259814dcd43SSerapheim Dimitropoulos spa_ld_log_spacemaps(spa_t *spa)
1260814dcd43SSerapheim Dimitropoulos {
1261814dcd43SSerapheim Dimitropoulos int error;
1262814dcd43SSerapheim Dimitropoulos
1263814dcd43SSerapheim Dimitropoulos spa_log_sm_set_blocklimit(spa);
1264814dcd43SSerapheim Dimitropoulos
1265814dcd43SSerapheim Dimitropoulos for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1266814dcd43SSerapheim Dimitropoulos vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1267814dcd43SSerapheim Dimitropoulos error = spa_ld_unflushed_txgs(vd);
1268814dcd43SSerapheim Dimitropoulos if (error != 0)
1269814dcd43SSerapheim Dimitropoulos return (error);
1270814dcd43SSerapheim Dimitropoulos }
1271814dcd43SSerapheim Dimitropoulos
1272814dcd43SSerapheim Dimitropoulos error = spa_ld_log_sm_metadata(spa);
1273814dcd43SSerapheim Dimitropoulos if (error != 0)
1274814dcd43SSerapheim Dimitropoulos return (error);
1275814dcd43SSerapheim Dimitropoulos
1276814dcd43SSerapheim Dimitropoulos /*
1277814dcd43SSerapheim Dimitropoulos * Note: we don't actually expect anything to change at this point
1278814dcd43SSerapheim Dimitropoulos * but we grab the config lock so we don't fail any assertions
1279814dcd43SSerapheim Dimitropoulos * when using vdev_lookup_top().
1280814dcd43SSerapheim Dimitropoulos */
1281814dcd43SSerapheim Dimitropoulos spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1282814dcd43SSerapheim Dimitropoulos error = spa_ld_log_sm_data(spa);
1283814dcd43SSerapheim Dimitropoulos spa_config_exit(spa, SCL_CONFIG, FTAG);
1284814dcd43SSerapheim Dimitropoulos
1285814dcd43SSerapheim Dimitropoulos return (error);
1286814dcd43SSerapheim Dimitropoulos }
1287