1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
22d6e555bGeorge Wilson * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fa9e406ahrens * Use is subject to license terms.
24fa9e406ahrens */
25fb09f5aMadhav Suresh/*
26814dcd4Serapheim Dimitropoulos * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
27814dcd4Serapheim Dimitropoulos * Copyright 2019 Joyent, Inc.
28fb09f5aMadhav Suresh */
29fa9e406ahrens
30fa9e406ahrens#include <sys/zfs_context.h>
31fa9e406ahrens#include <sys/spa.h>
32fa9e406ahrens#include <sys/dmu.h>
330713e23George Wilson#include <sys/dmu_tx.h>
340713e23George Wilson#include <sys/dnode.h>
350713e23George Wilson#include <sys/dsl_pool.h>
36ecc2d60bonwick#include <sys/zio.h>
37fa9e406ahrens#include <sys/space_map.h>
38814dcd4Serapheim Dimitropoulos#include <sys/spa_log_spacemap.h>
390713e23George Wilson#include <sys/refcount.h>
400713e23George Wilson#include <sys/zfeature.h>
41fa9e406ahrens
42ecc2d60bonwick/*
438671400Serapheim Dimitropoulos * Note on space map block size:
448671400Serapheim Dimitropoulos *
45b1be289Matthew Ahrens * The data for a given space map can be kept on blocks of any size.
4617f1128Serapheim Dimitropoulos * Larger blocks entail fewer I/O operations, but they also cause the
4717f1128Serapheim Dimitropoulos * DMU to keep more data in-core, and also to waste more I/O bandwidth
48b1be289Matthew Ahrens * when only a few blocks have changed since the last transaction group.
49ecc2d60bonwick */
50ecc2d60bonwick
51ecc2d60bonwick/*
5217f1128Serapheim Dimitropoulos * Enabled whenever we want to stress test the use of double-word
5317f1128Serapheim Dimitropoulos * space map entries.
5417f1128Serapheim Dimitropoulos */
5517f1128Serapheim Dimitropoulosboolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
5617f1128Serapheim Dimitropoulos
57221813cMatthew Ahrens/*
58221813cMatthew Ahrens * Override the default indirect block size of 128K, instead using 16K for
59221813cMatthew Ahrens * spacemaps (2^14 bytes).  This dramatically reduces write inflation since
60221813cMatthew Ahrens * appending to a spacemap typically has to write one data block (4KB) and one
61221813cMatthew Ahrens * or two indirect blocks (16K-32K, rather than 128K).
62221813cMatthew Ahrens */
63221813cMatthew Ahrensint space_map_ibs = 14;
64221813cMatthew Ahrens
6517f1128Serapheim Dimitropoulosboolean_t
6617f1128Serapheim Dimitropoulossm_entry_is_debug(uint64_t e)
6717f1128Serapheim Dimitropoulos{
6817f1128Serapheim Dimitropoulos	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
6917f1128Serapheim Dimitropoulos}
7017f1128Serapheim Dimitropoulos
7117f1128Serapheim Dimitropoulosboolean_t
7217f1128Serapheim Dimitropoulossm_entry_is_single_word(uint64_t e)
7317f1128Serapheim Dimitropoulos{
7417f1128Serapheim Dimitropoulos	uint8_t prefix = SM_PREFIX_DECODE(e);
7517f1128Serapheim Dimitropoulos	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
7617f1128Serapheim Dimitropoulos}
7717f1128Serapheim Dimitropoulos
7817f1128Serapheim Dimitropoulosboolean_t
7917f1128Serapheim Dimitropoulossm_entry_is_double_word(uint64_t e)
8017f1128Serapheim Dimitropoulos{
8117f1128Serapheim Dimitropoulos	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
8217f1128Serapheim Dimitropoulos}
8317f1128Serapheim Dimitropoulos
8417f1128Serapheim Dimitropoulos/*
855cabbc6Prashanth Sreenivasa * Iterate through the space map, invoking the callback on each (non-debug)
86555d674Serapheim Dimitropoulos * space map entry. Stop after reading 'end' bytes of the space map.
87ecc2d60bonwick */
88fa9e406ahrensint
89555d674Serapheim Dimitropoulosspace_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
90fa9e406ahrens{
91555d674Serapheim Dimitropoulos	uint64_t blksz = sm->sm_blksz;
92555d674Serapheim Dimitropoulos
93555d674Serapheim Dimitropoulos	ASSERT3U(blksz, !=, 0);
94555d674Serapheim Dimitropoulos	ASSERT3U(end, <=, space_map_length(sm));
95555d674Serapheim Dimitropoulos	ASSERT0(P2PHASE(end, sizeof (uint64_t)));
9617f1128Serapheim Dimitropoulos
97555d674Serapheim Dimitropoulos	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
9817f1128Serapheim Dimitropoulos	    ZIO_PRIORITY_SYNC_READ);
9917f1128Serapheim Dimitropoulos
1000a4e951gw	int error = 0;
101555d674Serapheim Dimitropoulos	for (uint64_t block_base = 0; block_base < end && error == 0;
10217f1128Serapheim Dimitropoulos	    block_base += blksz) {
10317f1128Serapheim Dimitropoulos		dmu_buf_t *db;
10417f1128Serapheim Dimitropoulos		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
10517f1128Serapheim Dimitropoulos		    block_base, FTAG, &db, DMU_READ_PREFETCH);
10617f1128Serapheim Dimitropoulos		if (error != 0)
10717f1128Serapheim Dimitropoulos			return (error);
108fa9e406ahrens
10917f1128Serapheim Dimitropoulos		uint64_t *block_start = db->db_data;
110555d674Serapheim Dimitropoulos		uint64_t block_length = MIN(end - block_base, blksz);
11117f1128Serapheim Dimitropoulos		uint64_t *block_end = block_start +
11217f1128Serapheim Dimitropoulos		    (block_length / sizeof (uint64_t));
113fa9e406ahrens
11417f1128Serapheim Dimitropoulos		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
11517f1128Serapheim Dimitropoulos		VERIFY3U(block_length, !=, 0);
11617f1128Serapheim Dimitropoulos		ASSERT3U(blksz, ==, db->db_size);
117ecc2d60bonwick
11817f1128Serapheim Dimitropoulos		for (uint64_t *block_cursor = block_start;
11917f1128Serapheim Dimitropoulos		    block_cursor < block_end && error == 0; block_cursor++) {
12017f1128Serapheim Dimitropoulos			uint64_t e = *block_cursor;
121ecc2d60bonwick
12217f1128Serapheim Dimitropoulos			if (sm_entry_is_debug(e)) /* Skip debug entries */
12317f1128Serapheim Dimitropoulos				continue;
124fa9e406ahrens
12517f1128Serapheim Dimitropoulos			uint64_t raw_offset, raw_run, vdev_id;
12617f1128Serapheim Dimitropoulos			maptype_t type;
12717f1128Serapheim Dimitropoulos			if (sm_entry_is_single_word(e)) {
12817f1128Serapheim Dimitropoulos				type = SM_TYPE_DECODE(e);
12917f1128Serapheim Dimitropoulos				vdev_id = SM_NO_VDEVID;
13017f1128Serapheim Dimitropoulos				raw_offset = SM_OFFSET_DECODE(e);
13117f1128Serapheim Dimitropoulos				raw_run = SM_RUN_DECODE(e);
13217f1128Serapheim Dimitropoulos			} else {
13317f1128Serapheim Dimitropoulos				/* it is a two-word entry */
13417f1128Serapheim Dimitropoulos				ASSERT(sm_entry_is_double_word(e));
13517f1128Serapheim Dimitropoulos				raw_run = SM2_RUN_DECODE(e);
13617f1128Serapheim Dimitropoulos				vdev_id = SM2_VDEV_DECODE(e);
13717f1128Serapheim Dimitropoulos
13817f1128Serapheim Dimitropoulos				/* move on to the second word */
13917f1128Serapheim Dimitropoulos				block_cursor++;
14017f1128Serapheim Dimitropoulos				e = *block_cursor;
14117f1128Serapheim Dimitropoulos				VERIFY3P(block_cursor, <=, block_end);
14217f1128Serapheim Dimitropoulos
14317f1128Serapheim Dimitropoulos				type = SM2_TYPE_DECODE(e);
14417f1128Serapheim Dimitropoulos				raw_offset = SM2_OFFSET_DECODE(e);
14517f1128Serapheim Dimitropoulos			}
146ecc2d60bonwick
14717f1128Serapheim Dimitropoulos			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
14817f1128Serapheim Dimitropoulos			    sm->sm_start;
14917f1128Serapheim Dimitropoulos			uint64_t entry_run = raw_run << sm->sm_shift;
150fa9e406ahrens
15117f1128Serapheim Dimitropoulos			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
15217f1128Serapheim Dimitropoulos			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
15317f1128Serapheim Dimitropoulos			ASSERT3U(entry_offset, >=, sm->sm_start);
15417f1128Serapheim Dimitropoulos			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
15517f1128Serapheim Dimitropoulos			ASSERT3U(entry_run, <=, sm->sm_size);
15617f1128Serapheim Dimitropoulos			ASSERT3U(entry_offset + entry_run, <=,
15717f1128Serapheim Dimitropoulos			    sm->sm_start + sm->sm_size);
158fa9e406ahrens
15917f1128Serapheim Dimitropoulos			space_map_entry_t sme = {
16017f1128Serapheim Dimitropoulos			    .sme_type = type,
16117f1128Serapheim Dimitropoulos			    .sme_vdev = vdev_id,
16217f1128Serapheim Dimitropoulos			    .sme_offset = entry_offset,
16317f1128Serapheim Dimitropoulos			    .sme_run = entry_run
16417f1128Serapheim Dimitropoulos			};
16517f1128Serapheim Dimitropoulos			error = callback(&sme, arg);
16617f1128Serapheim Dimitropoulos		}
16717f1128Serapheim Dimitropoulos		dmu_buf_rele(db, FTAG);
16817f1128Serapheim Dimitropoulos	}
16917f1128Serapheim Dimitropoulos	return (error);
17017f1128Serapheim Dimitropoulos}
171fa9e406ahrens
17217f1128Serapheim Dimitropoulos/*
17317f1128Serapheim Dimitropoulos * Reads the entries from the last block of the space map into
17417f1128Serapheim Dimitropoulos * buf in reverse order. Populates nwords with number of words
17517f1128Serapheim Dimitropoulos * in the last block.
17617f1128Serapheim Dimitropoulos *
17717f1128Serapheim Dimitropoulos * Refer to block comment within space_map_incremental_destroy()
17817f1128Serapheim Dimitropoulos * to understand why this function is needed.
17917f1128Serapheim Dimitropoulos */
18017f1128Serapheim Dimitropoulosstatic int
18117f1128Serapheim Dimitropoulosspace_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
18217f1128Serapheim Dimitropoulos    uint64_t bufsz, uint64_t *nwords)
18317f1128Serapheim Dimitropoulos{
18417f1128Serapheim Dimitropoulos	int error = 0;
18517f1128Serapheim Dimitropoulos	dmu_buf_t *db;
1860713e23George Wilson
18717f1128Serapheim Dimitropoulos	/*
18817f1128Serapheim Dimitropoulos	 * Find the offset of the last word in the space map and use
18917f1128Serapheim Dimitropoulos	 * that to read the last block of the space map with
19017f1128Serapheim Dimitropoulos	 * dmu_buf_hold().
19117f1128Serapheim Dimitropoulos	 */
19217f1128Serapheim Dimitropoulos	uint64_t last_word_offset =
193555d674Serapheim Dimitropoulos	    sm->sm_phys->smp_length - sizeof (uint64_t);
19417f1128Serapheim Dimitropoulos	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
19517f1128Serapheim Dimitropoulos	    FTAG, &db, DMU_READ_NO_PREFETCH);
19617f1128Serapheim Dimitropoulos	if (error != 0)
19717f1128Serapheim Dimitropoulos		return (error);
19817f1128Serapheim Dimitropoulos
19917f1128Serapheim Dimitropoulos	ASSERT3U(sm->sm_object, ==, db->db_object);
20017f1128Serapheim Dimitropoulos	ASSERT3U(sm->sm_blksz, ==, db->db_size);
20117f1128Serapheim Dimitropoulos	ASSERT3U(bufsz, >=, db->db_size);
20217f1128Serapheim Dimitropoulos	ASSERT(nwords != NULL);
20317f1128Serapheim Dimitropoulos
20417f1128Serapheim Dimitropoulos	uint64_t *words = db->db_data;
20517f1128Serapheim Dimitropoulos	*nwords =
206555d674Serapheim Dimitropoulos	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
20717f1128Serapheim Dimitropoulos
20817f1128Serapheim Dimitropoulos	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
20917f1128Serapheim Dimitropoulos
21017f1128Serapheim Dimitropoulos	uint64_t n = *nwords;
21117f1128Serapheim Dimitropoulos	uint64_t j = n - 1;
21217f1128Serapheim Dimitropoulos	for (uint64_t i = 0; i < n; i++) {
21317f1128Serapheim Dimitropoulos		uint64_t entry = words[i];
21417f1128Serapheim Dimitropoulos		if (sm_entry_is_double_word(entry)) {
21517f1128Serapheim Dimitropoulos			/*
21617f1128Serapheim Dimitropoulos			 * Since we are populating the buffer backwards
21717f1128Serapheim Dimitropoulos			 * we have to be extra careful and add the two
21817f1128Serapheim Dimitropoulos			 * words of the double-word entry in the right
21917f1128Serapheim Dimitropoulos			 * order.
22017f1128Serapheim Dimitropoulos			 */
22117f1128Serapheim Dimitropoulos			ASSERT3U(j, >, 0);
22217f1128Serapheim Dimitropoulos			buf[j - 1] = entry;
22317f1128Serapheim Dimitropoulos
22417f1128Serapheim Dimitropoulos			i++;
22517f1128Serapheim Dimitropoulos			ASSERT3U(i, <, n);
22617f1128Serapheim Dimitropoulos			entry = words[i];
22717f1128Serapheim Dimitropoulos			buf[j] = entry;
22817f1128Serapheim Dimitropoulos			j -= 2;
22917f1128Serapheim Dimitropoulos		} else {
23017f1128Serapheim Dimitropoulos			ASSERT(sm_entry_is_debug(entry) ||
23117f1128Serapheim Dimitropoulos			    sm_entry_is_single_word(entry));
23217f1128Serapheim Dimitropoulos			buf[j] = entry;
23317f1128Serapheim Dimitropoulos			j--;
234fa9e406ahrens		}
235fa9e406ahrens	}
236fa9e406ahrens
23717f1128Serapheim Dimitropoulos	/*
23817f1128Serapheim Dimitropoulos	 * Assert that we wrote backwards all the
23917f1128Serapheim Dimitropoulos	 * way to the beginning of the buffer.
24017f1128Serapheim Dimitropoulos	 */
24117f1128Serapheim Dimitropoulos	ASSERT3S(j, ==, -1);
24217f1128Serapheim Dimitropoulos
24317f1128Serapheim Dimitropoulos	dmu_buf_rele(db, FTAG);
2445cabbc6Prashanth Sreenivasa	return (error);
2455cabbc6Prashanth Sreenivasa}
2465cabbc6Prashanth Sreenivasa
2478671400Serapheim Dimitropoulos/*
2488671400Serapheim Dimitropoulos * Note: This function performs destructive actions - specifically
2498671400Serapheim Dimitropoulos * it deletes entries from the end of the space map. Thus, callers
2508671400Serapheim Dimitropoulos * should ensure that they are holding the appropriate locks for
2518671400Serapheim Dimitropoulos * the space map that they provide.
2528671400Serapheim Dimitropoulos */
2538671400Serapheim Dimitropoulosint
2548671400Serapheim Dimitropoulosspace_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
2558671400Serapheim Dimitropoulos    dmu_tx_t *tx)
2568671400Serapheim Dimitropoulos{
25717f1128Serapheim Dimitropoulos	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
25817f1128Serapheim Dimitropoulos	uint64_t *buf = zio_buf_alloc(bufsz);
2598671400Serapheim Dimitropoulos
2608671400Serapheim Dimitropoulos	dmu_buf_will_dirty(sm->sm_dbuf, tx);
2618671400Serapheim Dimitropoulos
2628671400Serapheim Dimitropoulos	/*
26317f1128Serapheim Dimitropoulos	 * Ideally we would want to iterate from the beginning of the
26417f1128Serapheim Dimitropoulos	 * space map to the end in incremental steps. The issue with this
26517f1128Serapheim Dimitropoulos	 * approach is that we don't have any field on-disk that points
26617f1128Serapheim Dimitropoulos	 * us where to start between each step. We could try zeroing out
26717f1128Serapheim Dimitropoulos	 * entries that we've destroyed, but this doesn't work either as
26817f1128Serapheim Dimitropoulos	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
26917f1128Serapheim Dimitropoulos	 *
27017f1128Serapheim Dimitropoulos	 * As a result, we destroy its entries incrementally starting from
27117f1128Serapheim Dimitropoulos	 * the end after applying the callback to each of them.
2728671400Serapheim Dimitropoulos	 *
27317f1128Serapheim Dimitropoulos	 * The problem with this approach is that we cannot literally
27417f1128Serapheim Dimitropoulos	 * iterate through the words in the space map backwards as we
27517f1128Serapheim Dimitropoulos	 * can't distinguish two-word space map entries from their second
27617f1128Serapheim Dimitropoulos	 * word. Thus we do the following:
2778671400Serapheim Dimitropoulos	 *
27817f1128Serapheim Dimitropoulos	 * 1] We get all the entries from the last block of the space map
27917f1128Serapheim Dimitropoulos	 *    and put them into a buffer in reverse order. This way the
28017f1128Serapheim Dimitropoulos	 *    last entry comes first in the buffer, the second to last is
28117f1128Serapheim Dimitropoulos	 *    second, etc.
28217f1128Serapheim Dimitropoulos	 * 2] We iterate through the entries in the buffer and we apply
28317f1128Serapheim Dimitropoulos	 *    the callback to each one. As we move from entry to entry we
28417f1128Serapheim Dimitropoulos	 *    we decrease the size of the space map, deleting effectively
28517f1128Serapheim Dimitropoulos	 *    each entry.
28617f1128Serapheim Dimitropoulos	 * 3] If there are no more entries in the space map or the callback
28717f1128Serapheim Dimitropoulos	 *    returns a value other than 0, we stop iterating over the
28817f1128Serapheim Dimitropoulos	 *    space map. If there are entries remaining and the callback
28917f1128Serapheim Dimitropoulos	 *    returned 0, we go back to step [1].
2908671400Serapheim Dimitropoulos	 */
29117f1128Serapheim Dimitropoulos	int error = 0;
29217f1128Serapheim Dimitropoulos	while (space_map_length(sm) > 0 && error == 0) {
29317f1128Serapheim Dimitropoulos		uint64_t nwords = 0;
29417f1128Serapheim Dimitropoulos		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
29517f1128Serapheim Dimitropoulos		    &nwords);
2968671400Serapheim Dimitropoulos		if (error != 0)
2978671400Serapheim Dimitropoulos			break;
2988671400Serapheim Dimitropoulos
29917f1128Serapheim Dimitropoulos		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
3008671400Serapheim Dimitropoulos
30117f1128Serapheim Dimitropoulos		for (uint64_t i = 0; i < nwords; i++) {
30217f1128Serapheim Dimitropoulos			uint64_t e = buf[i];
3038671400Serapheim Dimitropoulos
30417f1128Serapheim Dimitropoulos			if (sm_entry_is_debug(e)) {
305555d674Serapheim Dimitropoulos				sm->sm_phys->smp_length -= sizeof (uint64_t);
3068671400Serapheim Dimitropoulos				continue;
3078671400Serapheim Dimitropoulos			}
3088671400Serapheim Dimitropoulos
30917f1128Serapheim Dimitropoulos			int words = 1;
31017f1128Serapheim Dimitropoulos			uint64_t raw_offset, raw_run, vdev_id;
31117f1128Serapheim Dimitropoulos			maptype_t type;
31217f1128Serapheim Dimitropoulos			if (sm_entry_is_single_word(e)) {
31317f1128Serapheim Dimitropoulos				type = SM_TYPE_DECODE(e);
31417f1128Serapheim Dimitropoulos				vdev_id = SM_NO_VDEVID;
31517f1128Serapheim Dimitropoulos				raw_offset = SM_OFFSET_DECODE(e);
31617f1128Serapheim Dimitropoulos				raw_run = SM_RUN_DECODE(e);
31717f1128Serapheim Dimitropoulos			} else {
31817f1128Serapheim Dimitropoulos				ASSERT(sm_entry_is_double_word(e));
31917f1128Serapheim Dimitropoulos				words = 2;
32017f1128Serapheim Dimitropoulos
32117f1128Serapheim Dimitropoulos				raw_run = SM2_RUN_DECODE(e);
32217f1128Serapheim Dimitropoulos				vdev_id = SM2_VDEV_DECODE(e);
32317f1128Serapheim Dimitropoulos
32417f1128Serapheim Dimitropoulos				/* move to the second word */
32517f1128Serapheim Dimitropoulos				i++;
32617f1128Serapheim Dimitropoulos				e = buf[i];
32717f1128Serapheim Dimitropoulos
32817f1128Serapheim Dimitropoulos				ASSERT3P(i, <=, nwords);
32917f1128Serapheim Dimitropoulos
33017f1128Serapheim Dimitropoulos				type = SM2_TYPE_DECODE(e);
33117f1128Serapheim Dimitropoulos				raw_offset = SM2_OFFSET_DECODE(e);
33217f1128Serapheim Dimitropoulos			}
33317f1128Serapheim Dimitropoulos
33417f1128Serapheim Dimitropoulos			uint64_t entry_offset =
33517f1128Serapheim Dimitropoulos			    (raw_offset << sm->sm_shift) + sm->sm_start;
33617f1128Serapheim Dimitropoulos			uint64_t entry_run = raw_run << sm->sm_shift;
3378671400Serapheim Dimitropoulos
3388671400Serapheim Dimitropoulos			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
33917f1128Serapheim Dimitropoulos			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
3408671400Serapheim Dimitropoulos			VERIFY3U(entry_offset, >=, sm->sm_start);
34117f1128Serapheim Dimitropoulos			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
34217f1128Serapheim Dimitropoulos			VERIFY3U(entry_run, <=, sm->sm_size);
34317f1128Serapheim Dimitropoulos			VERIFY3U(entry_offset + entry_run, <=,
3448671400Serapheim Dimitropoulos			    sm->sm_start + sm->sm_size);
3458671400Serapheim Dimitropoulos
34617f1128Serapheim Dimitropoulos			space_map_entry_t sme = {
34717f1128Serapheim Dimitropoulos			    .sme_type = type,
34817f1128Serapheim Dimitropoulos			    .sme_vdev = vdev_id,
34917f1128Serapheim Dimitropoulos			    .sme_offset = entry_offset,
35017f1128Serapheim Dimitropoulos			    .sme_run = entry_run
35117f1128Serapheim Dimitropoulos			};
35217f1128Serapheim Dimitropoulos			error = callback(&sme, arg);
3538671400Serapheim Dimitropoulos			if (error != 0)
3548671400Serapheim Dimitropoulos				break;
3558671400Serapheim Dimitropoulos
3568671400Serapheim Dimitropoulos			if (type == SM_ALLOC)
35717f1128Serapheim Dimitropoulos				sm->sm_phys->smp_alloc -= entry_run;
3588671400Serapheim Dimitropoulos			else
35917f1128Serapheim Dimitropoulos				sm->sm_phys->smp_alloc += entry_run;
360555d674Serapheim Dimitropoulos			sm->sm_phys->smp_length -= words * sizeof (uint64_t);
3618671400Serapheim Dimitropoulos		}
3628671400Serapheim Dimitropoulos	}
3638671400Serapheim Dimitropoulos
36417f1128Serapheim Dimitropoulos	if (space_map_length(sm) == 0) {
3658671400Serapheim Dimitropoulos		ASSERT0(error);
366555d674Serapheim Dimitropoulos		ASSERT0(space_map_allocated(sm));
3678671400Serapheim Dimitropoulos	}
3688671400Serapheim Dimitropoulos
36917f1128Serapheim Dimitropoulos	zio_buf_free(buf, bufsz);
3708671400Serapheim Dimitropoulos	return (error);
3718671400Serapheim Dimitropoulos}
3728671400Serapheim Dimitropoulos
3735cabbc6Prashanth Sreenivasatypedef struct space_map_load_arg {
3745cabbc6Prashanth Sreenivasa	space_map_t	*smla_sm;
3755cabbc6Prashanth Sreenivasa	range_tree_t	*smla_rt;
3765cabbc6Prashanth Sreenivasa	maptype_t	smla_type;
3775cabbc6Prashanth Sreenivasa} space_map_load_arg_t;
3785cabbc6Prashanth Sreenivasa
3795cabbc6Prashanth Sreenivasastatic int
38017f1128Serapheim Dimitropoulosspace_map_load_callback(space_map_entry_t *sme, void *arg)
3815cabbc6Prashanth Sreenivasa{
3825cabbc6Prashanth Sreenivasa	space_map_load_arg_t *smla = arg;
38317f1128Serapheim Dimitropoulos	if (sme->sme_type == smla->smla_type) {
38417f1128Serapheim Dimitropoulos		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
3855cabbc6Prashanth Sreenivasa		    smla->smla_sm->sm_size);
38617f1128Serapheim Dimitropoulos		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
3875cabbc6Prashanth Sreenivasa	} else {
38817f1128Serapheim Dimitropoulos		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
3895cabbc6Prashanth Sreenivasa	}
3905cabbc6Prashanth Sreenivasa
3915cabbc6Prashanth Sreenivasa	return (0);
3925cabbc6Prashanth Sreenivasa}
3935cabbc6Prashanth Sreenivasa
3945cabbc6Prashanth Sreenivasa/*
395555d674Serapheim Dimitropoulos * Load the spacemap into the rangetree, like space_map_load. But only
396555d674Serapheim Dimitropoulos * read the first 'length' bytes of the spacemap.
3975cabbc6Prashanth Sreenivasa */
3985cabbc6Prashanth Sreenivasaint
399555d674Serapheim Dimitropoulosspace_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
400555d674Serapheim Dimitropoulos    uint64_t length)
4015cabbc6Prashanth Sreenivasa{
4025cabbc6Prashanth Sreenivasa	space_map_load_arg_t smla;
4035cabbc6Prashanth Sreenivasa
4045cabbc6Prashanth Sreenivasa	VERIFY0(range_tree_space(rt));
4055cabbc6Prashanth Sreenivasa
406555d674Serapheim Dimitropoulos	if (maptype == SM_FREE)
4075cabbc6Prashanth Sreenivasa		range_tree_add(rt, sm->sm_start, sm->sm_size);
4085cabbc6Prashanth Sreenivasa
4095cabbc6Prashanth Sreenivasa	smla.smla_rt = rt;
4105cabbc6Prashanth Sreenivasa	smla.smla_sm = sm;
4115cabbc6Prashanth Sreenivasa	smla.smla_type = maptype;
412555d674Serapheim Dimitropoulos	int err = space_map_iterate(sm, length,
413555d674Serapheim Dimitropoulos	    space_map_load_callback, &smla);
4145cabbc6Prashanth Sreenivasa
415555d674Serapheim Dimitropoulos	if (err != 0)
4160713e23George Wilson		range_tree_vacate(rt, NULL, NULL);
417b8493d5vl
4185cabbc6Prashanth Sreenivasa	return (err);
4190713e23George Wilson}
420ecc2d60bonwick
421555d674Serapheim Dimitropoulos/*
422555d674Serapheim Dimitropoulos * Load the space map disk into the specified range tree. Segments of maptype
423555d674Serapheim Dimitropoulos * are added to the range tree, other segment types are removed.
424555d674Serapheim Dimitropoulos */
425555d674Serapheim Dimitropoulosint
426555d674Serapheim Dimitropoulosspace_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
427555d674Serapheim Dimitropoulos{
428555d674Serapheim Dimitropoulos	return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
429555d674Serapheim Dimitropoulos}
430555d674Serapheim Dimitropoulos
4310713e23George Wilsonvoid
4320713e23George Wilsonspace_map_histogram_clear(space_map_t *sm)
4330713e23George Wilson{
4340713e23George Wilson	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
4350713e23George Wilson		return;
436ecc2d60bonwick
4370713e23George Wilson	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
4380713e23George Wilson}
439ecc2d60bonwick
4400713e23George Wilsonboolean_t
4410713e23George Wilsonspace_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
4420713e23George Wilson{
4430713e23George Wilson	/*
4440713e23George Wilson	 * Verify that the in-core range tree does not have any
4450713e23George Wilson	 * ranges smaller than our sm_shift size.
4460713e23George Wilson	 */
4470713e23George Wilson	for (int i = 0; i < sm->sm_shift; i++) {
4480713e23George Wilson		if (rt->rt_histogram[i] != 0)
4490713e23George Wilson			return (B_FALSE);
4500713e23George Wilson	}
4510713e23George Wilson	return (B_TRUE);
452fa9e406ahrens}
453fa9e406ahrens
454fa9e406ahrensvoid
4550713e23George Wilsonspace_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
456ecc2d60bonwick{
4570713e23George Wilson	int idx = 0;
4580713e23George Wilson
4590713e23George Wilson	ASSERT(dmu_tx_is_syncing(tx));
4600713e23George Wilson	VERIFY3U(space_map_object(sm), !=, 0);
461ecc2d60bonwick
4620713e23George Wilson	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
4630713e23George Wilson		return;
464ecc2d60bonwick
4650713e23George Wilson	dmu_buf_will_dirty(sm->sm_dbuf, tx);
466ecc2d60bonwick
4670713e23George Wilson	ASSERT(space_map_histogram_verify(sm, rt));
4680713e23George Wilson	/*
4690713e23George Wilson	 * Transfer the content of the range tree histogram to the space
4700713e23George Wilson	 * map histogram. The space map histogram contains 32 buckets ranging
4710713e23George Wilson	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
4720713e23George Wilson	 * however, can represent ranges from 2^0 to 2^63. Since the space
4730713e23George Wilson	 * map only cares about allocatable blocks (minimum of sm_shift) we
4740713e23George Wilson	 * can safely ignore all ranges in the range tree smaller than sm_shift.
4750713e23George Wilson	 */
4760713e23George Wilson	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
4770713e23George Wilson
4780713e23George Wilson		/*
4790713e23George Wilson		 * Since the largest histogram bucket in the space map is
4800713e23George Wilson		 * 2^(32+sm_shift-1), we need to normalize the values in
4810713e23George Wilson		 * the range tree for any bucket larger than that size. For
4820713e23George Wilson		 * example given an sm_shift of 9, ranges larger than 2^40
4830713e23George Wilson		 * would get normalized as if they were 1TB ranges. Assume
4840713e23George Wilson		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
4850713e23George Wilson		 * the calculation below would normalize this to 5 * 2^4 (16).
4860713e23George Wilson		 */
4870713e23George Wilson		ASSERT3U(i, >=, idx + sm->sm_shift);
4880713e23George Wilson		sm->sm_phys->smp_histogram[idx] +=
4890713e23George Wilson		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
4900713e23George Wilson
4910713e23George Wilson		/*
4920713e23George Wilson		 * Increment the space map's index as long as we haven't
4930713e23George Wilson		 * reached the maximum bucket size. Accumulate all ranges
4940713e23George Wilson		 * larger than the max bucket size into the last bucket.
4950713e23George Wilson		 */
4962e4c998George Wilson		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
4970713e23George Wilson			ASSERT3U(idx + sm->sm_shift, ==, i);
4980713e23George Wilson			idx++;
4992e4c998George Wilson			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
5000713e23George Wilson		}
5010713e23George Wilson	}
502d6e555bGeorge Wilson}
503d6e555bGeorge Wilson
50417f1128Serapheim Dimitropoulosstatic void
50517f1128Serapheim Dimitropoulosspace_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
506ecc2d60bonwick{
50717f1128Serapheim Dimitropoulos	dmu_buf_will_dirty(sm->sm_dbuf, tx);
50817f1128Serapheim Dimitropoulos
50917f1128Serapheim Dimitropoulos	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
51017f1128Serapheim Dimitropoulos	    SM_DEBUG_ACTION_ENCODE(maptype) |
51117f1128Serapheim Dimitropoulos	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
51217f1128Serapheim Dimitropoulos	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
51317f1128Serapheim Dimitropoulos
514555d674Serapheim Dimitropoulos	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
51517f1128Serapheim Dimitropoulos	    sizeof (dentry), &dentry, tx);
51617f1128Serapheim Dimitropoulos
517555d674Serapheim Dimitropoulos	sm->sm_phys->smp_length += sizeof (dentry);
51817f1128Serapheim Dimitropoulos}
51917f1128Serapheim Dimitropoulos
52017f1128Serapheim Dimitropoulos/*
52117f1128Serapheim Dimitropoulos * Writes one or more entries given a segment.
52217f1128Serapheim Dimitropoulos *
52317f1128Serapheim Dimitropoulos * Note: The function may release the dbuf from the pointer initially
52417f1128Serapheim Dimitropoulos * passed to it, and return a different dbuf. Also, the space map's
52517f1128Serapheim Dimitropoulos * dbuf must be dirty for the changes in sm_phys to take effect.
52617f1128Serapheim Dimitropoulos */
52717f1128Serapheim Dimitropoulosstatic void
5284d7988dPaul Dagneliespace_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
5294d7988dPaul Dagnelie    maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
5304d7988dPaul Dagnelie    void *tag, dmu_tx_t *tx)
53117f1128Serapheim Dimitropoulos{
53217f1128Serapheim Dimitropoulos	ASSERT3U(words, !=, 0);
53317f1128Serapheim Dimitropoulos	ASSERT3U(words, <=, 2);
53417f1128Serapheim Dimitropoulos
53517f1128Serapheim Dimitropoulos	/* ensure the vdev_id can be represented by the space map */
53617f1128Serapheim Dimitropoulos	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
53717f1128Serapheim Dimitropoulos
53817f1128Serapheim Dimitropoulos	/*
53917f1128Serapheim Dimitropoulos	 * if this is a single word entry, ensure that no vdev was
54017f1128Serapheim Dimitropoulos	 * specified.
54117f1128Serapheim Dimitropoulos	 */
54217f1128Serapheim Dimitropoulos	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
54317f1128Serapheim Dimitropoulos
54417f1128Serapheim Dimitropoulos	dmu_buf_t *db = *dbp;
54517f1128Serapheim Dimitropoulos	ASSERT3U(db->db_size, ==, sm->sm_blksz);
54617f1128Serapheim Dimitropoulos
54717f1128Serapheim Dimitropoulos	uint64_t *block_base = db->db_data;
54817f1128Serapheim Dimitropoulos	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
54917f1128Serapheim Dimitropoulos	uint64_t *block_cursor = block_base +
550555d674Serapheim Dimitropoulos	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
55117f1128Serapheim Dimitropoulos
55217f1128Serapheim Dimitropoulos	ASSERT3P(block_cursor, <=, block_end);
55317f1128Serapheim Dimitropoulos
5544d7988dPaul Dagnelie	uint64_t size = (rend - rstart) >> sm->sm_shift;
5554d7988dPaul Dagnelie	uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift;
55617f1128Serapheim Dimitropoulos	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
55717f1128Serapheim Dimitropoulos
5584d7988dPaul Dagnelie	ASSERT3U(rstart, >=, sm->sm_start);
5594d7988dPaul Dagnelie	ASSERT3U(rstart, <, sm->sm_start + sm->sm_size);
5604d7988dPaul Dagnelie	ASSERT3U(rend - rstart, <=, sm->sm_size);
5614d7988dPaul Dagnelie	ASSERT3U(rend, <=, sm->sm_start + sm->sm_size);
56217f1128Serapheim Dimitropoulos
56317f1128Serapheim Dimitropoulos	while (size != 0) {
56417f1128Serapheim Dimitropoulos		ASSERT3P(block_cursor, <=, block_end);
56517f1128Serapheim Dimitropoulos
56617f1128Serapheim Dimitropoulos		/*
56717f1128Serapheim Dimitropoulos		 * If we are at the end of this block, flush it and start
56817f1128Serapheim Dimitropoulos		 * writing again from the beginning.
56917f1128Serapheim Dimitropoulos		 */
57017f1128Serapheim Dimitropoulos		if (block_cursor == block_end) {
57117f1128Serapheim Dimitropoulos			dmu_buf_rele(db, tag);
572ecc2d60bonwick
573555d674Serapheim Dimitropoulos			uint64_t next_word_offset = sm->sm_phys->smp_length;
57417f1128Serapheim Dimitropoulos			VERIFY0(dmu_buf_hold(sm->sm_os,
57517f1128Serapheim Dimitropoulos			    space_map_object(sm), next_word_offset,
57617f1128Serapheim Dimitropoulos			    tag, &db, DMU_READ_PREFETCH));
57717f1128Serapheim Dimitropoulos			dmu_buf_will_dirty(db, tx);
57817f1128Serapheim Dimitropoulos
57917f1128Serapheim Dimitropoulos			/* update caller's dbuf */
58017f1128Serapheim Dimitropoulos			*dbp = db;
58117f1128Serapheim Dimitropoulos
58217f1128Serapheim Dimitropoulos			ASSERT3U(db->db_size, ==, sm->sm_blksz);
58317f1128Serapheim Dimitropoulos
58417f1128Serapheim Dimitropoulos			block_base = db->db_data;
58517f1128Serapheim Dimitropoulos			block_cursor = block_base;
58617f1128Serapheim Dimitropoulos			block_end = block_base +
58717f1128Serapheim Dimitropoulos			    (db->db_size / sizeof (uint64_t));
58817f1128Serapheim Dimitropoulos		}
58917f1128Serapheim Dimitropoulos
59017f1128Serapheim Dimitropoulos		/*
59117f1128Serapheim Dimitropoulos		 * If we are writing a two-word entry and we only have one
59217f1128Serapheim Dimitropoulos		 * word left on this block, just pad it with an empty debug
59317f1128Serapheim Dimitropoulos		 * entry and write the two-word entry in the next block.
59417f1128Serapheim Dimitropoulos		 */
59517f1128Serapheim Dimitropoulos		uint64_t *next_entry = block_cursor + 1;
59617f1128Serapheim Dimitropoulos		if (next_entry == block_end && words > 1) {
59717f1128Serapheim Dimitropoulos			ASSERT3U(words, ==, 2);
59817f1128Serapheim Dimitropoulos			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
59917f1128Serapheim Dimitropoulos			    SM_DEBUG_ACTION_ENCODE(0) |
60017f1128Serapheim Dimitropoulos			    SM_DEBUG_SYNCPASS_ENCODE(0) |
60117f1128Serapheim Dimitropoulos			    SM_DEBUG_TXG_ENCODE(0);
60217f1128Serapheim Dimitropoulos			block_cursor++;
603555d674Serapheim Dimitropoulos			sm->sm_phys->smp_length += sizeof (uint64_t);
60417f1128Serapheim Dimitropoulos			ASSERT3P(block_cursor, ==, block_end);
60517f1128Serapheim Dimitropoulos			continue;
60617f1128Serapheim Dimitropoulos		}
60717f1128Serapheim Dimitropoulos
60817f1128Serapheim Dimitropoulos		uint64_t run_len = MIN(size, run_max);
60917f1128Serapheim Dimitropoulos		switch (words) {
61017f1128Serapheim Dimitropoulos		case 1:
61117f1128Serapheim Dimitropoulos			*block_cursor = SM_OFFSET_ENCODE(start) |
61217f1128Serapheim Dimitropoulos			    SM_TYPE_ENCODE(maptype) |
61317f1128Serapheim Dimitropoulos			    SM_RUN_ENCODE(run_len);
61417f1128Serapheim Dimitropoulos			block_cursor++;
61517f1128Serapheim Dimitropoulos			break;
61617f1128Serapheim Dimitropoulos		case 2:
61717f1128Serapheim Dimitropoulos			/* write the first word of the entry */
61817f1128Serapheim Dimitropoulos			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
61917f1128Serapheim Dimitropoulos			    SM2_RUN_ENCODE(run_len) |
62017f1128Serapheim Dimitropoulos			    SM2_VDEV_ENCODE(vdev_id);
62117f1128Serapheim Dimitropoulos			block_cursor++;
62217f1128Serapheim Dimitropoulos
62317f1128Serapheim Dimitropoulos			/* move on to the second word of the entry */
62417f1128Serapheim Dimitropoulos			ASSERT3P(block_cursor, <, block_end);
62517f1128Serapheim Dimitropoulos			*block_cursor = SM2_TYPE_ENCODE(maptype) |
62617f1128Serapheim Dimitropoulos			    SM2_OFFSET_ENCODE(start);
62717f1128Serapheim Dimitropoulos			block_cursor++;
62817f1128Serapheim Dimitropoulos			break;
62917f1128Serapheim Dimitropoulos		default:
63017f1128Serapheim Dimitropoulos			panic("%d-word space map entries are not supported",
63117f1128Serapheim Dimitropoulos			    words);
63217f1128Serapheim Dimitropoulos			break;
63317f1128Serapheim Dimitropoulos		}
634555d674Serapheim Dimitropoulos		sm->sm_phys->smp_length += words * sizeof (uint64_t);
63517f1128Serapheim Dimitropoulos
63617f1128Serapheim Dimitropoulos		start += run_len;
63717f1128Serapheim Dimitropoulos		size -= run_len;
63817f1128Serapheim Dimitropoulos	}
63917f1128Serapheim Dimitropoulos	ASSERT0(size);
64017f1128Serapheim Dimitropoulos
64117f1128Serapheim Dimitropoulos}
64217f1128Serapheim Dimitropoulos
64317f1128Serapheim Dimitropoulos/*
64417f1128Serapheim Dimitropoulos * Note: The space map's dbuf must be dirty for the changes in sm_phys to
64517f1128Serapheim Dimitropoulos * take effect.
64617f1128Serapheim Dimitropoulos */
64717f1128Serapheim Dimitropoulosstatic void
64817f1128Serapheim Dimitropoulosspace_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
64917f1128Serapheim Dimitropoulos    uint64_t vdev_id, dmu_tx_t *tx)
65017f1128Serapheim Dimitropoulos{
65117f1128Serapheim Dimitropoulos	spa_t *spa = tx->tx_pool->dp_spa;
65217f1128Serapheim Dimitropoulos	dmu_buf_t *db;
65317f1128Serapheim Dimitropoulos
65417f1128Serapheim Dimitropoulos	space_map_write_intro_debug(sm, maptype, tx);
65517f1128Serapheim Dimitropoulos
65617f1128Serapheim Dimitropoulos#ifdef DEBUG
6570713e23George Wilson	/*
65817f1128Serapheim Dimitropoulos	 * We do this right after we write the intro debug entry
65917f1128Serapheim Dimitropoulos	 * because the estimate does not take it into account.
6600713e23George Wilson	 */
661555d674Serapheim Dimitropoulos	uint64_t initial_objsize = sm->sm_phys->smp_length;
66217f1128Serapheim Dimitropoulos	uint64_t estimated_growth =
66317f1128Serapheim Dimitropoulos	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
66417f1128Serapheim Dimitropoulos	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
66517f1128Serapheim Dimitropoulos#endif
666ecc2d60bonwick
6670713e23George Wilson	/*
66817f1128Serapheim Dimitropoulos	 * Find the offset right after the last word in the space map
66917f1128Serapheim Dimitropoulos	 * and use that to get a hold of the last block, so we can
67017f1128Serapheim Dimitropoulos	 * start appending to it.
6710713e23George Wilson	 */
672555d674Serapheim Dimitropoulos	uint64_t next_word_offset = sm->sm_phys->smp_length;
67317f1128Serapheim Dimitropoulos	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
67417f1128Serapheim Dimitropoulos	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
67517f1128Serapheim Dimitropoulos	ASSERT3U(db->db_size, ==, sm->sm_blksz);
67617f1128Serapheim Dimitropoulos
67717f1128Serapheim Dimitropoulos	dmu_buf_will_dirty(db, tx);
67817f1128Serapheim Dimitropoulos
6794d7988dPaul Dagnelie	zfs_btree_t *t = &rt->rt_root;
6804d7988dPaul Dagnelie	zfs_btree_index_t where;
6814d7988dPaul Dagnelie	for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL;
6824d7988dPaul Dagnelie	    rs = zfs_btree_next(t, &where, &where)) {
6834d7988dPaul Dagnelie		uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >>
6844d7988dPaul Dagnelie		    sm->sm_shift;
6854d7988dPaul Dagnelie		uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >>
6864d7988dPaul Dagnelie		    sm->sm_shift;
68717f1128Serapheim Dimitropoulos		uint8_t words = 1;
68817f1128Serapheim Dimitropoulos
68917f1128Serapheim Dimitropoulos		/*
69017f1128Serapheim Dimitropoulos		 * We only write two-word entries when both of the following
69117f1128Serapheim Dimitropoulos		 * are true:
69217f1128Serapheim Dimitropoulos		 *
69317f1128Serapheim Dimitropoulos		 * [1] The feature is enabled.
69417f1128Serapheim Dimitropoulos		 * [2] The offset or run is too big for a single-word entry,
695221813cMatthew Ahrens		 *	or the vdev_id is set (meaning not equal to
696221813cMatthew Ahrens		 *	SM_NO_VDEVID).
69717f1128Serapheim Dimitropoulos		 *
69817f1128Serapheim Dimitropoulos		 * Note that for purposes of testing we've added the case that
69917f1128Serapheim Dimitropoulos		 * we write two-word entries occasionally when the feature is
70017f1128Serapheim Dimitropoulos		 * enabled and zfs_force_some_double_word_sm_entries has been
70117f1128Serapheim Dimitropoulos		 * set.
70217f1128Serapheim Dimitropoulos		 */
70317f1128Serapheim Dimitropoulos		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
70417f1128Serapheim Dimitropoulos		    (offset >= (1ULL << SM_OFFSET_BITS) ||
70517f1128Serapheim Dimitropoulos		    length > SM_RUN_MAX ||
70617f1128Serapheim Dimitropoulos		    vdev_id != SM_NO_VDEVID ||
70717f1128Serapheim Dimitropoulos		    (zfs_force_some_double_word_sm_entries &&
70817f1128Serapheim Dimitropoulos		    spa_get_random(100) == 0)))
70917f1128Serapheim Dimitropoulos			words = 2;
71017f1128Serapheim Dimitropoulos
7114d7988dPaul Dagnelie		space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs,
7124d7988dPaul Dagnelie		    rt), maptype, vdev_id, words, &db, FTAG, tx);
7130713e23George Wilson	}
71417f1128Serapheim Dimitropoulos
71517f1128Serapheim Dimitropoulos	dmu_buf_rele(db, FTAG);
71617f1128Serapheim Dimitropoulos
71717f1128Serapheim Dimitropoulos#ifdef DEBUG
71817f1128Serapheim Dimitropoulos	/*
71917f1128Serapheim Dimitropoulos	 * We expect our estimation to be based on the worst case
72017f1128Serapheim Dimitropoulos	 * scenario [see comment in space_map_estimate_optimal_size()].
72117f1128Serapheim Dimitropoulos	 * Therefore we expect the actual objsize to be equal or less
72217f1128Serapheim Dimitropoulos	 * than whatever we estimated it to be.
72317f1128Serapheim Dimitropoulos	 */
724555d674Serapheim Dimitropoulos	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
72517f1128Serapheim Dimitropoulos#endif
726ecc2d60bonwick}
727ecc2d60bonwick
72817f1128Serapheim Dimitropoulos/*
72917f1128Serapheim Dimitropoulos * Note: This function manipulates the state of the given space map but
73017f1128Serapheim Dimitropoulos * does not hold any locks implicitly. Thus the caller is responsible
73117f1128Serapheim Dimitropoulos * for synchronizing writes to the space map.
73217f1128Serapheim Dimitropoulos */
733ecc2d60bonwickvoid
7340713e23George Wilsonspace_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
73517f1128Serapheim Dimitropoulos    uint64_t vdev_id, dmu_tx_t *tx)
736fa9e406ahrens{
7370713e23George Wilson	objset_t *os = sm->sm_os;
738fa9e406ahrens
7390713e23George Wilson	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
7400713e23George Wilson	VERIFY3U(space_map_object(sm), !=, 0);
74117f1128Serapheim Dimitropoulos
7420713e23George Wilson	dmu_buf_will_dirty(sm->sm_dbuf, tx);
743fa9e406ahrens
7440713e23George Wilson	/*
7450713e23George Wilson	 * This field is no longer necessary since the in-core space map
7460713e23George Wilson	 * now contains the object number but is maintained for backwards
7470713e23George Wilson	 * compatibility.
7480713e23George Wilson	 */
7490713e23George Wilson	sm->sm_phys->smp_object = sm->sm_object;
750fa9e406ahrens
7518671400Serapheim Dimitropoulos	if (range_tree_is_empty(rt)) {
7520713e23George Wilson		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
7530713e23George Wilson		return;
7540713e23George Wilson	}
755fa9e406ahrens
756ecc2d60bonwick	if (maptype == SM_ALLOC)
7570713e23George Wilson		sm->sm_phys->smp_alloc += range_tree_space(rt);
758ecc2d60bonwick	else
7590713e23George Wilson		sm->sm_phys->smp_alloc -= range_tree_space(rt);
760ecc2d60bonwick
7614d7988dPaul Dagnelie	uint64_t nodes = zfs_btree_numnodes(&rt->rt_root);
76217f1128Serapheim Dimitropoulos	uint64_t rt_space = range_tree_space(rt);
7630713e23George Wilson
76417f1128Serapheim Dimitropoulos	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
765fa9e406ahrens
76601f55e4George Wilson	/*
76701f55e4George Wilson	 * Ensure that the space_map's accounting wasn't changed
76801f55e4George Wilson	 * while we were in the middle of writing it out.
76901f55e4George Wilson	 */
7704d7988dPaul Dagnelie	VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root));
7710713e23George Wilson	VERIFY3U(range_tree_space(rt), ==, rt_space);
772fa9e406ahrens}
773fa9e406ahrens
7740713e23George Wilsonstatic int
7750713e23George Wilsonspace_map_open_impl(space_map_t *sm)
776fa9e406ahrens{
7770713e23George Wilson	int error;
7780713e23George Wilson	u_longlong_t blocks;
779fa9e406ahrens
7800713e23George Wilson	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
7810713e23George Wilson	if (error)
7820713e23George Wilson		return (error);
7830713e23George Wilson
7840713e23George Wilson	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
7850713e23George Wilson	sm->sm_phys = sm->sm_dbuf->db_data;
7860713e23George Wilson	return (0);
787fa9e406ahrens}
7888ad4d6dJeff Bonwick
7890713e23George Wilsonint
7900713e23George Wilsonspace_map_open(space_map_t **smp, objset_t *os, uint64_t object,
7915cabbc6Prashanth Sreenivasa    uint64_t start, uint64_t size, uint8_t shift)
7928ad4d6dJeff Bonwick{
7930713e23George Wilson	space_map_t *sm;
7940713e23George Wilson	int error;
7958ad4d6dJeff Bonwick
7960713e23George Wilson	ASSERT(*smp == NULL);
7970713e23George Wilson	ASSERT(os != NULL);
7980713e23George Wilson	ASSERT(object != 0);
7998ad4d6dJeff Bonwick
8000713e23George Wilson	sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
8018ad4d6dJeff Bonwick
8020713e23George Wilson	sm->sm_start = start;
8030713e23George Wilson	sm->sm_size = size;
8040713e23George Wilson	sm->sm_shift = shift;
8050713e23George Wilson	sm->sm_os = os;
8060713e23George Wilson	sm->sm_object = object;
8078ad4d6dJeff Bonwick
8080713e23George Wilson	error = space_map_open_impl(sm);
8090713e23George Wilson	if (error != 0) {
8100713e23George Wilson		space_map_close(sm);
8110713e23George Wilson		return (error);
8120713e23George Wilson	}
8130713e23George Wilson	*smp = sm;
8140713e23George Wilson
8150713e23George Wilson	return (0);
8168ad4d6dJeff Bonwick}
8178ad4d6dJeff Bonwick
8188ad4d6dJeff Bonwickvoid
8190713e23George Wilsonspace_map_close(space_map_t *sm)
8208ad4d6dJeff Bonwick{
8210713e23George Wilson	if (sm == NULL)
8220713e23George Wilson		return;
8238ad4d6dJeff Bonwick
824