1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5f65e61cahrens * Common Development and Distribution License (the "License").
6f65e61cahrens * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
2247cb52dJeff Bonwick * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2352abb70Matthew Ahrens * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24bc9014eJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25fa9e406ahrens */
26fa9e406ahrens
27fa9e406ahrens/*
28fa9e406ahrens * This file contains the top half of the zfs directory structure
29fa9e406ahrens * implementation. The bottom half is in zap_leaf.c.
30fa9e406ahrens *
31fa9e406ahrens * The zdir is an extendable hash data structure. There is a table of
32fa9e406ahrens * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
33fa9e406ahrens * each a constant size and hold a variable number of directory entries.
34fa9e406ahrens * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
35fa9e406ahrens *
36fa9e406ahrens * The pointer table holds a power of 2 number of pointers.
37fa9e406ahrens * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
38fa9e406ahrens * by the pointer at index i in the table holds entries whose hash value
39fa9e406ahrens * has a zd_prefix_len - bit prefix
40fa9e406ahrens */
41fa9e406ahrens
42fa9e406ahrens#include <sys/spa.h>
43fa9e406ahrens#include <sys/dmu.h>
44fa9e406ahrens#include <sys/zfs_context.h>
45de8267etimh#include <sys/zfs_znode.h>
46478ed9aEric Taylor#include <sys/fs/zfs.h>
47fa9e406ahrens#include <sys/zap.h>
48ea8dc4beschrock#include <sys/refcount.h>
49fa9e406ahrens#include <sys/zap_impl.h>
50fa9e406ahrens#include <sys/zap_leaf.h>
51fa9e406ahrens
5252abb70Matthew Ahrens/*
5352abb70Matthew Ahrens * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
5452abb70Matthew Ahrens * (all leaf blocks) when we start iterating over it.
5552abb70Matthew Ahrens *
5652abb70Matthew Ahrens * For zap_cursor_init(), the callers all intend to iterate through all the
5752abb70Matthew Ahrens * entries.  There are a few cases where an error (typically i/o error) could
5852abb70Matthew Ahrens * cause it to bail out early.
5952abb70Matthew Ahrens *
6052abb70Matthew Ahrens * For zap_cursor_init_serialized(), there are callers that do the iteration
6152abb70Matthew Ahrens * outside of ZFS.  Typically they would iterate over everything, but we
6252abb70Matthew Ahrens * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
6352abb70Matthew Ahrens * zcp_snapshots_iter(), and other iterators over things in the MOS - these
6452abb70Matthew Ahrens * are called by /sbin/zfs and channel programs.  The other example is
6552abb70Matthew Ahrens * zfs_readdir() which iterates over directory entries for the getdents()
6652abb70Matthew Ahrens * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
6752abb70Matthew Ahrens * userland doesn't have to.
6852abb70Matthew Ahrens *
6952abb70Matthew Ahrens * Given that the ZAP entries aren't returned in a specific order, the only
7052abb70Matthew Ahrens * legitimate use cases for partial iteration would be:
7152abb70Matthew Ahrens *
7252abb70Matthew Ahrens * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
7352abb70Matthew Ahrens *    get the first 100 and then wait for the user to hit "next page", which
7452abb70Matthew Ahrens *    they may never do).
7552abb70Matthew Ahrens *
7652abb70Matthew Ahrens * 2. You want to know if there are more than X entries, without relying on
7752abb70Matthew Ahrens *    the zfs-specific implementation of the directory's st_size (which is
7852abb70Matthew Ahrens *    the number of entries).
7952abb70Matthew Ahrens */
8052abb70Matthew Ahrensboolean_t zap_iterate_prefetch = B_TRUE;
8152abb70Matthew Ahrens
82f65e61cahrensint fzap_default_block_shift = 14; /* 16k blocksize */
83fa9e406ahrens
84c137962Justin T. Gibbsextern inline zap_phys_t *zap_f_phys(zap_t *zap);
85c137962Justin T. Gibbs
8666328ddahrensstatic uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
87fa9e406ahrens
88fa9e406ahrensvoid
89fa9e406ahrensfzap_byteswap(void *vbuf, size_t size)
90fa9e406ahrens{
91bf26014Matthew Ahrens	uint64_t block_type = *(uint64_t *)vbuf;
92fa9e406ahrens
935ad8204nd	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
94f65e61cahrens		zap_leaf_byteswap(vbuf, size);
955ad8204nd	else {
96fa9e406ahrens		/* it's a ptrtbl block */
97f65e61cahrens		byteswap_uint64_array(vbuf, size);
98fa9e406ahrens	}
99fa9e406ahrens}
100fa9e406ahrens
101fa9e406ahrensvoid
102b24ab67Jeff Bonwickfzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
103fa9e406ahrens{
104fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
105fa9e406ahrens	zap->zap_ismicro = FALSE;
106fa9e406ahrens
10740510e8Josef 'Jeff' Sipek	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
10840510e8Josef 'Jeff' Sipek	zap->zap_dbu.dbu_evict_func_async = NULL;
109fa9e406ahrens
110fa9e406ahrens	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
111bf16b11Matthew Ahrens	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
112fa9e406ahrens
113bf26014Matthew Ahrens	zap_phys_t *zp = zap_f_phys(zap);
114fa9e406ahrens	/*
115fa9e406ahrens	 * explicitly zero it since it might be coming from an
116fa9e406ahrens	 * initialized microzap
117fa9e406ahrens	 */
118f65e61cahrens	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
119fa9e406ahrens	zp->zap_block_type = ZBT_HEADER;
120fa9e406ahrens	zp->zap_magic = ZAP_MAGIC;
121fa9e406ahrens
122f65e61cahrens	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
123fa9e406ahrens
124fa9e406ahrens	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
125fa9e406ahrens	zp->zap_num_leafs = 1;
126fa9e406ahrens	zp->zap_num_entries = 0;
127fa9e406ahrens	zp->zap_salt = zap->zap_salt;
128da6c28aamw	zp->zap_normflags = zap->zap_normflags;
129b24ab67Jeff Bonwick	zp->zap_flags = flags;
130fa9e406ahrens
131f65e61cahrens	/* block 1 will be the first leaf */
132bf26014Matthew Ahrens	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
133f65e61cahrens		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
134fa9e406ahrens
135fa9e406ahrens	/*
136fa9e406ahrens	 * set up block 1 - the first leaf
137fa9e406ahrens	 */
138bf26014Matthew Ahrens	dmu_buf_t *db;
139bf26014Matthew Ahrens	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
14047cb52dJeff Bonwick	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
141fa9e406ahrens	dmu_buf_will_dirty(db, tx);
142fa9e406ahrens
143bf26014Matthew Ahrens	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
144fa9e406ahrens	l->l_dbuf = db;
145fa9e406ahrens
146de8267etimh	zap_leaf_init(l, zp->zap_normflags != 0);
147fa9e406ahrens
148fa9e406ahrens	kmem_free(l, sizeof (zap_leaf_t));
149ea8dc4beschrock	dmu_buf_rele(db, FTAG);
150fa9e406ahrens}
151fa9e406ahrens
152fa9e406ahrensstatic int
153fa9e406ahrenszap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
154fa9e406ahrens{
155fa9e406ahrens	if (RW_WRITE_HELD(&zap->zap_rwlock))
156fa9e406ahrens		return (1);
157fa9e406ahrens	if (rw_tryupgrade(&zap->zap_rwlock)) {
158fa9e406ahrens		dmu_buf_will_dirty(zap->zap_dbuf, tx);
159fa9e406ahrens		return (1);
160fa9e406ahrens	}
161fa9e406ahrens	return (0);
162fa9e406ahrens}
163fa9e406ahrens
164fa9e406ahrens/*
165fa9e406ahrens * Generic routines for dealing with the pointer & cookie tables.
166fa9e406ahrens */
167fa9e406ahrens
16866328ddahrensstatic int
169fa9e406ahrenszap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
170fa9e406ahrens    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
171fa9e406ahrens    dmu_tx_t *tx)
172fa9e406ahrens{
173bf26014Matthew Ahrens	uint64_t newblk;
174f65e61cahrens	int bs = FZAP_BLOCK_SHIFT(zap);
175f65e61cahrens	int hepb = 1<<(bs-4);
176fa9e406ahrens	/* hepb = half the number of entries in a block */
177fa9e406ahrens
178fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
179fa9e406ahrens	ASSERT(tbl->zt_blk != 0);
180fa9e406ahrens	ASSERT(tbl->zt_numblks > 0);
181fa9e406ahrens
182fa9e406ahrens	if (tbl->zt_nextblk != 0) {
183fa9e406ahrens		newblk = tbl->zt_nextblk;
184fa9e406ahrens	} else {
18566328ddahrens		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
186fa9e406ahrens		tbl->zt_nextblk = newblk;
187fb09f5aMadhav Suresh		ASSERT0(tbl->zt_blks_copied);
188a2cdcddPaul Dagnelie		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
189a2cdcddPaul Dagnelie		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
190a2cdcddPaul Dagnelie		    ZIO_PRIORITY_SYNC_READ);
191fa9e406ahrens	}
192fa9e406ahrens
193fa9e406ahrens	/*
19466328ddahrens	 * Copy the ptrtbl from the old to new location.
195fa9e406ahrens	 */
196fa9e406ahrens
197bf26014Matthew Ahrens	uint64_t b = tbl->zt_blks_copied;
198bf26014Matthew Ahrens	dmu_buf_t *db_old;
199bf26014Matthew Ahrens	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
20047cb52dJeff Bonwick	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
201bf26014Matthew Ahrens	if (err != 0)
20266328ddahrens		return (err);
203fa9e406ahrens
204fa9e406ahrens	/* first half of entries in old[b] go to new[2*b+0] */
205bf26014Matthew Ahrens	dmu_buf_t *db_new;
206bf26014Matthew Ahrens	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
20747cb52dJeff Bonwick	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
208fa9e406ahrens	dmu_buf_will_dirty(db_new, tx);
209fa9e406ahrens	transfer_func(db_old->db_data, db_new->db_data, hepb);
210ea8dc4beschrock	dmu_buf_rele(db_new, FTAG);
211fa9e406ahrens
212fa9e406ahrens	/* second half of entries in old[b] go to new[2*b+1] */
213bf26014Matthew Ahrens	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
21447cb52dJeff Bonwick	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
215fa9e406ahrens	dmu_buf_will_dirty(db_new, tx);
216fa9e406ahrens	transfer_func((uint64_t *)db_old->db_data + hepb,
217fa9e406ahrens	    db_new->db_data, hepb);
218ea8dc4beschrock	dmu_buf_rele(db_new, FTAG);
219fa9e406ahrens
220ea8dc4beschrock	dmu_buf_rele(db_old, FTAG);
221fa9e406ahrens
222fa9e406ahrens	tbl->zt_blks_copied++;
223fa9e406ahrens
224fa9e406ahrens	dprintf("copied block %llu of %llu\n",
225fa9e406ahrens	    tbl->zt_blks_copied, tbl->zt_numblks);
226fa9e406ahrens
227fa9e406ahrens	if (tbl->zt_blks_copied == tbl->zt_numblks) {
228ea8dc4beschrock		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
229f65e61cahrens		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
230fa9e406ahrens
231fa9e406ahrens		tbl->zt_blk = newblk;
232fa9e406ahrens		tbl->zt_numblks *= 2;
233fa9e406ahrens		tbl->zt_shift++;
234fa9e406ahrens		tbl->zt_nextblk = 0;
235fa9e406ahrens		tbl->zt_blks_copied = 0;
236fa9e406ahrens
237fa9e406ahrens		dprintf("finished; numblocks now %llu (%lluk entries)\n",
238fa9e406ahrens		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
239fa9e406ahrens	}
24066328ddahrens
24166328ddahrens	return (0);
242fa9e406ahrens}
243fa9e406ahrens
244ea8dc4beschrockstatic int
245fa9e406ahrenszap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
246fa9e406ahrens    dmu_tx_t *tx)
247fa9e406ahrens{
248f65e61cahrens	int bs = FZAP_BLOCK_SHIFT(zap);
249fa9e406ahrens
250fa9e406ahrens	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
251fa9e406ahrens	ASSERT(tbl->zt_blk != 0);
252fa9e406ahrens
253fa9e406ahrens	dprintf("storing %llx at index %llx\n", val, idx);
254fa9e406ahrens
255bf26014Matthew Ahrens	uint64_t blk = idx >> (bs-3);
256bf26014Matthew Ahrens	uint64_t off = idx & ((1<<(bs-3))-1);
257fa9e406ahrens
258bf26014Matthew Ahrens	dmu_buf_t *db;
259bf26014Matthew Ahrens	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
26047cb52dJeff Bonwick	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
261bf26014Matthew Ahrens	if (err != 0)
262ea8dc4beschrock		return (err);
263fa9e406ahrens	dmu_buf_will_dirty(db, tx);
264fa9e406ahrens
265fa9e406ahrens	if (tbl->zt_nextblk != 0) {
266ea8dc4beschrock		uint64_t idx2 = idx * 2;
267ea8dc4beschrock		uint64_t blk2 = idx2 >> (bs-3);
268ea8dc4beschrock		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
269ea8dc4beschrock		dmu_buf_t *db2;
270ea8dc4beschrock
271ea8dc4beschrock		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
27247cb52dJeff Bonwick		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
27347cb52dJeff Bonwick		    DMU_READ_NO_PREFETCH);
274bf26014Matthew Ahrens		if (err != 0) {
275ea8dc4beschrock			dmu_buf_rele(db, FTAG);
276ea8dc4beschrock			return (err);
277ea8dc4beschrock		}
278ea8dc4beschrock		dmu_buf_will_dirty(db2, tx);
279ea8dc4beschrock		((uint64_t *)db2->db_data)[off2] = val;
280ea8dc4beschrock		((uint64_t *)db2->db_data)[off2+1] = val;
281ea8dc4beschrock		dmu_buf_rele(db2, FTAG);
282fa9e406ahrens	}
283fa9e406ahrens
284ea8dc4beschrock	((uint64_t *)db->db_data)[off] = val;
285ea8dc4beschrock	dmu_buf_rele(db, FTAG);
286ea8dc4beschrock
287ea8dc4beschrock	return (0);
288fa9e406ahrens}
289fa9e406ahrens
290ea8dc4beschrockstatic int
291ea8dc4beschrockzap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
292fa9e406ahrens{
293f65e61cahrens	int bs = FZAP_BLOCK_SHIFT(zap);
294fa9e406ahrens
295fa9e406ahrens	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
296fa9e406ahrens
297bf26014Matthew Ahrens	uint64_t blk = idx >> (bs-3);
298bf26014Matthew Ahrens	uint64_t off = idx & ((1<<(bs-3))-1);
299fa9e406ahrens
30079d7283Matthew Ahrens	/*
30179d7283Matthew Ahrens	 * Note: this is equivalent to dmu_buf_hold(), but we use
30279d7283Matthew Ahrens	 * _dnode_enter / _by_dnode because it's faster because we don't
30379d7283Matthew Ahrens	 * have to hold the dnode.
30479d7283Matthew Ahrens	 */
305bf26014Matthew Ahrens	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
306bf26014Matthew Ahrens	dmu_buf_t *db;
307bf26014Matthew Ahrens	int err = dmu_buf_hold_by_dnode(dn,
30847cb52dJeff Bonwick	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
30979d7283Matthew Ahrens	dmu_buf_dnode_exit(zap->zap_dbuf);
310bf26014Matthew Ahrens	if (err != 0)
311ea8dc4beschrock		return (err);
312ea8dc4beschrock	*valp = ((uint64_t *)db->db_data)[off];
313ea8dc4beschrock	dmu_buf_rele(db, FTAG);
314ea8dc4beschrock
315ea8dc4beschrock	if (tbl->zt_nextblk != 0) {
316ea8dc4beschrock		/*
317ea8dc4beschrock		 * read the nextblk for the sake of i/o error checking,
318ea8dc4beschrock		 * so that zap_table_load() will catch errors for
319ea8dc4beschrock		 * zap_table_store.
320ea8dc4beschrock		 */
321ea8dc4beschrock		blk = (idx*2) >> (bs-3);
322ea8dc4beschrock
32379d7283Matthew Ahrens		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
32479d7283Matthew Ahrens		err = dmu_buf_hold_by_dnode(dn,
32547cb52dJeff Bonwick		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
32647cb52dJeff Bonwick		    DMU_READ_NO_PREFETCH);
32779d7283Matthew Ahrens		dmu_buf_dnode_exit(zap->zap_dbuf);
328b287be1Will Andrews		if (err == 0)
329b287be1Will Andrews			dmu_buf_rele(db, FTAG);
330ea8dc4beschrock	}
331ea8dc4beschrock	return (err);
332fa9e406ahrens}
333fa9e406ahrens
334fa9e406ahrens/*
335fa9e406ahrens * Routines for growing the ptrtbl.
336fa9e406ahrens */
337fa9e406ahrens
338fa9e406ahrensstatic void
339fa9e406ahrenszap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
340fa9e406ahrens{
341bf26014Matthew Ahrens	for (int i = 0; i < n; i++) {
342fa9e406ahrens		uint64_t lb = src[i];
343bf26014Matthew Ahrens		dst[2 * i + 0] = lb;
344bf26014Matthew Ahrens		dst[2 * i + 1] = lb;
345fa9e406ahrens	}
346fa9e406ahrens}
347fa9e406ahrens
34866328ddahrensstatic int
349fa9e406ahrenszap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
350fa9e406ahrens{
351b24ab67Jeff Bonwick	/*
352b24ab67Jeff Bonwick	 * The pointer table should never use more hash bits than we
353b24ab67Jeff Bonwick	 * have (otherwise we'd be using useless zero bits to index it).
354b24ab67Jeff Bonwick	 * If we are within 2 bits of running out, stop growing, since
355b24ab67Jeff Bonwick	 * this is already an aberrant condition.
356b24ab67Jeff Bonwick	 */
357c137962Justin T. Gibbs	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
358be6fd75Matthew Ahrens		return (SET_ERROR(ENOSPC));
359fa9e406ahrens
360c137962Justin T. Gibbs	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
361fa9e406ahrens		/*
362f65e61cahrens		 * We are outgrowing the "embedded" ptrtbl (the one
363f65e61cahrens		 * stored in the header block).  Give it its own entire
364f65e61cahrens		 * block, which will double the size of the ptrtbl.
365fa9e406ahrens		 */
366c137962Justin T. Gibbs		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
367f65e61cahrens		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
368c137962Justin T. Gibbs		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
369fa9e406ahrens
370bf26014Matthew Ahrens		uint64_t newblk = zap_allocate_blocks(zap, 1);
371bf26014Matthew Ahrens		dmu_buf_t *db_new;
372bf26014Matthew Ahrens		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
37347cb52dJeff Bonwick		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
37447cb52dJeff Bonwick		    DMU_READ_NO_PREFETCH);
375bf26014Matthew Ahrens		if (err != 0)
37666328ddahrens			return (err);
377fa9e406ahrens		dmu_buf_will_dirty(db_new, tx);
378f65e61cahrens		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
379f65e61cahrens		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
380ea8dc4beschrock		dmu_buf_rele(db_new, FTAG);
381fa9e406ahrens
382c137962Justin T. Gibbs		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
383c137962Justin T. Gibbs		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
384c137962Justin T. Gibbs		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
385fa9e406ahrens
386c137962Justin T. Gibbs		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
387c137962Justin T. Gibbs		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
388f65e61cahrens		    (FZAP_BLOCK_SHIFT(zap)-3));
38966328ddahrens
39066328ddahrens		return (0);
391fa9e406ahrens	} else {
392c137962Justin T. Gibbs		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
39366328ddahrens		    zap_ptrtbl_transfer, tx));
394fa9e406ahrens	}
395fa9e406ahrens}
396fa9e406ahrens
397fa9e406ahrensstatic void
398fa9e406ahrenszap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
399fa9e406ahrens{
400fa9e406ahrens	dmu_buf_will_dirty(zap->zap_dbuf, tx);
401fa9e406ahrens	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
402c137962Justin T. Gibbs	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
403c137962Justin T. Gibbs	zap_f_phys(zap)->zap_num_entries += delta;
404fa9e406ahrens	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
405fa9e406ahrens}
406fa9e406ahrens
40766328ddahrensstatic uint64_t
40866328ddahrenszap_allocate_blocks(zap_t *zap, int nblocks)
409fa9e406ahrens{
41066328ddahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
411bf26014Matthew Ahrens	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
412c137962Justin T. Gibbs	zap_f_phys(zap)->zap_freeblk += nblocks;
413fa9e406ahrens	return (newblk);
414fa9e406ahrens}
415fa9e406ahrens
416bc9014eJustin Gibbsstatic void
41740510e8Josef 'Jeff' Sipekzap_leaf_evict_sync(void *dbu)
418bc9014eJustin Gibbs{
419bc9014eJustin Gibbs	zap_leaf_t *l = dbu;
420bc9014eJustin Gibbs
421bc9014eJustin Gibbs	rw_destroy(&l->l_rwlock);
422bc9014eJustin Gibbs	kmem_free(l, sizeof (zap_leaf_t));
423bc9014eJustin Gibbs}
424bc9014eJustin Gibbs
42566328ddahrensstatic zap_leaf_t *
426fa9e406ahrenszap_create_leaf(zap_t *zap, dmu_tx_t *tx)
427fa9e406ahrens{
428bc9014eJustin Gibbs	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
429fa9e406ahrens
430fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
431fa9e406ahrens
432fa9e406ahrens	rw_init(&l->l_rwlock, 0, 0, 0);
433fa9e406ahrens	rw_enter(&l->l_rwlock, RW_WRITER);
43466328ddahrens	l->l_blkid = zap_allocate_blocks(zap, 1);
435fa9e406ahrens	l->l_dbuf = NULL;
436fa9e406ahrens
437bf26014Matthew Ahrens	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
43847cb52dJeff Bonwick	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
43947cb52dJeff Bonwick	    DMU_READ_NO_PREFETCH));
44040510e8Josef 'Jeff' Sipek	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
441bf26014Matthew Ahrens	VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
442fa9e406ahrens	dmu_buf_will_dirty(l->l_dbuf, tx);
443fa9e406ahrens
444de8267etimh	zap_leaf_init(l, zap->zap_normflags != 0);
445fa9e406ahrens
446c137962Justin T. Gibbs	zap_f_phys(zap)->zap_num_leafs++;
447fa9e406ahrens
44866328ddahrens	return (l);
449fa9e406ahrens}
450fa9e406ahrens
451fa9e406ahrensint
452fa9e406ahrensfzap_count(zap_t *zap, uint64_t *count)
453fa9e406ahrens{
454fa9e406ahrens	ASSERT(!zap->zap_ismicro);
455fa9e406ahrens	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
456c137962Justin T. Gibbs	*count = zap_f_phys(zap)->zap_num_entries;
457fa9e406ahrens	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
458fa9e406ahrens	return (0);
459fa9e406ahrens}
460fa9e406ahrens
461fa9e406ahrens/*
462fa9e406ahrens * Routines for obtaining zap_leaf_t's
463fa9e406ahrens */
464fa9e406ahrens
46587e5029ahrensvoid
466fa9e406ahrenszap_put_leaf(zap_leaf_t *l)
467fa9e406ahrens{
468fa9e406ahrens	rw_exit(&l->l_rwlock);
469ea8dc4beschrock	dmu_buf_rele(l->l_dbuf, NULL);
470fa9e406ahrens}
471fa9e406ahrens
472fa9e406ahrensstatic zap_leaf_t *
473fa9e406ahrenszap_open_leaf(uint64_t blkid, dmu_buf_t *db)
474fa9e406ahrens{
475fa9e406ahrens	ASSERT(blkid != 0);
476fa9e406ahrens
477bf26014Matthew Ahrens	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
478fa9e406ahrens	rw_init(&l->l_rwlock, 0, 0, 0);
479fa9e406ahrens	rw_enter(&l->l_rwlock, RW_WRITER);
480fa9e406ahrens	l->l_blkid = blkid;
481bf16b11Matthew Ahrens	l->l_bs = highbit64(db->db_size) - 1;
482fa9e406ahrens	l->l_dbuf = db;
483fa9e406ahrens
48440510e8Josef 'Jeff' Sipek	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
485bf26014Matthew Ahrens	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
486fa9e406ahrens
487fa9e406ahrens	rw_exit(&l->l_rwlock);
488fa9e406ahrens	if (winner != NULL) {
489fa9e406ahrens		/* someone else set it first */
49040510e8Josef 'Jeff' Sipek		zap_leaf_evict_sync(&l->l_dbu);
491fa9e406ahrens		l = winner;
492fa9e406ahrens	}
493fa9e406ahrens
494f65e61cahrens	/*
49566328ddahrens	 * lhr_pad was previously used for the next leaf in the leaf
49666328ddahrens	 * chain.  There should be no chained leafs (as we have removed
49766328ddahrens	 * support for them).
49866328ddahrens	 */
499c137962Justin T. Gibbs	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
50066328ddahrens
50166328ddahrens	/*
502f65e61cahrens	 * There should be more hash entries than there can be
503f65e61cahrens	 * chunks to put in the hash table
504f65e61cahrens	 */
505f65e61cahrens	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
506f65e61cahrens
507f65e61cahrens	/* The chunks should begin at the end of the hash table */
508f65e61cahrens	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
509c137962Justin T. Gibbs	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
510f65e61cahrens
511f65e61cahrens	/* The chunks should end at the end of the block */
512f65e61cahrens	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
513c137962Justin T. Gibbs	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
514f65e61cahrens
515fa9e406ahrens	return (l);
516fa9e406ahrens}
517fa9e406ahrens
518ea8dc4beschrockstatic int
51966328ddahrenszap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
520ea8dc4beschrock    zap_leaf_t **lp)
521fa9e406ahrens{
522fa9e406ahrens	dmu_buf_t *db;
523fa9e406ahrens
524fa9e406ahrens	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
525fa9e406ahrens
526bf26014Matthew Ahrens	int bs = FZAP_BLOCK_SHIFT(zap);
52779d7283Matthew Ahrens	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
528bf26014Matthew Ahrens	int err = dmu_buf_hold_by_dnode(dn,
52947cb52dJeff Bonwick	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
53079d7283Matthew Ahrens	dmu_buf_dnode_exit(zap->zap_dbuf);
531bf26014Matthew Ahrens	if (err != 0)
532ea8dc4beschrock		return (err);
533fa9e406ahrens
534fa9e406ahrens	ASSERT3U(db->db_object, ==, zap->zap_object);
535f65e61cahrens	ASSERT3U(db->db_offset, ==, blkid << bs);
536f65e61cahrens	ASSERT3U(db->db_size, ==, 1 << bs);
537fa9e406ahrens	ASSERT(blkid != 0);
538fa9e406ahrens
539bf26014Matthew Ahrens	zap_leaf_t *l = dmu_buf_get_user(db);
540fa9e406ahrens
541fa9e406ahrens	if (l == NULL)
542fa9e406ahrens		l = zap_open_leaf(blkid, db);
543fa9e406ahrens
544fa9e406ahrens	rw_enter(&l->l_rwlock, lt);
545fa9e406ahrens	/*
546c137962Justin T. Gibbs	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
547fa9e406ahrens	 * causing ASSERT below to fail.
548fa9e406ahrens	 */
549fa9e406ahrens	if (lt == RW_WRITER)
550fa9e406ahrens		dmu_buf_will_dirty(db, tx);
551fa9e406ahrens	ASSERT3U(l->l_blkid, ==, blkid);
552fa9e406ahrens	ASSERT3P(l->l_dbuf, ==, db);
553c137962Justin T. Gibbs	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
554c137962Justin T. Gibbs	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
555fa9e406ahrens
556ea8dc4beschrock	*lp = l;
557ea8dc4beschrock	return (0);
558fa9e406ahrens}
559fa9e406ahrens
560ea8dc4beschrockstatic int
561ea8dc4beschrockzap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
562fa9e406ahrens{
563fa9e406ahrens	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
564fa9e406ahrens
565c137962Justin T. Gibbs	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
566fa9e406ahrens		ASSERT3U(idx, <,
567c137962Justin T. Gibbs		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
568ea8dc4beschrock		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
569ea8dc4beschrock		return (0);
570fa9e406ahrens	} else {
571c137962Justin T. Gibbs		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
572ea8dc4beschrock		    idx, valp));
573fa9e406ahrens	}
574fa9e406ahrens}
575fa9e406ahrens
576ea8dc4beschrockstatic int
577fa9e406ahrenszap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
578fa9e406ahrens{
579fa9e406ahrens	ASSERT(tx != NULL);
580fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
581fa9e406ahrens
582c137962Justin T. Gibbs	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
583f65e61cahrens		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
584ea8dc4beschrock		return (0);
585fa9e406ahrens	} else {
586c137962Justin T. Gibbs		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
587ea8dc4beschrock		    idx, blk, tx));
588fa9e406ahrens	}
589fa9e406ahrens}
590fa9e406ahrens
591ea8dc4beschrockstatic int
592ea8dc4beschrockzap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
593fa9e406ahrens{
594bf26014Matthew Ahrens	uint64_t blk;
595fa9e406ahrens
596fa9e406ahrens	ASSERT(zap->zap_dbuf == NULL ||
597c137962Justin T. Gibbs	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
59802525cdChunwei Chen
59902525cdChunwei Chen	/* Reality check for corrupt zap objects (leaf or header). */
60002525cdChunwei Chen	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
60102525cdChunwei Chen	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
60202525cdChunwei Chen	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
60302525cdChunwei Chen		return (SET_ERROR(EIO));
60402525cdChunwei Chen	}
60502525cdChunwei Chen
606bf26014Matthew Ahrens	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
607bf26014Matthew Ahrens	int err = zap_idx_to_blk(zap, idx, &blk);
608ea8dc4beschrock	if (err != 0)
609ea8dc4beschrock		return (err);
610ea8dc4beschrock	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
611fa9e406ahrens
612c137962Justin T. Gibbs	ASSERT(err ||
613c137962Justin T. Gibbs	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
614c137962Justin T. Gibbs	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
615ea8dc4beschrock	return (err);
616fa9e406ahrens}
617fa9e406ahrens
618ea8dc4beschrockstatic int
619ae97279Matthew Ahrenszap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
620ae97279Matthew Ahrens    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
621fa9e406ahrens{
622ad860c8bonwick	zap_t *zap = zn->zn_zap;
623ad860c8bonwick	uint64_t hash = zn->zn_hash;
624bf26014Matthew Ahrens	int err;
625c137962Justin T. Gibbs	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
626fa9e406ahrens
627c137962Justin T. Gibbs	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
628fa9e406ahrens	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
629fa9e406ahrens
63066328ddahrens	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
631c137962Justin T. Gibbs	    zap_leaf_phys(l)->l_hdr.lh_prefix);
632fa9e406ahrens
63366328ddahrens	if (zap_tryupgradedir(zap, tx) == 0 ||
634c137962Justin T. Gibbs	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
63566328ddahrens		/* We failed to upgrade, or need to grow the pointer table */
636fa9e406ahrens		objset_t *os = zap->zap_objset;
637fa9e406ahrens		uint64_t object = zap->zap_object;
638fa9e406ahrens
639fa9e406ahrens		zap_put_leaf(l);
640ae97279Matthew Ahrens		zap_unlockdir(zap, tag);
641c5f9e43ahrens		err = zap_lockdir(os, object, tx, RW_WRITER,
642ae97279Matthew Ahrens		    FALSE, FALSE, tag, &zn->zn_zap);
643ad860c8bonwick		zap = zn->zn_zap;
644bf26014Matthew Ahrens		if (err != 0)
64566328ddahrens			return (err);
646fa9e406ahrens		ASSERT(!zap->zap_ismicro);
647fa9e406ahrens
64866328ddahrens		while (old_prefix_len ==
649c137962Justin T. Gibbs		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
65066328ddahrens			err = zap_grow_ptrtbl(zap, tx);
651bf26014Matthew Ahrens			if (err != 0)
65266328ddahrens				return (err);
65366328ddahrens		}
65466328ddahrens
65566328ddahrens		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
656bf26014Matthew Ahrens		if (err != 0)
65766328ddahrens			return (err);
65866328ddahrens
659c137962Justin T. Gibbs		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
660fa9e406ahrens			/* it split while our locks were down */
661ea8dc4beschrock			*lp = l;
662ea8dc4beschrock			return (0);
663ea8dc4beschrock		}
664fa9e406ahrens	}
665fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
666c137962Justin T. Gibbs	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
66766328ddahrens	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
668c137962Justin T. Gibbs	    zap_leaf_phys(l)->l_hdr.lh_prefix);
669fa9e406ahrens
670bf26014Matthew Ahrens	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
67166328ddahrens	    (old_prefix_len + 1);
672bf26014Matthew Ahrens	uint64_t sibling =
673bf26014Matthew Ahrens	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
674ea8dc4beschrock
675ea8dc4beschrock	/* check for i/o errors before doing zap_leaf_split */
676bf26014Matthew Ahrens	for (int i = 0; i < (1ULL << prefix_diff); i++) {
677ea8dc4beschrock		uint64_t blk;
678bf26014Matthew Ahrens		err = zap_idx_to_blk(zap, sibling + i, &blk);
679bf26014Matthew Ahrens		if (err != 0)
680ea8dc4beschrock			return (err);
681ea8dc4beschrock		ASSERT3U(blk, ==, l->l_blkid);
682ea8dc4beschrock	}
683