1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5f65e61cahrens * Common Development and Distribution License (the "License").
6f65e61cahrens * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
211c17160Kevin Crowe
22fa9e406ahrens/*
2347cb52dJeff Bonwick * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2452abb70Matthew Ahrens * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25bc9014eJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
271c17160Kevin Crowe * Copyright 2017 Nexenta Systems, Inc.
28fa9e406ahrens */
29fa9e406ahrens
30b24ab67Jeff Bonwick#include <sys/zio.h>
31fa9e406ahrens#include <sys/spa.h>
32fa9e406ahrens#include <sys/dmu.h>
33fa9e406ahrens#include <sys/zfs_context.h>
34fa9e406ahrens#include <sys/zap.h>
35ea8dc4beschrock#include <sys/refcount.h>
36fa9e406ahrens#include <sys/zap_impl.h>
3787e5029ahrens#include <sys/zap_leaf.h>
38fa9e406ahrens#include <sys/avl.h>
393f9d6adLin Ling#include <sys/arc.h>
40b515258Matthew Ahrens#include <sys/dmu_objset.h>
41de8267etimh
42de8267etimh#ifdef _KERNEL
43de8267etimh#include <sys/sunddi.h>
44de8267etimh#endif
45fa9e406ahrens
46c137962Justin T. Gibbsextern inline mzap_phys_t *zap_m_phys(zap_t *zap);
47c137962Justin T. Gibbs
48ae97279Matthew Ahrensstatic int mzap_upgrade(zap_t **zapp,
49ae97279Matthew Ahrens    void *tag, dmu_tx_t *tx, zap_flags_t flags);
50fa9e406ahrens
51b24ab67Jeff Bonwickuint64_t
52b24ab67Jeff Bonwickzap_getflags(zap_t *zap)
53b24ab67Jeff Bonwick{
54b24ab67Jeff Bonwick	if (zap->zap_ismicro)
55b24ab67Jeff Bonwick		return (0);
56c137962Justin T. Gibbs	return (zap_f_phys(zap)->zap_flags);
57b24ab67Jeff Bonwick}
58b24ab67Jeff Bonwick
59b24ab67Jeff Bonwickint
60b24ab67Jeff Bonwickzap_hashbits(zap_t *zap)
61b24ab67Jeff Bonwick{
62b24ab67Jeff Bonwick	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
63b24ab67Jeff Bonwick		return (48);
64b24ab67Jeff Bonwick	else
65b24ab67Jeff Bonwick		return (28);
66b24ab67Jeff Bonwick}
67b24ab67Jeff Bonwick
68b24ab67Jeff Bonwickuint32_t
69b24ab67Jeff Bonwickzap_maxcd(zap_t *zap)
70b24ab67Jeff Bonwick{
71b24ab67Jeff Bonwick	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
72b24ab67Jeff Bonwick		return ((1<<16)-1);
73b24ab67Jeff Bonwick	else
74b24ab67Jeff Bonwick		return (-1U);
75b24ab67Jeff Bonwick}
76fa9e406ahrens
77da6c28aamwstatic uint64_t
78b24ab67Jeff Bonwickzap_hash(zap_name_t *zn)
79da6c28aamw{
80b24ab67Jeff Bonwick	zap_t *zap = zn->zn_zap;
81b24ab67Jeff Bonwick	uint64_t h = 0;
82da6c28aamw
83b24ab67Jeff Bonwick	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
84b24ab67Jeff Bonwick		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
85b24ab67Jeff Bonwick		h = *(uint64_t *)zn->zn_key_orig;
86b24ab67Jeff Bonwick	} else {
87b24ab67Jeff Bonwick		h = zap->zap_salt;
88b24ab67Jeff Bonwick		ASSERT(h != 0);
89b24ab67Jeff Bonwick		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
90b24ab67Jeff Bonwick
91486ae71Matthew Ahrens		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
92486ae71Matthew Ahrens			const uint64_t *wp = zn->zn_key_norm;
93486ae71Matthew Ahrens
94486ae71Matthew Ahrens			ASSERT(zn->zn_key_intlen == 8);
95bf26014Matthew Ahrens			for (int i = 0; i < zn->zn_key_norm_numints;
96bf26014Matthew Ahrens			    wp++, i++) {
97486ae71Matthew Ahrens				uint64_t word = *wp;
98486ae71Matthew Ahrens
99bf26014Matthew Ahrens				for (int j = 0; j < zn->zn_key_intlen; j++) {
100486ae71Matthew Ahrens					h = (h >> 8) ^
101486ae71Matthew Ahrens					    zfs_crc64_table[(h ^ word) & 0xFF];
102486ae71Matthew Ahrens					word >>= NBBY;
103486ae71Matthew Ahrens				}
104486ae71Matthew Ahrens			}
105486ae71Matthew Ahrens		} else {
106486ae71Matthew Ahrens			const uint8_t *cp = zn->zn_key_norm;
107486ae71Matthew Ahrens
108486ae71Matthew Ahrens			/*
109486ae71Matthew Ahrens			 * We previously stored the terminating null on
110486ae71Matthew Ahrens			 * disk, but didn't hash it, so we need to
111486ae71Matthew Ahrens			 * continue to not hash it.  (The
112486ae71Matthew Ahrens			 * zn_key_*_numints includes the terminating
113486ae71Matthew Ahrens			 * null for non-binary keys.)
114486ae71Matthew Ahrens			 */
115bf26014Matthew Ahrens			int len = zn->zn_key_norm_numints - 1;
116486ae71Matthew Ahrens
117486ae71Matthew Ahrens			ASSERT(zn->zn_key_intlen == 1);
118bf26014Matthew Ahrens			for (int i = 0; i < len; cp++, i++) {
119486ae71Matthew Ahrens				h = (h >> 8) ^
120486ae71Matthew Ahrens				    zfs_crc64_table[(h ^ *cp) & 0xFF];
121486ae71Matthew Ahrens			}
122486ae71Matthew Ahrens		}
123b24ab67Jeff Bonwick	}
124da6c28aamw	/*
125b24ab67Jeff Bonwick	 * Don't use all 64 bits, since we need some in the cookie for
126b24ab67Jeff Bonwick	 * the collision differentiator.  We MUST use the high bits,
127b24ab67Jeff Bonwick	 * since those are the ones that we first pay attention to when
128da6c28aamw	 * chosing the bucket.
129da6c28aamw	 */
130b24ab67Jeff Bonwick	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
131da6c28aamw
132b24ab67Jeff Bonwick	return (h);
133da6c28aamw}
134da6c28aamw
135da6c28aamwstatic int
1361c17160Kevin Crowezap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
137da6c28aamw{
138b24ab67Jeff Bonwick	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
139b24ab67Jeff Bonwick
140bf26014Matthew Ahrens	size_t inlen = strlen(name) + 1;
141bf26014Matthew Ahrens	size_t outlen = ZAP_MAXNAMELEN;
142da6c28aamw
143bf26014Matthew Ahrens	int err = 0;
144da6c28aamw	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
1451c17160Kevin Crowe	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
1461c17160Kevin Crowe	    U8_UNICODE_LATEST, &err);
147da6c28aamw
148da6c28aamw	return (err);
149da6c28aamw}
150da6c28aamw
151da6c28aamwboolean_t
152da6c28aamwzap_match(zap_name_t *zn, const char *matchname)
153da6c28aamw{
154b24ab67Jeff Bonwick	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
155b24ab67Jeff Bonwick
1561c17160Kevin Crowe	if (zn->zn_matchtype & MT_NORMALIZE) {
157da6c28aamw		char norm[ZAP_MAXNAMELEN];
158da6c28aamw
1591c17160Kevin Crowe		if (zap_normalize(zn->zn_zap, matchname, norm,
1601c17160Kevin Crowe		    zn->zn_normflags) != 0)
161da6c28aamw			return (B_FALSE);
162da6c28aamw
163b24ab67Jeff Bonwick		return (strcmp(zn->zn_key_norm, norm) == 0);
164da6c28aamw	} else {
165b24ab67Jeff Bonwick		return (strcmp(zn->zn_key_orig, matchname) == 0);
166da6c28aamw	}
167da6c28aamw}
168da6c28aamw
169da6c28aamwvoid
170da6c28aamwzap_name_free(zap_name_t *zn)
171da6c28aamw{
172da6c28aamw	kmem_free(zn, sizeof (zap_name_t));
173da6c28aamw}
174da6c28aamw
175da6c28aamwzap_name_t *
176b24ab67Jeff Bonwickzap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
177da6c28aamw{
178da6c28aamw	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
179da6c28aamw
180da6c28aamw	zn->zn_zap = zap;
181b24ab67Jeff Bonwick	zn->zn_key_intlen = sizeof (*key);
182b24ab67Jeff Bonwick	zn->zn_key_orig = key;
183486ae71Matthew Ahrens	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
184da6c28aamw	zn->zn_matchtype = mt;
1851c17160Kevin Crowe	zn->zn_normflags = zap->zap_normflags;
1861c17160Kevin Crowe
1871c17160Kevin Crowe	/*
1881c17160Kevin Crowe	 * If we're dealing with a case sensitive lookup on a mixed or
1891c17160Kevin Crowe	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
1901c17160Kevin Crowe	 * will fold case to all caps overriding the lookup request.
1911c17160Kevin Crowe	 */
1921c17160Kevin Crowe	if (mt & MT_MATCH_CASE)
1931c17160Kevin Crowe		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
1941c17160Kevin Crowe
195da6c28aamw	if (zap->zap_normflags) {
1961c17160Kevin Crowe		/*
1971c17160Kevin Crowe		 * We *must* use zap_normflags because this normalization is
1981c17160Kevin Crowe		 * what the hash is computed from.
1991c17160Kevin Crowe		 */
2001c17160Kevin Crowe		if (zap_normalize(zap, key, zn->zn_normbuf,
2011c17160Kevin Crowe		    zap->zap_normflags) != 0) {
202da6c28aamw			zap_name_free(zn);
203da6c28aamw			return (NULL);
204da6c28aamw		}
205b24ab67Jeff Bonwick		zn->zn_key_norm = zn->zn_normbuf;
206486ae71Matthew Ahrens		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
207da6c28aamw	} else {
2081c17160Kevin Crowe		if (mt != 0) {
209da6c28aamw			zap_name_free(zn);
210da6c28aamw			return (NULL);
211da6c28aamw		}
212b24ab67Jeff Bonwick		zn->zn_key_norm = zn->zn_key_orig;
213486ae71Matthew Ahrens		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
214da6c28aamw	}
215da6c28aamw
216b24ab67Jeff Bonwick	zn->zn_hash = zap_hash(zn);
2171c17160Kevin Crowe
2181c17160Kevin Crowe	if (zap->zap_normflags != zn->zn_normflags) {
2191c17160Kevin Crowe		/*
2201c17160Kevin Crowe		 * We *must* use zn_normflags because this normalization is
2211c17160Kevin Crowe		 * what the matching is based on.  (Not the hash!)
2221c17160Kevin Crowe		 */
2231c17160Kevin Crowe		if (zap_normalize(zap, key, zn->zn_normbuf,
2241c17160Kevin Crowe		    zn->zn_normflags) != 0) {
2251c17160Kevin Crowe			zap_name_free(zn);
2261c17160Kevin Crowe			return (NULL);
2271c17160Kevin Crowe		}
2281c17160Kevin Crowe		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
2291c17160Kevin Crowe	}
2301c17160Kevin Crowe
231b24ab67Jeff Bonwick	return (zn);
232b24ab67Jeff Bonwick}
233b24ab67Jeff Bonwick
234b24ab67Jeff Bonwickzap_name_t *
235b24ab67Jeff Bonwickzap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
236b24ab67Jeff Bonwick{
237b24ab67Jeff Bonwick	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
238b24ab67Jeff Bonwick
239b24ab67Jeff Bonwick	ASSERT(zap->zap_normflags == 0);
240b24ab67Jeff Bonwick	zn->zn_zap = zap;
241b24ab67Jeff Bonwick	zn->zn_key_intlen = sizeof (*key);
242b24ab67Jeff Bonwick	zn->zn_key_orig = zn->zn_key_norm = key;
243486ae71Matthew Ahrens	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
2441c17160Kevin Crowe	zn->zn_matchtype = 0;
245b24ab67Jeff Bonwick
246b24ab67Jeff Bonwick	zn->zn_hash = zap_hash(zn);
247da6c28aamw	return (zn);
248da6c28aamw}
249da6c28aamw
250fa9e406ahrensstatic void
251fa9e406ahrensmzap_byteswap(mzap_phys_t *buf, size_t size)
252fa9e406ahrens{
253fa9e406ahrens	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
254fa9e406ahrens	buf->mz_salt = BSWAP_64(buf->mz_salt);
255ab04eb8timh	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
256bf26014Matthew Ahrens	int max = (size / MZAP_ENT_LEN) - 1;
257bf26014Matthew Ahrens	for (int i = 0; i < max; i++) {
258fa9e406ahrens		buf->mz_chunk[i].mze_value =
259fa9e406ahrens		    BSWAP_64(buf->mz_chunk[i].mze_value);
260fa9e406ahrens		buf->mz_chunk[i].mze_cd =
261fa9e406ahrens		    BSWAP_32(buf->mz_chunk[i].mze_cd);
262fa9e406ahrens	}
263fa9e406ahrens}
264fa9e406ahrens
265fa9e406ahrensvoid
266fa9e406ahrenszap_byteswap(void *buf, size_t size)
267fa9e406ahrens{
268bf26014Matthew Ahrens	uint64_t block_type = *(uint64_t *)buf;
269fa9e406ahrens
2705ad8204nd	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
271fa9e406ahrens		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
272fa9e406ahrens		mzap_byteswap(buf, size);
2735ad8204nd	} else {
274fa9e406ahrens		fzap_byteswap(buf, size);
275fa9e406ahrens	}
276fa9e406ahrens}
277fa9e406ahrens
278fa9e406ahrensstatic int
279fa9e406ahrensmze_compare(const void *arg1, const void *arg2)
280fa9e406ahrens{
281fa9e406ahrens	const mzap_ent_t *mze1 = arg1;
282fa9e406ahrens	const mzap_ent_t *mze2 = arg2;
283fa9e406ahrens
284c4ab0d3Gvozden Neskovic	int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
285c4ab0d3Gvozden Neskovic	if (likely(cmp))
286c4ab0d3Gvozden Neskovic		return (cmp);
287c4ab0d3Gvozden Neskovic
288c4ab0d3Gvozden Neskovic	return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
289fa9e406ahrens}
290fa9e406ahrens
291fa9e406ahrensstatic void
2923f9d6adLin Lingmze_insert(zap_t *zap, int chunkid, uint64_t hash)
293fa9e406ahrens{
294fa9e406ahrens	ASSERT(zap->zap_ismicro);
295fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
296fa9e406ahrens
297bf26014Matthew Ahrens	mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
298fa9e406ahrens	mze->mze_chunkid = chunkid;
299fa9e406ahrens	mze->mze_hash = hash;
3003f9d6adLin Ling	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
3013f9d6adLin Ling	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
302fa9e406ahrens	avl_add(&zap->zap_m.zap_avl, mze);
303fa9e406ahrens}
304fa9e406ahrens
305fa9e406ahrensstatic mzap_ent_t *
306da6c28aamwmze_find(zap_name_t *zn)
307fa9e406ahrens{
308fa9e406ahrens	mzap_ent_t mze_tofind;
309fa9e406ahrens	mzap_ent_t *mze;
310fa9e406ahrens	avl_index_t idx;
311da6c28aamw	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
312fa9e406ahrens
313da6c28aamw	ASSERT(zn->zn_zap->zap_ismicro);
314da6c28aamw	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
315fa9e406ahrens
316da6c28aamw	mze_tofind.mze_hash = zn->zn_hash;
3173f9d6adLin Ling	mze_tofind.mze_cd = 0;
318fa9e406ahrens
319fa9e406ahrens	mze = avl_find(avl, &mze_tofind, &idx);
320fa9e406ahrens	if (mze == NULL)
321fa9e406ahrens		mze = avl_nearest(avl, idx, AVL_AFTER);
322da6c28aamw	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
3233f9d6adLin Ling		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
3243f9d6adLin Ling		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
325fa9e406ahrens			return (mze);
326fa9e406ahrens	}
3271c17160Kevin Crowe
328fa9e406ahrens	return (NULL);
329fa9e406ahrens}
330fa9e406ahrens
331fa9e406ahrensstatic uint32_t
332fa9e406ahrensmze_find_unused_cd(zap_t *zap, uint64_t hash)
333fa9e406ahrens{
334fa9e406ahrens	mzap_ent_t mze_tofind;
335fa9e406ahrens	avl_index_t idx;
336fa9e406ahrens	avl_tree_t *avl = &zap->zap_m.zap_avl;
337fa9e406ahrens
338fa9e406ahrens	ASSERT(zap->zap_ismicro);
339fa9e406ahrens	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
340fa9e406ahrens
341fa9e406ahrens	mze_tofind.mze_hash = hash;
3423f9d6adLin Ling	mze_tofind.mze_cd = 0;
343fa9e406ahrens
344bf26014Matthew Ahrens	uint32_t cd = 0;
345bf26014Matthew Ahrens	for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
346fa9e406ahrens	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
3473f9d6adLin Ling		if (mze->mze_cd != cd)
348fa9e406ahrens			break;
349fa9e406ahrens		cd++;
350fa9e406ahrens	}
351fa9e406ahrens
352fa9e406ahrens	return (cd);
353fa9e406ahrens}
354fa9e406ahrens
355fa9e406ahrensstatic void
356fa9e406ahrensmze_remove(zap_t *zap, mzap_ent_t *mze)
357fa9e406ahrens{
358fa9e406ahrens	ASSERT(zap->zap_ismicro);
359fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
360fa9e406ahrens
361fa9e406ahrens	avl_remove(&zap->zap_m.zap_avl, mze);
362fa9e406ahrens	kmem_free(mze, sizeof (mzap_ent_t));
363fa9e406ahrens}
364fa9e406ahrens
365fa9e406ahrensstatic void
366fa9e406ahrensmze_destroy(zap_t *zap)
367fa9e406ahrens{
368fa9e406ahrens	mzap_ent_t *mze;
369fa9e406ahrens	void *avlcookie = NULL;
370fa9e406ahrens
371fa9e406ahrens	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
372fa9e406ahrens		kmem_free(mze, sizeof (mzap_ent_t));
373fa9e406ahrens	avl_destroy(&zap->zap_m.zap_avl);
374fa9e406ahrens}
375fa9e406ahrens
376fa9e406ahrensstatic zap_t *
377fa9e406ahrensmzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
378fa9e406ahrens{
379fa9e406ahrens	zap_t *winner;
38002525cdChunwei Chen	uint64_t *zap_hdr = (uint64_t *)db->db_data;
38102525cdChunwei Chen	uint64_t zap_block_type = zap_hdr[0];
38202525cdChunwei Chen	uint64_t zap_magic = zap_hdr[1];
383fa9e406ahrens
384fa9e406ahrens	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
385fa9e406ahrens
386bf26014Matthew Ahrens	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
387fa9e406ahrens	rw_init(&zap->zap_rwlock, 0, 0, 0);
388fa9e406ahrens	rw_enter(&zap->zap_rwlock, RW_WRITER);
389fa9e406ahrens	zap->zap_objset = os;
390fa9e406ahrens	zap->zap_object = obj;
391fa9e406ahrens	zap->zap_dbuf = db;
392fa9e406ahrens
39302525cdChunwei Chen	if (zap_block_type != ZBT_MICRO) {
394fa9e406ahrens		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
395bf16b11Matthew Ahrens		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
39602525cdChunwei Chen		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
39702525cdChunwei Chen			winner = NULL;	/* No actual winner here... */
39802525cdChunwei Chen			goto handle_winner;
39902525cdChunwei Chen		}
400fa9e406ahrens	} else {
401fa9e406ahrens		zap->zap_ismicro = TRUE;
402fa9e406ahrens	}
403fa9e406ahrens
404fa9e406ahrens	/*
405fa9e406ahrens	 * Make sure that zap_ismicro is set before we let others see
406fa9e406ahrens	 * it, because zap_lockdir() checks zap_ismicro without the lock
407fa9e406ahrens	 * held.
408fa9e406ahrens	 */
40940510e8Josef 'Jeff' Sipek	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
410bc9014eJustin Gibbs	winner = dmu_buf_set_user(db, &zap->zap_dbu);
411fa9e406ahrens
41202525cdChunwei Chen	if (winner != NULL)
41302525cdChunwei Chen		goto handle_winner;
414fa9e406ahrens
415fa9e406ahrens	if (zap->zap_ismicro) {
416c137962Justin T. Gibbs		zap->zap_salt = zap_m_phys(zap)->mz_salt;
417c137962Justin T. Gibbs		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
418fa9e406ahrens		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
419fa9e406ahrens		avl_create(&zap->zap_m.zap_avl, mze_compare,
420fa9e406ahrens		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
421fa9e406ahrens
422bf26014Matthew Ahrens		for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
423fa9e406ahrens			mzap_ent_phys_t *mze =
424c137962Justin T. Gibbs			    &zap_m_phys(zap)->mz_chunk[i];
425fa9e406ahrens			if (mze->mze_name[0]) {
426da6c28aamw				zap_name_t *zn;
427da6c28aamw
428fa9e406ahrens				zap->zap_m.zap_num_entries++;
4291c17160Kevin Crowe				zn = zap_name_alloc(zap, mze->mze_name, 0);
4303f9d6adLin Ling				mze_insert(zap, i, zn->zn_hash);
431da6c28aamw				zap_name_free(zn);
432fa9e406ahrens			}
433fa9e406ahrens		}
434fa9e406ahrens	} else {
435c137962Justin T. Gibbs		zap->zap_salt = zap_f_phys(zap)->zap_salt;
436c137962Justin T. Gibbs		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
437f65e61cahrens
438f65e61cahrens		ASSERT3U(sizeof (struct zap_leaf_header), ==,
439f65e61cahrens		    2*ZAP_LEAF_CHUNKSIZE);
440f65e61cahrens
441f65e61cahrens		/*
442f65e61cahrens		 * The embedded pointer table should not overlap the
443f65e61cahrens		 * other members.
444f65e61cahrens		 */
445f65e61cahrens		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
446c137962Justin T. Gibbs		    &zap_f_phys(zap)->zap_salt);
447f65e61cahrens
448f65e61cahrens		/*
449f65e61cahrens		 * The embedded pointer table should end at the end of
450f65e61cahrens		 * the block
451f65e61cahrens		 */
452f65e61cahrens		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
453f65e61cahrens		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
454c137962Justin T. Gibbs		    (uintptr_t)zap_f_phys(zap), ==,
455f65e61cahrens		    zap->zap_dbuf->db_size);
456fa9e406ahrens	}
457fa9e406ahrens	rw_exit(&zap->zap_rwlock);
458fa9e406ahrens	return (zap);
45902525cdChunwei Chen
46002525cdChunwei Chenhandle_winner:
46102525cdChunwei Chen	rw_exit(&zap->zap_rwlock);
46202525cdChunwei Chen	rw_destroy(&zap->zap_rwlock);
46302525cdChunwei Chen	if (!zap->zap_ismicro)
46402525cdChunwei Chen		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
46502525cdChunwei Chen	kmem_free(zap, sizeof (zap_t));
46602525cdChunwei Chen	return (winner);
467fa9e406ahrens}
468fa9e406ahrens
469bf26014Matthew Ahrens/*
470bf26014Matthew Ahrens * This routine "consumes" the caller's hold on the dbuf, which must
471bf26014Matthew Ahrens * have the specified tag.
472bf26014Matthew Ahrens */
473ae97279Matthew Ahrensstatic int
474ae97279Matthew Ahrenszap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
475c5f9e43ahrens    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
476fa9e406ahrens{
477ae97279Matthew Ahrens	ASSERT0(db->db_offset);
478ae97279Matthew Ahrens	objset_t *os = dmu_buf_get_objset(db);
479ae97279Matthew Ahrens	uint64_t obj = db->db_object;
480fa9e406ahrens
481ae97279Matthew Ahrens	*zapp = NULL;
482fa9e406ahrens
483bf26014Matthew Ahrens	zap_t *zap = dmu_buf_get_user(db);
48402525cdChunwei Chen	if (zap == NULL) {
485fa9e406ahrens		zap = mzap_open(os, obj, db);
48602525cdChunwei Chen		if (zap == NULL) {
48702525cdChunwei Chen			/*
48802525cdChunwei Chen			 * mzap_open() didn't like what it saw on-disk.
48902525cdChunwei Chen			 * Check for corruption!
49002525cdChunwei Chen			 */
49102525cdChunwei Chen			return (SET_ERROR(EIO));
49202525cdChunwei Chen		}
49302525cdChunwei Chen	}
494fa9e406ahrens
495fa9e406ahrens	/*
496fa9e406ahrens	 * We're checking zap_ismicro without the lock held, in order to
497fa9e406ahrens	 * tell what type of lock we want.  Once we have some sort of
498fa9e406ahrens	 * lock, see if it really is the right type.  In practice this
499fa9e406ahrens	 * can only be different if it was upgraded from micro to fat,
500fa9e406ahrens	 * and micro wanted WRITER but fat only needs READER.
501fa9e406ahrens	 */
502bf26014Matthew Ahrens	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
503fa9e406ahrens	rw_enter(&zap->zap_rwlock, lt);
504fa9e406ahrens	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
505fa9e406ahrens		/* it was upgraded, now we only need reader */
506fa9e406ahrens		ASSERT(lt == RW_WRITER);
507fa9e406ahrens		ASSERT(RW_READER ==
508fa9e406ahrens		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
509fa9e406ahrens		rw_downgrade(&zap->zap_rwlock);
510fa9e406ahrens		lt = RW_READER;
511fa9e406ahrens	}
512fa9e406ahrens
513fa9e406ahrens	zap->zap_objset = os;
514fa9e406ahrens
515fa9e406ahrens	if (lt == RW_WRITER)
516fa9e406ahrens		dmu_buf_will_dirty(db, tx);
517fa9e406ahrens
518fa9e406ahrens	ASSERT3P(zap->zap_dbuf, ==, db);
519fa9e406ahrens
520fa9e406ahrens	ASSERT(!zap->zap_ismicro ||
521fa9e406ahrens	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
522c5f9e43ahrens	if (zap->zap_ismicro && tx && adding &&
523fa9e406ahrens	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
524fa9e406ahrens		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
525fa9e406ahrens		if (newsz > MZAP_MAX_BLKSZ) {
526fa9e406ahrens			dprintf("upgrading obj %llu: num_entries=%u\n",
527fa9e406ahrens			    obj, zap->zap_m.zap_num_entries);
528fa9e406ahrens			*zapp = zap;
529ae97279Matthew Ahrens			int err = mzap_upgrade(zapp, tag, tx, 0);
530ae97279Matthew Ahrens			if (err != 0)
531ae97279Matthew Ahrens				rw_exit(&zap->zap_rwlock);
532ae97279Matthew Ahrens			return (err);
533fa9e406ahrens		}
534ae97279Matthew Ahrens		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
535fa9e406ahrens		zap->zap_m.zap_num_chunks =
536fa9e406ahrens		    db->db_size / MZAP_ENT_LEN - 1;
537fa9e406ahrens	}
538fa9e406ahrens
539fa9e406ahrens	*zapp = zap;
540fa9e406ahrens	return (0);
541fa9e406ahrens}
542fa9e406ahrens
54379d7283Matthew Ahrensstatic int
54479d7283Matthew Ahrenszap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
54579d7283Matthew Ahrens    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
54679d7283Matthew Ahrens{
54779d7283Matthew Ahrens	dmu_buf_t *db;
54879d7283Matthew Ahrens
549bf26014Matthew Ahrens	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
55079d7283Matthew Ahrens	if (err != 0) {
55179d7283Matthew Ahrens		return (err);
55279d7283Matthew Ahrens	}
553bf26014Matthew Ahrens#ifdef ZFS_DEBUG
554bf26014Matthew Ahrens	{
555bf26014Matthew Ahrens		dmu_object_info_t doi;
556bf26014Matthew Ahrens		dmu_object_info_from_db(db, &doi);
557bf26014Matthew Ahrens		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
558bf26014Matthew Ahrens	}
559bf26014Matthew Ahrens#endif
560bf26014Matthew Ahrens
56179d7283Matthew Ahrens	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
56279d7283Matthew Ahrens	if (err != 0) {
56379d7283Matthew Ahrens		dmu_buf_rele(db, tag);
56479d7283Matthew Ahrens	}
56579d7283Matthew Ahrens	return (err);
56679d7283Matthew Ahrens}
56779d7283Matthew Ahrens
568ae97279Matthew Ahrensint
569ae97279Matthew Ahrenszap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
570ae97279Matthew Ahrens    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
571ae97279Matthew Ahrens{
572ae97279Matthew Ahrens	dmu_buf_t *db;
573ae97279Matthew Ahrens
574bf26014Matthew Ahrens	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
575ae97279Matthew Ahrens	if (err != 0)
576ae97279Matthew Ahrens		return (err);
577bf26014Matthew Ahrens#ifdef ZFS_DEBUG
578bf26014Matthew Ahrens	{
579bf26014Matthew Ahrens		dmu_object_info_t doi;
580bf26014Matthew Ahrens		dmu_object_info_from_db(db, &doi);
581bf26014Matthew Ahrens		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
582bf26014Matthew Ahrens	}
583bf26014Matthew Ahrens#endif
584ae97279Matthew Ahrens	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
585ae97279Matthew Ahrens	if (err != 0)
586ae97279Matthew Ahrens		dmu_buf_rele(db, tag);
587ae97279Matthew Ahrens	return (err);
588ae97279Matthew Ahrens}
589ae97279Matthew Ahrens
590fa9e406ahrensvoid
591ae97279Matthew Ahrenszap_unlockdir(zap_t *zap, void *tag)
592fa9e406ahrens{
593fa9e406ahrens	rw_exit(&zap->zap_rwlock);
594ae97279Matthew Ahrens	dmu_buf_rele(zap->zap_dbuf, tag);
595fa9e406ahrens}
596fa9e406ahrens
597ad860c8bonwickstatic int
598ae97279Matthew Ahrensmzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
599fa9e406ahrens{
600b24ab67Jeff Bonwick	int err = 0;
601ad860c8bonwick	zap_t *zap = *zapp;
602fa9e406ahrens
603fa9e406ahrens	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
604fa9e406ahrens
605bf26014Matthew Ahrens	int sz = zap->zap_dbuf->db_size;
606bf26014Matthew Ahrens	mzap_phys_t *mzp = zio_buf_alloc(sz);
607fa9e406ahrens	bcopy(zap->zap_dbuf->db_data, mzp, sz);
608bf26014Matthew Ahrens	int nchunks = zap->zap_m.zap_num_chunks;
609fa9e406ahrens
610b24ab67Jeff Bonwick	if (!flags) {
611b24ab67Jeff Bonwick		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
612b24ab67Jeff Bonwick		    1ULL << fzap_default_block_shift, 0, tx);
613bf26014Matthew Ahrens		if (err != 0) {
614be3e2abBrian Behlendorf			zio_buf_free(mzp, sz);
615b24ab67Jeff Bonwick			return (err);
616b24ab67Jeff Bonwick		}
617ad860c8bonwick	}
618fa9e406ahrens
619fa9e406ahrens	dprintf("upgrading obj=%llu with %u chunks\n",
620fa9e406ahrens	    zap->zap_object, nchunks);
621da6c28aamw	/* XXX destroy the avl later, so we can use the stored hash value */
622fa9e406ahrens	mze_destroy(zap);
623fa9e406ahrens
624b24ab67Jeff Bonwick	fzap_upgrade(zap, tx, flags);
625fa9e406ahrens
626bf26014Matthew Ahrens	for (int i = 0; i < nchunks; i++) {
627fa9e406ahrens		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
628fa9e406ahrens		if (mze->mze_name[0] == 0)
629fa9e406ahrens			continue;
630fa9e406ahrens		dprintf("adding %s=%llu\n",
631fa9e406ahrens		    mze->mze_name, mze->mze_value);
632bf26014Matthew Ahrens		zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
633ae97279Matthew Ahrens		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
634ae97279Matthew Ahrens		    tag, tx);
635ad860c8bonwick		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
636da6c28aamw		zap_name_free(zn);
637bf26014Matthew Ahrens		if (err != 0)
638ad860c8bonwick			break;
639fa9e406ahrens	}
640be3e2abBrian Behlendorf	zio_buf_free(mzp, sz);
641ad860c8bonwick	*zapp = zap;
642ad860c8bonwick	return (err);
643fa9e406ahrens}
644fa9e406ahrens
6451c17160Kevin Crowe/*
6461c17160Kevin Crowe * The "normflags" determine the behavior of the matchtype_t which is
647