xref: /illumos-gate/usr/src/uts/common/fs/zfs/zap.c (revision fa9e4066f08beec538e775443c5be79dd423fcab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 
30 /*
31  * This file contains the top half of the zfs directory structure
32  * implementation. The bottom half is in zap_leaf.c.
33  *
34  * The zdir is an extendable hash data structure. There is a table of
35  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
36  * each a constant size and hold a variable number of directory entries.
37  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
38  *
39  * The pointer table holds a power of 2 number of pointers.
40  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
41  * by the pointer at index i in the table holds entries whose hash value
42  * has a zd_prefix_len - bit prefix
43  */
44 
45 #include <sys/spa.h>
46 #include <sys/dmu.h>
47 #include <sys/zfs_context.h>
48 #include <sys/zap.h>
49 #include <sys/zap_impl.h>
50 #include <sys/zap_leaf.h>
51 
52 #define	MIN_FREE (ZAP_LEAF_NUMCHUNKS*9/10)
53 
54 static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
55 static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
56 static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
57     dmu_tx_t *tx, krw_t lt);
58 static void zap_put_leaf(zap_leaf_t *l);
59 static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
60 
61 
62 void
63 fzap_byteswap(void *vbuf, size_t size)
64 {
65 	uint64_t block_type;
66 
67 	ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
68 	block_type = *(uint64_t *)vbuf;
69 
70 	switch (block_type) {
71 	case ZBT_LEAF:
72 	case BSWAP_64(ZBT_LEAF):
73 		zap_leaf_byteswap(vbuf);
74 		return;
75 	case ZBT_HEADER:
76 	case BSWAP_64(ZBT_HEADER):
77 	default:
78 		/* it's a ptrtbl block */
79 		byteswap_uint64_array(vbuf, 1<<ZAP_BLOCK_SHIFT);
80 		return;
81 	}
82 }
83 
84 void
85 fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
86 {
87 	dmu_buf_t *db;
88 	zap_leaf_t *l;
89 	int i;
90 	zap_phys_t *zp;
91 
92 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
93 	zap->zap_ismicro = FALSE;
94 
95 	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
96 	    &zap->zap_f.zap_phys, zap_pageout);
97 
98 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
99 
100 	zp = zap->zap_f.zap_phys;
101 	/*
102 	 * explicitly zero it since it might be coming from an
103 	 * initialized microzap
104 	 */
105 	ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size);
106 	bzero(zp, sizeof (zap_phys_t));
107 	zp->zap_block_type = ZBT_HEADER;
108 	zp->zap_magic = ZAP_MAGIC;
109 
110 	zp->zap_ptrtbl.zt_shift = ZAP_PTRTBL_MIN_SHIFT;
111 
112 	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
113 	zp->zap_num_leafs = 1;
114 	zp->zap_num_entries = 0;
115 	zp->zap_salt = zap->zap_salt;
116 
117 	for (i = 0; i < (1<<ZAP_PTRTBL_MIN_SHIFT); i++)
118 		zp->zap_leafs[i] = 1;	/* block 1 will be the first leaf */
119 
120 	/*
121 	 * set up block 1 - the first leaf
122 	 */
123 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
124 	    1<<ZAP_BLOCK_SHIFT);
125 	dmu_buf_will_dirty(db, tx);
126 
127 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
128 	l->l_dbuf = db;
129 	l->l_phys = db->db_data;
130 
131 	zap_leaf_init(l);
132 
133 	kmem_free(l, sizeof (zap_leaf_t));
134 	dmu_buf_rele(db);
135 }
136 
137 static int
138 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
139 {
140 	if (RW_WRITE_HELD(&zap->zap_rwlock))
141 		return (1);
142 	if (rw_tryupgrade(&zap->zap_rwlock)) {
143 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
144 		return (1);
145 	}
146 	return (0);
147 }
148 
149 /*
150  * Generic routines for dealing with the pointer & cookie tables.
151  */
152 
153 static void
154 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
155     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
156     dmu_tx_t *tx)
157 {
158 	uint64_t b, newblk;
159 	dmu_buf_t *db_old, *db_new;
160 	int hepb = 1<<(ZAP_BLOCK_SHIFT-4);
161 	/* hepb = half the number of entries in a block */
162 
163 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
164 	ASSERT(tbl->zt_blk != 0);
165 	ASSERT(tbl->zt_numblks > 0);
166 
167 	if (tbl->zt_nextblk != 0) {
168 		newblk = tbl->zt_nextblk;
169 	} else {
170 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
171 		tbl->zt_nextblk = newblk;
172 		ASSERT3U(tbl->zt_blks_copied, ==, 0);
173 		dmu_prefetch(zap->zap_objset, zap->zap_object,
174 		    tbl->zt_blk << ZAP_BLOCK_SHIFT, tbl->zt_numblks <<
175 		    ZAP_BLOCK_SHIFT);
176 	}
177 
178 	/*
179 	 * Copy the ptrtbl from the old to new location, leaving the odd
180 	 * entries blank as we go.
181 	 */
182 
183 	b = tbl->zt_blks_copied;
184 	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
185 	    (tbl->zt_blk + b) << ZAP_BLOCK_SHIFT);
186 	dmu_buf_read(db_old);
187 
188 	/* first half of entries in old[b] go to new[2*b+0] */
189 	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
190 	    (newblk + 2*b+0) << ZAP_BLOCK_SHIFT);
191 	dmu_buf_will_dirty(db_new, tx);
192 	transfer_func(db_old->db_data, db_new->db_data, hepb);
193 	dmu_buf_rele(db_new);
194 
195 	/* second half of entries in old[b] go to new[2*b+1] */
196 	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
197 	    (newblk + 2*b+1) << ZAP_BLOCK_SHIFT);
198 	dmu_buf_will_dirty(db_new, tx);
199 	transfer_func((uint64_t *)db_old->db_data + hepb,
200 	    db_new->db_data, hepb);
201 	dmu_buf_rele(db_new);
202 
203 	dmu_buf_rele(db_old);
204 
205 	tbl->zt_blks_copied++;
206 
207 	dprintf("copied block %llu of %llu\n",
208 	    tbl->zt_blks_copied, tbl->zt_numblks);
209 
210 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
211 		dmu_free_range(zap->zap_objset, zap->zap_object,
212 		    tbl->zt_blk << ZAP_BLOCK_SHIFT,
213 		    tbl->zt_numblks << ZAP_BLOCK_SHIFT, tx);
214 
215 		tbl->zt_blk = newblk;
216 		tbl->zt_numblks *= 2;
217 		tbl->zt_shift++;
218 		tbl->zt_nextblk = 0;
219 		tbl->zt_blks_copied = 0;
220 
221 		dprintf("finished; numblocks now %llu (%lluk entries)\n",
222 		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
223 	}
224 }
225 
226 static uint64_t
227 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
228     dmu_tx_t *tx)
229 {
230 	uint64_t blk, off, oldval;
231 	dmu_buf_t *db;
232 
233 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
234 	ASSERT(tbl->zt_blk != 0);
235 
236 	dprintf("storing %llx at index %llx\n", val, idx);
237 
238 	blk = idx >> (ZAP_BLOCK_SHIFT-3);
239 	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
240 
241 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
242 	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
243 	dmu_buf_will_dirty(db, tx);
244 	oldval = ((uint64_t *)db->db_data)[off];
245 	((uint64_t *)db->db_data)[off] = val;
246 	dmu_buf_rele(db);
247 
248 	if (tbl->zt_nextblk != 0) {
249 		idx *= 2;
250 		blk = idx >> (ZAP_BLOCK_SHIFT-3);
251 		off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
252 
253 		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
254 		    (tbl->zt_nextblk + blk) << ZAP_BLOCK_SHIFT);
255 		dmu_buf_will_dirty(db, tx);
256 		((uint64_t *)db->db_data)[off] = val;
257 		((uint64_t *)db->db_data)[off+1] = val;
258 		dmu_buf_rele(db);
259 	}
260 
261 	return (oldval);
262 }
263 
264 static uint64_t
265 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
266 {
267 	uint64_t blk, off, val;
268 	dmu_buf_t *db;
269 
270 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
271 
272 	blk = idx >> (ZAP_BLOCK_SHIFT-3);
273 	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
274 
275 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
276 	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
277 	dmu_buf_read(db);
278 	val = ((uint64_t *)db->db_data)[off];
279 	dmu_buf_rele(db);
280 	return (val);
281 }
282 
283 /*
284  * Routines for growing the ptrtbl.
285  */
286 
287 static void
288 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
289 {
290 	int i;
291 	for (i = 0; i < n; i++) {
292 		uint64_t lb = src[i];
293 		dst[2*i+0] = lb;
294 		dst[2*i+1] = lb;
295 	}
296 }
297 
298 static void
299 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
300 {
301 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32)
302 		return;
303 
304 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
305 		/*
306 		 * The ptrtbl can no longer be contained in the
307 		 * header block.  Give it its own entire block, which
308 		 * will quadruple the size of the ptrtbl.
309 		 */
310 		uint64_t newblk;
311 		dmu_buf_t *db_new;
312 
313 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
314 		    ZAP_PTRTBL_MIN_SHIFT);
315 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
316 
317 		newblk = zap_allocate_blocks(zap, 1, tx);
318 		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
319 		    newblk << ZAP_BLOCK_SHIFT);
320 
321 		dmu_buf_will_dirty(db_new, tx);
322 		zap_ptrtbl_transfer(zap->zap_f.zap_phys->zap_leafs,
323 		    db_new->db_data, 1 << ZAP_PTRTBL_MIN_SHIFT);
324 		dmu_buf_rele(db_new);
325 
326 		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
327 		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
328 		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
329 
330 		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
331 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
332 		    (ZAP_BLOCK_SHIFT-3));
333 	} else {
334 		zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
335 		    zap_ptrtbl_transfer, tx);
336 	}
337 }
338 
339 static void
340 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
341 {
342 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
343 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
344 
345 	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
346 
347 	zap->zap_f.zap_phys->zap_num_entries += delta;
348 
349 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
350 }
351 
352 uint64_t
353 zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
354 {
355 	uint64_t newblk;
356 	ASSERT(tx != NULL);
357 	if (!RW_WRITE_HELD(&zap->zap_rwlock)) {
358 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
359 	}
360 	newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
361 	    nblocks;
362 	return (newblk);
363 }
364 
365 
366 /*
367  * This function doesn't increment zap_num_leafs because it's used to
368  * allocate a leaf chain, which doesn't count against zap_num_leafs.
369  * The directory must be held exclusively for this tx.
370  */
371 zap_leaf_t *
372 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
373 {
374 	void *winner;
375 	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
376 
377 	ASSERT(tx != NULL);
378 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
379 	/* hence we already dirtied zap->zap_dbuf */
380 
381 	rw_init(&l->l_rwlock, 0, 0, 0);
382 	rw_enter(&l->l_rwlock, RW_WRITER);
383 	l->l_blkid = zap_allocate_blocks(zap, 1, tx);
384 	l->l_next = NULL;
385 	l->l_dbuf = NULL;
386 	l->l_phys = NULL;
387 
388 	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
389 	    l->l_blkid << ZAP_BLOCK_SHIFT);
390 	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
391 	ASSERT(winner == NULL);
392 	dmu_buf_will_dirty(l->l_dbuf, tx);
393 
394 	zap_leaf_init(l);
395 
396 	return (l);
397 }
398 
399 /* ARGSUSED */
400 void
401 zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
402 {
403 	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
404 	rw_exit(&l->l_rwlock);
405 	dmu_buf_rele(l->l_dbuf);
406 	/* XXX there are still holds on this block, so we can't free it? */
407 	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
408 	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
409 }
410 
411 int
412 fzap_count(zap_t *zap, uint64_t *count)
413 {
414 	ASSERT(!zap->zap_ismicro);
415 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
416 	*count = zap->zap_f.zap_phys->zap_num_entries;
417 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
418 	return (0);
419 }
420 
421 /*
422  * Routines for obtaining zap_leaf_t's
423  */
424 
425 static void
426 zap_put_leaf(zap_leaf_t *l)
427 {
428 	zap_leaf_t *nl = l->l_next;
429 	while (nl) {
430 		zap_leaf_t *nnl = nl->l_next;
431 		rw_exit(&nl->l_rwlock);
432 		dmu_buf_rele(nl->l_dbuf);
433 		nl = nnl;
434 	}
435 	rw_exit(&l->l_rwlock);
436 	dmu_buf_rele(l->l_dbuf);
437 }
438 
439 _NOTE(ARGSUSED(0))
440 static void
441 zap_leaf_pageout(dmu_buf_t *db, void *vl)
442 {
443 	zap_leaf_t *l = vl;
444 
445 	rw_destroy(&l->l_rwlock);
446 	kmem_free(l, sizeof (zap_leaf_t));
447 }
448 
449 static zap_leaf_t *
450 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
451 {
452 	zap_leaf_t *l, *winner;
453 
454 	ASSERT(blkid != 0);
455 
456 	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
457 	rw_init(&l->l_rwlock, 0, 0, 0);
458 	rw_enter(&l->l_rwlock, RW_WRITER);
459 	l->l_blkid = blkid;
460 	l->l_next = NULL;
461 	l->l_dbuf = db;
462 	l->l_phys = NULL;
463 
464 	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
465 
466 	rw_exit(&l->l_rwlock);
467 	if (winner != NULL) {
468 		/* someone else set it first */
469 		zap_leaf_pageout(NULL, l);
470 		l = winner;
471 	}
472 
473 	return (l);
474 }
475 
476 static zap_leaf_t *
477 zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
478 {
479 	dmu_buf_t *db;
480 	zap_leaf_t *l;
481 
482 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
483 
484 	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
485 	    blkid << ZAP_BLOCK_SHIFT);
486 
487 	ASSERT3U(db->db_object, ==, zap->zap_object);
488 	ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT);
489 	ASSERT3U(db->db_size, ==, 1 << ZAP_BLOCK_SHIFT);
490 	ASSERT(blkid != 0);
491 
492 	dmu_buf_read(db);
493 	l = dmu_buf_get_user(db);
494 
495 	if (l == NULL)
496 		l = zap_open_leaf(blkid, db);
497 
498 	rw_enter(&l->l_rwlock, lt);
499 	/*
500 	 * Must lock before dirtying, otherwise l->l_phys could change,
501 	 * causing ASSERT below to fail.
502 	 */
503 	if (lt == RW_WRITER)
504 		dmu_buf_will_dirty(db, tx);
505 	ASSERT3U(l->l_blkid, ==, blkid);
506 	ASSERT3P(l->l_dbuf, ==, db);
507 	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
508 	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
509 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
510 
511 	return (l);
512 }
513 
514 static zap_leaf_t *
515 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
516 {
517 	zap_leaf_t *l, *nl;
518 
519 	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
520 
521 	nl = l;
522 	while (nl->lh_next != 0) {
523 		zap_leaf_t *nnl;
524 		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
525 		nl->l_next = nnl;
526 		nl = nnl;
527 	}
528 
529 	return (l);
530 }
531 
532 static uint64_t
533 zap_idx_to_blk(zap_t *zap, uint64_t idx)
534 {
535 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
536 
537 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
538 		ASSERT3U(idx, <,
539 		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
540 		return (zap->zap_f.zap_phys->zap_leafs[idx]);
541 	} else {
542 		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
543 		    idx));
544 	}
545 }
546 
547 static void
548 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
549 {
550 	ASSERT(tx != NULL);
551 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
552 
553 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
554 		zap->zap_f.zap_phys->zap_leafs[idx] = blk;
555 	} else {
556 		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
557 		    idx, blk, tx);
558 	}
559 }
560 
561 static zap_leaf_t *
562 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
563 {
564 	uint64_t idx;
565 	zap_leaf_t *l;
566 
567 	ASSERT(zap->zap_dbuf == NULL ||
568 	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
569 	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
570 	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
571 	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
572 
573 	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
574 
575 	return (l);
576 }
577 
578 
579 static zap_leaf_t *
580 zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
581 {
582 	zap_leaf_t *nl;
583 	int prefix_diff, i, err;
584 	uint64_t sibling;
585 
586 	ASSERT3U(l->lh_prefix_len, <=,
587 	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
588 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
589 
590 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
591 
592 	if (zap_tryupgradedir(zap, tx) == 0) {
593 		/* failed to upgrade */
594 		int old_prefix_len = l->lh_prefix_len;
595 		objset_t *os = zap->zap_objset;
596 		uint64_t object = zap->zap_object;
597 
598 		zap_put_leaf(l);
599 		zap_unlockdir(zap);
600 		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
601 		ASSERT3U(err, ==, 0);
602 		ASSERT(!zap->zap_ismicro);
603 		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
604 
605 		if (l->lh_prefix_len != old_prefix_len)
606 			/* it split while our locks were down */
607 			return (l);
608 	}
609 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
610 
611 	if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
612 		/* There's only one pointer to us. Chain on another leaf blk. */
613 		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
614 		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
615 		    l->lh_prefix_len);
616 		return (l);
617 	}
618 
619 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
620 
621 	/* There's more than one pointer to us. Split this leaf. */
622 	nl = zap_leaf_split(zap, l, tx);
623 
624 	/* set sibling pointers */
625 	prefix_diff =
626 	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
627 	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
628 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
629 		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
630 		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
631 		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
632 	}
633 
634 	zap->zap_f.zap_phys->zap_num_leafs++;
635 
636 	if (hash & (1ULL << (64 - l->lh_prefix_len))) {
637 		/* we want the sibling */
638 		zap_put_leaf(l);
639 		l = nl;
640 	} else {
641 		zap_put_leaf(nl);
642 	}
643 
644 	return (l);
645 }
646 
647 static void
648 zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap,
649     zap_leaf_t *l, dmu_tx_t *tx)
650 {
651 	int shift, err;
652 
653 again:
654 	shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
655 
656 	if (l->lh_prefix_len == shift &&
657 	    (l->l_next != NULL || l->lh_nfree < MIN_FREE)) {
658 		/* this leaf will soon make us grow the pointer table */
659 
660 		if (zap_tryupgradedir(zap, tx) == 0) {
661 			objset_t *os = zap->zap_objset;
662 			uint64_t zapobj = zap->zap_object;
663 			uint64_t blkid = l->l_blkid;
664 
665 			zap_put_leaf(l);
666 			zap_unlockdir(zap);
667 			err = zap_lockdir(os, zapobj, tx,
668 			    RW_WRITER, FALSE, &zap);
669 			ASSERT3U(err, ==, 0);
670 			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
671 			goto again;
672 		}
673 
674 		zap_put_leaf(l);
675 		zap_grow_ptrtbl(zap, tx);
676 	} else {
677 		zap_put_leaf(l);
678 	}
679 }
680 
681 
682 static int
683 fzap_checksize(uint64_t integer_size, uint64_t num_integers)
684 {
685 	/* Only integer sizes supported by C */
686 	switch (integer_size) {
687 	case 1:
688 	case 2:
689 	case 4:
690 	case 8:
691 		break;
692 	default:
693 		return (EINVAL);
694 	}
695 
696 	/* Make sure we won't overflow */
697 	if (integer_size * num_integers < num_integers)
698 		return (EINVAL);
699 	if (integer_size * num_integers > DMU_MAX_ACCESS)
700 		return (EINVAL);
701 
702 	return (0);
703 }
704 
705 /*
706  * Routines for maniplulating attributes.
707  */
708 int
709 fzap_lookup(zap_t *zap, const char *name,
710     uint64_t integer_size, uint64_t num_integers, void *buf)
711 {
712 	zap_leaf_t *l;
713 	int err;
714 	uint64_t hash;
715 	zap_entry_handle_t zeh;
716 
717 	err = fzap_checksize(integer_size, num_integers);
718 	if (err != 0)
719 		return (err);
720 
721 	hash = zap_hash(zap, name);
722 	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
723 	err = zap_leaf_lookup(l, name, hash, &zeh);
724 	if (err != 0)
725 		goto out;
726 	err = zap_entry_read(&zeh, integer_size, num_integers, buf);
727 out:
728 	zap_put_leaf(l);
729 	return (err);
730 }
731 
732 int
733 fzap_add_cd(zap_t *zap, const char *name,
734     uint64_t integer_size, uint64_t num_integers,
735     const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
736 {
737 	zap_leaf_t *l;
738 	uint64_t hash;
739 	int err;
740 	zap_entry_handle_t zeh;
741 
742 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
743 	ASSERT(!zap->zap_ismicro);
744 	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
745 
746 	hash = zap_hash(zap, name);
747 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
748 retry:
749 	err = zap_leaf_lookup(l, name, hash, &zeh);
750 	if (err == 0) {
751 		err = EEXIST;
752 		goto out;
753 	}
754 	ASSERT(err == ENOENT);
755 
756 	/* XXX If this leaf is chained, split it if we can. */
757 	err = zap_entry_create(l, name, hash, cd,
758 	    integer_size, num_integers, val, &zeh);
759 
760 	if (err == 0) {
761 		zap_increment_num_entries(zap, 1, tx);
762 	} else if (err == EAGAIN) {
763 		l = zap_expand_leaf(zap, l, hash, tx);
764 		goto retry;
765 	}
766 
767 out:
768 	if (lp)
769 		*lp = l;
770 	else
771 		zap_put_leaf(l);
772 	return (err);
773 }
774 
775 int
776 fzap_add(zap_t *zap, const char *name,
777     uint64_t integer_size, uint64_t num_integers,
778     const void *val, dmu_tx_t *tx)
779 {
780 	int err;
781 	zap_leaf_t *l;
782 
783 	err = fzap_checksize(integer_size, num_integers);
784 	if (err != 0)
785 		return (err);
786 
787 	err = fzap_add_cd(zap, name, integer_size, num_integers,
788 	    val, ZAP_MAXCD, tx, &l);
789 
790 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
791 	return (err);
792 }
793 
794 int
795 fzap_update(zap_t *zap, const char *name,
796     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
797 {
798 	zap_leaf_t *l;
799 	uint64_t hash;
800 	int err, create;
801 	zap_entry_handle_t zeh;
802 
803 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
804 	err = fzap_checksize(integer_size, num_integers);
805 	if (err != 0)
806 		return (err);
807 
808 	hash = zap_hash(zap, name);
809 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
810 retry:
811 	err = zap_leaf_lookup(l, name, hash, &zeh);
812 	create = (err == ENOENT);
813 	ASSERT(err == 0 || err == ENOENT);
814 
815 	/* XXX If this leaf is chained, split it if we can. */
816 
817 	if (create) {
818 		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
819 		    integer_size, num_integers, val, &zeh);
820 		if (err == 0)
821 			zap_increment_num_entries(zap, 1, tx);
822 	} else {
823 		err = zap_entry_update(&zeh, integer_size, num_integers, val);
824 	}
825 
826 	if (err == EAGAIN) {
827 		l = zap_expand_leaf(zap, l, hash, tx);
828 		goto retry;
829 	}
830 
831 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
832 	return (err);
833 }
834 
835 int
836 fzap_length(zap_t *zap, const char *name,
837     uint64_t *integer_size, uint64_t *num_integers)
838 {
839 	zap_leaf_t *l;
840 	int err;
841 	uint64_t hash;
842 	zap_entry_handle_t zeh;
843 
844 	hash = zap_hash(zap, name);
845 	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
846 	err = zap_leaf_lookup(l, name, hash, &zeh);
847 	if (err != 0)
848 		goto out;
849 
850 	if (integer_size)
851 		*integer_size = zeh.zeh_integer_size;
852 	if (num_integers)
853 		*num_integers = zeh.zeh_num_integers;
854 out:
855 	zap_put_leaf(l);
856 	return (err);
857 }
858 
859 int
860 fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
861 {
862 	zap_leaf_t *l;
863 	uint64_t hash;
864 	int err;
865 	zap_entry_handle_t zeh;
866 
867 	hash = zap_hash(zap, name);
868 	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
869 	err = zap_leaf_lookup(l, name, hash, &zeh);
870 	if (err == 0) {
871 		zap_entry_remove(&zeh);
872 		zap_increment_num_entries(zap, -1, tx);
873 	}
874 	zap_put_leaf(l);
875 	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
876 	    zap->zap_objset, zap->zap_object, name, err);
877 	return (err);
878 }
879 
880 int
881 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
882 {
883 	zap_cursor_t zc;
884 	zap_attribute_t *za;
885 	int err;
886 
887 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
888 	for (zap_cursor_init(&zc, os, zapobj);
889 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
890 	    zap_cursor_advance(&zc)) {
891 		if (za->za_first_integer == value) {
892 			(void) strcpy(name, za->za_name);
893 			break;
894 		}
895 	}
896 	kmem_free(za, sizeof (zap_attribute_t));
897 	return (err);
898 }
899 
900 
901 /*
902  * Routines for iterating over the attributes.
903  */
904 
905 int
906 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
907 {
908 	int err = ENOENT;
909 	zap_entry_handle_t zeh;
910 	zap_leaf_t *l;
911 
912 	/* retrieve the next entry at or after zc_hash/zc_cd */
913 	/* if no entry, return ENOENT */
914 
915 again:
916 	l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
917 	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
918 
919 	if (err == ENOENT) {
920 		uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
921 		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
922 		zc->zc_cd = 0;
923 		if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
924 			zc->zc_hash = -1ULL;
925 		} else {
926 			zap_put_leaf(l);
927 			goto again;
928 		}
929 	}
930 
931 	if (err == 0) {
932 		zc->zc_hash = zeh.zeh_hash;
933 		zc->zc_cd = zeh.zeh_cd;
934 		za->za_integer_length = zeh.zeh_integer_size;
935 		za->za_num_integers = zeh.zeh_num_integers;
936 		if (zeh.zeh_num_integers == 0) {
937 			za->za_first_integer = 0;
938 		} else {
939 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
940 			ASSERT(err == 0 || err == EOVERFLOW);
941 		}
942 		err = zap_entry_read_name(&zeh,
943 		    sizeof (za->za_name), za->za_name);
944 		ASSERT(err == 0);
945 	}
946 	zap_put_leaf(l);
947 	return (err);
948 }
949 
950 
951 static void
952 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
953 {
954 	int i;
955 	uint64_t lastblk = 0;
956 
957 	/*
958 	 * NB: if a leaf has more pointers than an entire ptrtbl block
959 	 * can hold, then it'll be accounted for more than once, since
960 	 * we won't have lastblk.
961 	 */
962 	for (i = 0; i < len; i++) {
963 		zap_leaf_t *l;
964 
965 		if (tbl[i] == lastblk)
966 			continue;
967 		lastblk = tbl[i];
968 
969 		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
970 
971 		zap_stats_leaf(zap, l, zs);
972 		zap_put_leaf(l);
973 	}
974 }
975 
976 void
977 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
978 {
979 	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
980 	zs->zs_blocksize = 1ULL << ZAP_BLOCK_SHIFT;
981 	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
982 	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
983 	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
984 
985 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
986 		/* the ptrtbl is entirely in the header block. */
987 		zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs,
988 		    1 << ZAP_PTRTBL_MIN_SHIFT, zs);
989 	} else {
990 		int b;
991 
992 		dmu_prefetch(zap->zap_objset, zap->zap_object,
993 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT,
994 		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
995 			ZAP_BLOCK_SHIFT);
996 
997 		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
998 		    b++) {
999 			dmu_buf_t *db;
1000 
1001 			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1002 			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) <<
1003 			    ZAP_BLOCK_SHIFT);
1004 			dmu_buf_read(db);
1005 			zap_stats_ptrtbl(zap, db->db_data,
1006 			    1<<(ZAP_BLOCK_SHIFT-3), zs);
1007 			dmu_buf_rele(db);
1008 		}
1009 	}
1010 }
1011