xref: /illumos-gate/usr/src/uts/common/fs/zfs/zap_leaf.c (revision 87e5029a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * The 512-byte leaf is broken into 32 16-byte chunks.
31  * chunk number n means l_chunk[n], even though the header precedes it.
32  * the names are stored null-terminated.
33  */
34 
35 #include <sys/zfs_context.h>
36 #include <sys/zap.h>
37 #include <sys/zap_impl.h>
38 #include <sys/zap_leaf.h>
39 #include <sys/spa.h>
40 #include <sys/dmu.h>
41 
42 #define	CHAIN_END 0xffff /* end of the chunk chain */
43 
44 /* somewhat arbitrary, could go up to around 100k ... */
45 #define	MAX_ARRAY_BYTES (8<<10)
46 
47 #define	NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
48 
49 /*
50  * XXX This will >> by a negative number when
51  * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
52  */
53 #define	LEAF_HASH(l, h) \
54 	((ZAP_LEAF_HASH_NUMENTRIES-1) & \
55 		((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len)))
56 
57 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
58 
59 /* #define	MEMCHECK */
60 
61 
62 static void
63 zap_memset(void *a, int c, size_t n)
64 {
65 	char *cp = a;
66 	char *cpend = cp + n;
67 
68 	while (cp < cpend)
69 		*cp++ = c;
70 }
71 
72 static void
73 stv(int len, void *addr, uint64_t value)
74 {
75 	switch (len) {
76 	case 1:
77 		*(uint8_t *)addr = value;
78 		return;
79 	case 2:
80 		*(uint16_t *)addr = value;
81 		return;
82 	case 4:
83 		*(uint32_t *)addr = value;
84 		return;
85 	case 8:
86 		*(uint64_t *)addr = value;
87 		return;
88 	}
89 	ASSERT(!"bad int len");
90 }
91 
92 static uint64_t
93 ldv(int len, const void *addr)
94 {
95 	switch (len) {
96 	case 1:
97 		return (*(uint8_t *)addr);
98 	case 2:
99 		return (*(uint16_t *)addr);
100 	case 4:
101 		return (*(uint32_t *)addr);
102 	case 8:
103 		return (*(uint64_t *)addr);
104 	}
105 	ASSERT(!"bad int len");
106 	return (0xFEEDFACEDEADBEEF);
107 }
108 
109 void
110 zap_leaf_byteswap(zap_leaf_phys_t *buf)
111 {
112 	int i;
113 
114 	buf->l_hdr.lhr_block_type = 	BSWAP_64(buf->l_hdr.lhr_block_type);
115 	buf->l_hdr.lhr_next = 		BSWAP_64(buf->l_hdr.lhr_next);
116 	buf->l_hdr.lhr_prefix = 	BSWAP_64(buf->l_hdr.lhr_prefix);
117 	buf->l_hdr.lhr_magic = 		BSWAP_32(buf->l_hdr.lhr_magic);
118 	buf->l_hdr.lhr_nfree = 		BSWAP_16(buf->l_hdr.lhr_nfree);
119 	buf->l_hdr.lhr_nentries = 	BSWAP_16(buf->l_hdr.lhr_nentries);
120 	buf->l_hdr.lhr_prefix_len = 	BSWAP_16(buf->l_hdr.lhr_prefix_len);
121 	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
122 
123 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++)
124 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
125 
126 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
127 		struct zap_leaf_entry *le;
128 
129 		switch (buf->l_chunk[i].l_free.lf_type) {
130 		case ZAP_LEAF_ENTRY:
131 			le = &buf->l_chunk[i].l_entry;
132 
133 			le->le_type = BSWAP_8(le->le_type);
134 			le->le_int_size = BSWAP_8(le->le_int_size);
135 			le->le_next = BSWAP_16(le->le_next);
136 			le->le_name_chunk = BSWAP_16(le->le_name_chunk);
137 			le->le_name_length = BSWAP_16(le->le_name_length);
138 			le->le_value_chunk = BSWAP_16(le->le_value_chunk);
139 			le->le_value_length = BSWAP_16(le->le_value_length);
140 			le->le_cd = BSWAP_32(le->le_cd);
141 			le->le_hash = BSWAP_64(le->le_hash);
142 			break;
143 		case ZAP_LEAF_FREE:
144 			buf->l_chunk[i].l_free.lf_type =
145 			    BSWAP_8(buf->l_chunk[i].l_free.lf_type);
146 			buf->l_chunk[i].l_free.lf_next =
147 			    BSWAP_16(buf->l_chunk[i].l_free.lf_next);
148 			break;
149 		case ZAP_LEAF_ARRAY:
150 			/* zap_leaf_array */
151 			buf->l_chunk[i].l_array.la_type =
152 			    BSWAP_8(buf->l_chunk[i].l_array.la_type);
153 			buf->l_chunk[i].l_array.la_next =
154 			    BSWAP_16(buf->l_chunk[i].l_array.la_next);
155 			/* la_array doesn't need swapping */
156 			break;
157 		default:
158 			ASSERT(!"bad leaf type");
159 		}
160 	}
161 }
162 
163 void
164 zap_leaf_init(zap_leaf_t *l)
165 {
166 	int i;
167 
168 	ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size);
169 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
170 	zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
171 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
172 		l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE;
173 		l->l_phys->l_chunk[i].l_free.lf_next = i+1;
174 	}
175 	l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END;
176 	l->lh_block_type = ZBT_LEAF;
177 	l->lh_magic = ZAP_LEAF_MAGIC;
178 	l->lh_nfree = ZAP_LEAF_NUMCHUNKS;
179 }
180 
181 zap_leaf_t *
182 zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
183 {
184 	nl->lh_prefix = l->lh_prefix;
185 	nl->lh_prefix_len = l->lh_prefix_len;
186 	nl->l_next = l->l_next;
187 	l->l_next = nl;
188 	nl->lh_next = l->lh_next;
189 	l->lh_next = nl->l_blkid;
190 	return (nl);
191 }
192 
193 /*
194  * Routines which manipulate leaf chunks (l_chunk[]).
195  */
196 
197 static uint16_t
198 zap_leaf_chunk_alloc(zap_leaf_t *l)
199 {
200 	int chunk;
201 
202 	ASSERT(l->lh_nfree > 0);
203 
204 	chunk = l->l_phys->l_hdr.lh_freelist;
205 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
206 	ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE);
207 
208 	l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next;
209 
210 #ifdef MEMCHECK
211 	zap_memset(&l->l_phys->l_chunk[chunk], 0xa1,
212 	    sizeof (l->l_phys->l_chunk[chunk]));
213 #endif
214 
215 	l->lh_nfree--;
216 
217 	return (chunk);
218 }
219 
220 static void
221 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
222 {
223 	struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free;
224 	ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS);
225 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
226 	ASSERT(zlf->lf_type != ZAP_LEAF_FREE);
227 
228 #ifdef MEMCHECK
229 	zap_memset(&l->l_phys->l_chunk[chunk], 0xf4,
230 	    sizeof (l->l_phys->l_chunk[chunk]));
231 #endif
232 
233 	zlf->lf_type = ZAP_LEAF_FREE;
234 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
235 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
236 	l->l_phys->l_hdr.lh_freelist = chunk;
237 
238 	l->lh_nfree++;
239 }
240 
241 
242 /*
243  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
244  */
245 
246 static uint16_t
247 zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
248 	int integer_size, int num_integers)
249 {
250 	uint16_t chunk_head;
251 	uint16_t *chunkp = &chunk_head;
252 	int byten = 0;
253 	uint64_t value;
254 	int shift = (integer_size-1)*8;
255 	int len = num_integers;
256 	zap_leaf_t *l = zeh->zeh_found_leaf;
257 
258 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
259 
260 	while (len > 0) {
261 		uint16_t chunk = zap_leaf_chunk_alloc(l);
262 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
263 		int i;
264 
265 		la->la_type = ZAP_LEAF_ARRAY;
266 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
267 			if (byten == 0)
268 				value = ldv(integer_size, buf);
269 			la->la_array[i] = (value & (0xff << shift)) >> shift;
270 			value <<= 8;
271 			if (++byten == integer_size) {
272 				byten = 0;
273 				buf += integer_size;
274 				if (--len == 0)
275 					break;
276 			}
277 		}
278 
279 		*chunkp = chunk;
280 		chunkp = &la->la_next;
281 	}
282 	*chunkp = CHAIN_END;
283 
284 	return (chunk_head);
285 }
286 
287 static void
288 zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
289 {
290 	uint16_t chunk = *chunkp;
291 	zap_leaf_t *l = zeh->zeh_found_leaf;
292 
293 	*chunkp = CHAIN_END;
294 
295 	while (chunk != CHAIN_END) {
296 		int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next;
297 		ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==,
298 		    ZAP_LEAF_ARRAY);
299 		zap_leaf_chunk_free(l, chunk);
300 		chunk = nextchunk;
301 	}
302 }
303 
304 /* array_len and buf_len are in integers, not bytes */
305 static void
306 zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
307     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
308     char *buf)
309 {
310 	int len = MIN(array_len, buf_len);
311 	int byten = 0;
312 	uint64_t value = 0;
313 	zap_leaf_t *l = zeh->zeh_found_leaf;
314 
315 	ASSERT3U(array_int_len, <=, buf_int_len);
316 
317 	/* Fast path for one 8-byte integer */
318 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
319 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
320 		uint64_t *buf64 = (uint64_t *)buf;
321 		uint64_t val = *(uint64_t *)la->la_array;
322 		*buf64 = BE_64(val);
323 		return;
324 	}
325 
326 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
327 	if (array_int_len == 1 && buf_int_len == 1 &&
328 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
329 		while (chunk != CHAIN_END) {
330 			struct zap_leaf_array *la =
331 			    &l->l_phys->l_chunk[chunk].l_array;
332 			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
333 			buf += ZAP_LEAF_ARRAY_BYTES;
334 			chunk = la->la_next;
335 		}
336 		return;
337 	}
338 
339 	while (len > 0) {
340 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
341 		int i;
342 
343 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
344 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
345 			value = (value << 8) | la->la_array[i];
346 			byten++;
347 			if (byten == array_int_len) {
348 				stv(buf_int_len, buf, value);
349 				byten = 0;
350 				len--;
351 				if (len == 0)
352 					return;
353 				buf += buf_int_len;
354 			}
355 		}
356 		chunk = la->la_next;
357 	}
358 }
359 
360 /*
361  * Only to be used on 8-bit arrays.
362  * array_len is actual len in bytes (not encoded le_value_length).
363  * buf is null-terminated.
364  */
365 static int
366 zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
367     int array_len, const char *buf)
368 {
369 	int bseen = 0;
370 	zap_leaf_t *l = zeh->zeh_found_leaf;
371 
372 	while (bseen < array_len) {
373 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
374 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
375 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
376 		if (bcmp(la->la_array, buf + bseen, toread))
377 			break;
378 		chunk = la->la_next;
379 		bseen += toread;
380 	}
381 	return (bseen == array_len);
382 }
383 
384 /*
385  * Routines which manipulate leaf entries.
386  */
387 
388 int
389 zap_leaf_lookup(zap_leaf_t *l,
390     const char *name, uint64_t h, zap_entry_handle_t *zeh)
391 {
392 	uint16_t *chunkp;
393 	struct zap_leaf_entry *le;
394 
395 	zeh->zeh_head_leaf = l;
396 
397 again:
398 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
399 
400 	for (chunkp = LEAF_HASH_ENTPTR(l, h);
401 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
402 		uint16_t chunk = *chunkp;
403 		le = &l->l_phys->l_chunk[chunk].l_entry;
404 
405 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
406 		ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
407 
408 		if (le->le_hash != h)
409 			continue;
410 
411 		zeh->zeh_found_leaf = l;
412 		if (zap_leaf_array_equal(zeh, le->le_name_chunk,
413 		    le->le_name_length, name)) {
414 			zeh->zeh_num_integers = le->le_value_length;
415 			zeh->zeh_integer_size = le->le_int_size;
416 			zeh->zeh_cd = le->le_cd;
417 			zeh->zeh_hash = le->le_hash;
418 			zeh->zeh_chunkp = chunkp;
419 			zeh->zeh_found_leaf = l;
420 			return (0);
421 		}
422 	}
423 
424 	if (l->l_next) {
425 		l = l->l_next;
426 		goto again;
427 	}
428 
429 	return (ENOENT);
430 }
431 
432 /* Return (h1,cd1 >= h2,cd2) */
433 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
434 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
435 
436 int
437 zap_leaf_lookup_closest(zap_leaf_t *l,
438     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
439 {
440 	uint16_t chunk;
441 	uint64_t besth = -1ULL;
442 	uint32_t bestcd = ZAP_MAXCD;
443 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1;
444 	uint16_t lh;
445 	struct zap_leaf_entry *le;
446 
447 	zeh->zeh_head_leaf = l;
448 
449 again:
450 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
451 
452 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
453 		for (chunk = l->l_phys->l_hash[lh];
454 		    chunk != CHAIN_END; chunk = le->le_next) {
455 			le = &l->l_phys->l_chunk[chunk].l_entry;
456 
457 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
458 			ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
459 
460 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
461 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
462 				ASSERT3U(bestlh, >=, lh);
463 				bestlh = lh;
464 				besth = le->le_hash;
465 				bestcd = le->le_cd;
466 
467 				zeh->zeh_num_integers = le->le_value_length;
468 				zeh->zeh_integer_size = le->le_int_size;
469 				zeh->zeh_cd = le->le_cd;
470 				zeh->zeh_hash = le->le_hash;
471 				zeh->zeh_fakechunk = chunk;
472 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
473 				zeh->zeh_found_leaf = l;
474 			}
475 		}
476 	}
477 
478 	if (l->l_next) {
479 		l = l->l_next;
480 		goto again;
481 	}
482 
483 	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
484 }
485 
486 int
487 zap_entry_read(const zap_entry_handle_t *zeh,
488     uint8_t integer_size, uint64_t num_integers, void *buf)
489 {
490 	struct zap_leaf_entry *le;
491 
492 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
493 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
494 
495 	if (le->le_int_size > integer_size)
496 		return (EINVAL);
497 
498 	zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
499 	    le->le_value_length, integer_size, num_integers, buf);
500 
501 	if (zeh->zeh_num_integers > num_integers)
502 		return (EOVERFLOW);
503 	return (0);
504 
505 }
506 
507 int
508 zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
509 {
510 	struct zap_leaf_entry *le;
511 
512 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
513 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
514 
515 	zap_leaf_array_read(zeh, le->le_name_chunk, 1,
516 	    le->le_name_length, 1, buflen, buf);
517 	if (le->le_name_length > buflen)
518 		return (EOVERFLOW);
519 	return (0);
520 }
521 
522 int
523 zap_entry_update(zap_entry_handle_t *zeh,
524 	uint8_t integer_size, uint64_t num_integers, const void *buf)
525 {
526 	int delta_chunks;
527 	struct zap_leaf_entry *le;
528 	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
529 
530 	delta_chunks = NCHUNKS(num_integers * integer_size) -
531 	    NCHUNKS(le->le_value_length * le->le_int_size);
532 
533 	if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
534 		return (EAGAIN);
535 
536 	/*
537 	 * We should search other chained leaves (via
538 	 * zap_entry_remove,create?) otherwise returning EAGAIN will
539 	 * just send us into an infinite loop if we have to chain
540 	 * another leaf block, rather than being able to split this
541 	 * block.
542 	 */
543 
544 	zap_leaf_array_free(zeh, &le->le_value_chunk);
545 	le->le_value_chunk =
546 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
547 	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
548 	    (MAX_ARRAY_BYTES + 1) : (num_integers);
549 	le->le_int_size = integer_size;
550 	return (0);
551 }
552 
553 void
554 zap_entry_remove(zap_entry_handle_t *zeh)
555 {
556 	uint16_t entry_chunk;
557 	struct zap_leaf_entry *le;
558 	zap_leaf_t *l = zeh->zeh_found_leaf;
559 
560 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
561 
562 	entry_chunk = *zeh->zeh_chunkp;
563 	le = &l->l_phys->l_chunk[entry_chunk].l_entry;
564 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
565 
566 	zap_leaf_array_free(zeh, &le->le_name_chunk);
567 	zap_leaf_array_free(zeh, &le->le_value_chunk);
568 
569 	*zeh->zeh_chunkp = le->le_next;
570 	zap_leaf_chunk_free(l, entry_chunk);
571 
572 	l->lh_nentries--;
573 }
574 
575 int
576 zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
577     uint8_t integer_size, uint64_t num_integers, const void *buf,
578     zap_entry_handle_t *zeh)
579 {
580 	uint16_t chunk;
581 	uint16_t *chunkp;
582 	struct zap_leaf_entry *le;
583 	uint64_t namelen, valuelen;
584 	int numchunks;
585 
586 	valuelen = integer_size * num_integers;
587 	namelen = strlen(name) + 1;
588 	ASSERT(namelen >= 2);
589 
590 	zeh->zeh_head_leaf = l;
591 
592 	if (namelen > MAXNAMELEN)
593 		return (ENAMETOOLONG);
594 	/* find the first leaf in the chain that has sufficient free space */
595 	numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
596 	if (numchunks > ZAP_LEAF_NUMCHUNKS)
597 		return (E2BIG);
598 
599 	if (cd == ZAP_MAXCD) {
600 		for (cd = 0; cd < ZAP_MAXCD; cd++) {
601 			zap_leaf_t *ll;
602 			for (ll = l; ll; ll = ll->l_next) {
603 				for (chunk = *LEAF_HASH_ENTPTR(ll, h);
604 				    chunk != CHAIN_END; chunk = le->le_next) {
605 					le = &ll->l_phys->l_chunk
606 					    [chunk].l_entry;
607 					if (le->le_hash == h &&
608 					    le->le_cd == cd) {
609 						break;
610 					}
611 				}
612 				/*
613 				 * if this cd is in use, no need to
614 				 * check more chained leafs
615 				 */
616 				if (chunk != CHAIN_END)
617 					break;
618 			}
619 			/* If this cd is not in use, we are good. */
620 			if (chunk == CHAIN_END)
621 				break;
622 		}
623 		/* If we tried all the cd's, we lose. */
624 		if (cd == ZAP_MAXCD)
625 			return (ENOSPC);
626 	}
627 
628 	for (; l; l = l->l_next)
629 		if (l->lh_nfree >= numchunks)
630 			break;
631 	if (l == NULL)
632 		return (EAGAIN);
633 
634 	zeh->zeh_found_leaf = l;
635 
636 	/* make the entry */
637 	chunk = zap_leaf_chunk_alloc(l);
638 	le = &l->l_phys->l_chunk[chunk].l_entry;
639 	le->le_type = ZAP_LEAF_ENTRY;
640 	le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
641 	le->le_name_length = namelen;
642 	le->le_value_chunk =
643 	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
644 	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
645 	    (MAX_ARRAY_BYTES + 1) : (num_integers);
646 	le->le_int_size = integer_size;
647 	le->le_hash = h;
648 	le->le_cd = cd;
649 
650 	/* link it into the hash chain */
651 	chunkp = LEAF_HASH_ENTPTR(l, h);
652 	le->le_next = *chunkp;
653 	*chunkp = chunk;
654 
655 	l->lh_nentries++;
656 
657 	zeh->zeh_num_integers = num_integers;
658 	zeh->zeh_integer_size = le->le_int_size;
659 	zeh->zeh_cd = le->le_cd;
660 	zeh->zeh_hash = le->le_hash;
661 	zeh->zeh_chunkp = chunkp;
662 
663 	return (0);
664 }
665 
666 /*
667  * Routines for transferring entries between leafs.
668  */
669 
670 static void
671 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
672 {
673 	struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry;
674 	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
675 	le->le_next = *ptr;
676 	*ptr = entry;
677 }
678 
679 static void
680 zap_leaf_rehash_entries(zap_leaf_t *l)
681 {
682 	int i;
683 
684 	if (l->lh_nentries == 0)
685 		return;
686 
687 	/* break existing hash chains */
688 	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
689 
690 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
691 		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
692 		if (le->le_type != ZAP_LEAF_ENTRY)
693 			continue;
694 		zap_leaf_rehash_entry(l, i);
695 	}
696 }
697 
698 static uint16_t
699 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
700 {
701 	uint16_t new_chunk;
702 	uint16_t *nchunkp = &new_chunk;
703 
704 	while (chunk != CHAIN_END) {
705 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
706 		struct zap_leaf_array *nla =
707 		    &nl->l_phys->l_chunk[nchunk].l_array;
708 		struct zap_leaf_array *la =
709 		    &l->l_phys->l_chunk[chunk].l_array;
710 		int nextchunk = la->la_next;
711 
712 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
713 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS);
714 
715 		*nla = *la;
716 
717 		zap_leaf_chunk_free(l, chunk);
718 		chunk = nextchunk;
719 		*nchunkp = nchunk;
720 		nchunkp = &nla->la_next;
721 	}
722 	*nchunkp = CHAIN_END;
723 	return (new_chunk);
724 }
725 
726 static void
727 zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
728     dmu_tx_t *tx)
729 {
730 	zap_leaf_t *nl;
731 	struct zap_leaf_entry *le, *nle;
732 	uint16_t chunk, nchunks;
733 
734 	le = &l->l_phys->l_chunk[entry].l_entry;
735 	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
736 
737 	/* find a leaf in the destination leaf chain with enough free space */
738 	nchunks = 1 + NCHUNKS(le->le_name_length) +
739 	    NCHUNKS(le->le_value_length * le->le_int_size);
740 	for (nl = nhl; nl; nl = nl->l_next)
741 		if (nl->lh_nfree >= nchunks)
742 			break;
743 	if (nl == NULL) {
744 		nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
745 		dprintf("transfer_entry: chaining leaf %x/%d\n",
746 		    nl->lh_prefix, nl->lh_prefix_len);
747 	}
748 
749 	chunk = zap_leaf_chunk_alloc(nl);
750 	nle = &nl->l_phys->l_chunk[chunk].l_entry;
751 	*nle = *le;
752 
753 	zap_leaf_rehash_entry(nl, chunk);
754 
755 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
756 	nle->le_value_chunk =
757 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
758 
759 	zap_leaf_chunk_free(l, entry);
760 
761 	l->lh_nentries--;
762 	nl->lh_nentries++;
763 }
764 
765 /*
766  * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
767  * Ignore leaf chaining in source (l), but chain in destinations.
768  * We'll re-chain all the entries in l as we go along.
769  */
770 static void
771 zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
772     zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
773 {
774 	int i;
775 
776 	ASSERT(bit < 64 && bit >= 0);
777 	/* break existing hash chains */
778 	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
779 
780 	if (nl0 != l)
781 		zap_leaf_rehash_entries(nl0);
782 	if (nl1 != nl0)
783 		zap_leaf_rehash_entries(nl1);
784 
785 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
786 		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
787 		if (le->le_type != ZAP_LEAF_ENTRY)
788 			continue;
789 
790 		/*
791 		 * We could find entries via hashtable instead. That
792 		 * would be O(hashents+numents) rather than
793 		 * O(numblks+numents), but this accesses memory more
794 		 * sequentially, and when we're called, the block is
795 		 * usually pretty full.
796 		 */
797 
798 		if (le->le_hash & (1ULL << bit)) {
799 			zap_leaf_transfer_entry(zap, l, i, nl1, tx);
800 		} else {
801 			if (nl0 == l)
802 				zap_leaf_rehash_entry(l, i);
803 			else
804 				zap_leaf_transfer_entry(zap, l, i, nl0, tx);
805 		}
806 	}
807 
808 }
809 
810 /*
811  * nl will contain the entries whose hash prefix ends in 1
812  * handles leaf chaining
813  */
814 zap_leaf_t *
815 zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
816 {
817 	zap_leaf_t *l = hl;
818 	int bit = 64 - 1 - hl->lh_prefix_len;
819 	zap_leaf_t *nl = zap_create_leaf(zap, tx);
820 
821 	/* set new prefix and prefix_len */
822 	hl->lh_prefix <<= 1;
823 	hl->lh_prefix_len++;
824 	nl->lh_prefix = hl->lh_prefix | 1;
825 	nl->lh_prefix_len = hl->lh_prefix_len;
826 
827 	/* transfer odd entries from first leaf in hl chain to nl */
828 	zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
829 
830 	/* take rest of chain off hl */
831 	l = hl->l_next;
832 	hl->l_next = NULL;
833 	hl->lh_next = 0;
834 
835 	/* transfer even entries from hl chain back to hl, odd entries to nl */
836 	while (l) {
837 		zap_leaf_t *next = l->l_next;
838 		zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
839 		zap_destroy_leaf(zap, l, tx);
840 		l = next;
841 	}
842 
843 	return (nl);
844 }
845 
846 void
847 zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
848 {
849 	int n, nchained = 0;
850 
851 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
852 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
853 	zs->zs_leafs_with_2n_pointers[n]++;
854 
855 	do {
856 		int i;
857 
858 		n = l->lh_nentries/5;
859 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
860 		zs->zs_blocks_with_n5_entries[n]++;
861 
862 		n = ((1<<ZAP_BLOCK_SHIFT) -
863 		    l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
864 		    (1<<ZAP_BLOCK_SHIFT);
865 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
866 		zs->zs_blocks_n_tenths_full[n]++;
867 
868 		for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) {
869 			int nentries = 0;
870 			int chunk = l->l_phys->l_hash[i];
871 
872 			while (chunk != CHAIN_END) {
873 				struct zap_leaf_entry *le =
874 				    &l->l_phys->l_chunk[chunk].l_entry;
875 
876 				n = 1 + NCHUNKS(le->le_name_length) +
877 				    NCHUNKS(le->le_value_length *
878 					le->le_int_size);
879 				n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
880 				zs->zs_entries_using_n_chunks[n]++;
881 
882 				chunk = le->le_next;
883 				nentries++;
884 			}
885 
886 			n = nentries;
887 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
888 			zs->zs_buckets_with_n_entries[n]++;
889 		}
890 
891 		nchained++;
892 		l = l->l_next;
893 	} while (l);
894 
895 	n = nchained-1;
896 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
897 	zs->zs_leafs_with_n_chained[n]++;
898 }
899