1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 *
7 *	@(#)db_page.h	10.18 (Sleepycat) 12/2/98
8 */
9
10#ifndef _DB_PAGE_H_
11#define	_DB_PAGE_H_
12
13/*
14 * DB page formats.
15 *
16 * This implementation requires that values within the following structures
17 * NOT be padded -- note, ANSI C permits random padding within structures.
18 * If your compiler pads randomly you can just forget ever making DB run on
19 * your system.  In addition, no data type can require larger alignment than
20 * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
21 *
22 * Note that key/data lengths are often stored in db_indx_t's -- this is
23 * not accidental, nor does it limit the key/data size.  If the key/data
24 * item fits on a page, it's guaranteed to be small enough to fit into a
25 * db_indx_t, and storing it in one saves space.
26 */
27
28#define	PGNO_METADATA	0	/* Metadata page number. */
29#define	PGNO_INVALID	0	/* Metadata page number, therefore illegal. */
30#define	PGNO_ROOT	1	/* Root is page #1. */
31
32/*
33 * When we create pages in mpool, we ask mpool to clear some number of bytes
34 * in the header.  This number must be at least as big as the regular page
35 * headers and cover enough of the btree and hash meta-data pages to obliterate
36 * the magic and version numbers.
37 */
38#define	DB_PAGE_CLEAR_LEN	32
39
40/************************************************************************
41 BTREE METADATA PAGE LAYOUT
42 ************************************************************************/
43
44/*
45 * Btree metadata page layout:
46 */
47typedef struct _btmeta {
48	DB_LSN	  lsn;		/* 00-07: LSN. */
49	db_pgno_t pgno;		/* 08-11: Current page number. */
50	u_int32_t magic;	/* 12-15: Magic number. */
51	u_int32_t version;	/* 16-19: Version. */
52	u_int32_t pagesize;	/* 20-23: Pagesize. */
53	u_int32_t maxkey;	/* 24-27: Btree: Maxkey. */
54	u_int32_t minkey;	/* 28-31: Btree: Minkey. */
55	u_int32_t free;		/* 32-35: Free list page number. */
56#define	BTM_DUP		0x001	/* 	  Duplicates. */
57#define	BTM_RECNO	0x002	/*	  Recno tree. */
58#define	BTM_RECNUM	0x004	/*	  Btree: maintain record count. */
59#define	BTM_FIXEDLEN	0x008	/*	  Recno: fixed length records. */
60#define	BTM_RENUMBER	0x010	/*	  Recno: renumber on insert/delete. */
61#define	BTM_MASK	0x01f
62	u_int32_t flags;	/* 36-39: Flags. */
63	u_int32_t re_len;	/* 40-43: Recno: fixed-length record length. */
64	u_int32_t re_pad;	/* 44-47: Recno: fixed-length record pad. */
65				/* 48-67: Unique file ID. */
66	u_int8_t  uid[DB_FILE_ID_LEN];
67} BTMETA;
68
69/************************************************************************
70 HASH METADATA PAGE LAYOUT
71 ************************************************************************/
72
73/*
74 * Hash metadata page layout:
75 */
76/* Hash Table Information */
77typedef struct hashhdr {	/* Disk resident portion */
78	DB_LSN	lsn;		/* 00-07: LSN of the header page */
79	db_pgno_t pgno;		/* 08-11: Page number (btree compatibility). */
80	u_int32_t magic;	/* 12-15: Magic NO for hash tables */
81	u_int32_t version;	/* 16-19: Version ID */
82	u_int32_t pagesize;	/* 20-23: Bucket/Page Size */
83	u_int32_t ovfl_point;	/* 24-27: Overflow page allocation location */
84	u_int32_t last_freed;	/* 28-31: Last freed overflow page pgno */
85	u_int32_t max_bucket;	/* 32-35: ID of Maximum bucket in use */
86	u_int32_t high_mask;	/* 36-39: Modulo mask into table */
87	u_int32_t low_mask;	/* 40-43: Modulo mask into table lower half */
88	u_int32_t ffactor;	/* 44-47: Fill factor */
89	u_int32_t nelem;	/* 48-51: Number of keys in hash table */
90	u_int32_t h_charkey;	/* 52-55: Value of hash(CHARKEY) */
91#define	DB_HASH_DUP	0x01
92	u_int32_t flags;	/* 56-59: Allow duplicates. */
93#define NCACHED	32		/* number of spare points */
94				/* 60-187: Spare pages for overflow */
95	u_int32_t spares[NCACHED];
96				/* 188-207: Unique file ID. */
97	u_int8_t  uid[DB_FILE_ID_LEN];
98
99	/*
100	 * Minimum page size is 256.
101	 */
102} HASHHDR;
103
104/************************************************************************
105 MAIN PAGE LAYOUT
106 ************************************************************************/
107
108/*
109 *	+-----------------------------------+
110 *	|    lsn    |   pgno    | prev pgno |
111 *	+-----------------------------------+
112 *	| next pgno |  entries  | hf offset |
113 *	+-----------------------------------+
114 *	|   level   |   type    |   index   |
115 *	+-----------------------------------+
116 *	|   index   | free -->              |
117 *	+-----------+-----------------------+
118 *	|   	 F R E E A R E A            |
119 *	+-----------------------------------+
120 *	|              <-- free |   item    |
121 *	+-----------------------------------+
122 *	|   item    |   item    |   item    |
123 *	+-----------------------------------+
124 *
125 * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be
126 * two-byte aligned.
127 *
128 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
129 * key for inp[1]'s data.  All other types of pages only contain single items.
130 */
131typedef struct _db_page {
132	DB_LSN	  lsn;		/* 00-07: Log sequence number. */
133	db_pgno_t pgno;		/* 08-11: Current page number. */
134	db_pgno_t prev_pgno;	/* 12-15: Previous page number. */
135	db_pgno_t next_pgno;	/* 16-19: Next page number. */
136	db_indx_t entries;	/* 20-21: Number of item pairs on the page. */
137	db_indx_t hf_offset;	/* 22-23: High free byte page offset. */
138
139	/*
140	 * The btree levels are numbered from the leaf to the root, starting
141	 * with 1, so the leaf is level 1, its parent is level 2, and so on.
142	 * We maintain this level on all btree pages, but the only place that
143	 * we actually need it is on the root page.  It would not be difficult
144	 * to hide the byte on the root page once it becomes an internal page,
145	 * so we could get this byte back if we needed it for something else.
146	 */
147#define	LEAFLEVEL	  1
148#define	MAXBTREELEVEL	255
149	u_int8_t  level;	/*    24: Btree tree level. */
150
151#define	P_INVALID	0	/*	  Invalid page type. */
152#define	P_DUPLICATE	1	/*        Duplicate. */
153#define	P_HASH		2	/*        Hash. */
154#define	P_IBTREE	3	/*        Btree internal. */
155#define	P_IRECNO	4	/*        Recno internal. */
156#define	P_LBTREE	5	/*        Btree leaf. */
157#define	P_LRECNO	6	/*        Recno leaf. */
158#define	P_OVERFLOW	7	/*        Overflow. */
159	u_int8_t  type;		/*    25: Page type. */
160	db_indx_t inp[1];	/* Variable length index of items. */
161} PAGE;
162
163/* Element macros. */
164#define	LSN(p)		(((PAGE *)p)->lsn)
165#define	PGNO(p)		(((PAGE *)p)->pgno)
166#define	PREV_PGNO(p)	(((PAGE *)p)->prev_pgno)
167#define	NEXT_PGNO(p)	(((PAGE *)p)->next_pgno)
168#define	NUM_ENT(p)	(((PAGE *)p)->entries)
169#define	HOFFSET(p)	(((PAGE *)p)->hf_offset)
170#define	LEVEL(p)	(((PAGE *)p)->level)
171#define	TYPE(p)		(((PAGE *)p)->type)
172
173/*
174 * !!!
175 * The next_pgno and prev_pgno fields are not maintained for btree and recno
176 * internal pages.  It's a minor performance improvement, and more, it's
177 * hard to do when deleting internal pages, and it decreases the chance of
178 * deadlock during deletes and splits.
179 *
180 * !!!
181 * The btree/recno access method needs db_recno_t bytes of space on the root
182 * page to specify how many records are stored in the tree.  (The alternative
183 * is to store the number of records in the meta-data page, which will create
184 * a second hot spot in trees being actively modified, or recalculate it from
185 * the BINTERNAL fields on each access.)  Overload the prev_pgno field.
186 */
187#define	RE_NREC(p)							\
188	(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 :				\
189	    TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p))
190#define	RE_NREC_ADJ(p, adj)						\
191	PREV_PGNO(p) += adj;
192#define	RE_NREC_SET(p, num)						\
193	PREV_PGNO(p) = num;
194
195/*
196 * Initialize a page.
197 *
198 * !!!
199 * Don't modify the page's LSN, code depends on it being unchanged after a
200 * P_INIT call.
201 */
202#define	P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do {	\
203	PGNO(pg) = n;							\
204	PREV_PGNO(pg) = pg_prev;					\
205	NEXT_PGNO(pg) = pg_next;					\
206	NUM_ENT(pg) = 0;						\
207	HOFFSET(pg) = pg_size;						\
208	LEVEL(pg) = btl;						\
209	TYPE(pg) = pg_type;						\
210} while (0)
211
212/* Page header length (offset to first index). */
213#define P_OVERHEAD		(SSZA(PAGE, inp))
214
215/* First free byte. */
216#define	LOFFSET(pg)		(P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t))
217
218/* Free space on the page. */
219#define	P_FREESPACE(pg)		(HOFFSET(pg) - LOFFSET(pg))
220
221/* Get a pointer to the bytes at a specific index. */
222#define	P_ENTRY(pg, indx)	((u_int8_t *)pg + ((PAGE *)pg)->inp[indx])
223
224/************************************************************************
225 OVERFLOW PAGE LAYOUT
226 ************************************************************************/
227
228/*
229 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
230 * store a page number (the first page of the overflow item) and a length
231 * (the total length of the overflow item).  The overflow item consists of
232 * some number of overflow pages, linked by the next_pgno field of the page.
233 * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
234 *
235 * Overflow page overloads:
236 *	The amount of overflow data stored on each page is stored in the
237 *	hf_offset field.
238 *
239 *	The implementation reference counts overflow items as it's possible
240 *	for them to be promoted onto btree internal pages.  The reference
241 *	count is stored in the entries field.
242 */
243#define	OV_LEN(p)	(((PAGE *)p)->hf_offset)
244#define	OV_REF(p)	(((PAGE *)p)->entries)
245
246/* Maximum number of bytes that you can put on an overflow page. */
247#define	P_MAXSPACE(psize)	((psize) - P_OVERHEAD)
248
249/************************************************************************
250 HASH PAGE LAYOUT
251 ************************************************************************/
252
253/* Each index references a group of bytes on the page. */
254#define	H_KEYDATA	1	/* Key/data item. */
255#define	H_DUPLICATE	2	/* Duplicate key/data item. */
256#define	H_OFFPAGE	3	/* Overflow key/data item. */
257#define	H_OFFDUP	4	/* Overflow page of duplicates. */
258
259/*
260 * !!!
261 * Items on hash pages are (potentially) unaligned, so we can never cast the
262 * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
263 * we do with B+tree on-page structures.  Because we frequently want the type
264 * field, it requires no alignment, and it's in the same location in all three
265 * structures, there's a pair of macros.
266 */
267#define	HPAGE_PTYPE(p)		(*(u_int8_t *)p)
268#define	HPAGE_TYPE(pg, indx)	(*P_ENTRY(pg, indx))
269
270/*
271 * The first and second types are H_KEYDATA and H_DUPLICATE, represented
272 * by the HKEYDATA structure:
273 *
274 *	+-----------------------------------+
275 *	|    type   | key/data ...          |
276 *	+-----------------------------------+
277 *
278 * For duplicates, the data field encodes duplicate elements in the data
279 * field:
280 *
281 *	+---------------------------------------------------------------+
282 *	|    type   | len1 | element1 | len1 | len2 | element2 | len2   |
283 *	+---------------------------------------------------------------+
284 *
285 * Thus, by keeping track of the offset in the element, we can do both
286 * backward and forward traversal.
287 */
288typedef struct _hkeydata {
289	u_int8_t  type;		/*    00: Page type. */
290	u_int8_t  data[1];	/* Variable length key/data item. */
291} HKEYDATA;
292#define	HKEYDATA_DATA(p)	(((u_int8_t *)p) + SSZA(HKEYDATA, data))
293
294/*
295 * The length of any HKEYDATA item. Note that indx is an element index,
296 * not a PAIR index.
297 */
298#define	LEN_HITEM(pg, pgsize, indx)					\
299	(((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx])
300
301#define	LEN_HKEYDATA(pg, psize, indx)					\
302	(((indx) == 0 ? psize : pg->inp[indx - 1]) -			\
303	pg->inp[indx] - HKEYDATA_SIZE(0))
304
305/*
306 * Page space required to add a new HKEYDATA item to the page, with and
307 * without the index value.
308 */
309#define	HKEYDATA_SIZE(len)						\
310	((len) + SSZA(HKEYDATA, data))
311#define	HKEYDATA_PSIZE(len)						\
312	(HKEYDATA_SIZE(len) + sizeof(db_indx_t))
313
314/* Put a HKEYDATA item at the location referenced by a page entry. */
315#define	PUT_HKEYDATA(pe, kd, len, type) {				\
316	((HKEYDATA *)pe)->type = type;					\
317	memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len);		\
318}
319
320/*
321 * Macros the describe the page layout in terms of key-data pairs.
322 * The use of "pindex" indicates that the argument is the index
323 * expressed in pairs instead of individual elements.
324 */
325#define H_NUMPAIRS(pg)			(NUM_ENT(pg) / 2)
326#define	H_KEYINDEX(pindx)		(2 * (pindx))
327#define	H_DATAINDEX(pindx)		((2 * (pindx)) + 1)
328#define	H_PAIRKEY(pg, pindx)		P_ENTRY(pg, H_KEYINDEX(pindx))
329#define	H_PAIRDATA(pg, pindx)		P_ENTRY(pg, H_DATAINDEX(pindx))
330#define H_PAIRSIZE(pg, psize, pindx)					\
331	(LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) +			\
332	LEN_HITEM(pg, psize, H_DATAINDEX(pindx)))
333#define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx))
334#define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx))
335
336/*
337 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
338 */
339typedef struct _hoffpage {
340	u_int8_t  type;		/*    00: Page type and delete flag. */
341	u_int8_t  unused[3];	/* 01-03: Padding, unused. */
342	db_pgno_t pgno;		/* 04-07: Offpage page number. */
343	u_int32_t tlen;		/* 08-11: Total length of item. */
344} HOFFPAGE;
345
346#define	HOFFPAGE_PGNO(p)	(((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
347#define	HOFFPAGE_TLEN(p)	(((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
348
349/*
350 * Page space required to add a new HOFFPAGE item to the page, with and
351 * without the index value.
352 */
353#define	HOFFPAGE_SIZE		(sizeof(HOFFPAGE))
354#define	HOFFPAGE_PSIZE		(HOFFPAGE_SIZE + sizeof(db_indx_t))
355
356/*
357 * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
358 */
359typedef struct _hoffdup {
360	u_int8_t  type;		/*    00: Page type and delete flag. */
361	u_int8_t  unused[3];	/* 01-03: Padding, unused. */
362	db_pgno_t pgno;		/* 04-07: Offpage page number. */
363} HOFFDUP;
364#define	HOFFDUP_PGNO(p)		(((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
365
366/*
367 * Page space required to add a new HOFFDUP item to the page, with and
368 * without the index value.
369 */
370#define	HOFFDUP_SIZE		(sizeof(HOFFDUP))
371#define	HOFFDUP_PSIZE		(HOFFDUP_SIZE + sizeof(db_indx_t))
372
373/************************************************************************
374 BTREE PAGE LAYOUT
375 ************************************************************************/
376
377/* Each index references a group of bytes on the page. */
378#define	B_KEYDATA	1	/* Key/data item. */
379#define	B_DUPLICATE	2	/* Duplicate key/data item. */
380#define	B_OVERFLOW	3	/* Overflow key/data item. */
381
382/*
383 * We have to store a deleted entry flag in the page.   The reason is complex,
384 * but the simple version is that we can't delete on-page items referenced by
385 * a cursor -- the return order of subsequent insertions might be wrong.  The
386 * delete flag is an overload of the top bit of the type byte.
387 */
388#define	B_DELETE	(0x80)
389#define	B_DCLR(t)	(t) &= ~B_DELETE
390#define	B_DSET(t)	(t) |= B_DELETE
391#define	B_DISSET(t)	((t) & B_DELETE)
392
393#define	B_TYPE(t)	((t) & ~B_DELETE)
394#define	B_TSET(t, type, deleted) {					\
395	(t) = (type);							\
396	if (deleted)							\
397		B_DSET(t);						\
398}
399
400/*
401 * The first type is B_KEYDATA, represented by the BKEYDATA structure:
402 */
403typedef struct _bkeydata {
404	db_indx_t len;		/* 00-01: Key/data item length. */
405	u_int8_t  type;		/*    02: Page type AND DELETE FLAG. */
406	u_int8_t  data[1];	/* Variable length key/data item. */
407} BKEYDATA;
408
409/* Get a BKEYDATA item for a specific index. */
410#define	GET_BKEYDATA(pg, indx)						\
411	((BKEYDATA *)P_ENTRY(pg, indx))
412
413/*
414 * Page space required to add a new BKEYDATA item to the page, with and
415 * without the index value.
416 */
417#define	BKEYDATA_SIZE(len)						\
418	ALIGN((len) + SSZA(BKEYDATA, data), 4)
419#define	BKEYDATA_PSIZE(len)						\
420	(BKEYDATA_SIZE(len) + sizeof(db_indx_t))
421
422/*
423 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
424 * by the BOVERFLOW structure.
425 */
426typedef struct _boverflow {
427	db_indx_t unused1;	/* 00-01: Padding, unused. */
428	u_int8_t  type;		/*    02: Page type AND DELETE FLAG. */
429	u_int8_t  unused2;	/*    03: Padding, unused. */
430	db_pgno_t pgno;		/* 04-07: Next page number. */
431	u_int32_t tlen;		/* 08-11: Total length of item. */
432} BOVERFLOW;
433
434/* Get a BOVERFLOW item for a specific index. */
435#define	GET_BOVERFLOW(pg, indx)						\
436	((BOVERFLOW *)P_ENTRY(pg, indx))
437
438/*
439 * Page space required to add a new BOVERFLOW item to the page, with and
440 * without the index value.
441 */
442#define	BOVERFLOW_SIZE							\
443	ALIGN(sizeof(BOVERFLOW), 4)
444#define	BOVERFLOW_PSIZE							\
445	(BOVERFLOW_SIZE + sizeof(db_indx_t))
446
447/*
448 * Btree leaf and hash page layouts group indices in sets of two, one
449 * for the key and one for the data.  Everything else does it in sets
450 * of one to save space.  I use the following macros so that it's real
451 * obvious what's going on...
452 */
453#define	O_INDX	1
454#define	P_INDX	2
455
456/************************************************************************
457 BTREE INTERNAL PAGE LAYOUT
458 ************************************************************************/
459
460/*
461 * Btree internal entry.
462 */
463typedef struct _binternal {
464	db_indx_t  len;		/* 00-01: Key/data item length. */
465	u_int8_t   type;	/*    02: Page type AND DELETE FLAG. */
466	u_int8_t   unused;	/*    03: Padding, unused. */
467	db_pgno_t  pgno;	/* 04-07: Page number of referenced page. */
468	db_recno_t nrecs;	/* 08-11: Subtree record count. */
469	u_int8_t   data[1];	/* Variable length key item. */
470} BINTERNAL;
471
472/* Get a BINTERNAL item for a specific index. */
473#define	GET_BINTERNAL(pg, indx)						\
474	((BINTERNAL *)P_ENTRY(pg, indx))
475
476/*
477 * Page space required to add a new BINTERNAL item to the page, with and
478 * without the index value.
479 */
480#define	BINTERNAL_SIZE(len)						\
481	ALIGN((len) + SSZA(BINTERNAL, data), 4)
482#define	BINTERNAL_PSIZE(len)						\
483	(BINTERNAL_SIZE(len) + sizeof(db_indx_t))
484
485/************************************************************************
486 RECNO INTERNAL PAGE LAYOUT
487 ************************************************************************/
488
489/*
490 * The recno internal entry.
491 *
492 * XXX
493 * Why not fold this into the db_indx_t structure, it's fixed length?
494 */
495typedef struct _rinternal {
496	db_pgno_t  pgno;	/* 00-03: Page number of referenced page. */
497	db_recno_t nrecs;	/* 04-07: Subtree record count. */
498} RINTERNAL;
499
500/* Get a RINTERNAL item for a specific index. */
501#define	GET_RINTERNAL(pg, indx)						\
502	((RINTERNAL *)P_ENTRY(pg, indx))
503
504/*
505 * Page space required to add a new RINTERNAL item to the page, with and
506 * without the index value.
507 */
508#define	RINTERNAL_SIZE							\
509	ALIGN(sizeof(RINTERNAL), 4)
510#define	RINTERNAL_PSIZE							\
511	(RINTERNAL_SIZE + sizeof(db_indx_t))
512#endif /* _DB_PAGE_H_ */
513