1*7c478bd9Sstevel@tonic-gate /*-
2*7c478bd9Sstevel@tonic-gate  * See the file LICENSE for redistribution information.
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1996, 1997, 1998
5*7c478bd9Sstevel@tonic-gate  *	Sleepycat Software.  All rights reserved.
6*7c478bd9Sstevel@tonic-gate  */
7*7c478bd9Sstevel@tonic-gate /*
8*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1990, 1993, 1994, 1995, 1996
9*7c478bd9Sstevel@tonic-gate  *	Keith Bostic.  All rights reserved.
10*7c478bd9Sstevel@tonic-gate  */
11*7c478bd9Sstevel@tonic-gate /*
12*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1990, 1993, 1994, 1995
13*7c478bd9Sstevel@tonic-gate  *	The Regents of the University of California.  All rights reserved.
14*7c478bd9Sstevel@tonic-gate  *
15*7c478bd9Sstevel@tonic-gate  * This code is derived from software contributed to Berkeley by
16*7c478bd9Sstevel@tonic-gate  * Mike Olson.
17*7c478bd9Sstevel@tonic-gate  *
18*7c478bd9Sstevel@tonic-gate  * Redistribution and use in source and binary forms, with or without
19*7c478bd9Sstevel@tonic-gate  * modification, are permitted provided that the following conditions
20*7c478bd9Sstevel@tonic-gate  * are met:
21*7c478bd9Sstevel@tonic-gate  * 1. Redistributions of source code must retain the above copyright
22*7c478bd9Sstevel@tonic-gate  *    notice, this list of conditions and the following disclaimer.
23*7c478bd9Sstevel@tonic-gate  * 2. Redistributions in binary form must reproduce the above copyright
24*7c478bd9Sstevel@tonic-gate  *    notice, this list of conditions and the following disclaimer in the
25*7c478bd9Sstevel@tonic-gate  *    documentation and/or other materials provided with the distribution.
26*7c478bd9Sstevel@tonic-gate  * 3. All advertising materials mentioning features or use of this software
27*7c478bd9Sstevel@tonic-gate  *    must display the following acknowledgement:
28*7c478bd9Sstevel@tonic-gate  *	This product includes software developed by the University of
29*7c478bd9Sstevel@tonic-gate  *	California, Berkeley and its contributors.
30*7c478bd9Sstevel@tonic-gate  * 4. Neither the name of the University nor the names of its contributors
31*7c478bd9Sstevel@tonic-gate  *    may be used to endorse or promote products derived from this software
32*7c478bd9Sstevel@tonic-gate  *    without specific prior written permission.
33*7c478bd9Sstevel@tonic-gate  *
34*7c478bd9Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35*7c478bd9Sstevel@tonic-gate  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36*7c478bd9Sstevel@tonic-gate  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37*7c478bd9Sstevel@tonic-gate  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38*7c478bd9Sstevel@tonic-gate  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39*7c478bd9Sstevel@tonic-gate  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40*7c478bd9Sstevel@tonic-gate  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41*7c478bd9Sstevel@tonic-gate  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42*7c478bd9Sstevel@tonic-gate  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43*7c478bd9Sstevel@tonic-gate  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44*7c478bd9Sstevel@tonic-gate  * SUCH DAMAGE.
45*7c478bd9Sstevel@tonic-gate  *
46*7c478bd9Sstevel@tonic-gate  *	@(#)btree.h	10.26 (Sleepycat) 12/16/98
47*7c478bd9Sstevel@tonic-gate  */
48*7c478bd9Sstevel@tonic-gate 
49*7c478bd9Sstevel@tonic-gate /* Forward structure declarations. */
50*7c478bd9Sstevel@tonic-gate struct __btree;		typedef struct __btree BTREE;
51*7c478bd9Sstevel@tonic-gate struct __cursor;	typedef struct __cursor CURSOR;
52*7c478bd9Sstevel@tonic-gate struct __epg;		typedef struct __epg EPG;
53*7c478bd9Sstevel@tonic-gate struct __recno;		typedef struct __recno RECNO;
54*7c478bd9Sstevel@tonic-gate 
55*7c478bd9Sstevel@tonic-gate #define	DEFMINKEYPAGE	 (2)
56*7c478bd9Sstevel@tonic-gate 
57*7c478bd9Sstevel@tonic-gate #define	ISINTERNAL(p)	(TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
58*7c478bd9Sstevel@tonic-gate #define	ISLEAF(p)	(TYPE(p) == P_LBTREE || TYPE(p) == P_LRECNO)
59*7c478bd9Sstevel@tonic-gate 
60*7c478bd9Sstevel@tonic-gate /*
61*7c478bd9Sstevel@tonic-gate  * If doing transactions we have to hold the locks associated with a data item
62*7c478bd9Sstevel@tonic-gate  * from a page for the entire transaction.  However, we don't have to hold the
63*7c478bd9Sstevel@tonic-gate  * locks associated with walking the tree.  Distinguish between the two so that
64*7c478bd9Sstevel@tonic-gate  * we don't tie up the internal pages of the tree longer than necessary.
65*7c478bd9Sstevel@tonic-gate  */
66*7c478bd9Sstevel@tonic-gate #define	__BT_LPUT(dbc, lock)						\
67*7c478bd9Sstevel@tonic-gate 	(F_ISSET((dbc)->dbp, DB_AM_LOCKING) ?				\
68*7c478bd9Sstevel@tonic-gate 	    lock_put((dbc)->dbp->dbenv->lk_info, lock) : 0)
69*7c478bd9Sstevel@tonic-gate #define	__BT_TLPUT(dbc, lock)						\
70*7c478bd9Sstevel@tonic-gate 	(F_ISSET((dbc)->dbp, DB_AM_LOCKING) && (dbc)->txn == NULL ?	\
71*7c478bd9Sstevel@tonic-gate 	    lock_put((dbc)->dbp->dbenv->lk_info, lock) : 0)
72*7c478bd9Sstevel@tonic-gate 
73*7c478bd9Sstevel@tonic-gate /*
74*7c478bd9Sstevel@tonic-gate  * Flags to __bam_search() and __bam_rsearch().
75*7c478bd9Sstevel@tonic-gate  *
76*7c478bd9Sstevel@tonic-gate  * Note, internal page searches must find the largest record less than key in
77*7c478bd9Sstevel@tonic-gate  * the tree so that descents work.  Leaf page searches must find the smallest
78*7c478bd9Sstevel@tonic-gate  * record greater than key so that the returned index is the record's correct
79*7c478bd9Sstevel@tonic-gate  * position for insertion.
80*7c478bd9Sstevel@tonic-gate  *
81*7c478bd9Sstevel@tonic-gate  * The flags parameter to the search routines describes three aspects of the
82*7c478bd9Sstevel@tonic-gate  * search: the type of locking required (including if we're locking a pair of
83*7c478bd9Sstevel@tonic-gate  * pages), the item to return in the presence of duplicates and whether or not
84*7c478bd9Sstevel@tonic-gate  * to return deleted entries.  To simplify both the mnemonic representation
85*7c478bd9Sstevel@tonic-gate  * and the code that checks for various cases, we construct a set of bitmasks.
86*7c478bd9Sstevel@tonic-gate  */
87*7c478bd9Sstevel@tonic-gate #define	S_READ		0x00001		/* Read locks. */
88*7c478bd9Sstevel@tonic-gate #define	S_WRITE		0x00002		/* Write locks. */
89*7c478bd9Sstevel@tonic-gate 
90*7c478bd9Sstevel@tonic-gate #define	S_APPEND	0x00040		/* Append to the tree. */
91*7c478bd9Sstevel@tonic-gate #define	S_DELNO		0x00080		/* Don't return deleted items. */
92*7c478bd9Sstevel@tonic-gate #define	S_DUPFIRST	0x00100		/* Return first duplicate. */
93*7c478bd9Sstevel@tonic-gate #define	S_DUPLAST	0x00200		/* Return last duplicate. */
94*7c478bd9Sstevel@tonic-gate #define	S_EXACT		0x00400		/* Exact items only. */
95*7c478bd9Sstevel@tonic-gate #define	S_PARENT	0x00800		/* Lock page pair. */
96*7c478bd9Sstevel@tonic-gate #define	S_STACK		0x01000		/* Need a complete stack. */
97*7c478bd9Sstevel@tonic-gate #define	S_PAST_EOF	0x02000		/* If doing insert search (or keyfirst
98*7c478bd9Sstevel@tonic-gate 					 * or keylast operations), or a split
99*7c478bd9Sstevel@tonic-gate 					 * on behalf of an insert, it's okay to
100*7c478bd9Sstevel@tonic-gate 					 * return an entry one past end-of-page.
101*7c478bd9Sstevel@tonic-gate 					 */
102*7c478bd9Sstevel@tonic-gate 
103*7c478bd9Sstevel@tonic-gate #define	S_DELETE	(S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK)
104*7c478bd9Sstevel@tonic-gate #define	S_FIND		(S_READ | S_DUPFIRST | S_DELNO)
105*7c478bd9Sstevel@tonic-gate #define	S_FIND_WR	(S_WRITE | S_DUPFIRST | S_DELNO)
106*7c478bd9Sstevel@tonic-gate #define	S_INSERT	(S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
107*7c478bd9Sstevel@tonic-gate #define	S_KEYFIRST	(S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK)
108*7c478bd9Sstevel@tonic-gate #define	S_KEYLAST	(S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
109*7c478bd9Sstevel@tonic-gate #define	S_WRPAIR	(S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT)
110*7c478bd9Sstevel@tonic-gate 
111*7c478bd9Sstevel@tonic-gate /*
112*7c478bd9Sstevel@tonic-gate  * Flags to __bam_iitem().
113*7c478bd9Sstevel@tonic-gate  */
114*7c478bd9Sstevel@tonic-gate #define	BI_DELETED	0x01		/* Key/data pair only placeholder. */
115*7c478bd9Sstevel@tonic-gate #define	BI_DOINCR	0x02		/* Increment the record count. */
116*7c478bd9Sstevel@tonic-gate #define	BI_NEWKEY	0x04		/* New key. */
117*7c478bd9Sstevel@tonic-gate 
118*7c478bd9Sstevel@tonic-gate /*
119*7c478bd9Sstevel@tonic-gate  * Various routines pass around page references.  A page reference can be a
120*7c478bd9Sstevel@tonic-gate  * pointer to the page or a page number; for either, an indx can designate
121*7c478bd9Sstevel@tonic-gate  * an item on the page.
122*7c478bd9Sstevel@tonic-gate  */
123*7c478bd9Sstevel@tonic-gate struct __epg {
124*7c478bd9Sstevel@tonic-gate 	PAGE	 *page;			/* The page. */
125*7c478bd9Sstevel@tonic-gate 	db_indx_t indx;			/* The index on the page. */
126*7c478bd9Sstevel@tonic-gate 	DB_LOCK	  lock;			/* The page's lock. */
127*7c478bd9Sstevel@tonic-gate };
128*7c478bd9Sstevel@tonic-gate 
129*7c478bd9Sstevel@tonic-gate /*
130*7c478bd9Sstevel@tonic-gate  * We maintain a stack of the pages that we're locking in the tree.  Btree's
131*7c478bd9Sstevel@tonic-gate  * (currently) only save two levels of the tree at a time, so the default
132*7c478bd9Sstevel@tonic-gate  * stack is always large enough.  Recno trees have to lock the entire tree to
133*7c478bd9Sstevel@tonic-gate  * do inserts/deletes, however.  Grow the stack as necessary.
134*7c478bd9Sstevel@tonic-gate  */
135*7c478bd9Sstevel@tonic-gate #define	BT_STK_CLR(c)							\
136*7c478bd9Sstevel@tonic-gate 	((c)->csp = (c)->sp)
137*7c478bd9Sstevel@tonic-gate 
138*7c478bd9Sstevel@tonic-gate #define	BT_STK_ENTER(c, pagep, page_indx, lock, ret) do {		\
139*7c478bd9Sstevel@tonic-gate 	if ((ret =							\
140*7c478bd9Sstevel@tonic-gate 	    (c)->csp == (c)->esp ? __bam_stkgrow(c) : 0) == 0) {	\
141*7c478bd9Sstevel@tonic-gate 		(c)->csp->page = pagep;					\
142*7c478bd9Sstevel@tonic-gate 		(c)->csp->indx = page_indx;				\
143*7c478bd9Sstevel@tonic-gate 		(c)->csp->lock = lock;					\
144*7c478bd9Sstevel@tonic-gate 	}								\
145*7c478bd9Sstevel@tonic-gate } while (0)
146*7c478bd9Sstevel@tonic-gate 
147*7c478bd9Sstevel@tonic-gate #define	BT_STK_PUSH(c, pagep, page_indx, lock, ret) do {		\
148*7c478bd9Sstevel@tonic-gate 	BT_STK_ENTER(c, pagep, page_indx, lock, ret);			\
149*7c478bd9Sstevel@tonic-gate 	++(c)->csp;							\
150*7c478bd9Sstevel@tonic-gate } while (0)
151*7c478bd9Sstevel@tonic-gate 
152*7c478bd9Sstevel@tonic-gate #define	BT_STK_POP(c)							\
153*7c478bd9Sstevel@tonic-gate 	((c)->csp == (c)->stack ? NULL : --(c)->csp)
154*7c478bd9Sstevel@tonic-gate 
155*7c478bd9Sstevel@tonic-gate /*
156*7c478bd9Sstevel@tonic-gate  * Arguments passed to __bam_ca_replace().
157*7c478bd9Sstevel@tonic-gate  */
158*7c478bd9Sstevel@tonic-gate typedef enum {
159*7c478bd9Sstevel@tonic-gate 	REPLACE_SETUP,
160*7c478bd9Sstevel@tonic-gate 	REPLACE_SUCCESS,
161*7c478bd9Sstevel@tonic-gate 	REPLACE_FAILED
162*7c478bd9Sstevel@tonic-gate } ca_replace_arg;
163*7c478bd9Sstevel@tonic-gate 
164*7c478bd9Sstevel@tonic-gate /* Arguments passed to __ram_ca(). */
165*7c478bd9Sstevel@tonic-gate typedef enum {
166*7c478bd9Sstevel@tonic-gate 	CA_DELETE,
167*7c478bd9Sstevel@tonic-gate 	CA_IAFTER,
168*7c478bd9Sstevel@tonic-gate 	CA_IBEFORE
169*7c478bd9Sstevel@tonic-gate } ca_recno_arg;
170*7c478bd9Sstevel@tonic-gate 
171*7c478bd9Sstevel@tonic-gate #define	RECNO_OOB	0		/* Illegal record number. */
172*7c478bd9Sstevel@tonic-gate 
173*7c478bd9Sstevel@tonic-gate /* Btree/Recno cursor. */
174*7c478bd9Sstevel@tonic-gate struct __cursor {
175*7c478bd9Sstevel@tonic-gate 	DBC		*dbc;		/* Enclosing DBC. */
176*7c478bd9Sstevel@tonic-gate 
177*7c478bd9Sstevel@tonic-gate 	/* Per-thread information: shared by btree/recno. */
178*7c478bd9Sstevel@tonic-gate 	EPG		*sp;		/* Stack pointer. */
179*7c478bd9Sstevel@tonic-gate 	EPG	 	*csp;		/* Current stack entry. */
180*7c478bd9Sstevel@tonic-gate 	EPG		*esp;		/* End stack pointer. */
181*7c478bd9Sstevel@tonic-gate 	EPG		 stack[5];
182*7c478bd9Sstevel@tonic-gate 
183*7c478bd9Sstevel@tonic-gate 	/* Per-thread information: btree private. */
184*7c478bd9Sstevel@tonic-gate 	PAGE		*page;		/* Cursor page. */
185*7c478bd9Sstevel@tonic-gate 
186*7c478bd9Sstevel@tonic-gate 	db_pgno_t	 pgno;		/* Page. */
187*7c478bd9Sstevel@tonic-gate 	db_indx_t	 indx;		/* Page item ref'd by the cursor. */
188*7c478bd9Sstevel@tonic-gate 
189*7c478bd9Sstevel@tonic-gate 	db_pgno_t	 dpgno;		/* Duplicate page. */
190*7c478bd9Sstevel@tonic-gate 	db_indx_t	 dindx;		/* Page item ref'd by the cursor. */
191*7c478bd9Sstevel@tonic-gate 
192*7c478bd9Sstevel@tonic-gate 	DB_LOCK		 lock;		/* Cursor read lock. */
193*7c478bd9Sstevel@tonic-gate 	db_lockmode_t	 mode;		/* Lock mode. */
194*7c478bd9Sstevel@tonic-gate 
195*7c478bd9Sstevel@tonic-gate 	/* Per-thread information: recno private. */
196*7c478bd9Sstevel@tonic-gate 	db_recno_t	 recno;		/* Current record number. */
197*7c478bd9Sstevel@tonic-gate 
198*7c478bd9Sstevel@tonic-gate 	/*
199*7c478bd9Sstevel@tonic-gate 	 * Btree:
200*7c478bd9Sstevel@tonic-gate 	 * We set a flag in the cursor structure if the underlying object has
201*7c478bd9Sstevel@tonic-gate 	 * been deleted.  It's not strictly necessary, we could get the same
202*7c478bd9Sstevel@tonic-gate 	 * information by looking at the page itself.
203*7c478bd9Sstevel@tonic-gate 	 *
204*7c478bd9Sstevel@tonic-gate 	 * Recno:
205*7c478bd9Sstevel@tonic-gate 	 * When renumbering recno databases during deletes, cursors referencing
206*7c478bd9Sstevel@tonic-gate 	 * "deleted" records end up positioned between two records, and so must
207*7c478bd9Sstevel@tonic-gate 	 * be specially adjusted on the next operation.
208*7c478bd9Sstevel@tonic-gate 	 */
209*7c478bd9Sstevel@tonic-gate #define	C_DELETED	0x0001		/* Record was deleted. */
210*7c478bd9Sstevel@tonic-gate 	u_int32_t	 flags;
211*7c478bd9Sstevel@tonic-gate };
212*7c478bd9Sstevel@tonic-gate 
213*7c478bd9Sstevel@tonic-gate /*
214*7c478bd9Sstevel@tonic-gate  * The in-memory recno data structure.
215*7c478bd9Sstevel@tonic-gate  *
216*7c478bd9Sstevel@tonic-gate  * !!!
217*7c478bd9Sstevel@tonic-gate  * These fields are ignored as far as multi-threading is concerned.  There
218*7c478bd9Sstevel@tonic-gate  * are no transaction semantics associated with backing files, nor is there
219*7c478bd9Sstevel@tonic-gate  * any thread protection.
220*7c478bd9Sstevel@tonic-gate  */
221*7c478bd9Sstevel@tonic-gate struct __recno {
222*7c478bd9Sstevel@tonic-gate 	int		 re_delim;	/* Variable-length delimiting byte. */
223*7c478bd9Sstevel@tonic-gate 	int		 re_pad;	/* Fixed-length padding byte. */
224*7c478bd9Sstevel@tonic-gate 	u_int32_t	 re_len;	/* Length for fixed-length records. */
225*7c478bd9Sstevel@tonic-gate 
226*7c478bd9Sstevel@tonic-gate 	char		*re_source;	/* Source file name. */
227*7c478bd9Sstevel@tonic-gate 	int		 re_fd;		/* Source file descriptor */
228*7c478bd9Sstevel@tonic-gate 	db_recno_t	 re_last;	/* Last record number read. */
229*7c478bd9Sstevel@tonic-gate 	void		*re_cmap;	/* Current point in mapped space. */
230*7c478bd9Sstevel@tonic-gate 	void		*re_smap;	/* Start of mapped space. */
231*7c478bd9Sstevel@tonic-gate 	void		*re_emap;	/* End of mapped space. */
232*7c478bd9Sstevel@tonic-gate 	size_t		 re_msize;	/* Size of mapped region. */
233*7c478bd9Sstevel@tonic-gate 					/* Recno input function. */
234*7c478bd9Sstevel@tonic-gate 	int (*re_irec) __P((DBC *, db_recno_t));
235*7c478bd9Sstevel@tonic-gate 
236*7c478bd9Sstevel@tonic-gate #define	RECNO_EOF	0x0001		/* EOF on backing source file. */
237*7c478bd9Sstevel@tonic-gate #define	RECNO_MODIFIED	0x0002		/* Tree was modified. */
238*7c478bd9Sstevel@tonic-gate 	u_int32_t	 flags;
239*7c478bd9Sstevel@tonic-gate };
240*7c478bd9Sstevel@tonic-gate 
241*7c478bd9Sstevel@tonic-gate /*
242*7c478bd9Sstevel@tonic-gate  * The in-memory, per-tree btree data structure.
243*7c478bd9Sstevel@tonic-gate  */
244*7c478bd9Sstevel@tonic-gate struct __btree {
245*7c478bd9Sstevel@tonic-gate 	db_pgno_t	 bt_lpgno;	/* Last insert location. */
246*7c478bd9Sstevel@tonic-gate 
247*7c478bd9Sstevel@tonic-gate 	db_indx_t 	 bt_maxkey;	/* Maximum keys per page. */
248*7c478bd9Sstevel@tonic-gate 	db_indx_t 	 bt_minkey;	/* Minimum keys per page. */
249*7c478bd9Sstevel@tonic-gate 
250*7c478bd9Sstevel@tonic-gate 	int (*bt_compare)		/* Comparison function. */
251*7c478bd9Sstevel@tonic-gate 	    __P((const DBT *, const DBT *));
252*7c478bd9Sstevel@tonic-gate 	size_t(*bt_prefix)		/* Prefix function. */
253*7c478bd9Sstevel@tonic-gate 	    __P((const DBT *, const DBT *));
254*7c478bd9Sstevel@tonic-gate 
255*7c478bd9Sstevel@tonic-gate 	db_indx_t	 bt_ovflsize;	/* Maximum key/data on-page size. */
256*7c478bd9Sstevel@tonic-gate 
257*7c478bd9Sstevel@tonic-gate 	RECNO		*recno;		/* Private recno structure. */
258*7c478bd9Sstevel@tonic-gate };
259*7c478bd9Sstevel@tonic-gate 
260*7c478bd9Sstevel@tonic-gate #include "btree_auto.h"
261*7c478bd9Sstevel@tonic-gate #include "btree_ext.h"
262*7c478bd9Sstevel@tonic-gate #include "db_am.h"
263*7c478bd9Sstevel@tonic-gate #include "common_ext.h"
264