1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 */
7/*
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 *	Keith Bostic.  All rights reserved.
10 */
11/*
12 * Copyright (c) 1990, 1993, 1994, 1995
13 *	The Regents of the University of California.  All rights reserved.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * Mike Olson.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 *    notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 *    notice, this list of conditions and the following disclaimer in the
25 *    documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 *    must display the following acknowledgement:
28 *	This product includes software developed by the University of
29 *	California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 *    may be used to endorse or promote products derived from this software
32 *    without specific prior written permission.
33 *
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 * SUCH DAMAGE.
45 */
46
47#include "config.h"
48
49#ifndef lint
50static const char sccsid[] = "@(#)bt_search.c	10.25 (Sleepycat) 12/16/98";
51#endif /* not lint */
52
53#ifndef NO_SYSTEM_INCLUDES
54#include <sys/types.h>
55
56#include <errno.h>
57#include <string.h>
58#endif
59
60#include "db_int.h"
61#include "db_page.h"
62#include "btree.h"
63
64/*
65 * __bam_search --
66 *	Search a btree for a key.
67 *
68 * PUBLIC: int __bam_search __P((DBC *,
69 * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
70 */
71int
72__bam_search(dbc, key, flags, stop, recnop, exactp)
73	DBC *dbc;
74	const DBT *key;
75	u_int32_t flags;
76	int stop, *exactp;
77	db_recno_t *recnop;
78{
79	BTREE *t;
80	CURSOR *cp;
81	DB *dbp;
82	DB_LOCK lock;
83	PAGE *h;
84	db_indx_t base, i, indx, lim;
85	db_pgno_t pg;
86	db_recno_t recno;
87	int cmp, jump, ret, stack;
88
89	dbp = dbc->dbp;
90	cp = dbc->internal;
91	t = dbp->internal;
92	recno = 0;
93
94	BT_STK_CLR(cp);
95
96	/*
97	 * There are several ways we search a btree tree.  The flags argument
98	 * specifies if we're acquiring read or write locks, if we position
99	 * to the first or last item in a set of duplicates, if we return
100	 * deleted items, and if we are locking pairs of pages.  In addition,
101	 * if we're modifying record numbers, we have to lock the entire tree
102	 * regardless.  See btree.h for more details.
103	 *
104	 * If write-locking pages, we need to know whether or not to acquire a
105	 * write lock on a page before getting it.  This depends on how deep it
106	 * is in tree, which we don't know until we acquire the root page.  So,
107	 * if we need to lock the root page we may have to upgrade it later,
108	 * because we won't get the correct lock initially.
109	 *
110	 * Retrieve the root page.
111	 */
112	pg = PGNO_ROOT;
113	stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK);
114	if ((ret = __bam_lget(dbc,
115	    0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
116		return (ret);
117	if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
118		(void)__BT_LPUT(dbc, lock);
119		return (ret);
120	}
121
122	/*
123	 * Decide if we need to save this page; if we do, write lock it.
124	 * We deliberately don't lock-couple on this call.  If the tree
125	 * is tiny, i.e., one page, and two threads are busily updating
126	 * the root page, we're almost guaranteed deadlocks galore, as
127	 * each one gets a read lock and then blocks the other's attempt
128	 * for a write lock.
129	 */
130	if (!stack &&
131	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
132	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
133		(void)memp_fput(dbp->mpf, h, 0);
134		(void)__BT_LPUT(dbc, lock);
135		if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
136			return (ret);
137		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
138			(void)__BT_LPUT(dbc, lock);
139			return (ret);
140		}
141		stack = 1;
142	}
143
144	for (;;) {
145		/*
146		 * Do a binary search on the current page.  If we're searching
147		 * a leaf page, we have to manipulate the indices in groups of
148		 * two.  If we're searching an internal page, they're an index
149		 * per page item.  If we find an exact match on a leaf page,
150		 * we're done.
151		 */
152		jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
153		for (base = 0,
154		    lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) {
155			indx = base + ((lim >> 1) * jump);
156			if ((cmp =
157			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) == 0) {
158				if (TYPE(h) == P_LBTREE)
159					goto match;
160				goto next;
161			}
162			if (cmp > 0) {
163				base = indx + jump;
164				--lim;
165			}
166		}
167
168		/*
169		 * No match found.  Base is the smallest index greater than
170		 * key and may be zero or a last + O_INDX index.
171		 *
172		 * If it's a leaf page, return base as the "found" value.
173		 * Delete only deletes exact matches.
174		 */
175		if (TYPE(h) == P_LBTREE) {
176			*exactp = 0;
177
178			if (LF_ISSET(S_EXACT))
179				goto notfound;
180
181			/*
182			 * !!!
183			 * Possibly returning a deleted record -- DB_SET_RANGE,
184			 * DB_KEYFIRST and DB_KEYLAST don't require an exact
185			 * match, and we don't want to walk multiple pages here
186			 * to find an undeleted record.  This is handled in the
187			 * __bam_c_search() routine.
188			 */
189			BT_STK_ENTER(cp, h, base, lock, ret);
190			return (ret);
191		}
192
193		/*
194		 * If it's not a leaf page, record the internal page (which is
195		 * a parent page for the key).  Decrement the base by 1 if it's
196		 * non-zero so that if a split later occurs, the inserted page
197		 * will be to the right of the saved page.
198		 */
199		indx = base > 0 ? base - O_INDX : base;
200
201		/*
202		 * If we're trying to calculate the record number, sum up
203		 * all the record numbers on this page up to the indx point.
204		 */
205		if (recnop != NULL)
206			for (i = 0; i < indx; ++i)
207				recno += GET_BINTERNAL(h, i)->nrecs;
208
209next:		pg = GET_BINTERNAL(h, indx)->pgno;
210		if (stack) {
211			/* Return if this is the lowest page wanted. */
212			if (LF_ISSET(S_PARENT) && stop == h->level) {
213				BT_STK_ENTER(cp, h, indx, lock, ret);
214				return (ret);
215			}
216			BT_STK_PUSH(cp, h, indx, lock, ret);
217			if (ret != 0)
218				goto err;
219
220			if ((ret =
221			    __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
222				goto err;
223		} else {
224			/*
225			 * Decide if we want to return a reference to the next
226			 * page in the return stack.  If so, lock it and never
227			 * unlock it.
228			 */
229			if ((LF_ISSET(S_PARENT) &&
230			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
231			    (h->level - 1) == LEAFLEVEL)
232				stack = 1;
233
234			(void)memp_fput(dbp->mpf, h, 0);
235
236			if ((ret =
237			    __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
238			    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
239				goto err;
240		}
241		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
242			goto err;
243	}
244	/* NOTREACHED */
245
246match:	*exactp = 1;
247
248	/*
249	 * If we're trying to calculate the record number, add in the
250	 * offset on this page and correct for the fact that records
251	 * in the tree are 0-based.
252	 */
253	if (recnop != NULL)
254		*recnop = recno + (indx / P_INDX) + 1;
255
256	/*
257	 * If we got here, we know that we have a btree leaf page.
258	 *
259	 * If there are duplicates, go to the first/last one.  This is
260	 * safe because we know that we're not going to leave the page,
261	 * all duplicate sets that are not on overflow pages exist on a
262	 * single leaf page.
263	 */
264	if (LF_ISSET(S_DUPLAST))
265		while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
266		    h->inp[indx] == h->inp[indx + P_INDX])
267			indx += P_INDX;
268	else
269		while (indx > 0 &&
270		    h->inp[indx] == h->inp[indx - P_INDX])
271			indx -= P_INDX;
272
273	/*
274	 * Now check if we are allowed to return deleted items; if not
275	 * find the next (or previous) non-deleted item.
276	 */
277	if (LF_ISSET(S_DELNO)) {
278		if (LF_ISSET(S_DUPLAST))
279			while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) &&
280			    indx > 0 &&
281			    h->inp[indx] == h->inp[indx - P_INDX])
282				indx -= P_INDX;
283		else
284			while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) &&
285			    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
286			    h->inp[indx] == h->inp[indx + P_INDX])
287				indx += P_INDX;
288
289		if (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
290			goto notfound;
291	}
292
293	BT_STK_ENTER(cp, h, indx, lock, ret);
294	return (ret);
295
296notfound:
297	(void)memp_fput(dbp->mpf, h, 0);
298	(void)__BT_LPUT(dbc, lock);
299	ret = DB_NOTFOUND;
300
301err:	if (cp->csp > cp->sp) {
302		BT_STK_POP(cp);
303		__bam_stkrel(dbc, 0);
304	}
305	return (ret);
306}
307
308/*
309 * __bam_stkrel --
310 *	Release all pages currently held in the stack.
311 *
312 * PUBLIC: int __bam_stkrel __P((DBC *, int));
313 */
314int
315__bam_stkrel(dbc, nolocks)
316	DBC *dbc;
317	int nolocks;
318{
319	CURSOR *cp;
320	DB *dbp;
321	EPG *epg;
322
323	dbp = dbc->dbp;
324	cp = dbc->internal;
325
326	/* Release inner pages first. */
327	for (epg = cp->sp; epg <= cp->csp; ++epg) {
328		if (epg->page != NULL)
329			(void)memp_fput(dbp->mpf, epg->page, 0);
330		if (epg->lock != LOCK_INVALID)
331			if (nolocks)
332				(void)__BT_LPUT(dbc, epg->lock);
333			else
334				(void)__BT_TLPUT(dbc, epg->lock);
335	}
336
337	/* Clear the stack, all pages have been released. */
338	BT_STK_CLR(cp);
339
340	return (0);
341}
342
343/*
344 * __bam_stkgrow --
345 *	Grow the stack.
346 *
347 * PUBLIC: int __bam_stkgrow __P((CURSOR *));
348 */
349int
350__bam_stkgrow(cp)
351	CURSOR *cp;
352{
353	EPG *p;
354	size_t entries;
355	int ret;
356
357	entries = cp->esp - cp->sp;
358
359	if ((ret = __os_calloc(entries * 2, sizeof(EPG), &p)) != 0)
360		return (ret);
361	memcpy(p, cp->sp, entries * sizeof(EPG));
362	if (cp->sp != cp->stack)
363		__os_free(cp->sp, entries * sizeof(EPG));
364	cp->sp = p;
365	cp->csp = p + entries;
366	cp->esp = p + entries * 2;
367	return (0);
368}
369