1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 */
7/*
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 *	Keith Bostic.  All rights reserved.
10 */
11/*
12 * Copyright (c) 1990, 1993
13 *	The Regents of the University of California.  All rights reserved.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 *    must display the following acknowledgement:
25 *	This product includes software developed by the University of
26 *	California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 *    may be used to endorse or promote products derived from this software
29 *    without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 */
43
44#include "config.h"
45
46#ifndef lint
47static const char sccsid[] = "@(#)bt_rsearch.c	10.21 (Sleepycat) 12/2/98";
48#endif /* not lint */
49
50#ifndef NO_SYSTEM_INCLUDES
51#include <sys/types.h>
52#endif
53
54#include "db_int.h"
55#include "db_page.h"
56#include "btree.h"
57
58/*
59 * __bam_rsearch --
60 *	Search a btree for a record number.
61 *
62 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
63 */
64int
65__bam_rsearch(dbc, recnop, flags, stop, exactp)
66	DBC *dbc;
67	db_recno_t *recnop;
68	u_int32_t flags;
69	int stop, *exactp;
70{
71	BINTERNAL *bi;
72	CURSOR *cp;
73	DB *dbp;
74	DB_LOCK lock;
75	PAGE *h;
76	RINTERNAL *ri;
77	db_indx_t indx, top;
78	db_pgno_t pg;
79	db_recno_t i, recno, total;
80	int ret, stack;
81
82	dbp = dbc->dbp;
83	cp = dbc->internal;
84
85	BT_STK_CLR(cp);
86
87	/*
88	 * There are several ways we search a btree tree.  The flags argument
89	 * specifies if we're acquiring read or write locks and if we are
90	 * locking pairs of pages.  In addition, if we're adding or deleting
91	 * an item, we have to lock the entire tree, regardless.  See btree.h
92	 * for more details.
93	 *
94	 * If write-locking pages, we need to know whether or not to acquire a
95	 * write lock on a page before getting it.  This depends on how deep it
96	 * is in tree, which we don't know until we acquire the root page.  So,
97	 * if we need to lock the root page we may have to upgrade it later,
98	 * because we won't get the correct lock initially.
99	 *
100	 * Retrieve the root page.
101	 */
102	pg = PGNO_ROOT;
103	stack = LF_ISSET(S_STACK);
104	if ((ret = __bam_lget(dbc,
105	    0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
106		return (ret);
107	if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
108		(void)__BT_LPUT(dbc, lock);
109		return (ret);
110	}
111
112	/*
113	 * Decide if we need to save this page; if we do, write lock it.
114	 * We deliberately don't lock-couple on this call.  If the tree
115	 * is tiny, i.e., one page, and two threads are busily updating
116	 * the root page, we're almost guaranteed deadlocks galore, as
117	 * each one gets a read lock and then blocks the other's attempt
118	 * for a write lock.
119	 */
120	if (!stack &&
121	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
122	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
123		(void)memp_fput(dbp->mpf, h, 0);
124		(void)__BT_LPUT(dbc, lock);
125		if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
126			return (ret);
127		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
128			(void)__BT_LPUT(dbc, lock);
129			return (ret);
130		}
131		stack = 1;
132	}
133
134	/*
135	 * If appending to the tree, set the record number now -- we have the
136	 * root page locked.
137	 *
138	 * Delete only deletes exact matches, read only returns exact matches.
139	 * Note, this is different from __bam_search(), which returns non-exact
140	 * matches for read.
141	 *
142	 * The record may not exist.  We can only return the correct location
143	 * for the record immediately after the last record in the tree, so do
144	 * a fast check now.
145	 */
146	total = RE_NREC(h);
147	if (LF_ISSET(S_APPEND)) {
148		*exactp = 0;
149		*recnop = recno = total + 1;
150	} else {
151		recno = *recnop;
152		if (recno <= total)
153			*exactp = 1;
154		else {
155			*exactp = 0;
156			if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
157				(void)memp_fput(dbp->mpf, h, 0);
158				(void)__BT_LPUT(dbc, lock);
159				return (DB_NOTFOUND);
160			}
161		}
162	}
163
164	/*
165	 * !!!
166	 * Record numbers in the tree are 0-based, but the recno is
167	 * 1-based.  All of the calculations below have to take this
168	 * into account.
169	 */
170	for (total = 0;;) {
171		switch (TYPE(h)) {
172		case P_LBTREE:
173			recno -= total;
174
175			/*
176			 * There may be logically deleted records on the page,
177			 * walk the page correcting for them.  The record may
178			 * not exist if there are enough deleted records in the
179			 * page.
180			 */
181			if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX)
182				for (i = recno - 1;; --i) {
183					if (B_DISSET(GET_BKEYDATA(h,
184					    i * P_INDX + O_INDX)->type))
185						++recno;
186					if (i == 0)
187						break;
188				}
189			if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) {
190				*exactp = 0;
191				if (!LF_ISSET(S_PAST_EOF) || recno >
192				    (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) {
193					ret = DB_NOTFOUND;
194					goto err;
195				}
196
197			}
198
199			/* Correct from 1-based to 0-based for a page offset. */
200			--recno;
201			BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret);
202			return (ret);
203		case P_IBTREE:
204			for (indx = 0, top = NUM_ENT(h);;) {
205				bi = GET_BINTERNAL(h, indx);
206				if (++indx == top || total + bi->nrecs >= recno)
207					break;
208				total += bi->nrecs;
209			}
210			pg = bi->pgno;
211			break;
212		case P_LRECNO:
213			recno -= total;
214
215			/* Correct from 1-based to 0-based for a page offset. */
216			--recno;
217			BT_STK_ENTER(cp, h, recno, lock, ret);
218			return (ret);
219		case P_IRECNO:
220			for (indx = 0, top = NUM_ENT(h);;) {
221				ri = GET_RINTERNAL(h, indx);
222				if (++indx == top || total + ri->nrecs >= recno)
223					break;
224				total += ri->nrecs;
225			}
226			pg = ri->pgno;
227			break;
228		default:
229			return (__db_pgfmt(dbp, h->pgno));
230		}
231		--indx;
232
233		if (stack) {
234			/* Return if this is the lowest page wanted. */
235			if (LF_ISSET(S_PARENT) && stop == h->level) {
236				BT_STK_ENTER(cp, h, indx, lock, ret);
237				return (ret);
238			}
239			BT_STK_PUSH(cp, h, indx, lock, ret);
240			if (ret != 0)
241				goto err;
242
243			if ((ret =
244			    __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
245				goto err;
246		} else {
247			/*
248			 * Decide if we want to return a pointer to the next
249			 * page in the stack.  If we do, write lock it and
250			 * never unlock it.
251			 */
252			if ((LF_ISSET(S_PARENT) &&
253			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
254			    (h->level - 1) == LEAFLEVEL)
255				stack = 1;
256
257			(void)memp_fput(dbp->mpf, h, 0);
258
259			if ((ret =
260			    __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
261			    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
262				goto err;
263		}
264
265		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
266			goto err;
267	}
268	/* NOTREACHED */
269
270err:	BT_STK_POP(cp);
271	__bam_stkrel(dbc, 0);
272	return (ret);
273}
274
275/*
276 * __bam_adjust --
277 *	Adjust the tree after adding or deleting a record.
278 *
279 * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
280 */
281int
282__bam_adjust(dbc, adjust)
283	DBC *dbc;
284	int32_t adjust;
285{
286	CURSOR *cp;
287	DB *dbp;
288	EPG *epg;
289	PAGE *h;
290	int ret;
291
292	dbp = dbc->dbp;
293	cp = dbc->internal;
294
295	/* Update the record counts for the tree. */
296	for (epg = cp->sp; epg <= cp->csp; ++epg) {
297		h = epg->page;
298		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
299			if (DB_LOGGING(dbc) &&
300			    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
301			    dbc->txn, &LSN(h), 0, dbp->log_fileid,
302			    PGNO(h), &LSN(h), (u_int32_t)epg->indx,
303			    adjust, 1)) != 0)
304				return (ret);
305
306			if (TYPE(h) == P_IBTREE)
307				GET_BINTERNAL(h, epg->indx)->nrecs += adjust;
308			else
309				GET_RINTERNAL(h, epg->indx)->nrecs += adjust;
310
311			if (PGNO(h) == PGNO_ROOT)
312				RE_NREC_ADJ(h, adjust);
313
314			if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
315				return (ret);
316		}
317	}
318	return (0);
319}
320
321/*
322 * __bam_nrecs --
323 *	Return the number of records in the tree.
324 *
325 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
326 */
327int
328__bam_nrecs(dbc, rep)
329	DBC *dbc;
330	db_recno_t *rep;
331{
332	DB *dbp;
333	DB_LOCK lock;
334	PAGE *h;
335	db_pgno_t pgno;
336	int ret;
337
338	dbp = dbc->dbp;
339
340	pgno = PGNO_ROOT;
341	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0)
342		return (ret);
343	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
344		return (ret);
345
346	*rep = RE_NREC(h);
347
348	(void)memp_fput(dbp->mpf, h, 0);
349	(void)__BT_TLPUT(dbc, lock);
350
351	return (0);
352}
353
354/*
355 * __bam_total --
356 *	Return the number of records below a page.
357 *
358 * PUBLIC: db_recno_t __bam_total __P((PAGE *));
359 */
360db_recno_t
361__bam_total(h)
362	PAGE *h;
363{
364	db_recno_t nrecs;
365	db_indx_t indx, top;
366
367	nrecs = 0;
368	top = NUM_ENT(h);
369
370	switch (TYPE(h)) {
371	case P_LBTREE:
372		/* Check for logically deleted records. */
373		for (indx = 0; indx < top; indx += P_INDX)
374			if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
375				++nrecs;
376		break;
377	case P_IBTREE:
378		for (indx = 0; indx < top; indx += O_INDX)
379			nrecs += GET_BINTERNAL(h, indx)->nrecs;
380		break;
381	case P_LRECNO:
382		nrecs = NUM_ENT(h);
383		break;
384	case P_IRECNO:
385		for (indx = 0; indx < top; indx += O_INDX)
386			nrecs += GET_RINTERNAL(h, indx)->nrecs;
387		break;
388	}
389
390	return (nrecs);
391}
392