1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 /*
8  * Copyright (c) 1990, 1993, 1994, 1995, 1996
9  *	Keith Bostic.  All rights reserved.
10  */
11 /*
12  * Copyright (c) 1990, 1993
13  *	The Regents of the University of California.  All rights reserved.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *	This product includes software developed by the University of
26  *	California, Berkeley and its contributors.
27  * 4. Neither the name of the University nor the names of its contributors
28  *    may be used to endorse or promote products derived from this software
29  *    without specific prior written permission.
30  *
31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41  * SUCH DAMAGE.
42  */
43 
44 #include "config.h"
45 
46 #ifndef lint
47 static const char sccsid[] = "@(#)bt_rsearch.c	10.21 (Sleepycat) 12/2/98";
48 #endif /* not lint */
49 
50 #ifndef NO_SYSTEM_INCLUDES
51 #include <sys/types.h>
52 #endif
53 
54 #include "db_int.h"
55 #include "db_page.h"
56 #include "btree.h"
57 
58 /*
59  * __bam_rsearch --
60  *	Search a btree for a record number.
61  *
62  * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
63  */
64 int
__bam_rsearch(dbc,recnop,flags,stop,exactp)65 __bam_rsearch(dbc, recnop, flags, stop, exactp)
66 	DBC *dbc;
67 	db_recno_t *recnop;
68 	u_int32_t flags;
69 	int stop, *exactp;
70 {
71 	BINTERNAL *bi;
72 	CURSOR *cp;
73 	DB *dbp;
74 	DB_LOCK lock;
75 	PAGE *h;
76 	RINTERNAL *ri;
77 	db_indx_t indx, top;
78 	db_pgno_t pg;
79 	db_recno_t i, recno, total;
80 	int ret, stack;
81 
82 	dbp = dbc->dbp;
83 	cp = dbc->internal;
84 
85 	BT_STK_CLR(cp);
86 
87 	/*
88 	 * There are several ways we search a btree tree.  The flags argument
89 	 * specifies if we're acquiring read or write locks and if we are
90 	 * locking pairs of pages.  In addition, if we're adding or deleting
91 	 * an item, we have to lock the entire tree, regardless.  See btree.h
92 	 * for more details.
93 	 *
94 	 * If write-locking pages, we need to know whether or not to acquire a
95 	 * write lock on a page before getting it.  This depends on how deep it
96 	 * is in tree, which we don't know until we acquire the root page.  So,
97 	 * if we need to lock the root page we may have to upgrade it later,
98 	 * because we won't get the correct lock initially.
99 	 *
100 	 * Retrieve the root page.
101 	 */
102 	pg = PGNO_ROOT;
103 	stack = LF_ISSET(S_STACK);
104 	if ((ret = __bam_lget(dbc,
105 	    0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
106 		return (ret);
107 	if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
108 		(void)__BT_LPUT(dbc, lock);
109 		return (ret);
110 	}
111 
112 	/*
113 	 * Decide if we need to save this page; if we do, write lock it.
114 	 * We deliberately don't lock-couple on this call.  If the tree
115 	 * is tiny, i.e., one page, and two threads are busily updating
116 	 * the root page, we're almost guaranteed deadlocks galore, as
117 	 * each one gets a read lock and then blocks the other's attempt
118 	 * for a write lock.
119 	 */
120 	if (!stack &&
121 	    ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
122 	    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
123 		(void)memp_fput(dbp->mpf, h, 0);
124 		(void)__BT_LPUT(dbc, lock);
125 		if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
126 			return (ret);
127 		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
128 			(void)__BT_LPUT(dbc, lock);
129 			return (ret);
130 		}
131 		stack = 1;
132 	}
133 
134 	/*
135 	 * If appending to the tree, set the record number now -- we have the
136 	 * root page locked.
137 	 *
138 	 * Delete only deletes exact matches, read only returns exact matches.
139 	 * Note, this is different from __bam_search(), which returns non-exact
140 	 * matches for read.
141 	 *
142 	 * The record may not exist.  We can only return the correct location
143 	 * for the record immediately after the last record in the tree, so do
144 	 * a fast check now.
145 	 */
146 	total = RE_NREC(h);
147 	if (LF_ISSET(S_APPEND)) {
148 		*exactp = 0;
149 		*recnop = recno = total + 1;
150 	} else {
151 		recno = *recnop;
152 		if (recno <= total)
153 			*exactp = 1;
154 		else {
155 			*exactp = 0;
156 			if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
157 				(void)memp_fput(dbp->mpf, h, 0);
158 				(void)__BT_LPUT(dbc, lock);
159 				return (DB_NOTFOUND);
160 			}
161 		}
162 	}
163 
164 	/*
165 	 * !!!
166 	 * Record numbers in the tree are 0-based, but the recno is
167 	 * 1-based.  All of the calculations below have to take this
168 	 * into account.
169 	 */
170 	for (total = 0;;) {
171 		switch (TYPE(h)) {
172 		case P_LBTREE:
173 			recno -= total;
174 
175 			/*
176 			 * There may be logically deleted records on the page,
177 			 * walk the page correcting for them.  The record may
178 			 * not exist if there are enough deleted records in the
179 			 * page.
180 			 */
181 			if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX)
182 				for (i = recno - 1;; --i) {
183 					if (B_DISSET(GET_BKEYDATA(h,
184 					    i * P_INDX + O_INDX)->type))
185 						++recno;
186 					if (i == 0)
187 						break;
188 				}
189 			if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) {
190 				*exactp = 0;
191 				if (!LF_ISSET(S_PAST_EOF) || recno >
192 				    (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) {
193 					ret = DB_NOTFOUND;
194 					goto err;
195 				}
196 
197 			}
198 
199 			/* Correct from 1-based to 0-based for a page offset. */
200 			--recno;
201 			BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret);
202 			return (ret);
203 		case P_IBTREE:
204 			for (indx = 0, top = NUM_ENT(h);;) {
205 				bi = GET_BINTERNAL(h, indx);
206 				if (++indx == top || total + bi->nrecs >= recno)
207 					break;
208 				total += bi->nrecs;
209 			}
210 			pg = bi->pgno;
211 			break;
212 		case P_LRECNO:
213 			recno -= total;
214 
215 			/* Correct from 1-based to 0-based for a page offset. */
216 			--recno;
217 			BT_STK_ENTER(cp, h, recno, lock, ret);
218 			return (ret);
219 		case P_IRECNO:
220 			for (indx = 0, top = NUM_ENT(h);;) {
221 				ri = GET_RINTERNAL(h, indx);
222 				if (++indx == top || total + ri->nrecs >= recno)
223 					break;
224 				total += ri->nrecs;
225 			}
226 			pg = ri->pgno;
227 			break;
228 		default:
229 			return (__db_pgfmt(dbp, h->pgno));
230 		}
231 		--indx;
232 
233 		if (stack) {
234 			/* Return if this is the lowest page wanted. */
235 			if (LF_ISSET(S_PARENT) && stop == h->level) {
236 				BT_STK_ENTER(cp, h, indx, lock, ret);
237 				return (ret);
238 			}
239 			BT_STK_PUSH(cp, h, indx, lock, ret);
240 			if (ret != 0)
241 				goto err;
242 
243 			if ((ret =
244 			    __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
245 				goto err;
246 		} else {
247 			/*
248 			 * Decide if we want to return a pointer to the next
249 			 * page in the stack.  If we do, write lock it and
250 			 * never unlock it.
251 			 */
252 			if ((LF_ISSET(S_PARENT) &&
253 			    (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
254 			    (h->level - 1) == LEAFLEVEL)
255 				stack = 1;
256 
257 			(void)memp_fput(dbp->mpf, h, 0);
258 
259 			if ((ret =
260 			    __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ?
261 			    DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
262 				goto err;
263 		}
264 
265 		if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
266 			goto err;
267 	}
268 	/* NOTREACHED */
269 
270 err:	BT_STK_POP(cp);
271 	__bam_stkrel(dbc, 0);
272 	return (ret);
273 }
274 
275 /*
276  * __bam_adjust --
277  *	Adjust the tree after adding or deleting a record.
278  *
279  * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
280  */
281 int
__bam_adjust(dbc,adjust)282 __bam_adjust(dbc, adjust)
283 	DBC *dbc;
284 	int32_t adjust;
285 {
286 	CURSOR *cp;
287 	DB *dbp;
288 	EPG *epg;
289 	PAGE *h;
290 	int ret;
291 
292 	dbp = dbc->dbp;
293 	cp = dbc->internal;
294 
295 	/* Update the record counts for the tree. */
296 	for (epg = cp->sp; epg <= cp->csp; ++epg) {
297 		h = epg->page;
298 		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
299 			if (DB_LOGGING(dbc) &&
300 			    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
301 			    dbc->txn, &LSN(h), 0, dbp->log_fileid,
302 			    PGNO(h), &LSN(h), (u_int32_t)epg->indx,
303 			    adjust, 1)) != 0)
304 				return (ret);
305 
306 			if (TYPE(h) == P_IBTREE)
307 				GET_BINTERNAL(h, epg->indx)->nrecs += adjust;
308 			else
309 				GET_RINTERNAL(h, epg->indx)->nrecs += adjust;
310 
311 			if (PGNO(h) == PGNO_ROOT)
312 				RE_NREC_ADJ(h, adjust);
313 
314 			if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
315 				return (ret);
316 		}
317 	}
318 	return (0);
319 }
320 
321 /*
322  * __bam_nrecs --
323  *	Return the number of records in the tree.
324  *
325  * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
326  */
327 int
__bam_nrecs(dbc,rep)328 __bam_nrecs(dbc, rep)
329 	DBC *dbc;
330 	db_recno_t *rep;
331 {
332 	DB *dbp;
333 	DB_LOCK lock;
334 	PAGE *h;
335 	db_pgno_t pgno;
336 	int ret;
337 
338 	dbp = dbc->dbp;
339 
340 	pgno = PGNO_ROOT;
341 	if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0)
342 		return (ret);
343 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
344 		return (ret);
345 
346 	*rep = RE_NREC(h);
347 
348 	(void)memp_fput(dbp->mpf, h, 0);
349 	(void)__BT_TLPUT(dbc, lock);
350 
351 	return (0);
352 }
353 
354 /*
355  * __bam_total --
356  *	Return the number of records below a page.
357  *
358  * PUBLIC: db_recno_t __bam_total __P((PAGE *));
359  */
360 db_recno_t
__bam_total(h)361 __bam_total(h)
362 	PAGE *h;
363 {
364 	db_recno_t nrecs;
365 	db_indx_t indx, top;
366 
367 	nrecs = 0;
368 	top = NUM_ENT(h);
369 
370 	switch (TYPE(h)) {
371 	case P_LBTREE:
372 		/* Check for logically deleted records. */
373 		for (indx = 0; indx < top; indx += P_INDX)
374 			if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
375 				++nrecs;
376 		break;
377 	case P_IBTREE:
378 		for (indx = 0; indx < top; indx += O_INDX)
379 			nrecs += GET_BINTERNAL(h, indx)->nrecs;
380 		break;
381 	case P_LRECNO:
382 		nrecs = NUM_ENT(h);
383 		break;
384 	case P_IRECNO:
385 		for (indx = 0; indx < top; indx += O_INDX)
386 			nrecs += GET_RINTERNAL(h, indx)->nrecs;
387 		break;
388 	}
389 
390 	return (nrecs);
391 }
392