1/*
2 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6#pragma ident	"%Z%%M%	%I%	%E% SMI"
7
8/*
9 * Copyright (C) 2002 by the Massachusetts Institute of Technology.
10 * All rights reserved.
11 *
12 * Export of this software from the United States of America may
13 *   require a specific license from the United States Government.
14 *   It is the responsibility of any person or organization contemplating
15 *   export to obtain such a license before exporting.
16 *
17 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
18 * distribute this software and its documentation for any purpose and
19 * without fee is hereby granted, provided that the above copyright
20 * notice appear in all copies and that both that copyright notice and
21 * this permission notice appear in supporting documentation, and that
22 * the name of M.I.T. not be used in advertising or publicity pertaining
23 * to distribution of the software without specific, written prior
24 * permission.  Furthermore if you modify this software you must label
25 * your software as modified software and not distribute it in such a
26 * fashion that it might be confused with the original M.I.T. software.
27 * M.I.T. makes no representations about the suitability of
28 * this software for any purpose.  It is provided "as is" without express
29 * or implied warranty.
30 */
31
32/*-
33 * Copyright (c) 1990, 1993, 1994
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Mike Olson.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 *    must display the following acknowledgement:
49 *	This product includes software developed by the University of
50 *	California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 *    may be used to endorse or promote products derived from this software
53 *    without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 */
67
68#if defined(LIBC_SCCS) && !defined(lint)
69static char sccsid[] = "@(#)bt_seq.c	8.9 (Berkeley) 6/20/95";
70#endif /* LIBC_SCCS and not lint */
71
72#include <sys/types.h>
73
74#include <errno.h>
75#include <stddef.h>
76#include <stdio.h>
77#include <stdlib.h>
78#include <string.h>
79
80#include "db-int.h"
81#include "btree.h"
82
83static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
84static int __bt_seqadv __P((BTREE *, EPG *, int));
85static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
86
87/*
88 * Sequential scan support.
89 *
90 * The tree can be scanned sequentially, starting from either end of the
91 * tree or from any specific key.  A scan request before any scanning is
92 * done is initialized as starting from the least node.
93 */
94
95/*
96 * __bt_seq --
97 *	Btree sequential scan interface.
98 *
99 * Parameters:
100 *	dbp:	pointer to access method
101 *	key:	key for positioning and return value
102 *	data:	data return value
103 *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
104 *
105 * Returns:
106 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
107 */
108int
109__bt_seq(dbp, key, data, flags)
110	const DB *dbp;
111	DBT *key, *data;
112	u_int flags;
113{
114	BTREE *t;
115	EPG e;
116	int status;
117
118	t = dbp->internal;
119
120	/* Toss any page pinned across calls. */
121	if (t->bt_pinned != NULL) {
122		mpool_put(t->bt_mp, t->bt_pinned, 0);
123		t->bt_pinned = NULL;
124	}
125
126	/*
127	 * If scan unitialized as yet, or starting at a specific record, set
128	 * the scan to a specific key.  Both __bt_seqset and __bt_seqadv pin
129	 * the page the cursor references if they're successful.
130	 */
131	switch (flags) {
132	case R_NEXT:
133	case R_PREV:
134		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
135			status = __bt_seqadv(t, &e, flags);
136			break;
137		}
138		/* FALLTHROUGH */
139	case R_FIRST:
140	case R_LAST:
141	case R_CURSOR:
142		status = __bt_seqset(t, &e, key, flags);
143		break;
144	default:
145		errno = EINVAL;
146		return (RET_ERROR);
147	}
148
149	if (status == RET_SUCCESS) {
150		__bt_setcur(t, e.page->pgno, e.index);
151
152		status =
153		    __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
154
155		/*
156		 * If the user is doing concurrent access, we copied the
157		 * key/data, toss the page.
158		 */
159		if (F_ISSET(t, B_DB_LOCK))
160			mpool_put(t->bt_mp, e.page, 0);
161		else
162			t->bt_pinned = e.page;
163	}
164	return (status);
165}
166
167/*
168 * __bt_seqset --
169 *	Set the sequential scan to a specific key.
170 *
171 * Parameters:
172 *	t:	tree
173 *	ep:	storage for returned key
174 *	key:	key for initial scan position
175 *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
176 *
177 * Side effects:
178 *	Pins the page the cursor references.
179 *
180 * Returns:
181 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
182 */
183static int
184__bt_seqset(t, ep, key, flags)
185	BTREE *t;
186	EPG *ep;
187	DBT *key;
188	int flags;
189{
190	PAGE *h;
191	db_pgno_t pg;
192	int exact;
193
194	/*
195	 * Find the first, last or specific key in the tree and point the
196	 * cursor at it.  The cursor may not be moved until a new key has
197	 * been found.
198	 */
199	switch (flags) {
200	case R_CURSOR:				/* Keyed scan. */
201		/*
202		 * Find the first instance of the key or the smallest key
203		 * which is greater than or equal to the specified key.
204		 */
205		if (key->data == NULL || key->size == 0) {
206			errno = EINVAL;
207			return (RET_ERROR);
208		}
209		return (__bt_first(t, key, ep, &exact));
210	case R_FIRST:				/* First record. */
211	case R_NEXT:
212		/* Walk down the left-hand side of the tree. */
213		for (pg = P_ROOT;;) {
214			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
215				return (RET_ERROR);
216
217			/* Check for an empty tree. */
218			if (NEXTINDEX(h) == 0) {
219				mpool_put(t->bt_mp, h, 0);
220				return (RET_SPECIAL);
221			}
222
223			if (h->flags & (P_BLEAF | P_RLEAF))
224				break;
225			pg = GETBINTERNAL(h, 0)->pgno;
226			mpool_put(t->bt_mp, h, 0);
227		}
228		ep->page = h;
229		ep->index = 0;
230		break;
231	case R_LAST:				/* Last record. */
232	case R_PREV:
233		/* Walk down the right-hand side of the tree. */
234		for (pg = P_ROOT;;) {
235			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
236				return (RET_ERROR);
237
238			/* Check for an empty tree. */
239			if (NEXTINDEX(h) == 0) {
240				mpool_put(t->bt_mp, h, 0);
241				return (RET_SPECIAL);
242			}
243
244			if (h->flags & (P_BLEAF | P_RLEAF))
245				break;
246			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
247			mpool_put(t->bt_mp, h, 0);
248		}
249
250		ep->page = h;
251		ep->index = NEXTINDEX(h) - 1;
252		break;
253	}
254	return (RET_SUCCESS);
255}
256
257/*
258 * __bt_seqadvance --
259 *	Advance the sequential scan.
260 *
261 * Parameters:
262 *	t:	tree
263 *	flags:	R_NEXT, R_PREV
264 *
265 * Side effects:
266 *	Pins the page the new key/data record is on.
267 *
268 * Returns:
269 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
270 */
271static int
272__bt_seqadv(t, ep, flags)
273	BTREE *t;
274	EPG *ep;
275	int flags;
276{
277	CURSOR *c;
278	PAGE *h;
279	indx_t idx;
280	db_pgno_t pg;
281	int exact, rval;
282
283	/*
284	 * There are a couple of states that we can be in.  The cursor has
285	 * been initialized by the time we get here, but that's all we know.
286	 */
287	c = &t->bt_cursor;
288
289	/*
290	 * The cursor was deleted and there weren't any duplicate records,
291	 * so the cursor's key was saved.  Find out where that key would
292	 * be in the current tree.  If the returned key is an exact match,
293	 * it means that a key/data pair was inserted into the tree after
294	 * the delete.  We could reasonably return the key, but the problem
295	 * is that this is the access pattern we'll see if the user is
296	 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
297	 * the cursor record and then replaces it, so the cursor was saved,
298	 * and we'll simply return the same "new" record until the user
299	 * notices and doesn't do a put() of it.  Since the key is an exact
300	 * match, we could as easily put the new record before the cursor,
301	 * and we've made no guarantee to return it.  So, move forward or
302	 * back a record if it's an exact match.
303	 *
304	 * XXX
305	 * In the current implementation, put's to the cursor are done with
306	 * delete/add pairs.  This has two consequences.  First, it means
307	 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
308	 * the same behavior as above.  Second, you can return the same key
309	 * twice if you have duplicate records.  The scenario is that the
310	 * cursor record is deleted, moving the cursor forward or backward
311	 * to a duplicate.  The add then inserts the new record at a location
312	 * ahead of the cursor because duplicates aren't sorted in any way,
313	 * and the new record is later returned.  This has to be fixed at some
314	 * point.
315	 */
316	if (F_ISSET(c, CURS_ACQUIRE)) {
317		if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
318			return (RET_ERROR);
319		if (!exact)
320			return (rval);
321		/*
322		 * XXX
323		 * Kluge -- get, release, get the page.
324		 */
325		c->pg.pgno = ep->page->pgno;
326		c->pg.index = ep->index;
327		mpool_put(t->bt_mp, ep->page, 0);
328	}
329
330	/* Get the page referenced by the cursor. */
331	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
332		return (RET_ERROR);
333
334	/*
335 	 * Find the next/previous record in the tree and point the cursor at
336	 * it.  The cursor may not be moved until a new key has been found.
337	 */
338	switch (flags) {
339	case R_NEXT:			/* Next record. */
340		/*
341		 * The cursor was deleted in duplicate records, and moved
342		 * forward to a record that has yet to be returned.  Clear
343		 * that flag, and return the record.
344		 */
345		if (F_ISSET(c, CURS_AFTER))
346			goto usecurrent;
347		idx = c->pg.index;
348		if (++idx == NEXTINDEX(h)) {
349			pg = h->nextpg;
350			mpool_put(t->bt_mp, h, 0);
351			if (pg == P_INVALID)
352				return (RET_SPECIAL);
353			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
354				return (RET_ERROR);
355			idx = 0;
356		}
357		break;
358	case R_PREV:			/* Previous record. */
359		/*
360		 * The cursor was deleted in duplicate records, and moved
361		 * backward to a record that has yet to be returned.  Clear
362		 * that flag, and return the record.
363		 */
364		if (F_ISSET(c, CURS_BEFORE)) {
365usecurrent:		F_CLR(c, CURS_AFTER | CURS_BEFORE);
366			ep->page = h;
367			ep->index = c->pg.index;
368			return (RET_SUCCESS);
369		}
370		idx = c->pg.index;
371		if (idx == 0) {
372			pg = h->prevpg;
373			mpool_put(t->bt_mp, h, 0);
374			if (pg == P_INVALID)
375				return (RET_SPECIAL);
376			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
377				return (RET_ERROR);
378			idx = NEXTINDEX(h) - 1;
379		} else
380			--idx;
381		break;
382	}
383
384	ep->page = h;
385	ep->index = idx;
386	return (RET_SUCCESS);
387}
388
389/*
390 * __bt_first --
391 *	Find the first entry.
392 *
393 * Parameters:
394 *	t:	the tree
395 *    key:	the key
396 *  erval:	return EPG
397 * exactp:	pointer to exact match flag
398 *
399 * Returns:
400 *	The first entry in the tree greater than or equal to key,
401 *	or RET_SPECIAL if no such key exists.
402 */
403static int
404__bt_first(t, key, erval, exactp)
405	BTREE *t;
406	const DBT *key;
407	EPG *erval;
408	int *exactp;
409{
410	PAGE *h;
411	EPG *ep, save;
412	db_pgno_t pg;
413
414	/*
415	 * Find any matching record; __bt_search pins the page.
416	 *
417	 * If it's an exact match and duplicates are possible, walk backwards
418	 * in the tree until we find the first one.  Otherwise, make sure it's
419	 * a valid key (__bt_search may return an index just past the end of a
420	 * page) and return it.
421	 */
422	if ((ep = __bt_search(t, key, exactp)) == NULL)
423		return (RET_SPECIAL);
424	if (*exactp) {
425		if (F_ISSET(t, B_NODUPS)) {
426			*erval = *ep;
427			return (RET_SUCCESS);
428		}
429
430		/*
431		 * Walk backwards, as long as the entry matches and there are
432		 * keys left in the tree.  Save a copy of each match in case
433		 * we go too far.
434		 */
435		save = *ep;
436		h = ep->page;
437		do {
438			if (save.page->pgno != ep->page->pgno) {
439				mpool_put(t->bt_mp, save.page, 0);
440				save = *ep;
441			} else
442				save.index = ep->index;
443
444			/*
445			 * Don't unpin the page the last (or original) match
446			 * was on, but make sure it's unpinned if an error
447			 * occurs.
448			 */
449			if (ep->index == 0) {
450				if (h->prevpg == P_INVALID)
451					break;
452				if (h->pgno != save.page->pgno)
453					mpool_put(t->bt_mp, h, 0);
454				if ((h = mpool_get(t->bt_mp,
455				    h->prevpg, 0)) == NULL) {
456					if (h->pgno == save.page->pgno)
457						mpool_put(t->bt_mp,
458						    save.page, 0);
459					return (RET_ERROR);
460				}
461				ep->page = h;
462				ep->index = NEXTINDEX(h);
463			}
464			--ep->index;
465		} while (__bt_cmp(t, key, ep) == 0);
466
467		/*
468		 * Reach here with the last page that was looked at pinned,
469		 * which may or may not be the same as the last (or original)
470		 * match page.  If it's not useful, release it.
471		 */
472		if (h->pgno != save.page->pgno)
473			mpool_put(t->bt_mp, h, 0);
474
475		*erval = save;
476		return (RET_SUCCESS);
477	}
478
479	/* If at the end of a page, find the next entry. */
480	if (ep->index == NEXTINDEX(ep->page)) {
481		h = ep->page;
482		pg = h->nextpg;
483		mpool_put(t->bt_mp, h, 0);
484		if (pg == P_INVALID)
485			return (RET_SPECIAL);
486		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
487			return (RET_ERROR);
488		ep->index = 0;
489		ep->page = h;
490	}
491	*erval = *ep;
492	return (RET_SUCCESS);
493}
494
495/*
496 * __bt_setcur --
497 *	Set the cursor to an entry in the tree.
498 *
499 * Parameters:
500 *	t:	the tree
501 *   pgno:	page number
502 *  index:	page index
503 */
504void
505__bt_setcur(t, pgno, idx)
506	BTREE *t;
507	db_pgno_t pgno;
508	u_int idx;
509{
510	/* Lose any already deleted key. */
511	if (t->bt_cursor.key.data != NULL) {
512		free(t->bt_cursor.key.data);
513		t->bt_cursor.key.size = 0;
514		t->bt_cursor.key.data = NULL;
515	}
516	F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
517
518	/* Update the cursor. */
519	t->bt_cursor.pg.pgno = pgno;
520	t->bt_cursor.pg.index = idx;
521	F_SET(&t->bt_cursor, CURS_INIT);
522}
523
524/* Recursive descent cursor. */
525typedef struct rcursor_ {
526	CURSOR	cursor;
527	size_t	ssize;
528	EPGNO	*stack;
529	EPGNO	*sp;
530} RCURSOR;
531#define RCURSOR_MINSS	64
532
533static int	 bt_rcinit(void **);
534static void	 bt_rcdestroy(void **);
535static int	 bt_rcpush(RCURSOR *, db_pgno_t, u_int);
536static EPGNO	*bt_rcpop(RCURSOR *);
537static void	 bt_rcclr(RCURSOR *);
538static int	 bt_rcgrowstk(RCURSOR *);
539static int	 bt_rseqset(BTREE *, EPG *, DBT *, RCURSOR *, int);
540static int	 bt_rseqadv(BTREE *, EPG *, RCURSOR *, int);
541
542static int
543bt_rcinit(curs)
544	void **curs;
545{
546	RCURSOR *rc;
547
548	rc = *curs = malloc(sizeof(RCURSOR));
549	if (rc == NULL) {
550		errno = ENOMEM;
551		return RET_ERROR;
552	}
553	memset(rc, 0, sizeof(*rc));
554
555	rc->ssize = RCURSOR_MINSS;
556	rc->stack = malloc(rc->ssize * sizeof(EPGNO));
557	if (rc->stack == NULL) {
558		free(rc);
559		errno = ENOMEM;
560		return RET_ERROR;
561	}
562	bt_rcclr(rc);
563	return RET_SUCCESS;
564}
565
566static void
567bt_rcdestroy(curs)
568	void **curs;
569{
570	RCURSOR *rc;
571
572	rc = *curs;
573	free(rc->stack);
574	free(rc);
575	*curs = NULL;
576}
577
578static int
579bt_rcpush(rc, p, i)
580	RCURSOR *rc;
581	db_pgno_t p;
582	u_int i;
583{
584	int status;
585
586	rc->sp->pgno = p;
587	rc->sp->index = i;
588	if (++rc->sp > rc->stack + rc->ssize) {
589		status = bt_rcgrowstk(rc);
590		if (status != RET_SUCCESS)
591			return status;
592	}
593	return RET_SUCCESS;
594}
595
596static EPGNO *
597bt_rcpop(rc)
598	RCURSOR *rc;
599{
600	return (rc->sp == rc->stack) ? NULL : --rc->sp;
601}
602
603static void
604bt_rcclr(rc)
605	RCURSOR *rc;
606{
607	rc->sp = rc->stack;
608}
609
610static int
611bt_rcgrowstk(rc)
612	RCURSOR *rc;
613{
614	size_t osize;
615	EPGNO *e;
616
617	osize = rc->ssize;
618	rc->ssize *= 2;
619	e = realloc(rc->stack, rc->ssize * sizeof(EPGNO));
620	if (e == NULL) {
621		rc->ssize = osize;
622		errno = ENOMEM;
623		return RET_ERROR;
624	}
625	rc->stack = e;
626	return RET_SUCCESS;
627}
628
629/*
630 * bt_rseq --
631 *	Like __bt_seq but does recursive descent tree traversal
632 *	instead of using the prev/next pointers.
633 */
634int
635bt_rseq(dbp, key, data, curs, flags)
636	const DB *dbp;
637	DBT *key, *data;
638	void **curs;
639	u_int flags;
640{
641	RCURSOR *rc;
642	BTREE *t;
643	EPG e;
644	int status;
645
646	t = dbp->internal;
647
648	/* Toss any page pinned across calls. */
649	if (t->bt_pinned != NULL) {
650		mpool_put(t->bt_mp, t->bt_pinned, 0);
651		t->bt_pinned = NULL;
652	}
653
654	if (curs == NULL) {
655		errno = EINVAL;
656		return RET_ERROR;
657	}
658	if (*curs == NULL) {
659		status = bt_rcinit(curs);
660		if (status != RET_SUCCESS)
661			return RET_ERROR;
662	}
663	rc = *curs;
664
665	/*
666	 * If scan unitialized as yet, or starting at a specific record, set
667	 * the scan to a specific key.  Both bt_rseqset and bt_rseqadv pin
668	 * the page the cursor references if they're successful.
669	 */
670	switch (flags) {
671	case R_NEXT:
672	case R_PREV:
673		if (F_ISSET(&rc->cursor, CURS_INIT)) {
674			status = bt_rseqadv(t, &e, rc, flags);
675			break;
676		}
677		/* FALLTHROUGH */
678	case R_FIRST:
679	case R_LAST:
680	case R_CURSOR:
681		status = bt_rseqset(t, &e, key, rc, flags);
682		break;
683	default:
684		errno = EINVAL;
685		return (RET_ERROR);
686	}
687
688	if (status == RET_SUCCESS) {
689		status =
690		    __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
691
692		/*
693		 * If the user is doing concurrent access, we copied the
694		 * key/data, toss the page.
695		 */
696		if (F_ISSET(t, B_DB_LOCK))
697			mpool_put(t->bt_mp, e.page, 0);
698		else
699			t->bt_pinned = e.page;
700	} else if (status == RET_SPECIAL)
701		bt_rcdestroy(curs);
702	return (status);
703}
704
705/*
706 * bt_rseqset --
707 *	Set the sequential scan to a specific key.
708 *
709 * Parameters:
710 *	t:	tree
711 *	ep:	storage for returned key
712 *	key:	key for initial scan position
713 *	rc:	recursion cursor
714 *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
715 *
716 * Side effects:
717 *	Pins the page the cursor references.
718 *	Updates rc's stack and cursor.
719 *
720 * Returns:
721 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
722 */
723static int
724bt_rseqset(t, ep, key, rc, flags)
725	BTREE *t;
726	EPG *ep;
727	DBT *key;
728	RCURSOR *rc;
729	int flags;
730{
731	PAGE *h;
732	db_pgno_t pg;
733	int status;
734
735	/*
736	 * Find the first, last or specific key in the tree and point the
737	 * cursor at it.  The cursor may not be moved until a new key has
738	 * been found.
739	 */
740	switch (flags) {
741	case R_CURSOR:		/* Not implemented. */
742		errno = EINVAL;
743		return RET_ERROR;
744	case R_FIRST:				/* First record. */
745	case R_NEXT:
746		bt_rcclr(rc);
747		/* Walk down the left-hand side of the tree. */
748		for (pg = P_ROOT;;) {
749			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
750				return (RET_ERROR);
751
752			/* Check for an empty tree. */
753			if (NEXTINDEX(h) == 0) {
754				mpool_put(t->bt_mp, h, 0);
755				return (RET_SPECIAL);
756			}
757
758			if (h->flags & (P_BLEAF | P_RLEAF))
759				break;
760			pg = GETBINTERNAL(h, 0)->pgno;
761			status = bt_rcpush(rc, h->pgno, 0);
762			mpool_put(t->bt_mp, h, 0);
763			if (status != RET_SUCCESS)
764				return status;
765		}
766		ep->page = h;
767		ep->index = 0;
768		break;
769	case R_LAST:				/* Last record. */
770	case R_PREV:
771		bt_rcclr(rc);
772		/* Walk down the right-hand side of the tree. */
773		for (pg = P_ROOT;;) {
774			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
775				return (RET_ERROR);
776
777			/* Check for an empty tree. */
778			if (NEXTINDEX(h) == 0) {
779				mpool_put(t->bt_mp, h, 0);
780				return (RET_SPECIAL);
781			}
782
783			if (h->flags & (P_BLEAF | P_RLEAF))
784				break;
785			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
786			status = bt_rcpush(rc, h->pgno, NEXTINDEX(h) - 1);
787			mpool_put(t->bt_mp, h, 0);
788			if (status != RET_SUCCESS)
789				return status;
790		}
791		ep->page = h;
792		ep->index = NEXTINDEX(h) - 1;
793		break;
794	}
795	rc->cursor.pg.pgno = ep->page->pgno;
796	rc->cursor.pg.index = ep->index;
797	F_CLR(&rc->cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
798	F_SET(&rc->cursor, CURS_INIT);
799	return (RET_SUCCESS);
800}
801
802/*
803 * bt_rseqadvance --
804 *	Advance the sequential scan.
805 *
806 * Parameters:
807 *	t:	tree
808 *	ep:	return page
809 *	rc:	recursion cursor
810 *	flags:	R_NEXT, R_PREV
811 *
812 * Side effects:
813 *	Pins the page the new key/data record is on.
814 *	Updates rc's stack and cursor.
815 *
816 * Returns:
817 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
818 */
819static int
820bt_rseqadv(t, ep, rc, flags)
821	BTREE *t;
822	EPG *ep;
823	RCURSOR *rc;
824	int flags;
825{
826	CURSOR *c;
827	PAGE *h;
828	indx_t idx;
829	db_pgno_t pg;
830	int status;
831	EPGNO *e;
832
833	/*
834	 * There are a couple of states that we can be in.  The cursor has
835	 * been initialized by the time we get here, but that's all we know.
836	 */
837	c = &rc->cursor;
838
839	/* Get the page referenced by the cursor. */
840	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
841		return (RET_ERROR);
842
843	/*
844 	 * Find the next/previous record in the tree and point the cursor at
845	 * it.  The cursor may not be moved until a new key has been found.
846	 */
847	switch (flags) {
848	case R_NEXT:			/* Next record. */
849		idx = c->pg.index;
850		while (++idx == NEXTINDEX(h)) {
851			/* Crawl up if we hit the right edge. */
852			e = bt_rcpop(rc);
853			mpool_put(t->bt_mp, h, 0);
854			if (e == NULL) /* Hit the right edge of root. */
855				return RET_SPECIAL;
856			idx = e->index;
857			pg = e->pgno;
858			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
859				return (RET_ERROR);
860		}
861		while (!(h->flags & (P_BLEAF | P_RLEAF))) {
862			/* Crawl down the left until we hit a leaf. */
863			status = bt_rcpush(rc, h->pgno, idx);
864			pg = GETBINTERNAL(h, idx)->pgno;
865			mpool_put(t->bt_mp, h, 0);
866			if (status != RET_SUCCESS)
867				return status;
868			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
869				return (RET_ERROR);
870			idx = 0;
871		}
872		break;
873	case R_PREV:			/* Previous record. */
874		idx = c->pg.index;
875		while (!idx) {
876			/* Crawl up if we hit the left edge. */
877			e = bt_rcpop(rc);
878			mpool_put(t->bt_mp, h, 0);
879			if (e == NULL) /* Hit the left edge of root. */
880				return RET_SPECIAL;
881			idx = e->index;
882			pg = e->pgno;
883			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
884				return (RET_ERROR);
885		}
886		idx--;
887		while (!(h->flags & (P_BLEAF | P_RLEAF))) {
888			/* Crawl down the right until we hit a leaf. */
889			status = bt_rcpush(rc, h->pgno, idx);
890			pg = GETBINTERNAL(h, idx)->pgno;
891			mpool_put(t->bt_mp, h, 0);
892			if (status != RET_SUCCESS)
893				return status;
894			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
895				return (RET_ERROR);
896			idx = NEXTINDEX(h) - 1;
897		}
898		break;
899	}
900
901	ep->page = h;
902	ep->index = idx;
903	c->pg.pgno = h->pgno;
904	c->pg.index = idx;
905	F_CLR(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
906	F_SET(c, CURS_INIT);
907	return (RET_SUCCESS);
908}
909