1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 */
7
8#include "config.h"
9
10#ifndef lint
11static const char sccsid[] = "@(#)bt_cursor.c	10.81 (Sleepycat) 12/16/98";
12#endif /* not lint */
13
14#ifndef NO_SYSTEM_INCLUDES
15#include <sys/types.h>
16
17#include <errno.h>
18#include <stdlib.h>
19#include <string.h>
20#endif
21
22#include "db_int.h"
23#include "db_page.h"
24#include "btree.h"
25#include "shqueue.h"
26#include "db_shash.h"
27#include "lock.h"
28#include "lock_ext.h"
29
30static int __bam_c_close __P((DBC *));
31static int __bam_c_del __P((DBC *, u_int32_t));
32static int __bam_c_destroy __P((DBC *));
33static int __bam_c_first __P((DBC *, CURSOR *));
34static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
35static int __bam_c_getstack __P((DBC *, CURSOR *));
36static int __bam_c_last __P((DBC *, CURSOR *));
37static int __bam_c_next __P((DBC *, CURSOR *, int));
38static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *));
39static int __bam_c_prev __P((DBC *, CURSOR *));
40static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
41static void __bam_c_reset __P((CURSOR *));
42static int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
43static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *));
44static int __bam_dsearch __P((DBC *, CURSOR *,  DBT *, u_int32_t *));
45
46/* Discard the current page/lock held by a cursor. */
47#undef	DISCARD
48#define	DISCARD(dbc, cp) {						\
49	if ((cp)->page != NULL) {					\
50		(void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0);	\
51		(cp)->page = NULL;					\
52	}								\
53	if ((cp)->lock != LOCK_INVALID) {				\
54		(void)__BT_TLPUT((dbc), (cp)->lock);			\
55		(cp)->lock = LOCK_INVALID;				\
56	}								\
57}
58
59/* If the cursor references a deleted record. */
60#undef	IS_CUR_DELETED
61#define	IS_CUR_DELETED(cp)						\
62	(((cp)->dpgno == PGNO_INVALID &&				\
63	B_DISSET(GET_BKEYDATA((cp)->page,				\
64	(cp)->indx + O_INDX)->type)) ||					\
65	((cp)->dpgno != PGNO_INVALID &&					\
66	B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type)))
67
68/* If the cursor and index combination references a deleted record. */
69#undef	IS_DELETED
70#define	IS_DELETED(cp, indx)						\
71	(((cp)->dpgno == PGNO_INVALID &&				\
72	B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) ||	\
73	((cp)->dpgno != PGNO_INVALID &&					\
74	B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type)))
75
76/*
77 * Test to see if two cursors could point to duplicates of the same key,
78 * whether on-page or off-page.  The leaf page numbers must be the same
79 * in both cases.  In the case of off-page duplicates, the key indices
80 * on the leaf page will be the same.  In the case of on-page duplicates,
81 * the duplicate page number must not be set, and the key index offsets
82 * must be the same.  For the last test, as the saved copy of the cursor
83 * will not have a valid page pointer, we use the cursor's.
84 */
85#undef	POSSIBLE_DUPLICATE
86#define	POSSIBLE_DUPLICATE(cursor, saved_copy)				\
87	((cursor)->pgno == (saved_copy).pgno &&				\
88	((cursor)->indx == (saved_copy).indx ||				\
89	((cursor)->dpgno == PGNO_INVALID &&				\
90	    (saved_copy).dpgno == PGNO_INVALID &&			\
91	    (cursor)->page->inp[(cursor)->indx] ==			\
92	    (cursor)->page->inp[(saved_copy).indx])))
93
94/*
95 * __bam_c_reset --
96 *	Initialize internal cursor structure.
97 */
98static void
99__bam_c_reset(cp)
100	CURSOR *cp;
101{
102	cp->sp = cp->csp = cp->stack;
103	cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
104	cp->page = NULL;
105	cp->pgno = PGNO_INVALID;
106	cp->indx = 0;
107	cp->dpgno = PGNO_INVALID;
108	cp->dindx = 0;
109	cp->lock = LOCK_INVALID;
110	cp->mode = DB_LOCK_NG;
111	cp->recno = RECNO_OOB;
112	cp->flags = 0;
113}
114
115/*
116 * __bam_c_init --
117 *	Initialize the access private portion of a cursor
118 *
119 * PUBLIC: int __bam_c_init __P((DBC *));
120 */
121int
122__bam_c_init(dbc)
123	DBC *dbc;
124{
125	DB *dbp;
126	CURSOR *cp;
127	int ret;
128
129	if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0)
130		return (ret);
131
132	dbp = dbc->dbp;
133	cp->dbc = dbc;
134
135	/*
136	 * Logical record numbers are always the same size, and we don't want
137	 * to have to check for space every time we return one.  Allocate it
138	 * in advance.
139	 */
140	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
141		if ((ret = __os_malloc(sizeof(db_recno_t),
142		    NULL, &dbc->rkey.data)) != 0) {
143			__os_free(cp, sizeof(CURSOR));
144			return (ret);
145		}
146		dbc->rkey.ulen = sizeof(db_recno_t);
147	}
148
149	/* Initialize methods. */
150	dbc->internal = cp;
151	if (dbp->type == DB_BTREE) {
152		dbc->c_am_close = __bam_c_close;
153		dbc->c_am_destroy = __bam_c_destroy;
154		dbc->c_del = __bam_c_del;
155		dbc->c_get = __bam_c_get;
156		dbc->c_put = __bam_c_put;
157	} else {
158		dbc->c_am_close = __bam_c_close;
159		dbc->c_am_destroy = __bam_c_destroy;
160		dbc->c_del = __ram_c_del;
161		dbc->c_get = __ram_c_get;
162		dbc->c_put = __ram_c_put;
163	}
164
165	/* Initialize dynamic information. */
166	__bam_c_reset(cp);
167
168	return (0);
169}
170
171/*
172 * __bam_c_close --
173 *	Close down the cursor from a single use.
174 */
175static int
176__bam_c_close(dbc)
177	DBC *dbc;
178{
179	CURSOR *cp;
180	DB *dbp;
181	int ret;
182
183	dbp = dbc->dbp;
184	cp = dbc->internal;
185	ret = 0;
186
187	/*
188	 * If a cursor deleted a btree key, perform the actual deletion.
189	 * (Recno keys are either deleted immediately or never deleted.)
190	 */
191	if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED))
192		ret = __bam_c_physdel(dbc, cp, NULL);
193
194	/* Discard any locks not acquired inside of a transaction. */
195	if (cp->lock != LOCK_INVALID) {
196		(void)__BT_TLPUT(dbc, cp->lock);
197		cp->lock = LOCK_INVALID;
198	}
199
200	/* Sanity checks. */
201#ifdef DIAGNOSTIC
202	if (cp->csp != cp->stack)
203		__db_err(dbp->dbenv, "btree cursor close: stack not empty");
204#endif
205
206	/* Initialize dynamic information. */
207	__bam_c_reset(cp);
208
209	return (ret);
210}
211
212/*
213 * __bam_c_destroy --
214 *	Close a single cursor -- internal version.
215 */
216static int
217__bam_c_destroy(dbc)
218	DBC *dbc;
219{
220	/* Discard the structures. */
221	__os_free(dbc->internal, sizeof(CURSOR));
222
223	return (0);
224}
225
226/*
227 * __bam_c_del --
228 *	Delete using a cursor.
229 */
230static int
231__bam_c_del(dbc, flags)
232	DBC *dbc;
233	u_int32_t flags;
234{
235	CURSOR *cp;
236	DB *dbp;
237	DB_LOCK lock;
238	PAGE *h;
239	db_pgno_t pgno;
240	db_indx_t indx;
241	int ret;
242
243	dbp = dbc->dbp;
244	cp = dbc->internal;
245	h = NULL;
246
247	DB_PANIC_CHECK(dbp);
248
249	/* Check for invalid flags. */
250	if ((ret = __db_cdelchk(dbp, flags,
251	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
252		return (ret);
253
254	/*
255	 * If we are running CDB, this had better be either a write
256	 * cursor or an immediate writer.
257	 */
258	if (F_ISSET(dbp, DB_AM_CDB))
259		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
260			return (EINVAL);
261
262	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags);
263
264	/* If already deleted, return failure. */
265	if (F_ISSET(cp, C_DELETED))
266		return (DB_KEYEMPTY);
267
268	/*
269	 * We don't physically delete the record until the cursor moves,
270	 * so we have to have a long-lived write lock on the page instead
271	 * of a long-lived read lock.  Note, we have to have a read lock
272	 * to even get here, so we simply discard it.
273	 */
274	if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
275		if ((ret = __bam_lget(dbc,
276		    0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
277			goto err;
278		(void)__BT_TLPUT(dbc, cp->lock);
279		cp->lock = lock;
280		cp->mode = DB_LOCK_WRITE;
281	}
282
283	/*
284	 * Acquire the underlying page (which may be different from the above
285	 * page because it may be a duplicate page), and set the on-page and
286	 * in-cursor delete flags.  We don't need to lock it as we've already
287	 * write-locked the page leading to it.
288	 */
289	if (cp->dpgno == PGNO_INVALID) {
290		pgno = cp->pgno;
291		indx = cp->indx;
292	} else {
293		pgno = cp->dpgno;
294		indx = cp->dindx;
295	}
296
297	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
298		goto err;
299
300	/* Log the change. */
301	if (DB_LOGGING(dbc) &&
302	    (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
303	    0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
304		(void)memp_fput(dbp->mpf, h, 0);
305		goto err;
306	}
307
308	/*
309	 * Set the intent-to-delete flag on the page and update all cursors. */
310	if (cp->dpgno == PGNO_INVALID)
311		B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
312	else
313		B_DSET(GET_BKEYDATA(h, indx)->type);
314	(void)__bam_ca_delete(dbp, pgno, indx, 1);
315
316	ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
317	h = NULL;
318
319	/*
320	 * If the tree has record numbers, we have to adjust the counts.
321	 *
322	 * !!!
323	 * This test is right -- we don't yet support duplicates and record
324	 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM
325	 * set.
326	 */
327	if (F_ISSET(dbp, DB_BT_RECNUM)) {
328		if ((ret = __bam_c_getstack(dbc, cp)) != 0)
329			goto err;
330		if ((ret = __bam_adjust(dbc, -1)) != 0)
331			goto err;
332		(void)__bam_stkrel(dbc, 0);
333	}
334
335err:	if (h != NULL)
336		(void)memp_fput(dbp->mpf, h, 0);
337	return (ret);
338}
339
340/*
341 * __bam_c_get --
342 *	Get using a cursor (btree).
343 */
344static int
345__bam_c_get(dbc, key, data, flags)
346	DBC *dbc;
347	DBT *key, *data;
348	u_int32_t flags;
349{
350	CURSOR *cp, copy, start;
351	DB *dbp;
352	PAGE *h;
353	int exact, ret, tmp_rmw;
354
355	dbp = dbc->dbp;
356	cp = dbc->internal;
357
358	DB_PANIC_CHECK(dbp);
359
360	/* Check for invalid flags. */
361	if ((ret = __db_cgetchk(dbp,
362	    key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
363		return (ret);
364
365	/* Clear OR'd in additional bits so we can check for flag equality. */
366	tmp_rmw = 0;
367	if (LF_ISSET(DB_RMW)) {
368		if (!F_ISSET(dbp, DB_AM_CDB)) {
369			tmp_rmw = 1;
370			F_SET(dbc, DBC_RMW);
371		}
372		LF_CLR(DB_RMW);
373	}
374
375	DEBUG_LREAD(dbc, dbc->txn, "bam_c_get",
376	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
377
378	/*
379	 * Return a cursor's record number.  It has nothing to do with the
380	 * cursor get code except that it's been rammed into the interface.
381	 */
382	if (flags == DB_GET_RECNO) {
383		ret = __bam_c_rget(dbc, data, flags);
384		if (tmp_rmw)
385			F_CLR(dbc, DBC_RMW);
386		return (ret);
387	}
388
389	/*
390	 * Initialize the cursor for a new retrieval.  Clear the cursor's
391	 * page pointer, it was set before this operation, and no longer
392	 * has any meaning.
393	 */
394	cp->page = NULL;
395	copy = *cp;
396	cp->lock = LOCK_INVALID;
397
398	switch (flags) {
399	case DB_CURRENT:
400		/* It's not possible to return a deleted record. */
401		if (F_ISSET(cp, C_DELETED)) {
402			ret = DB_KEYEMPTY;
403			goto err;
404		}
405
406		/* Acquire the current page. */
407		if ((ret = __bam_lget(dbc,
408		    0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0)
409			ret = memp_fget(dbp->mpf,
410			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
411			    0, &cp->page);
412		if (ret != 0)
413			goto err;
414		break;
415	case DB_NEXT_DUP:
416		if (cp->pgno == PGNO_INVALID) {
417			ret = EINVAL;
418			goto err;
419		}
420		if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
421			goto err;
422
423		/* Make sure we didn't go past the end of the duplicates. */
424		if (!POSSIBLE_DUPLICATE(cp, copy)) {
425			ret = DB_NOTFOUND;
426			goto err;
427		}
428		break;
429	case DB_NEXT:
430		if (cp->pgno != PGNO_INVALID) {
431			if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
432				goto err;
433			break;
434		}
435		/* FALLTHROUGH */
436	case DB_FIRST:
437		if ((ret = __bam_c_first(dbc, cp)) != 0)
438			goto err;
439		break;
440	case DB_PREV:
441		if (cp->pgno != PGNO_INVALID) {
442			if ((ret = __bam_c_prev(dbc, cp)) != 0)
443				goto err;
444			break;
445		}
446		/* FALLTHROUGH */
447	case DB_LAST:
448		if ((ret = __bam_c_last(dbc, cp)) != 0)
449			goto err;
450		break;
451	case DB_SET:
452		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
453			goto err;
454
455		/*
456		 * We cannot currently be referencing a deleted record, but we
457		 * may be referencing off-page duplicates.
458		 *
459		 * If we're referencing off-page duplicates, move off-page.
460		 * If we moved off-page, move to the next non-deleted record.
461		 * If we moved to the next non-deleted record, check to make
462		 * sure we didn't switch records because our current record
463		 * had no non-deleted data items.
464		 */
465		start = *cp;
466		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
467			goto err;
468		if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) {
469			if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
470				goto err;
471			if (!POSSIBLE_DUPLICATE(cp, start)) {
472				ret = DB_NOTFOUND;
473				goto err;
474			}
475		}
476		break;
477	case DB_SET_RECNO:
478		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
479			goto err;
480		break;
481	case DB_GET_BOTH:
482		if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) {
483			/* Acquire the current page. */
484			if ((ret = memp_fget(dbp->mpf,
485			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
486			    0, &cp->page)) != 0)
487				goto err;
488
489			/* If DBC_CONTINUE, move to the next item. */
490			if (F_ISSET(dbc, DBC_CONTINUE) &&
491			    (ret = __bam_c_next(dbc, cp, 1)) != 0)
492				goto err;
493		} else {
494			if ((ret =
495			    __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
496				goto err;
497
498			/*
499			 * We may be referencing a duplicates page.  Move to
500			 * the first duplicate.
501			 */
502			if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
503				goto err;
504		}
505
506		/* Search for a matching entry. */
507		if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0)
508			goto err;
509
510		/* Ignore deleted entries. */
511		if (IS_CUR_DELETED(cp)) {
512			ret = DB_NOTFOUND;
513			goto err;
514		}
515		break;
516	case DB_SET_RANGE:
517		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
518			goto err;
519
520		/*
521		 * As we didn't require an exact match, the search function
522		 * may have returned an entry past the end of the page.  If
523		 * so, move to the next entry.
524		 */
525		if (cp->indx == NUM_ENT(cp->page) &&
526		    (ret = __bam_c_next(dbc, cp, 0)) != 0)
527			goto err;
528
529		/*
530		 * We may be referencing off-page duplicates, if so, move
531		 * off-page.
532		 */
533		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
534			goto err;
535
536		/*
537		 * We may be referencing a deleted record, if so, move to
538		 * the next non-deleted record.
539		 */
540		if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0)
541			goto err;
542		break;
543	}
544
545	/*
546	 * Return the key if the user didn't give us one.  If we've moved to
547	 * a duplicate page, we may no longer have a pointer to the main page,
548	 * so we have to go get it.  We know that it's already read-locked,
549	 * however, so we don't have to acquire a new lock.
550	 */
551	if (flags != DB_SET) {
552		if (cp->dpgno != PGNO_INVALID) {
553			if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
554				goto err;
555		} else
556			h = cp->page;
557		ret = __db_ret(dbp,
558		    h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen);
559		if (cp->dpgno != PGNO_INVALID)
560			(void)memp_fput(dbp->mpf, h, 0);
561		if (ret)
562			goto err;
563	}
564
565	/* Return the data. */
566	if ((ret = __db_ret(dbp, cp->page,
567	    cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
568	    data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
569		goto err;
570
571	/*
572	 * If the previous cursor record has been deleted, physically delete
573	 * the entry from the page.  We clear the deleted flag before we call
574	 * the underlying delete routine so that, if an error occurs, and we
575	 * restore the cursor, the deleted flag is cleared.  This is because,
576	 * if we manage to physically modify the page, and then restore the
577	 * cursor, we might try to repeat the page modification when closing
578	 * the cursor.
579	 */
580	if (F_ISSET(&copy, C_DELETED)) {
581		F_CLR(&copy, C_DELETED);
582		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
583			goto err;
584	}
585	F_CLR(cp, C_DELETED);
586
587	/* Release the previous lock, if any; the current lock is retained. */
588	if (copy.lock != LOCK_INVALID)
589		(void)__BT_TLPUT(dbc, copy.lock);
590
591	/* Release the current page. */
592	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
593		goto err;
594
595	if (0) {
596err:		if (cp->page != NULL)
597			(void)memp_fput(dbp->mpf, cp->page, 0);
598		if (cp->lock != LOCK_INVALID)
599			(void)__BT_TLPUT(dbc, cp->lock);
600		*cp = copy;
601	}
602
603	/* Release temporary lock upgrade. */
604	if (tmp_rmw)
605		F_CLR(dbc, DBC_RMW);
606
607	return (ret);
608}
609
610/*
611 * __bam_dsearch --
612 *	Search for a matching data item (or the first data item that's
613 *	equal to or greater than the one we're searching for).
614 */
615static int
616__bam_dsearch(dbc, cp, data, iflagp)
617	DBC *dbc;
618	CURSOR *cp;
619	DBT *data;
620	u_int32_t *iflagp;
621{
622	DB *dbp;
623	CURSOR copy, last;
624	int cmp, ret;
625
626	dbp = dbc->dbp;
627
628	/*
629	 * If iflagp is non-NULL, we're doing an insert.
630	 *
631	 * If the duplicates are off-page, use the duplicate search routine.
632	 */
633	if (cp->dpgno != PGNO_INVALID) {
634		if ((ret = __db_dsearch(dbc, iflagp != NULL,
635		    data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0)
636			return (ret);
637		cp->dpgno = cp->page->pgno;
638
639		if (iflagp == NULL) {
640			if (cmp != 0)
641				return (DB_NOTFOUND);
642			return (0);
643		}
644		*iflagp = DB_BEFORE;
645		return (0);
646	}
647
648	/* Otherwise, do the search ourselves. */
649	copy = *cp;
650	for (;;) {
651		/* Save the last interesting cursor position. */
652		last = *cp;
653
654		/* See if the data item matches the one we're looking for. */
655		if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX,
656		    dbp->dup_compare == NULL ?
657		    __bam_defcmp : dbp->dup_compare)) == 0) {
658			if (iflagp != NULL)
659				*iflagp = DB_AFTER;
660			return (0);
661		}
662
663		/*
664		 * If duplicate entries are sorted, we're done if we find a
665		 * page entry that sorts greater than the application item.
666		 * If doing an insert, return success, otherwise DB_NOTFOUND.
667		 */
668		if (dbp->dup_compare != NULL && cmp < 0) {
669			if (iflagp == NULL)
670				return (DB_NOTFOUND);
671			*iflagp = DB_BEFORE;
672			return (0);
673		}
674
675		/*
676		 * Move to the next item.  If we reach the end of the page and
677		 * we're doing an insert, set the cursor to the last item and
678		 * set the referenced memory location so callers know to insert
679		 * after the item, instead of before it.  If not inserting, we
680		 * return DB_NOTFOUND.
681		 */
682		if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) {
683			if (iflagp == NULL)
684				return (DB_NOTFOUND);
685			goto use_last;
686		}
687
688		/*
689		 * Make sure we didn't go past the end of the duplicates.  The
690		 * error conditions are the same as above.
691		 */
692		if (!POSSIBLE_DUPLICATE(cp, copy)) {
693			if (iflagp == NULL)
694				 return (DB_NOTFOUND);
695use_last:		*cp = last;
696			*iflagp = DB_AFTER;
697			return (0);
698		}
699	}
700	/* NOTREACHED */
701}
702
703/*
704 * __bam_c_rget --
705 *	Return the record number for a cursor.
706 */
707static int
708__bam_c_rget(dbc, data, flags)
709	DBC *dbc;
710	DBT *data;
711	u_int32_t flags;
712{
713	CURSOR *cp;
714	DB *dbp;
715	DBT dbt;
716	db_recno_t recno;
717	int exact, ret;
718
719	COMPQUIET(flags, 0);
720	dbp = dbc->dbp;
721	cp = dbc->internal;
722
723	/* Get the page with the current item on it. */
724	if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
725		return (ret);
726
727	/* Get a copy of the key. */
728	memset(&dbt, 0, sizeof(DBT));
729	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
730	if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0)
731		goto err;
732
733	exact = 1;
734	if ((ret = __bam_search(dbc, &dbt,
735	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
736	    1, &recno, &exact)) != 0)
737		goto err;
738
739	ret = __db_retcopy(data, &recno, sizeof(recno),
740	    &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc);
741
742	/* Release the stack. */
743	__bam_stkrel(dbc, 0);
744
745err:	(void)memp_fput(dbp->mpf, cp->page, 0);
746	__os_free(dbt.data, dbt.size);
747	return (ret);
748}
749
750/*
751 * __bam_c_put --
752 *	Put using a cursor.
753 */
754static int
755__bam_c_put(dbc, key, data, flags)
756	DBC *dbc;
757	DBT *key, *data;
758	u_int32_t flags;
759{
760	CURSOR *cp, copy;
761	DB *dbp;
762	DBT dbt;
763	db_indx_t indx;
764	db_pgno_t pgno;
765	u_int32_t iiflags, iiop;
766	int exact, needkey, ret, stack;
767	void *arg;
768
769	dbp = dbc->dbp;
770	cp = dbc->internal;
771
772	DB_PANIC_CHECK(dbp);
773
774	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put",
775	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
776	    data, flags);
777
778	if ((ret = __db_cputchk(dbp, key, data, flags,
779	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
780		return (ret);
781
782	/*
783	 * If we are running CDB, this had better be either a write
784	 * cursor or an immediate writer.  If it's a regular writer,
785	 * that means we have an IWRITE lock and we need to upgrade
786	 * it to a write lock.
787	 */
788	if (F_ISSET(dbp, DB_AM_CDB)) {
789		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
790			return (EINVAL);
791
792		if (F_ISSET(dbc, DBC_RMW) &&
793		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
794		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
795		    &dbc->mylock)) != 0)
796			return (EAGAIN);
797	}
798
799	if (0) {
800split:		/*
801		 * To split, we need a valid key for the page.  Since it's a
802		 * cursor, we have to build one.
803		 *
804		 * Acquire a copy of a key from the page.
805		 */
806		if (needkey) {
807			memset(&dbt, 0, sizeof(DBT));
808			if ((ret = __db_ret(dbp, cp->page, indx,
809			    &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
810				goto err;
811			arg = &dbt;
812		} else
813			arg = key;
814
815		/*
816		 * Discard any locks and pinned pages (the locks are discarded
817		 * even if we're running with transactions, as they lock pages
818		 * that we're sorry we ever acquired).  If stack is set and the
819		 * cursor entries are valid, they point to the same entries as
820		 * the stack, don't free them twice.
821		 */
822		if (stack) {
823			(void)__bam_stkrel(dbc, 1);
824			stack = 0;
825		} else
826			DISCARD(dbc, cp);
827
828		/*
829		 * Restore the cursor to its original value.  This is necessary
830		 * for two reasons.  First, we are about to copy it in case of
831		 * error, again.  Second, we adjust cursors during the split,
832		 * and we have to ensure this cursor is adjusted appropriately,
833		 * along with all the other cursors.
834		 */
835		*cp = copy;
836
837		if ((ret = __bam_split(dbc, arg)) != 0)
838			goto err;
839	}
840
841	/*
842	 * Initialize the cursor for a new retrieval.  Clear the cursor's
843	 * page pointer, it was set before this operation, and no longer
844	 * has any meaning.
845	 */
846	cp->page = NULL;
847	copy = *cp;
848	cp->lock = LOCK_INVALID;
849
850	iiflags = needkey = ret = stack = 0;
851	switch (flags) {
852	case DB_AFTER:
853	case DB_BEFORE:
854	case DB_CURRENT:
855		needkey = 1;
856		if (cp->dpgno == PGNO_INVALID) {
857			pgno = cp->pgno;
858			indx = cp->indx;
859		} else {
860			pgno = cp->dpgno;
861			indx = cp->dindx;
862		}
863
864		/*
865		 * !!!
866		 * This test is right -- we don't yet support duplicates and
867		 * record numbers in the same tree, so ignore duplicates if
868		 * DB_BT_RECNUM set.
869		 */
870		if (F_ISSET(dbp, DB_BT_RECNUM) &&
871		    (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) {
872			/* Acquire a complete stack. */
873			if ((ret = __bam_c_getstack(dbc, cp)) != 0)
874				goto err;
875			cp->page = cp->csp->page;
876
877			stack = 1;
878			iiflags = BI_DOINCR;
879		} else {
880			/* Acquire the current page. */
881			if ((ret = __bam_lget(dbc,
882			    0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0)
883				ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page);
884			if (ret != 0)
885				goto err;
886
887			iiflags = 0;
888		}
889
890		/*
891		 * If the user has specified a duplicate comparison function,
892		 * we return an error if DB_CURRENT was specified and the
893		 * replacement data doesn't compare equal to the current data.
894		 * This stops apps from screwing up the duplicate sort order.
895		 */
896		if (flags == DB_CURRENT && dbp->dup_compare != NULL)
897			if (__bam_cmp(dbp, data,
898			    cp->page, indx, dbp->dup_compare) != 0) {
899				ret = EINVAL;
900				goto err;
901			}
902
903		iiop = flags;
904		break;
905	case DB_KEYFIRST:
906	case DB_KEYLAST:
907		/*
908		 * If we have a duplicate comparison function, we position to
909		 * the first of any on-page duplicates, and use __bam_dsearch
910		 * to search for the right slot.  Otherwise, we position to
911		 * the first/last of any on-page duplicates based on the flag
912		 * value.
913		 */
914		if ((ret = __bam_c_search(dbc, cp, key,
915		    flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
916		    DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
917			goto err;
918		stack = 1;
919
920		/*
921		 * If an exact match:
922		 *	If duplicates aren't supported, replace the current
923		 *	item.  (When implementing the DB->put function, our
924		 *	caller has already checked the DB_NOOVERWRITE flag.)
925		 *
926		 *	If there's a duplicate comparison function, find the
927		 *	correct slot for this duplicate item.
928		 *
929		 *	If there's no duplicate comparison function, set the
930		 *	insert flag based on the argument flags.
931		 *
932		 * If there's no match, the search function returned the
933		 * smallest slot greater than the key, use it.
934		 */
935		if (exact) {
936			if (F_ISSET(dbp, DB_AM_DUP)) {
937				/*
938				 * If at off-page duplicate page, move to the
939				 * first or last entry -- if a comparison
940				 * function was specified, start searching at
941				 * the first entry.  Otherwise, move based on
942				 * the DB_KEYFIRST/DB_KEYLAST flags.
943				 */
944				if ((ret = __bam_dup(dbc, cp, cp->indx,
945				    dbp->dup_compare == NULL &&
946				    flags != DB_KEYFIRST)) != 0)
947					goto err;
948
949				/*
950				 * If there's a comparison function, search for
951				 * the correct slot.  Otherwise, set the insert
952				 * flag based on the argment flag.
953				 */
954				if (dbp->dup_compare == NULL)
955					iiop = flags == DB_KEYFIRST ?
956					    DB_BEFORE : DB_AFTER;
957				else
958					if ((ret = __bam_dsearch(dbc,
959					    cp, data, &iiop)) != 0)
960						goto err;
961			} else
962				iiop = DB_CURRENT;
963			iiflags = 0;
964		} else {
965			iiop = DB_BEFORE;
966			iiflags = BI_NEWKEY;
967		}
968
969		if (cp->dpgno == PGNO_INVALID) {
970			pgno = cp->pgno;
971			indx = cp->indx;
972		} else {
973			pgno = cp->dpgno;
974			indx = cp->dindx;
975		}
976		break;
977	}
978
979	ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags);
980
981	if (ret == DB_NEEDSPLIT)
982		goto split;
983	if (ret != 0)
984		goto err;
985
986	/*
987	 * Reset any cursors referencing this item that might have the item
988	 * marked for deletion.
989	 */
990	if (iiop == DB_CURRENT) {
991		(void)__bam_ca_delete(dbp, pgno, indx, 0);
992
993		/*
994		 * It's also possible that we are the cursor that had the
995		 * item marked for deletion, in which case we want to make
996		 * sure that we don't delete it because we had the delete
997		 * flag set already.
998		 */
999		if (cp->pgno == copy.pgno && cp->indx == copy.indx &&
1000		    cp->dpgno == copy.dpgno && cp->dindx == copy.dindx)
1001			F_CLR(&copy, C_DELETED);
1002	}
1003
1004	/*
1005	 * Update the cursor to point to the new entry.  The new entry was
1006	 * stored on the current page, because we split pages until it was
1007	 * possible.
1008	 */
1009	if (cp->dpgno == PGNO_INVALID)
1010		cp->indx = indx;
1011	else
1012		cp->dindx = indx;
1013
1014	/*
1015	 * If the previous cursor record has been deleted, physically delete
1016	 * the entry from the page.  We clear the deleted flag before we call
1017	 * the underlying delete routine so that, if an error occurs, and we
1018	 * restore the cursor, the deleted flag is cleared.  This is because,
1019	 * if we manage to physically modify the page, and then restore the
1020	 * cursor, we might try to repeat the page modification when closing
1021	 * the cursor.
1022	 */
1023	if (F_ISSET(&copy, C_DELETED)) {
1024		F_CLR(&copy, C_DELETED);
1025		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
1026			goto err;
1027	}
1028	F_CLR(cp, C_DELETED);
1029
1030	/* Release the previous lock, if any; the current lock is retained. */
1031	if (copy.lock != LOCK_INVALID)
1032		(void)__BT_TLPUT(dbc, copy.lock);
1033
1034	/*
1035	 * Discard any pages pinned in the tree and their locks, except for
1036	 * the leaf page, for which we only discard the pin, not the lock.
1037	 *
1038	 * Note, the leaf page participated in the stack we acquired, and so
1039	 * we have to adjust the stack as necessary.  If there was only a
1040	 * single page on the stack, we don't have to free further stack pages.
1041	 */
1042	if (stack && BT_STK_POP(cp) != NULL)
1043		(void)__bam_stkrel(dbc, 0);
1044
1045	/* Release the current page. */
1046	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1047		goto err;
1048
1049	if (0) {
1050err:		/* Discard any pinned pages. */
1051		if (stack)
1052			(void)__bam_stkrel(dbc, 0);
1053		else
1054			DISCARD(dbc, cp);
1055		*cp = copy;
1056	}
1057
1058	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1059		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1060		    DB_LOCK_IWRITE, 0);
1061
1062	return (ret);
1063}
1064
1065/*
1066 * __bam_c_first --
1067 *	Return the first record.
1068 */
1069static int
1070__bam_c_first(dbc, cp)
1071	DBC *dbc;
1072	CURSOR *cp;
1073{
1074	DB *dbp;
1075	db_pgno_t pgno;
1076	int ret;
1077
1078	dbp = dbc->dbp;
1079
1080	/* Walk down the left-hand side of the tree. */
1081	for (pgno = PGNO_ROOT;;) {
1082		if ((ret =
1083		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1084			return (ret);
1085		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1086			return (ret);
1087
1088		/* If we find a leaf page, we're done. */
1089		if (ISLEAF(cp->page))
1090			break;
1091
1092		pgno = GET_BINTERNAL(cp->page, 0)->pgno;
1093		DISCARD(dbc, cp);
1094	}
1095
1096	cp->pgno = cp->page->pgno;
1097	cp->indx = 0;
1098	cp->dpgno = PGNO_INVALID;
1099
1100	/* Check for duplicates. */
1101	if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
1102		return (ret);
1103
1104	/* If on an empty page or a deleted record, move to the next one. */
1105	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1106		if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
1107			return (ret);
1108
1109	return (0);
1110}
1111
1112/*
1113 * __bam_c_last --
1114 *	Return the last record.
1115 */
1116static int
1117__bam_c_last(dbc, cp)
1118	DBC *dbc;
1119	CURSOR *cp;
1120{
1121	DB *dbp;
1122	db_pgno_t pgno;
1123	int ret;
1124
1125	dbp = dbc->dbp;
1126
1127	/* Walk down the right-hand side of the tree. */
1128	for (pgno = PGNO_ROOT;;) {
1129		if ((ret =
1130		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1131			return (ret);
1132		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1133			return (ret);
1134
1135		/* If we find a leaf page, we're done. */
1136		if (ISLEAF(cp->page))
1137			break;
1138
1139		pgno =
1140		    GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
1141		DISCARD(dbc, cp);
1142	}
1143
1144	cp->pgno = cp->page->pgno;
1145	cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
1146	cp->dpgno = PGNO_INVALID;
1147
1148	/* Check for duplicates. */
1149	if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0)
1150		return (ret);
1151
1152	/* If on an empty page or a deleted record, move to the next one. */
1153	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1154		if ((ret = __bam_c_prev(dbc, cp)) != 0)
1155			return (ret);
1156
1157	return (0);
1158}
1159
1160/*
1161 * __bam_c_next --
1162 *	Move to the next record.
1163 */
1164static int
1165__bam_c_next(dbc, cp, initial_move)
1166	DBC *dbc;
1167	CURSOR *cp;
1168	int initial_move;
1169{
1170	DB *dbp;
1171	db_indx_t adjust, indx;
1172	db_pgno_t pgno;
1173	int ret;
1174
1175	dbp = dbc->dbp;
1176
1177	/*
1178	 * We're either moving through a page of duplicates or a btree leaf
1179	 * page.
1180	 */
1181	if (cp->dpgno == PGNO_INVALID) {
1182		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1183		pgno = cp->pgno;
1184		indx = cp->indx;
1185	} else {
1186		adjust = O_INDX;
1187		pgno = cp->dpgno;
1188		indx = cp->dindx;
1189	}
1190	if (cp->page == NULL) {
1191		if ((ret =
1192		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1193			return (ret);
1194		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1195			return (ret);
1196	}
1197
1198	/*
1199	 * If at the end of the page, move to a subsequent page.
1200	 *
1201	 * !!!
1202	 * Check for >= NUM_ENT.  If we're here as the result of a search that
1203	 * landed us on NUM_ENT, we'll increment indx before we test.
1204	 *
1205	 * !!!
1206	 * This code handles empty pages and pages with only deleted entries.
1207	 */
1208	if (initial_move)
1209		indx += adjust;
1210	for (;;) {
1211		if (indx >= NUM_ENT(cp->page)) {
1212			/*
1213			 * If we're in a btree leaf page, we've reached the end
1214			 * of the tree.  If we've reached the end of a page of
1215			 * duplicates, continue from the btree leaf page where
1216			 * we found this page of duplicates.
1217			 */
1218			pgno = cp->page->next_pgno;
1219			if (pgno == PGNO_INVALID) {
1220				/* If in a btree leaf page, it's EOF. */
1221				if (cp->dpgno == PGNO_INVALID)
1222					return (DB_NOTFOUND);
1223
1224				/* Continue from the last btree leaf page. */
1225				cp->dpgno = PGNO_INVALID;
1226
1227				adjust = P_INDX;
1228				pgno = cp->pgno;
1229				indx = cp->indx + P_INDX;
1230			} else
1231				indx = 0;
1232
1233			DISCARD(dbc, cp);
1234			if ((ret = __bam_lget(dbc,
1235			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1236				return (ret);
1237			if ((ret =
1238			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1239				return (ret);
1240			continue;
1241		}
1242
1243		/* Ignore deleted records. */
1244		if (IS_DELETED(cp, indx)) {
1245			indx += adjust;
1246			continue;
1247		}
1248
1249		/*
1250		 * If we're not in a duplicates page, check to see if we've
1251		 * found a page of duplicates, in which case we move to the
1252		 * first entry.
1253		 */
1254		if (cp->dpgno == PGNO_INVALID) {
1255			cp->pgno = cp->page->pgno;
1256			cp->indx = indx;
1257
1258			if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0)
1259				return (ret);
1260			if (cp->dpgno != PGNO_INVALID) {
1261				indx = cp->dindx;
1262				adjust = O_INDX;
1263				continue;
1264			}
1265		} else {
1266			cp->dpgno = cp->page->pgno;
1267			cp->dindx = indx;
1268		}
1269		break;
1270	}
1271	return (0);
1272}
1273
1274/*
1275 * __bam_c_prev --
1276 *	Move to the previous record.
1277 */
1278static int
1279__bam_c_prev(dbc, cp)
1280	DBC *dbc;
1281	CURSOR *cp;
1282{
1283	DB *dbp;
1284	db_indx_t indx, adjust;
1285	db_pgno_t pgno;
1286	int ret, set_indx;
1287
1288	dbp = dbc->dbp;
1289
1290	/*
1291	 * We're either moving through a page of duplicates or a btree leaf
1292	 * page.
1293	 */
1294	if (cp->dpgno == PGNO_INVALID) {
1295		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1296		pgno = cp->pgno;
1297		indx = cp->indx;
1298	} else {
1299		adjust = O_INDX;
1300		pgno = cp->dpgno;
1301		indx = cp->dindx;
1302	}
1303	if (cp->page == NULL) {
1304		if ((ret =
1305		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1306			return (ret);
1307		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1308			return (ret);
1309	}
1310
1311	/*
1312	 * If at the beginning of the page, move to any previous one.
1313	 *
1314	 * !!!
1315	 * This code handles empty pages and pages with only deleted entries.
1316	 */
1317	for (;;) {
1318		if (indx == 0) {
1319			/*
1320			 * If we're in a btree leaf page, we've reached the
1321			 * beginning of the tree.  If we've reached the first
1322			 * of a page of duplicates, continue from the btree
1323			 * leaf page where we found this page of duplicates.
1324			 */
1325			pgno = cp->page->prev_pgno;
1326			if (pgno == PGNO_INVALID) {
1327				/* If in a btree leaf page, it's SOF. */
1328				if (cp->dpgno == PGNO_INVALID)
1329					return (DB_NOTFOUND);
1330
1331				/* Continue from the last btree leaf page. */
1332				cp->dpgno = PGNO_INVALID;
1333
1334				adjust = P_INDX;
1335				pgno = cp->pgno;
1336				indx = cp->indx;
1337				set_indx = 0;
1338			} else
1339				set_indx = 1;
1340
1341			DISCARD(dbc, cp);
1342			if ((ret = __bam_lget(dbc,
1343			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1344				return (ret);
1345			if ((ret =
1346			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1347				return (ret);
1348
1349			if (set_indx)
1350				indx = NUM_ENT(cp->page);
1351			if (indx == 0)
1352				continue;
1353		}
1354
1355		/* Ignore deleted records. */
1356		indx -= adjust;
1357		if (IS_DELETED(cp, indx))
1358			continue;
1359
1360		/*
1361		 * If we're not in a duplicates page, check to see if we've
1362		 * found a page of duplicates, in which case we move to the
1363		 * last entry.
1364		 */
1365		if (cp->dpgno == PGNO_INVALID) {
1366			cp->pgno = cp->page->pgno;
1367			cp->indx = indx;
1368
1369			if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0)
1370				return (ret);
1371			if (cp->dpgno != PGNO_INVALID) {
1372				indx = cp->dindx + O_INDX;
1373				adjust = O_INDX;
1374				continue;
1375			}
1376		} else {
1377			cp->dpgno = cp->page->pgno;
1378			cp->dindx = indx;
1379		}
1380		break;
1381	}
1382	return (0);
1383}
1384
1385/*
1386 * __bam_c_search --
1387 *	Move to a specified record.
1388 */
1389static int
1390__bam_c_search(dbc, cp, key, flags, exactp)
1391	DBC *dbc;
1392	CURSOR *cp;
1393	const DBT *key;
1394	u_int32_t flags;
1395	int *exactp;
1396{
1397	BTREE *t;
1398	DB *dbp;
1399	DB_LOCK lock;
1400	PAGE *h;
1401	db_recno_t recno;
1402	db_indx_t indx;
1403	u_int32_t sflags;
1404	int cmp, needexact, ret;
1405
1406	dbp = dbc->dbp;
1407	t = dbp->internal;
1408
1409	/* Find an entry in the database. */
1410	switch (flags) {
1411	case DB_SET_RECNO:
1412		if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
1413			return (ret);
1414		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1415		needexact = *exactp = 1;
1416		ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp);
1417		break;
1418	case DB_SET:
1419	case DB_GET_BOTH:
1420		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1421		needexact = *exactp = 1;
1422		goto search;
1423	case DB_SET_RANGE:
1424		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1425		needexact = *exactp = 0;
1426		goto search;
1427	case DB_KEYFIRST:
1428		sflags = S_KEYFIRST;
1429		goto fast_search;
1430	case DB_KEYLAST:
1431		sflags = S_KEYLAST;
1432fast_search:	needexact = *exactp = 0;
1433		/*
1434		 * If the application has a history of inserting into the first
1435		 * or last pages of the database, we check those pages first to
1436		 * avoid doing a full search.
1437		 *
1438		 * Record numbers can't be fast-tracked, the entire tree has to
1439		 * be locked.
1440		 */
1441		h = NULL;
1442		lock = LOCK_INVALID;
1443		if (F_ISSET(dbp, DB_BT_RECNUM))
1444			goto search;
1445
1446		/* Check if the application has a history of sorted input. */
1447		if (t->bt_lpgno == PGNO_INVALID)
1448			goto search;
1449
1450		/*
1451		 * Lock and retrieve the page on which we did the last insert.
1452		 * It's okay if it doesn't exist, or if it's not the page type
1453		 * we expected, it just means that the world changed.
1454		 */
1455		if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
1456			goto fast_miss;
1457		if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h))
1458			goto fast_miss;
1459		if (TYPE(h) != P_LBTREE)
1460			goto fast_miss;
1461		if (NUM_ENT(h) == 0)
1462			goto fast_miss;
1463
1464		/*
1465		 * What we do here is test to see if we're at the beginning or
1466		 * end of the tree and if the new item sorts before/after the
1467		 * first/last page entry.  We don't try and catch inserts into
1468		 * the middle of the tree (although we could, as long as there
1469		 * were two keys on the page and we saved both the index and
1470		 * the page number of the last insert).
1471		 */
1472		if (h->next_pgno == PGNO_INVALID) {
1473			indx = NUM_ENT(h) - P_INDX;
1474			if ((cmp =
1475			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0)
1476				goto try_begin;
1477			if (cmp > 0) {
1478				indx += P_INDX;
1479				goto fast_hit;
1480			}
1481
1482			/*
1483			 * Found a duplicate.  If doing DB_KEYLAST, we're at
1484			 * the correct position, otherwise, move to the first
1485			 * of the duplicates.
1486			 */
1487			if (flags == DB_KEYLAST)
1488				goto fast_hit;
1489			for (;
1490			    indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
1491			    indx -= P_INDX)
1492				;
1493			goto fast_hit;
1494		}
1495try_begin:	if (h->prev_pgno == PGNO_INVALID) {
1496			indx = 0;
1497			if ((cmp =
1498			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0)
1499				goto fast_miss;
1500			if (cmp < 0)
1501				goto fast_hit;
1502			/*
1503			 * Found a duplicate.  If doing DB_KEYFIRST, we're at
1504			 * the correct position, otherwise, move to the last
1505			 * of the duplicates.
1506			 */
1507			if (flags == DB_KEYFIRST)
1508				goto fast_hit;
1509			for (;
1510			    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
1511			    h->inp[indx] == h->inp[indx + P_INDX];
1512			    indx += P_INDX)
1513				;
1514			goto fast_hit;
1515		}
1516		goto fast_miss;
1517
1518fast_hit:	/* Set the exact match flag, we may have found a duplicate. */
1519		*exactp = cmp == 0;
1520
1521		/* Enter the entry in the stack. */
1522		BT_STK_CLR(cp);
1523		BT_STK_ENTER(cp, h, indx, lock, ret);
1524		break;
1525
1526fast_miss:	if (h != NULL)
1527			(void)memp_fput(dbp->mpf, h, 0);
1528		if (lock != LOCK_INVALID)
1529			(void)__BT_LPUT(dbc, lock);
1530
1531search:		ret = __bam_search(dbc, key, sflags, 1, NULL, exactp);
1532		break;
1533	default:				/* XXX: Impossible. */
1534		abort();
1535		/* NOTREACHED */
1536	}
1537	if (ret != 0)
1538		return (ret);
1539
1540	/*
1541	 * Initialize the cursor to reference it.  This has to be done
1542	 * before we return (even with DB_NOTFOUND) because we have to
1543	 * free the page(s) we locked in __bam_search.
1544	 */
1545	cp->page = cp->csp->page;
1546	cp->pgno = cp->csp->page->pgno;
1547	cp->indx = cp->csp->indx;
1548	cp->lock = cp->csp->lock;
1549	cp->dpgno = PGNO_INVALID;
1550
1551	/*
1552	 * If we inserted a key into the first or last slot of the tree,
1553	 * remember where it was so we can do it more quickly next time.
1554	 */
1555	if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
1556		t->bt_lpgno =
1557		    ((cp->page->next_pgno == PGNO_INVALID &&
1558		    cp->indx >= NUM_ENT(cp->page)) ||
1559		    (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ?
1560		    cp->pgno : PGNO_INVALID;
1561
1562	/* If we need an exact match and didn't find one, we're done. */
1563	if (needexact && *exactp == 0)
1564		return (DB_NOTFOUND);
1565
1566	return (0);
1567}
1568
1569/*
1570 * __bam_dup --
1571 *	Check for an off-page duplicates entry, and if found, move to the
1572 *	first or last entry.
1573 *
1574 * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
1575 */
1576int
1577__bam_dup(dbc, cp, indx, last_dup)
1578	DBC *dbc;
1579	CURSOR *cp;
1580	u_int32_t indx;
1581	int last_dup;
1582{
1583	BOVERFLOW *bo;
1584	DB *dbp;
1585	db_pgno_t pgno;
1586	int ret;
1587
1588	dbp = dbc->dbp;
1589
1590	/*
1591	 * Check for an overflow entry.  If we find one, move to the
1592	 * duplicates page, and optionally move to the last record on
1593	 * that page.
1594	 *
1595	 * !!!
1596	 * We don't lock duplicates pages, we've already got the correct
1597	 * lock on the main page.
1598	 */
1599	bo = GET_BOVERFLOW(cp->page, indx + O_INDX);
1600	if (B_TYPE(bo->type) != B_DUPLICATE)
1601		return (0);
1602
1603	pgno = bo->pgno;
1604	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1605		return (ret);
1606	cp->page = NULL;
1607	if (last_dup) {
1608		if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0)
1609			return (ret);
1610		indx = NUM_ENT(cp->page) - O_INDX;
1611	} else {
1612		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1613			return (ret);
1614		indx = 0;
1615	}
1616
1617	/* Update the cursor's duplicate information. */
1618	cp->dpgno = cp->page->pgno;
1619	cp->dindx = indx;
1620
1621	return (0);
1622}
1623
1624/*
1625 * __bam_c_physdel --
1626 *	Actually do the cursor deletion.
1627 */
1628static int
1629__bam_c_physdel(dbc, cp, h)
1630	DBC *dbc;
1631	CURSOR *cp;
1632	PAGE *h;
1633{
1634	enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd;
1635	BOVERFLOW bo;
1636	DB *dbp;
1637	DBT dbt;
1638	DB_LOCK lock;
1639	db_indx_t indx;
1640	db_pgno_t pgno, next_pgno, prev_pgno;
1641	int delete_page, local_page, ret;
1642
1643	dbp = dbc->dbp;
1644
1645	delete_page = ret = 0;
1646
1647	/* Figure out what we're deleting. */
1648	if (cp->dpgno == PGNO_INVALID) {
1649		pgno = cp->pgno;
1650		indx = cp->indx;
1651	} else {
1652		pgno = cp->dpgno;
1653		indx = cp->dindx;
1654	}
1655
1656	/*
1657	 * If the item is referenced by another cursor, set that cursor's
1658	 * delete flag and leave it up to it to do the delete.
1659	 *
1660	 * !!!
1661	 * This test for > 0 is a tricky.  There are two ways that we can
1662	 * be called here.  Either we are closing the cursor or we've moved
1663	 * off the page with the deleted entry.  In the first case, we've
1664	 * already removed the cursor from the active queue, so we won't see
1665	 * it in __bam_ca_delete. In the second case, it will be on a different
1666	 * item, so we won't bother with it in __bam_ca_delete.
1667	 */
1668	if (__bam_ca_delete(dbp, pgno, indx, 1) > 0)
1669		return (0);
1670
1671	/*
1672	 * If this is concurrent DB, upgrade the lock if necessary.
1673	 */
1674	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) &&
1675	    (ret = lock_get(dbp->dbenv->lk_info,
1676	    dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
1677	    &dbc->mylock)) != 0)
1678		return (EAGAIN);
1679
1680	/*
1681	 * If we don't already have the page locked, get it and delete the
1682	 * items.
1683	 */
1684	if ((h == NULL || h->pgno != pgno)) {
1685		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
1686			return (ret);
1687		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1688			return (ret);
1689		local_page = 1;
1690	} else
1691		local_page = 0;
1692
1693	/*
1694	 * If we're deleting a duplicate entry and there are other duplicate
1695	 * entries remaining, call the common code to do the work and fix up
1696	 * the parent page as necessary.  Otherwise, do a normal btree delete.
1697	 *
1698	 * There are 5 possible cases:
1699	 *
1700	 * 1. It's not a duplicate item: do a normal btree delete.
1701	 * 2. It's a duplicate item:
1702	 *	2a: We delete an item from a page of duplicates, but there are
1703	 *	    more items on the page.
1704	 *      2b: We delete the last item from a page of duplicates, deleting
1705	 *	    the last duplicate.
1706	 *      2c: We delete the last item from a page of duplicates, but there
1707	 *	    is a previous page of duplicates.
1708	 *      2d: We delete the last item from a page of duplicates, but there
1709	 *	    is a following page of duplicates.
1710	 *
1711	 * In the case of:
1712	 *
1713	 *  1: There's nothing further to do.
1714	 * 2a: There's nothing further to do.
1715	 * 2b: Do the normal btree delete instead of a duplicate delete, as
1716	 *     that deletes both the duplicate chain and the parent page's
1717	 *     entry.
1718	 * 2c: There's nothing further to do.
1719	 * 2d: Delete the duplicate, and update the parent page's entry.
1720	 */
1721	if (TYPE(h) == P_DUPLICATE) {
1722		pgno = PGNO(h);
1723		prev_pgno = PREV_PGNO(h);
1724		next_pgno = NEXT_PGNO(h);
1725
1726		if (NUM_ENT(h) == 1 &&
1727		    prev_pgno == PGNO_INVALID && next_pgno == PGNO_INVALID)
1728			cmd = DELETE_PAGE;
1729		else {
1730			cmd = DELETE_ITEM;
1731
1732			/* Delete the duplicate. */
1733			if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0)
1734				goto err;
1735
1736			/*
1737			 * 2a: h != NULL, h->pgno == pgno
1738			 * 2b: We don't reach this clause, as the above test
1739			 *     was true.
1740			 * 2c: h == NULL, prev_pgno != PGNO_INVALID
1741			 * 2d: h != NULL, next_pgno != PGNO_INVALID
1742			 *
1743			 * Test for 2a and 2c: if we didn't empty the current
1744			 * page or there was a previous page of duplicates, we
1745			 * don't need to touch the parent page.
1746			 */
1747			if ((h != NULL && pgno == h->pgno) ||
1748			    prev_pgno != PGNO_INVALID)
1749				cmd = NOTHING_FURTHER;
1750		}
1751
1752		/*
1753		 * Release any page we're holding and its lock.
1754		 *
1755		 * !!!
1756		 * If there is no subsequent page in the duplicate chain, then
1757		 * __db_drem will have put page "h" and set it to NULL.
1758		*/
1759		if (local_page) {
1760			if (h != NULL)
1761				(void)memp_fput(dbp->mpf, h, 0);
1762			(void)__BT_TLPUT(dbc, lock);
1763			local_page = 0;
1764		}
1765
1766		if (cmd == NOTHING_FURTHER)
1767			goto done;
1768
1769		/* Acquire the parent page and switch the index to its entry. */
1770		if ((ret =
1771		    __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
1772			goto err;
1773		if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) {
1774			(void)__BT_TLPUT(dbc, lock);
1775			goto err;
1776		}
1777		local_page = 1;
1778		indx = cp->indx;
1779
1780		if (cmd == DELETE_PAGE)
1781			goto btd;
1782
1783		/*
1784		 * Copy, delete, update, add-back the parent page's data entry.
1785		 *
1786		 * XXX
1787		 * This may be a performance/logging problem.  We should add a
1788		 * log message which simply logs/updates a random set of bytes
1789		 * on a page, and use it instead of doing a delete/add pair.
1790		 */
1791		indx += O_INDX;
1792		bo = *GET_BOVERFLOW(h, indx);
1793		(void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE);
1794		bo.pgno = next_pgno;
1795		memset(&dbt, 0, sizeof(dbt));
1796		dbt.data = &bo;
1797		dbt.size = BOVERFLOW_SIZE;
1798		(void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
1799		(void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
1800		goto done;
1801	}
1802
1803btd:	/*
1804	 * If the page is going to be emptied, delete it.  To delete a leaf
1805	 * page we need a copy of a key from the page.  We use the 0th page
1806	 * index since it's the last key that the page held.
1807	 *
1808	 * We malloc the page information instead of using the return key/data
1809	 * memory because we've already set them -- the reason we've already
1810	 * set them is because we're (potentially) about to do a reverse split,
1811	 * which would make our saved page information useless.
1812	 *
1813	 * !!!
1814	 * The following operations to delete a page might deadlock.  I think
1815	 * that's OK.  The problem is if we're deleting an item because we're
1816	 * closing cursors because we've already deadlocked and want to call
1817	 * txn_abort().  If we fail due to deadlock, we leave a locked empty
1818	 * page in the tree, which won't be empty long because we're going to
1819	 * undo the delete.
1820	 */
1821	if (NUM_ENT(h) == 2 && h->pgno != PGNO_ROOT) {
1822		memset(&dbt, 0, sizeof(DBT));
1823		dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1824		if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1825			goto err;
1826		delete_page = 1;
1827	}
1828
1829	/*
1830	 * Do a normal btree delete.
1831	 *
1832	 * !!!
1833	 * Delete the key item first, otherwise the duplicate checks in
1834	 * __bam_ditem() won't work!
1835	 */
1836	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1837		goto err;
1838	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1839		goto err;
1840
1841	/* Discard any remaining locks/pages. */
1842	if (local_page) {
1843		(void)memp_fput(dbp->mpf, h, 0);
1844		(void)__BT_TLPUT(dbc, lock);
1845		local_page = 0;
1846	}
1847
1848	/* Delete the page if it was emptied. */
1849	if (delete_page)
1850		ret = __bam_dpage(dbc, &dbt);
1851
1852err:
1853done:	if (delete_page)
1854		__os_free(dbt.data, dbt.size);
1855
1856	if (local_page) {
1857		/*
1858		 * It's possible for h to be NULL, as __db_drem may have
1859		 * been relinking pages by the time that it deadlocked.
1860		 */
1861		if (h != NULL)
1862			(void)memp_fput(dbp->mpf, h, 0);
1863		(void)__BT_TLPUT(dbc, lock);
1864	}
1865
1866	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1867		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1868		    DB_LOCK_IWRITE, 0);
1869
1870	return (ret);
1871}
1872
1873/*
1874 * __bam_c_getstack --
1875 *	Acquire a full stack for a cursor.
1876 */
1877static int
1878__bam_c_getstack(dbc, cp)
1879	DBC *dbc;
1880	CURSOR *cp;
1881{
1882	DB *dbp;
1883	DBT dbt;
1884	PAGE *h;
1885	db_pgno_t pgno;
1886	int exact, ret;
1887
1888	dbp = dbc->dbp;
1889	h = NULL;
1890	memset(&dbt, 0, sizeof(DBT));
1891	ret = 0;
1892
1893	/* Get the page with the current item on it. */
1894	pgno = cp->pgno;
1895	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1896		return (ret);
1897
1898	/* Get a copy of a key from the page. */
1899	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1900	if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1901		goto err;
1902
1903	/* Get a write-locked stack for that page. */
1904	exact = 0;
1905	ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
1906
1907	/* We no longer need the key or the page. */
1908err:	if (h != NULL)
1909		(void)memp_fput(dbp->mpf, h, 0);
1910	if (dbt.data != NULL)
1911		__os_free(dbt.data, dbt.size);
1912	return (ret);
1913}
1914