17c478bdstevel@tonic-gate/*-
27c478bdstevel@tonic-gate * See the file LICENSE for redistribution information.
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * Copyright (c) 1996, 1997, 1998
57c478bdstevel@tonic-gate *	Sleepycat Software.  All rights reserved.
67c478bdstevel@tonic-gate */
77c478bdstevel@tonic-gate
87c478bdstevel@tonic-gate#include "config.h"
97c478bdstevel@tonic-gate
107c478bdstevel@tonic-gate#ifndef lint
117c478bdstevel@tonic-gatestatic const char sccsid[] = "@(#)bt_cursor.c	10.81 (Sleepycat) 12/16/98";
127c478bdstevel@tonic-gate#endif /* not lint */
137c478bdstevel@tonic-gate
147c478bdstevel@tonic-gate#ifndef NO_SYSTEM_INCLUDES
157c478bdstevel@tonic-gate#include <sys/types.h>
167c478bdstevel@tonic-gate
177c478bdstevel@tonic-gate#include <errno.h>
187c478bdstevel@tonic-gate#include <stdlib.h>
197c478bdstevel@tonic-gate#include <string.h>
207c478bdstevel@tonic-gate#endif
217c478bdstevel@tonic-gate
227c478bdstevel@tonic-gate#include "db_int.h"
237c478bdstevel@tonic-gate#include "db_page.h"
247c478bdstevel@tonic-gate#include "btree.h"
257c478bdstevel@tonic-gate#include "shqueue.h"
267c478bdstevel@tonic-gate#include "db_shash.h"
277c478bdstevel@tonic-gate#include "lock.h"
287c478bdstevel@tonic-gate#include "lock_ext.h"
297c478bdstevel@tonic-gate
307c478bdstevel@tonic-gatestatic int __bam_c_close __P((DBC *));
317c478bdstevel@tonic-gatestatic int __bam_c_del __P((DBC *, u_int32_t));
327c478bdstevel@tonic-gatestatic int __bam_c_destroy __P((DBC *));
337c478bdstevel@tonic-gatestatic int __bam_c_first __P((DBC *, CURSOR *));
347c478bdstevel@tonic-gatestatic int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
357c478bdstevel@tonic-gatestatic int __bam_c_getstack __P((DBC *, CURSOR *));
367c478bdstevel@tonic-gatestatic int __bam_c_last __P((DBC *, CURSOR *));
377c478bdstevel@tonic-gatestatic int __bam_c_next __P((DBC *, CURSOR *, int));
387c478bdstevel@tonic-gatestatic int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *));
397c478bdstevel@tonic-gatestatic int __bam_c_prev __P((DBC *, CURSOR *));
407c478bdstevel@tonic-gatestatic int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
417c478bdstevel@tonic-gatestatic void __bam_c_reset __P((CURSOR *));
427c478bdstevel@tonic-gatestatic int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
437c478bdstevel@tonic-gatestatic int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *));
447c478bdstevel@tonic-gatestatic int __bam_dsearch __P((DBC *, CURSOR *,  DBT *, u_int32_t *));
457c478bdstevel@tonic-gate
467c478bdstevel@tonic-gate/* Discard the current page/lock held by a cursor. */
477c478bdstevel@tonic-gate#undef	DISCARD
487c478bdstevel@tonic-gate#define	DISCARD(dbc, cp) {						\
497c478bdstevel@tonic-gate	if ((cp)->page != NULL) {					\
507c478bdstevel@tonic-gate		(void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0);	\
517c478bdstevel@tonic-gate		(cp)->page = NULL;					\
527c478bdstevel@tonic-gate	}								\
537c478bdstevel@tonic-gate	if ((cp)->lock != LOCK_INVALID) {				\
547c478bdstevel@tonic-gate		(void)__BT_TLPUT((dbc), (cp)->lock);			\
557c478bdstevel@tonic-gate		(cp)->lock = LOCK_INVALID;				\
567c478bdstevel@tonic-gate	}								\
577c478bdstevel@tonic-gate}
587c478bdstevel@tonic-gate
597c478bdstevel@tonic-gate/* If the cursor references a deleted record. */
607c478bdstevel@tonic-gate#undef	IS_CUR_DELETED
617c478bdstevel@tonic-gate#define	IS_CUR_DELETED(cp)						\
627c478bdstevel@tonic-gate	(((cp)->dpgno == PGNO_INVALID &&				\
637c478bdstevel@tonic-gate	B_DISSET(GET_BKEYDATA((cp)->page,				\
647c478bdstevel@tonic-gate	(cp)->indx + O_INDX)->type)) ||					\
657c478bdstevel@tonic-gate	((cp)->dpgno != PGNO_INVALID &&					\
667c478bdstevel@tonic-gate	B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type)))
677c478bdstevel@tonic-gate
687c478bdstevel@tonic-gate/* If the cursor and index combination references a deleted record. */
697c478bdstevel@tonic-gate#undef	IS_DELETED
707c478bdstevel@tonic-gate#define	IS_DELETED(cp, indx)						\
717c478bdstevel@tonic-gate	(((cp)->dpgno == PGNO_INVALID &&				\
727c478bdstevel@tonic-gate	B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) ||	\
737c478bdstevel@tonic-gate	((cp)->dpgno != PGNO_INVALID &&					\
747c478bdstevel@tonic-gate	B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type)))
757c478bdstevel@tonic-gate
767c478bdstevel@tonic-gate/*
777c478bdstevel@tonic-gate * Test to see if two cursors could point to duplicates of the same key,
787c478bdstevel@tonic-gate * whether on-page or off-page.  The leaf page numbers must be the same
797c478bdstevel@tonic-gate * in both cases.  In the case of off-page duplicates, the key indices
807c478bdstevel@tonic-gate * on the leaf page will be the same.  In the case of on-page duplicates,
817c478bdstevel@tonic-gate * the duplicate page number must not be set, and the key index offsets
827c478bdstevel@tonic-gate * must be the same.  For the last test, as the saved copy of the cursor
837c478bdstevel@tonic-gate * will not have a valid page pointer, we use the cursor's.
847c478bdstevel@tonic-gate */
857c478bdstevel@tonic-gate#undef	POSSIBLE_DUPLICATE
867c478bdstevel@tonic-gate#define	POSSIBLE_DUPLICATE(cursor, saved_copy)				\
877c478bdstevel@tonic-gate	((cursor)->pgno == (saved_copy).pgno &&				\
887c478bdstevel@tonic-gate	((cursor)->indx == (saved_copy).indx ||				\
897c478bdstevel@tonic-gate	((cursor)->dpgno == PGNO_INVALID &&				\
907c478bdstevel@tonic-gate	    (saved_copy).dpgno == PGNO_INVALID &&			\
917c478bdstevel@tonic-gate	    (cursor)->page->inp[(cursor)->indx] ==			\
927c478bdstevel@tonic-gate	    (cursor)->page->inp[(saved_copy).indx])))
937c478bdstevel@tonic-gate
947c478bdstevel@tonic-gate/*
957c478bdstevel@tonic-gate * __bam_c_reset --
967c478bdstevel@tonic-gate *	Initialize internal cursor structure.
977c478bdstevel@tonic-gate */
987c478bdstevel@tonic-gatestatic void
997c478bdstevel@tonic-gate__bam_c_reset(cp)
1007c478bdstevel@tonic-gate	CURSOR *cp;
1017c478bdstevel@tonic-gate{
1027c478bdstevel@tonic-gate	cp->sp = cp->csp = cp->stack;
1037c478bdstevel@tonic-gate	cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
1047c478bdstevel@tonic-gate	cp->page = NULL;
1057c478bdstevel@tonic-gate	cp->pgno = PGNO_INVALID;
1067c478bdstevel@tonic-gate	cp->indx = 0;
1077c478bdstevel@tonic-gate	cp->dpgno = PGNO_INVALID;
1087c478bdstevel@tonic-gate	cp->dindx = 0;
1097c478bdstevel@tonic-gate	cp->lock = LOCK_INVALID;
1107c478bdstevel@tonic-gate	cp->mode = DB_LOCK_NG;
1117c478bdstevel@tonic-gate	cp->recno = RECNO_OOB;
1127c478bdstevel@tonic-gate	cp->flags = 0;
1137c478bdstevel@tonic-gate}
1147c478bdstevel@tonic-gate
1157c478bdstevel@tonic-gate/*
1167c478bdstevel@tonic-gate * __bam_c_init --
1177c478bdstevel@tonic-gate *	Initialize the access private portion of a cursor
1187c478bdstevel@tonic-gate *
1197c478bdstevel@tonic-gate * PUBLIC: int __bam_c_init __P((DBC *));
1207c478bdstevel@tonic-gate */
1217c478bdstevel@tonic-gateint
1227c478bdstevel@tonic-gate__bam_c_init(dbc)
1237c478bdstevel@tonic-gate	DBC *dbc;
1247c478bdstevel@tonic-gate{
1257c478bdstevel@tonic-gate	DB *dbp;
1267c478bdstevel@tonic-gate	CURSOR *cp;
1277c478bdstevel@tonic-gate	int ret;
1287c478bdstevel@tonic-gate
1297c478bdstevel@tonic-gate	if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0)
1307c478bdstevel@tonic-gate		return (ret);
1317c478bdstevel@tonic-gate
1327c478bdstevel@tonic-gate	dbp = dbc->dbp;
1337c478bdstevel@tonic-gate	cp->dbc = dbc;
1347c478bdstevel@tonic-gate
1357c478bdstevel@tonic-gate	/*
1367c478bdstevel@tonic-gate	 * Logical record numbers are always the same size, and we don't want
1377c478bdstevel@tonic-gate	 * to have to check for space every time we return one.  Allocate it
1387c478bdstevel@tonic-gate	 * in advance.
1397c478bdstevel@tonic-gate	 */
1407c478bdstevel@tonic-gate	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
1417c478bdstevel@tonic-gate		if ((ret = __os_malloc(sizeof(db_recno_t),
1427c478bdstevel@tonic-gate		    NULL, &dbc->rkey.data)) != 0) {
1437c478bdstevel@tonic-gate			__os_free(cp, sizeof(CURSOR));
1447c478bdstevel@tonic-gate			return (ret);
1457c478bdstevel@tonic-gate		}
1467c478bdstevel@tonic-gate		dbc->rkey.ulen = sizeof(db_recno_t);
1477c478bdstevel@tonic-gate	}
1487c478bdstevel@tonic-gate
1497c478bdstevel@tonic-gate	/* Initialize methods. */
1507c478bdstevel@tonic-gate	dbc->internal = cp;
1517c478bdstevel@tonic-gate	if (dbp->type == DB_BTREE) {
1527c478bdstevel@tonic-gate		dbc->c_am_close = __bam_c_close;
1537c478bdstevel@tonic-gate		dbc->c_am_destroy = __bam_c_destroy;
1547c478bdstevel@tonic-gate		dbc->c_del = __bam_c_del;
1557c478bdstevel@tonic-gate		dbc->c_get = __bam_c_get;
1567c478bdstevel@tonic-gate		dbc->c_put = __bam_c_put;
1577c478bdstevel@tonic-gate	} else {
1587c478bdstevel@tonic-gate		dbc->c_am_close = __bam_c_close;
1597c478bdstevel@tonic-gate		dbc->c_am_destroy = __bam_c_destroy;
1607c478bdstevel@tonic-gate		dbc->c_del = __ram_c_del;
1617c478bdstevel@tonic-gate		dbc->c_get = __ram_c_get;
1627c478bdstevel@tonic-gate		dbc->c_put = __ram_c_put;
1637c478bdstevel@tonic-gate	}
1647c478bdstevel@tonic-gate
1657c478bdstevel@tonic-gate	/* Initialize dynamic information. */
1667c478bdstevel@tonic-gate	__bam_c_reset(cp);
1677c478bdstevel@tonic-gate
1687c478bdstevel@tonic-gate	return (0);
1697c478bdstevel@tonic-gate}
1707c478bdstevel@tonic-gate
1717c478bdstevel@tonic-gate/*
1727c478bdstevel@tonic-gate * __bam_c_close --
1737c478bdstevel@tonic-gate *	Close down the cursor from a single use.
1747c478bdstevel@tonic-gate */
1757c478bdstevel@tonic-gatestatic int
1767c478bdstevel@tonic-gate__bam_c_close(dbc)
1777c478bdstevel@tonic-gate	DBC *dbc;
1787c478bdstevel@tonic-gate{
1797c478bdstevel@tonic-gate	CURSOR *cp;
1807c478bdstevel@tonic-gate	DB *dbp;
1817c478bdstevel@tonic-gate	int ret;
1827c478bdstevel@tonic-gate
1837c478bdstevel@tonic-gate	dbp = dbc->dbp;
1847c478bdstevel@tonic-gate	cp = dbc->internal;
1857c478bdstevel@tonic-gate	ret = 0;
1867c478bdstevel@tonic-gate
1877c478bdstevel@tonic-gate	/*
1887c478bdstevel@tonic-gate	 * If a cursor deleted a btree key, perform the actual deletion.
1897c478bdstevel@tonic-gate	 * (Recno keys are either deleted immediately or never deleted.)
1907c478bdstevel@tonic-gate	 */
1917c478bdstevel@tonic-gate	if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED))
1927c478bdstevel@tonic-gate		ret = __bam_c_physdel(dbc, cp, NULL);
1937c478bdstevel@tonic-gate
1947c478bdstevel@tonic-gate	/* Discard any locks not acquired inside of a transaction. */
1957c478bdstevel@tonic-gate	if (cp->lock != LOCK_INVALID) {
1967c478bdstevel@tonic-gate		(void)__BT_TLPUT(dbc, cp->lock);
1977c478bdstevel@tonic-gate		cp->lock = LOCK_INVALID;
1987c478bdstevel@tonic-gate	}
1997c478bdstevel@tonic-gate
2007c478bdstevel@tonic-gate	/* Sanity checks. */
2017c478bdstevel@tonic-gate#ifdef DIAGNOSTIC
2027c478bdstevel@tonic-gate	if (cp->csp != cp->stack)
2037c478bdstevel@tonic-gate		__db_err(dbp->dbenv, "btree cursor close: stack not empty");
2047c478bdstevel@tonic-gate#endif
2057c478bdstevel@tonic-gate
2067c478bdstevel@tonic-gate	/* Initialize dynamic information. */
2077c478bdstevel@tonic-gate	__bam_c_reset(cp);
2087c478bdstevel@tonic-gate
2097c478bdstevel@tonic-gate	return (ret);
2107c478bdstevel@tonic-gate}
2117c478bdstevel@tonic-gate
2127c478bdstevel@tonic-gate/*
2137c478bdstevel@tonic-gate * __bam_c_destroy --
2147c478bdstevel@tonic-gate *	Close a single cursor -- internal version.
2157c478bdstevel@tonic-gate */
2167c478bdstevel@tonic-gatestatic int
2177c478bdstevel@tonic-gate__bam_c_destroy(dbc)
2187c478bdstevel@tonic-gate	DBC *dbc;
2197c478bdstevel@tonic-gate{
2207c478bdstevel@tonic-gate	/* Discard the structures. */
2217c478bdstevel@tonic-gate	__os_free(dbc->internal, sizeof(CURSOR));
2227c478bdstevel@tonic-gate
2237c478bdstevel@tonic-gate	return (0);
2247c478bdstevel@tonic-gate}
2257c478bdstevel@tonic-gate
2267c478bdstevel@tonic-gate/*
2277c478bdstevel@tonic-gate * __bam_c_del --
2287c478bdstevel@tonic-gate *	Delete using a cursor.
2297c478bdstevel@tonic-gate */
2307c478bdstevel@tonic-gatestatic int
2317c478bdstevel@tonic-gate__bam_c_del(dbc, flags)
2327c478bdstevel@tonic-gate	DBC *dbc;
2337c478bdstevel@tonic-gate	u_int32_t flags;
2347c478bdstevel@tonic-gate{
2357c478bdstevel@tonic-gate	CURSOR *cp;
2367c478bdstevel@tonic-gate	DB *dbp;
2377c478bdstevel@tonic-gate	DB_LOCK lock;
2387c478bdstevel@tonic-gate	PAGE *h;
2397c478bdstevel@tonic-gate	db_pgno_t pgno;
2407c478bdstevel@tonic-gate	db_indx_t indx;
2417c478bdstevel@tonic-gate	int ret;
2427c478bdstevel@tonic-gate
2437c478bdstevel@tonic-gate	dbp = dbc->dbp;
2447c478bdstevel@tonic-gate	cp = dbc->internal;
2457c478bdstevel@tonic-gate	h = NULL;
2467c478bdstevel@tonic-gate
2477c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
2487c478bdstevel@tonic-gate
2497c478bdstevel@tonic-gate	/* Check for invalid flags. */
2507c478bdstevel@tonic-gate	if ((ret = __db_cdelchk(dbp, flags,
2517c478bdstevel@tonic-gate	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
2527c478bdstevel@tonic-gate		return (ret);
2537c478bdstevel@tonic-gate
2547c478bdstevel@tonic-gate	/*
2557c478bdstevel@tonic-gate	 * If we are running CDB, this had better be either a write
2567c478bdstevel@tonic-gate	 * cursor or an immediate writer.
2577c478bdstevel@tonic-gate	 */
2587c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB))
2597c478bdstevel@tonic-gate		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
2607c478bdstevel@tonic-gate			return (EINVAL);
2617c478bdstevel@tonic-gate
2627c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags);
2637c478bdstevel@tonic-gate
2647c478bdstevel@tonic-gate	/* If already deleted, return failure. */
2657c478bdstevel@tonic-gate	if (F_ISSET(cp, C_DELETED))
2667c478bdstevel@tonic-gate		return (DB_KEYEMPTY);
2677c478bdstevel@tonic-gate
2687c478bdstevel@tonic-gate	/*
2697c478bdstevel@tonic-gate	 * We don't physically delete the record until the cursor moves,
2707c478bdstevel@tonic-gate	 * so we have to have a long-lived write lock on the page instead
2717c478bdstevel@tonic-gate	 * of a long-lived read lock.  Note, we have to have a read lock
2727c478bdstevel@tonic-gate	 * to even get here, so we simply discard it.
2737c478bdstevel@tonic-gate	 */
2747c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
2757c478bdstevel@tonic-gate		if ((ret = __bam_lget(dbc,
2767c478bdstevel@tonic-gate		    0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
2777c478bdstevel@tonic-gate			goto err;
2787c478bdstevel@tonic-gate		(void)__BT_TLPUT(dbc, cp->lock);
2797c478bdstevel@tonic-gate		cp->lock = lock;
2807c478bdstevel@tonic-gate		cp->mode = DB_LOCK_WRITE;
2817c478bdstevel@tonic-gate	}
2827c478bdstevel@tonic-gate
2837c478bdstevel@tonic-gate	/*
2847c478bdstevel@tonic-gate	 * Acquire the underlying page (which may be different from the above
2857c478bdstevel@tonic-gate	 * page because it may be a duplicate page), and set the on-page and
2867c478bdstevel@tonic-gate	 * in-cursor delete flags.  We don't need to lock it as we've already
2877c478bdstevel@tonic-gate	 * write-locked the page leading to it.
2887c478bdstevel@tonic-gate	 */
2897c478bdstevel@tonic-gate	if (cp->dpgno == PGNO_INVALID) {
2907c478bdstevel@tonic-gate		pgno = cp->pgno;
2917c478bdstevel@tonic-gate		indx = cp->indx;
2927c478bdstevel@tonic-gate	} else {
2937c478bdstevel@tonic-gate		pgno = cp->dpgno;
2947c478bdstevel@tonic-gate		indx = cp->dindx;
2957c478bdstevel@tonic-gate	}
2967c478bdstevel@tonic-gate
2977c478bdstevel@tonic-gate	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
2987c478bdstevel@tonic-gate		goto err;
2997c478bdstevel@tonic-gate
3007c478bdstevel@tonic-gate	/* Log the change. */
3017c478bdstevel@tonic-gate	if (DB_LOGGING(dbc) &&
3027c478bdstevel@tonic-gate	    (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
3037c478bdstevel@tonic-gate	    0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
3047c478bdstevel@tonic-gate		(void)memp_fput(dbp->mpf, h, 0);
3057c478bdstevel@tonic-gate		goto err;
3067c478bdstevel@tonic-gate	}
3077c478bdstevel@tonic-gate
3087c478bdstevel@tonic-gate	/*
3097c478bdstevel@tonic-gate	 * Set the intent-to-delete flag on the page and update all cursors. */
3107c478bdstevel@tonic-gate	if (cp->dpgno == PGNO_INVALID)
3117c478bdstevel@tonic-gate		B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
3127c478bdstevel@tonic-gate	else
3137c478bdstevel@tonic-gate		B_DSET(GET_BKEYDATA(h, indx)->type);
3147c478bdstevel@tonic-gate	(void)__bam_ca_delete(dbp, pgno, indx, 1);
3157c478bdstevel@tonic-gate
3167c478bdstevel@tonic-gate	ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
3177c478bdstevel@tonic-gate	h = NULL;
3187c478bdstevel@tonic-gate
3197c478bdstevel@tonic-gate	/*
3207c478bdstevel@tonic-gate	 * If the tree has record numbers, we have to adjust the counts.
3217c478bdstevel@tonic-gate	 *
3227c478bdstevel@tonic-gate	 * !!!
3237c478bdstevel@tonic-gate	 * This test is right -- we don't yet support duplicates and record
3247c478bdstevel@tonic-gate	 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM
3257c478bdstevel@tonic-gate	 * set.
3267c478bdstevel@tonic-gate	 */
3277c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_BT_RECNUM)) {
3287c478bdstevel@tonic-gate		if ((ret = __bam_c_getstack(dbc, cp)) != 0)
3297c478bdstevel@tonic-gate			goto err;
3307c478bdstevel@tonic-gate		if ((ret = __bam_adjust(dbc, -1)) != 0)
3317c478bdstevel@tonic-gate			goto err;
3327c478bdstevel@tonic-gate		(void)__bam_stkrel(dbc, 0);
3337c478bdstevel@tonic-gate	}
3347c478bdstevel@tonic-gate
3357c478bdstevel@tonic-gateerr:	if (h != NULL)
3367c478bdstevel@tonic-gate		(void)memp_fput(dbp->mpf, h, 0);
3377c478bdstevel@tonic-gate	return (ret);
3387c478bdstevel@tonic-gate}
3397c478bdstevel@tonic-gate
3407c478bdstevel@tonic-gate/*
3417c478bdstevel@tonic-gate * __bam_c_get --
3427c478bdstevel@tonic-gate *	Get using a cursor (btree).
3437c478bdstevel@tonic-gate */
3447c478bdstevel@tonic-gatestatic int
3457c478bdstevel@tonic-gate__bam_c_get(dbc, key, data, flags)
3467c478bdstevel@tonic-gate	DBC *dbc;
3477c478bdstevel@tonic-gate	DBT *key, *data;
3487c478bdstevel@tonic-gate	u_int32_t flags;
3497c478bdstevel@tonic-gate{
3507c478bdstevel@tonic-gate	CURSOR *cp, copy, start;
3517c478bdstevel@tonic-gate	DB *dbp;
3527c478bdstevel@tonic-gate	PAGE *h;
3537c478bdstevel@tonic-gate	int exact, ret, tmp_rmw;
3547c478bdstevel@tonic-gate
3557c478bdstevel@tonic-gate	dbp = dbc->dbp;
3567c478bdstevel@tonic-gate	cp = dbc->internal;
3577c478bdstevel@tonic-gate
3587c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
3597c478bdstevel@tonic-gate
3607c478bdstevel@tonic-gate	/* Check for invalid flags. */
3617c478bdstevel@tonic-gate	if ((ret = __db_cgetchk(dbp,
3627c478bdstevel@tonic-gate	    key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
3637c478bdstevel@tonic-gate		return (ret);
3647c478bdstevel@tonic-gate
3657c478bdstevel@tonic-gate	/* Clear OR'd in additional bits so we can check for flag equality. */
3667c478bdstevel@tonic-gate	tmp_rmw = 0;
3677c478bdstevel@tonic-gate	if (LF_ISSET(DB_RMW)) {
3687c478bdstevel@tonic-gate		if (!F_ISSET(dbp, DB_AM_CDB)) {
3697c478bdstevel@tonic-gate			tmp_rmw = 1;
3707c478bdstevel@tonic-gate			F_SET(dbc, DBC_RMW);
3717c478bdstevel@tonic-gate		}
3727c478bdstevel@tonic-gate		LF_CLR(DB_RMW);
3737c478bdstevel@tonic-gate	}
3747c478bdstevel@tonic-gate
3757c478bdstevel@tonic-gate	DEBUG_LREAD(dbc, dbc->txn, "bam_c_get",
3767c478bdstevel@tonic-gate	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
3777c478bdstevel@tonic-gate
3787c478bdstevel@tonic-gate	/*
3797c478bdstevel@tonic-gate	 * Return a cursor's record number.  It has nothing to do with the
3807c478bdstevel@tonic-gate	 * cursor get code except that it's been rammed into the interface.
3817c478bdstevel@tonic-gate	 */
3827c478bdstevel@tonic-gate	if (flags == DB_GET_RECNO) {
3837c478bdstevel@tonic-gate		ret = __bam_c_rget(dbc, data, flags);
3847c478bdstevel@tonic-gate		if (tmp_rmw)
3857c478bdstevel@tonic-gate			F_CLR(dbc, DBC_RMW);
3867c478bdstevel@tonic-gate		return (ret);
3877c478bdstevel@tonic-gate	}
3887c478bdstevel@tonic-gate
3897c478bdstevel@tonic-gate	/*
3907c478bdstevel@tonic-gate	 * Initialize the cursor for a new retrieval.  Clear the cursor's
3917c478bdstevel@tonic-gate	 * page pointer, it was set before this operation, and no longer
3927c478bdstevel@tonic-gate	 * has any meaning.
3937c478bdstevel@tonic-gate	 */
3947c478bdstevel@tonic-gate	cp->page = NULL;
3957c478bdstevel@tonic-gate	copy = *cp;
3967c478bdstevel@tonic-gate	cp->lock = LOCK_INVALID;
3977c478bdstevel@tonic-gate
3987c478bdstevel@tonic-gate	switch (flags) {
3997c478bdstevel@tonic-gate	case DB_CURRENT:
4007c478bdstevel@tonic-gate		/* It's not possible to return a deleted record. */
4017c478bdstevel@tonic-gate		if (F_ISSET(cp, C_DELETED)) {
4027c478bdstevel@tonic-gate			ret = DB_KEYEMPTY;
4037c478bdstevel@tonic-gate			goto err;
4047c478bdstevel@tonic-gate		}
4057c478bdstevel@tonic-gate
4067c478bdstevel@tonic-gate		/* Acquire the current page. */
4077c478bdstevel@tonic-gate		if ((ret = __bam_lget(dbc,
4087c478bdstevel@tonic-gate		    0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0)
4097c478bdstevel@tonic-gate			ret = memp_fget(dbp->mpf,
4107c478bdstevel@tonic-gate			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
4117c478bdstevel@tonic-gate			    0, &cp->page);
4127c478bdstevel@tonic-gate		if (ret != 0)
4137c478bdstevel@tonic-gate			goto err;
4147c478bdstevel@tonic-gate		break;
4157c478bdstevel@tonic-gate	case DB_NEXT_DUP:
4167c478bdstevel@tonic-gate		if (cp->pgno == PGNO_INVALID) {
4177c478bdstevel@tonic-gate			ret = EINVAL;
4187c478bdstevel@tonic-gate			goto err;
4197c478bdstevel@tonic-gate		}
4207c478bdstevel@tonic-gate		if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
4217c478bdstevel@tonic-gate			goto err;
4227c478bdstevel@tonic-gate
4237c478bdstevel@tonic-gate		/* Make sure we didn't go past the end of the duplicates. */
4247c478bdstevel@tonic-gate		if (!POSSIBLE_DUPLICATE(cp, copy)) {
4257c478bdstevel@tonic-gate			ret = DB_NOTFOUND;
4267c478bdstevel@tonic-gate			goto err;
4277c478bdstevel@tonic-gate		}
4287c478bdstevel@tonic-gate		break;
4297c478bdstevel@tonic-gate	case DB_NEXT:
4307c478bdstevel@tonic-gate		if (cp->pgno != PGNO_INVALID) {
4317c478bdstevel@tonic-gate			if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
4327c478bdstevel@tonic-gate				goto err;
4337c478bdstevel@tonic-gate			break;
4347c478bdstevel@tonic-gate		}
4357c478bdstevel@tonic-gate		/* FALLTHROUGH */
4367c478bdstevel@tonic-gate	case DB_FIRST:
4377c478bdstevel@tonic-gate		if ((ret = __bam_c_first(dbc, cp)) != 0)
4387c478bdstevel@tonic-gate			goto err;
4397c478bdstevel@tonic-gate		break;
4407c478bdstevel@tonic-gate	case DB_PREV:
4417c478bdstevel@tonic-gate		if (cp->pgno != PGNO_INVALID) {
4427c478bdstevel@tonic-gate			if ((ret = __bam_c_prev(dbc, cp)) != 0)
4437c478bdstevel@tonic-gate				goto err;
4447c478bdstevel@tonic-gate			break;
4457c478bdstevel@tonic-gate		}
4467c478bdstevel@tonic-gate		/* FALLTHROUGH */
4477c478bdstevel@tonic-gate	case DB_LAST:
4487c478bdstevel@tonic-gate		if ((ret = __bam_c_last(dbc, cp)) != 0)
4497c478bdstevel@tonic-gate			goto err;
4507c478bdstevel@tonic-gate		break;
4517c478bdstevel@tonic-gate	case DB_SET:
4527c478bdstevel@tonic-gate		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
4537c478bdstevel@tonic-gate			goto err;
4547c478bdstevel@tonic-gate
4557c478bdstevel@tonic-gate		/*
4567c478bdstevel@tonic-gate		 * We cannot currently be referencing a deleted record, but we
4577c478bdstevel@tonic-gate		 * may be referencing off-page duplicates.
4587c478bdstevel@tonic-gate		 *
4597c478bdstevel@tonic-gate		 * If we're referencing off-page duplicates, move off-page.
4607c478bdstevel@tonic-gate		 * If we moved off-page, move to the next non-deleted record.
4617c478bdstevel@tonic-gate		 * If we moved to the next non-deleted record, check to make
4627c478bdstevel@tonic-gate		 * sure we didn't switch records because our current record
4637c478bdstevel@tonic-gate		 * had no non-deleted data items.
4647c478bdstevel@tonic-gate		 */
4657c478bdstevel@tonic-gate		start = *cp;
4667c478bdstevel@tonic-gate		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
4677c478bdstevel@tonic-gate			goto err;
4687c478bdstevel@tonic-gate		if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) {
4697c478bdstevel@tonic-gate			if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
4707c478bdstevel@tonic-gate				goto err;
4717c478bdstevel@tonic-gate			if (!POSSIBLE_DUPLICATE(cp, start)) {
4727c478bdstevel@tonic-gate				ret = DB_NOTFOUND;
4737c478bdstevel@tonic-gate				goto err;
4747c478bdstevel@tonic-gate			}
4757c478bdstevel@tonic-gate		}
4767c478bdstevel@tonic-gate		break;
4777c478bdstevel@tonic-gate	case DB_SET_RECNO:
4787c478bdstevel@tonic-gate		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
4797c478bdstevel@tonic-gate			goto err;
4807c478bdstevel@tonic-gate		break;
4817c478bdstevel@tonic-gate	case DB_GET_BOTH:
4827c478bdstevel@tonic-gate		if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) {
4837c478bdstevel@tonic-gate			/* Acquire the current page. */
4847c478bdstevel@tonic-gate			if ((ret = memp_fget(dbp->mpf,
4857c478bdstevel@tonic-gate			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
4867c478bdstevel@tonic-gate			    0, &cp->page)) != 0)
4877c478bdstevel@tonic-gate				goto err;
4887c478bdstevel@tonic-gate
4897c478bdstevel@tonic-gate			/* If DBC_CONTINUE, move to the next item. */
4907c478bdstevel@tonic-gate			if (F_ISSET(dbc, DBC_CONTINUE) &&
4917c478bdstevel@tonic-gate			    (ret = __bam_c_next(dbc, cp, 1)) != 0)
4927c478bdstevel@tonic-gate				goto err;
4937c478bdstevel@tonic-gate		} else {
4947c478bdstevel@tonic-gate			if ((ret =
4957c478bdstevel@tonic-gate			    __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
4967c478bdstevel@tonic-gate				goto err;
4977c478bdstevel@tonic-gate
4987c478bdstevel@tonic-gate			/*
4997c478bdstevel@tonic-gate			 * We may be referencing a duplicates page.  Move to
5007c478bdstevel@tonic-gate			 * the first duplicate.
5017c478bdstevel@tonic-gate			 */
5027c478bdstevel@tonic-gate			if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
5037c478bdstevel@tonic-gate				goto err;
5047c478bdstevel@tonic-gate		}
5057c478bdstevel@tonic-gate
5067c478bdstevel@tonic-gate		/* Search for a matching entry. */
5077c478bdstevel@tonic-gate		if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0)
5087c478bdstevel@tonic-gate			goto err;
5097c478bdstevel@tonic-gate
5107c478bdstevel@tonic-gate		/* Ignore deleted entries. */
5117c478bdstevel@tonic-gate		if (IS_CUR_DELETED(cp)) {
5127c478bdstevel@tonic-gate			ret = DB_NOTFOUND;
5137c478bdstevel@tonic-gate			goto err;
5147c478bdstevel@tonic-gate		}
5157c478bdstevel@tonic-gate		break;
5167c478bdstevel@tonic-gate	case DB_SET_RANGE:
5177c478bdstevel@tonic-gate		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
5187c478bdstevel@tonic-gate			goto err;
5197c478bdstevel@tonic-gate
5207c478bdstevel@tonic-gate		/*
5217c478bdstevel@tonic-gate		 * As we didn't require an exact match, the search function
5227c478bdstevel@tonic-gate		 * may have returned an entry past the end of the page.  If
5237c478bdstevel@tonic-gate		 * so, move to the next entry.
5247c478bdstevel@tonic-gate		 */
5257c478bdstevel@tonic-gate		if (cp->indx == NUM_ENT(cp->page) &&
5267c478bdstevel@tonic-gate		    (ret = __bam_c_next(dbc, cp, 0)) != 0)
5277c478bdstevel@tonic-gate			goto err;
5287c478bdstevel@tonic-gate
5297c478bdstevel@tonic-gate		/*
5307c478bdstevel@tonic-gate		 * We may be referencing off-page duplicates, if so, move
5317c478bdstevel@tonic-gate		 * off-page.
5327c478bdstevel@tonic-gate		 */
5337c478bdstevel@tonic-gate		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
5347c478bdstevel@tonic-gate			goto err;
5357c478bdstevel@tonic-gate
5367c478bdstevel@tonic-gate		/*
5377c478bdstevel@tonic-gate		 * We may be referencing a deleted record, if so, move to
5387c478bdstevel@tonic-gate		 * the next non-deleted record.
5397c478bdstevel@tonic-gate		 */
5407c478bdstevel@tonic-gate		if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0)
5417c478bdstevel@tonic-gate			goto err;
5427c478bdstevel@tonic-gate		break;
5437c478bdstevel@tonic-gate	}
5447c478bdstevel@tonic-gate
5457c478bdstevel@tonic-gate	/*
5467c478bdstevel@tonic-gate	 * Return the key if the user didn't give us one.  If we've moved to
5477c478bdstevel@tonic-gate	 * a duplicate page, we may no longer have a pointer to the main page,
5487c478bdstevel@tonic-gate	 * so we have to go get it.  We know that it's already read-locked,
5497c478bdstevel@tonic-gate	 * however, so we don't have to acquire a new lock.
5507c478bdstevel@tonic-gate	 */
5517c478bdstevel@tonic-gate	if (flags != DB_SET) {
5527c478bdstevel@tonic-gate		if (cp->dpgno != PGNO_INVALID) {
5537c478bdstevel@tonic-gate			if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
5547c478bdstevel@tonic-gate				goto err;
5557c478bdstevel@tonic-gate		} else
5567c478bdstevel@tonic-gate			h = cp->page;
5577c478bdstevel@tonic-gate		ret = __db_ret(dbp,
5587c478bdstevel@tonic-gate		    h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen);
5597c478bdstevel@tonic-gate		if (cp->dpgno != PGNO_INVALID)
5607c478bdstevel@tonic-gate			(void)memp_fput(dbp->mpf, h, 0);
5617c478bdstevel@tonic-gate		if (ret)
5627c478bdstevel@tonic-gate			goto err;
5637c478bdstevel@tonic-gate	}
5647c478bdstevel@tonic-gate
5657c478bdstevel@tonic-gate	/* Return the data. */
5667c478bdstevel@tonic-gate	if ((ret = __db_ret(dbp, cp->page,
5677c478bdstevel@tonic-gate	    cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
5687c478bdstevel@tonic-gate	    data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
5697c478bdstevel@tonic-gate		goto err;
5707c478bdstevel@tonic-gate
5717c478bdstevel@tonic-gate	/*
5727c478bdstevel@tonic-gate	 * If the previous cursor record has been deleted, physically delete
5737c478bdstevel@tonic-gate	 * the entry from the page.  We clear the deleted flag before we call
5747c478bdstevel@tonic-gate	 * the underlying delete routine so that, if an error occurs, and we
5757c478bdstevel@tonic-gate	 * restore the cursor, the deleted flag is cleared.  This is because,
5767c478bdstevel@tonic-gate	 * if we manage to physically modify the page, and then restore the
5777c478bdstevel@tonic-gate	 * cursor, we might try to repeat the page modification when closing
5787c478bdstevel@tonic-gate	 * the cursor.
5797c478bdstevel@tonic-gate	 */
5807c478bdstevel@tonic-gate	if (F_ISSET(&copy, C_DELETED)) {
5817c478bdstevel@tonic-gate		F_CLR(&copy, C_DELETED);
5827c478bdstevel@tonic-gate		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
5837c478bdstevel@tonic-gate			goto err;
5847c478bdstevel@tonic-gate	}
5857c478bdstevel@tonic-gate	F_CLR(cp, C_DELETED);
5867c478bdstevel@tonic-gate
5877c478bdstevel@tonic-gate	/* Release the previous lock, if any; the current lock is retained. */
5887c478bdstevel@tonic-gate	if (copy.lock != LOCK_INVALID)
5897c478bdstevel@tonic-gate		(void)__BT_TLPUT(dbc, copy.lock);
5907c478bdstevel@tonic-gate
5917c478bdstevel@tonic-gate	/* Release the current page. */
5927c478bdstevel@tonic-gate	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
5937c478bdstevel@tonic-gate		goto err;
5947c478bdstevel@tonic-gate
5957c478bdstevel@tonic-gate	if (0) {
5967c478bdstevel@tonic-gateerr:		if (cp->page != NULL)
5977c478bdstevel@tonic-gate			(void)memp_fput(dbp->mpf, cp->page, 0);
5987c478bdstevel@tonic-gate		if (cp->lock != LOCK_INVALID)
5997c478bdstevel@tonic-gate			(void)__BT_TLPUT(dbc, cp->lock);
6007c478bdstevel@tonic-gate		*cp = copy;
6017c478bdstevel@tonic-gate	}
6027c478bdstevel@tonic-gate
6037c478bdstevel@tonic-gate	/* Release temporary lock upgrade. */
6047c478bdstevel@tonic-gate	if (tmp_rmw)
6057c478bdstevel@tonic-gate		F_CLR(dbc, DBC_RMW);
6067c478bdstevel@tonic-gate
6077c478bdstevel@tonic-gate	return (ret);
6087c478bdstevel@tonic-gate}
6097c478bdstevel@tonic-gate
6107c478bdstevel@tonic-gate/*
6117c478bdstevel@tonic-gate * __bam_dsearch --
6127c478bdstevel@tonic-gate *	Search for a matching data item (or the first data item that's
6137c478bdstevel@tonic-gate *	equal to or greater than the one we're searching for).
6147c478bdstevel@tonic-gate */
6157c478bdstevel@tonic-gatestatic int
6167c478bdstevel@tonic-gate__bam_dsearch(dbc, cp, data, iflagp)
6177c478bdstevel@tonic-gate	DBC *dbc;
6187c478bdstevel@tonic-gate	CURSOR *cp;
6197c478bdstevel@tonic-gate	DBT *data;
6207c478bdstevel@tonic-gate	u_int32_t *iflagp;
6217c478bdstevel@tonic-gate{
6227c478bdstevel@tonic-gate	DB *dbp;
6237c478bdstevel@tonic-gate	CURSOR copy, last;
6247c478bdstevel@tonic-gate	int cmp, ret;
6257c478bdstevel@tonic-gate
6267c478bdstevel@tonic-gate	dbp = dbc->dbp;
6277c478bdstevel@tonic-gate
6287c478bdstevel@tonic-gate	/*
6297c478bdstevel@tonic-gate	 * If iflagp is non-NULL, we're doing an insert.
6307c478bdstevel@tonic-gate	 *
6317c478bdstevel@tonic-gate	 * If the duplicates are off-page, use the duplicate search routine.
6327c478bdstevel@tonic-gate	 */
6337c478bdstevel@tonic-gate	if (cp->dpgno != PGNO_INVALID) {
6347c478bdstevel@tonic-gate		if ((ret = __db_dsearch(dbc, iflagp != NULL,
6357c478bdstevel@tonic-gate		    data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0)
6367c478bdstevel@tonic-gate			return (ret);
6377c478bdstevel@tonic-gate		cp->dpgno = cp->page->pgno;
6387c478bdstevel@tonic-gate
6397c478bdstevel@tonic-gate		if (iflagp == NULL) {
6407c478bdstevel@tonic-gate			if (cmp != 0)
6417c478bdstevel@tonic-gate				return (DB_NOTFOUND);
6427c478bdstevel@tonic-gate			return (0);
6437c478bdstevel@tonic-gate		}
6447c478bdstevel@tonic-gate		*iflagp = DB_BEFORE;
6457c478bdstevel@tonic-gate		return (0);
6467c478bdstevel@tonic-gate	}
6477c478bdstevel@tonic-gate
6487c478bdstevel@tonic-gate	/* Otherwise, do the search ourselves. */
6497c478bdstevel@tonic-gate	copy = *cp;
6507c478bdstevel@tonic-gate	for (;;) {
6517c478bdstevel@tonic-gate		/* Save the last interesting cursor position. */
6527c478bdstevel@tonic-gate		last = *cp;
6537c478bdstevel@tonic-gate
6547c478bdstevel@tonic-gate		/* See if the data item matches the one we're looking for. */
6557c478bdstevel@tonic-gate		if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX,
6567c478bdstevel@tonic-gate		    dbp->dup_compare == NULL ?
6577c478bdstevel@tonic-gate		    __bam_defcmp : dbp->dup_compare)) == 0) {
6587c478bdstevel@tonic-gate			if (iflagp != NULL)
6597c478bdstevel@tonic-gate				*iflagp = DB_AFTER;
6607c478bdstevel@tonic-gate			return (0);
6617c478bdstevel@tonic-gate		}
6627c478bdstevel@tonic-gate
6637c478bdstevel@tonic-gate		/*
6647c478bdstevel@tonic-gate		 * If duplicate entries are sorted, we're done if we find a
6657c478bdstevel@tonic-gate		 * page entry that sorts greater than the application item.
6667c478bdstevel@tonic-gate		 * If doing an insert, return success, otherwise DB_NOTFOUND.
6677c478bdstevel@tonic-gate		 */
6687c478bdstevel@tonic-gate		if (dbp->dup_compare != NULL && cmp < 0) {
6697c478bdstevel@tonic-gate			if (iflagp == NULL)
6707c478bdstevel@tonic-gate				return (DB_NOTFOUND);
6717c478bdstevel@tonic-gate			*iflagp = DB_BEFORE;
6727c478bdstevel@tonic-gate			return (0);
6737c478bdstevel@tonic-gate		}
6747c478bdstevel@tonic-gate
6757c478bdstevel@tonic-gate		/*
6767c478bdstevel@tonic-gate		 * Move to the next item.  If we reach the end of the page and
6777c478bdstevel@tonic-gate		 * we're doing an insert, set the cursor to the last item and
6787c478bdstevel@tonic-gate		 * set the referenced memory location so callers know to insert
6797c478bdstevel@tonic-gate		 * after the item, instead of before it.  If not inserting, we
6807c478bdstevel@tonic-gate		 * return DB_NOTFOUND.
6817c478bdstevel@tonic-gate		 */
6827c478bdstevel@tonic-gate		if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) {
6837c478bdstevel@tonic-gate			if (iflagp == NULL)
6847c478bdstevel@tonic-gate				return (DB_NOTFOUND);
6857c478bdstevel@tonic-gate			goto use_last;
6867c478bdstevel@tonic-gate		}
6877c478bdstevel@tonic-gate
6887c478bdstevel@tonic-gate		/*
6897c478bdstevel@tonic-gate		 * Make sure we didn't go past the end of the duplicates.  The
6907c478bdstevel@tonic-gate		 * error conditions are the same as above.
6917c478bdstevel@tonic-gate		 */
6927c478bdstevel@tonic-gate		if (!POSSIBLE_DUPLICATE(cp, copy)) {
6937c478bdstevel@tonic-gate			if (iflagp == NULL)
6947c478bdstevel@tonic-gate				 return (DB_NOTFOUND);
6957c478bdstevel@tonic-gateuse_last:		*cp = last;
6967c478bdstevel@tonic-gate			*iflagp = DB_AFTER;
6977c478bdstevel@tonic-gate			return (0);
6987c478bdstevel@tonic-gate		}
6997c478bdstevel@tonic-gate	}
7007c478bdstevel@tonic-gate	/* NOTREACHED */
7017c478bdstevel@tonic-gate}
7027c478bdstevel@tonic-gate
7037c478bdstevel@tonic-gate/*
7047c478bdstevel@tonic-gate * __bam_c_rget --
7057c478bdstevel@tonic-gate *	Return the record number for a cursor.
7067c478bdstevel@tonic-gate */
7077c478bdstevel@tonic-gatestatic int
7087c478bdstevel@tonic-gate__bam_c_rget(dbc, data, flags)
7097c478bdstevel@tonic-gate	DBC *dbc;
7107c478bdstevel@tonic-gate	DBT *data;
7117c478bdstevel@tonic-gate	u_int32_t flags;
7127c478bdstevel@tonic-gate{
7137c478bdstevel@tonic-gate	CURSOR *cp;
7147c478bdstevel@tonic-gate	DB *dbp;
7157c478bdstevel@tonic-gate	DBT dbt;
7167c478bdstevel@tonic-gate	db_recno_t recno;
7177c478bdstevel@tonic-gate	int exact, ret;
7187c478bdstevel@tonic-gate
7197c478bdstevel@tonic-gate	COMPQUIET(flags, 0);
7207c478bdstevel@tonic-gate	dbp = dbc->dbp;
7217c478bdstevel@tonic-gate	cp = dbc->internal;
7227c478bdstevel@tonic-gate
7237c478bdstevel@tonic-gate	/* Get the page with the current item on it. */
7247c478bdstevel@tonic-gate	if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
7257c478bdstevel@tonic-gate		return (ret);
7267c478bdstevel@tonic-gate
7277c478bdstevel@tonic-gate	/* Get a copy of the key. */
7287c478bdstevel@tonic-gate	memset(&dbt, 0, sizeof(DBT));
7297c478bdstevel@tonic-gate	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
7307c478bdstevel@tonic-gate	if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0)
7317c478bdstevel@tonic-gate		goto err;
7327c478bdstevel@tonic-gate
7337c478bdstevel@tonic-gate	exact = 1;
7347c478bdstevel@tonic-gate	if ((ret = __bam_search(dbc, &dbt,
7357c478bdstevel@tonic-gate	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
7367c478bdstevel@tonic-gate	    1, &recno, &exact)) != 0)
7377c478bdstevel@tonic-gate		goto err;
7387c478bdstevel@tonic-gate
7397c478bdstevel@tonic-gate	ret = __db_retcopy(data, &recno, sizeof(recno),
7407c478bdstevel@tonic-gate	    &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc);
7417c478bdstevel@tonic-gate
7427c478bdstevel@tonic-gate	/* Release the stack. */
7437c478bdstevel@tonic-gate	__bam_stkrel(dbc, 0);
7447c478bdstevel@tonic-gate
7457c478bdstevel@tonic-gateerr:	(void)memp_fput(dbp->mpf, cp->page, 0);
7467c478bdstevel@tonic-gate	__os_free(dbt.data, dbt.size);
7477c478bdstevel@tonic-gate	return (ret);
7487c478bdstevel@tonic-gate}
7497c478bdstevel@tonic-gate
7507c478bdstevel@tonic-gate/*
7517c478bdstevel@tonic-gate * __bam_c_put --
7527c478bdstevel@tonic-gate *	Put using a cursor.
7537c478bdstevel@tonic-gate */
7547c478bdstevel@tonic-gatestatic int
7557c478bdstevel@tonic-gate__bam_c_put(dbc, key, data, flags)
7567c478bdstevel@tonic-gate	DBC *dbc;
7577c478bdstevel@tonic-gate	DBT *key, *data;
7587c478bdstevel@tonic-gate	u_int32_t flags;
7597c478bdstevel@tonic-gate{
7607c478bdstevel@tonic-gate	CURSOR *cp, copy;
7617c478bdstevel@tonic-gate	DB *dbp;
7627c478bdstevel@tonic-gate	DBT dbt;
7637c478bdstevel@tonic-gate	db_indx_t indx;
7647c478bdstevel@tonic-gate	db_pgno_t pgno;
7657c478bdstevel@tonic-gate	u_int32_t iiflags, iiop;
7667c478bdstevel@tonic-gate	int exact, needkey, ret, stack;
7677c478bdstevel@tonic-gate	void *arg;
7687c478bdstevel@tonic-gate
7697c478bdstevel@tonic-gate	dbp = dbc->dbp;
7707c478bdstevel@tonic-gate	cp = dbc->internal;
7717c478bdstevel@tonic-gate
7727c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
7737c478bdstevel@tonic-gate
7747c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put",
7757c478bdstevel@tonic-gate	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
7767c478bdstevel@tonic-gate	    data, flags);
7777c478bdstevel@tonic-gate
7787c478bdstevel@tonic-gate	if ((ret = __db_cputchk(dbp, key, data, flags,
7797c478bdstevel@tonic-gate	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
7807c478bdstevel@tonic-gate		return (ret);
7817c478bdstevel@tonic-gate
7827c478bdstevel@tonic-gate	/*
7837c478bdstevel@tonic-gate	 * If we are running CDB, this had better be either a write
7847c478bdstevel@tonic-gate	 * cursor or an immediate writer.  If it's a regular writer,
7857c478bdstevel@tonic-gate	 * that means we have an IWRITE lock and we need to upgrade
7867c478bdstevel@tonic-gate	 * it to a write lock.
7877c478bdstevel@tonic-gate	 */
7887c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB)) {
7897c478bdstevel@tonic-gate		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
7907c478bdstevel@tonic-gate			return (EINVAL);
7917c478bdstevel@tonic-gate
7927c478bdstevel@tonic-gate		if (F_ISSET(dbc, DBC_RMW) &&
7937c478bdstevel@tonic-gate		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
7947c478bdstevel@tonic-gate		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
7957c478bdstevel@tonic-gate		    &dbc->mylock)) != 0)
7967c478bdstevel@tonic-gate			return (EAGAIN);
7977c478bdstevel@tonic-gate	}
7987c478bdstevel@tonic-gate
7997c478bdstevel@tonic-gate	if (0) {
8007c478bdstevel@tonic-gatesplit:		/*
8017c478bdstevel@tonic-gate		 * To split, we need a valid key for the page.  Since it's a
8027c478bdstevel@tonic-gate		 * cursor, we have to build one.
8037c478bdstevel@tonic-gate		 *
8047c478bdstevel@tonic-gate		 * Acquire a copy of a key from the page.
8057c478bdstevel@tonic-gate		 */
8067c478bdstevel@tonic-gate		if (needkey) {
8077c478bdstevel@tonic-gate			memset(&dbt, 0, sizeof(DBT));
8087c478bdstevel@tonic-gate			if ((ret = __db_ret(dbp, cp->page, indx,
8097c478bdstevel@tonic-gate			    &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
8107c478bdstevel@tonic-gate				goto err;
8117c478bdstevel@tonic-gate			arg = &dbt;
8127c478bdstevel@tonic-gate		} else
8137c478bdstevel@tonic-gate			arg = key;
8147c478bdstevel@tonic-gate
8157c478bdstevel@tonic-gate		/*
8167c478bdstevel@tonic-gate		 * Discard any locks and pinned pages (the locks are discarded
8177c478bdstevel@tonic-gate		 * even if we're running with transactions, as they lock pages
8187c478bdstevel@tonic-gate		 * that we're sorry we ever acquired).  If stack is set and the
8197c478bdstevel@tonic-gate		 * cursor entries are valid, they point to the same entries as
8207c478bdstevel@tonic-gate		 * the stack, don't free them twice.
8217c478bdstevel@tonic-gate		 */
8227c478bdstevel@tonic-gate		if (stack) {
8237c478bdstevel@tonic-gate			(void)__bam_stkrel(dbc, 1);
8247c478bdstevel@tonic-gate			stack = 0;
8257c478bdstevel@tonic-gate		} else
8267c478bdstevel@tonic-gate			DISCARD(dbc, cp);
8277c478bdstevel@tonic-gate
8287c478bdstevel@tonic-gate		/*
8297c478bdstevel@tonic-gate		 * Restore the cursor to its original value.  This is necessary
8307c478bdstevel@tonic-gate		 * for two reasons.  First, we are about to copy it in case of
8317c478bdstevel@tonic-gate		 * error, again.  Second, we adjust cursors during the split,
8327c478bdstevel@tonic-gate		 * and we have to ensure this cursor is adjusted appropriately,
8337c478bdstevel@tonic-gate		 * along with all the other cursors.
8347c478bdstevel@tonic-gate		 */
8357c478bdstevel@tonic-gate		*cp = copy;
8367c478bdstevel@tonic-gate
8377c478bdstevel@tonic-gate		if ((ret = __bam_split(dbc, arg)) != 0)
8387c478bdstevel@tonic-gate			goto err;
8397c478bdstevel@tonic-gate	}
8407c478bdstevel@tonic-gate
8417c478bdstevel@tonic-gate	/*
8427c478bdstevel@tonic-gate	 * Initialize the cursor for a new retrieval.  Clear the cursor's
8437c478bdstevel@tonic-gate	 * page pointer, it was set before this operation, and no longer
8447c478bdstevel@tonic-gate	 * has any meaning.
8457c478bdstevel@tonic-gate	 */
8467c478bdstevel@tonic-gate	cp->page = NULL;
8477c478bdstevel@tonic-gate	copy = *cp;
8487c478bdstevel@tonic-gate	cp->lock = LOCK_INVALID;
8497c478bdstevel@tonic-gate
8507c478bdstevel@tonic-gate	iiflags = needkey = ret = stack = 0;
8517c478bdstevel@tonic-gate	switch (flags) {
8527c478bdstevel@tonic-gate	case DB_AFTER:
8537c478bdstevel@tonic-gate	case DB_BEFORE:
8547c478bdstevel@tonic-gate	case DB_CURRENT:
8557c478bdstevel@tonic-gate		needkey = 1;
8567c478bdstevel@tonic-gate		if (cp->dpgno == PGNO_INVALID) {
8577c478bdstevel@tonic-gate			pgno = cp->pgno;
8587c478bdstevel@tonic-gate			indx = cp->indx;
8597c478bdstevel@tonic-gate		} else {
8607c478bdstevel@tonic-gate			pgno = cp->dpgno;
8617c478bdstevel@tonic-gate			indx = cp->dindx;
8627c478bdstevel@tonic-gate		}
8637c478bdstevel@tonic-gate
8647c478bdstevel@tonic-gate		/*
8657c478bdstevel@tonic-gate		 * !!!
8667c478bdstevel@tonic-gate		 * This test is right -- we don't yet support duplicates and
8677c478bdstevel@tonic-gate		 * record numbers in the same tree, so ignore duplicates if
8687c478bdstevel@tonic-gate		 * DB_BT_RECNUM set.
8697c478bdstevel@tonic-gate		 */
8707c478bdstevel@tonic-gate		if (F_ISSET(dbp, DB_BT_RECNUM) &&
8717c478bdstevel@tonic-gate		    (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) {
8727c478bdstevel@tonic-gate			/* Acquire a complete stack. */
8737c478bdstevel@tonic-gate			if ((ret = __bam_c_getstack(dbc, cp)) != 0)
8747c478bdstevel@tonic-gate				goto err;
8757c478bdstevel@tonic-gate			cp->page = cp->csp->page;
8767c478bdstevel@tonic-gate
8777c478bdstevel@tonic-gate			stack = 1;
8787c478bdstevel@tonic-gate			iiflags = BI_DOINCR;
8797c478bdstevel@tonic-gate		} else {
8807c478bdstevel@tonic-gate			/* Acquire the current page. */
8817c478bdstevel@tonic-gate			if ((ret = __bam_lget(dbc,
8827c478bdstevel@tonic-gate			    0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0)
8837c478bdstevel@tonic-gate				ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page);
8847c478bdstevel@tonic-gate			if (ret != 0)
8857c478bdstevel@tonic-gate				goto err;
8867c478bdstevel@tonic-gate
8877c478bdstevel@tonic-gate			iiflags = 0;
8887c478bdstevel@tonic-gate		}
8897c478bdstevel@tonic-gate
8907c478bdstevel@tonic-gate		/*
8917c478bdstevel@tonic-gate		 * If the user has specified a duplicate comparison function,
8927c478bdstevel@tonic-gate		 * we return an error if DB_CURRENT was specified and the
8937c478bdstevel@tonic-gate		 * replacement data doesn't compare equal to the current data.
8947c478bdstevel@tonic-gate		 * This stops apps from screwing up the duplicate sort order.
8957c478bdstevel@tonic-gate		 */
8967c478bdstevel@tonic-gate		if (flags == DB_CURRENT && dbp->dup_compare != NULL)
8977c478bdstevel@tonic-gate			if (__bam_cmp(dbp, data,
8987c478bdstevel@tonic-gate			    cp->page, indx, dbp->dup_compare) != 0) {
8997c478bdstevel@tonic-gate				ret = EINVAL;
9007c478bdstevel@tonic-gate				goto err;
9017c478bdstevel@tonic-gate			}
9027c478bdstevel@tonic-gate
9037c478bdstevel@tonic-gate		iiop = flags;
9047c478bdstevel@tonic-gate		break;
9057c478bdstevel@tonic-gate	case DB_KEYFIRST:
9067c478bdstevel@tonic-gate	case DB_KEYLAST:
9077c478bdstevel@tonic-gate		/*
9087c478bdstevel@tonic-gate		 * If we have a duplicate comparison function, we position to
9097c478bdstevel@tonic-gate		 * the first of any on-page duplicates, and use __bam_dsearch
9107c478bdstevel@tonic-gate		 * to search for the right slot.  Otherwise, we position to
9117c478bdstevel@tonic-gate		 * the first/last of any on-page duplicates based on the flag
9127c478bdstevel@tonic-gate		 * value.
9137c478bdstevel@tonic-gate		 */
9147c478bdstevel@tonic-gate		if ((ret = __bam_c_search(dbc, cp, key,
9157c478bdstevel@tonic-gate		    flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
9167c478bdstevel@tonic-gate		    DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
9177c478bdstevel@tonic-gate			goto err;
9187c478bdstevel@tonic-gate		stack = 1;
9197c478bdstevel@tonic-gate
9207c478bdstevel@tonic-gate		/*
9217c478bdstevel@tonic-gate		 * If an exact match:
9227c478bdstevel@tonic-gate		 *	If duplicates aren't supported, replace the current
9237c478bdstevel@tonic-gate		 *	item.  (When implementing the DB->put function, our
9247c478bdstevel@tonic-gate		 *	caller has already checked the DB_NOOVERWRITE flag.)
9257c478bdstevel@tonic-gate		 *
9267c478bdstevel@tonic-gate		 *	If there's a duplicate comparison function, find the
9277c478bdstevel@tonic-gate		 *	correct slot for this duplicate item.
9287c478bdstevel@tonic-gate		 *
9297c478bdstevel@tonic-gate		 *	If there's no duplicate comparison function, set the
9307c478bdstevel@tonic-gate		 *	insert flag based on the argument flags.
9317c478bdstevel@tonic-gate		 *
9327c478bdstevel@tonic-gate		 * If there's no match, the search function returned the
9337c478bdstevel@tonic-gate		 * smallest slot greater than the key, use it.
9347c478bdstevel@tonic-gate		 */
9357c478bdstevel@tonic-gate		if (exact) {
9367c478bdstevel@tonic-gate			if (F_ISSET(dbp, DB_AM_DUP)) {
9377c478bdstevel@tonic-gate				/*
9387c478bdstevel@tonic-gate				 * If at off-page duplicate page, move to the
9397c478bdstevel@tonic-gate				 * first or last entry -- if a comparison
9407c478bdstevel@tonic-gate				 * function was specified, start searching at
9417c478bdstevel@tonic-gate				 * the first entry.  Otherwise, move based on
9427c478bdstevel@tonic-gate				 * the DB_KEYFIRST/DB_KEYLAST flags.
9437c478bdstevel@tonic-gate				 */
9447c478bdstevel@tonic-gate				if ((ret = __bam_dup(dbc, cp, cp->indx,
9457c478bdstevel@tonic-gate				    dbp->dup_compare == NULL &&
9467c478bdstevel@tonic-gate				    flags != DB_KEYFIRST)) != 0)
9477c478bdstevel@tonic-gate					goto err;
9487c478bdstevel@tonic-gate
9497c478bdstevel@tonic-gate				/*
9507c478bdstevel@tonic-gate				 * If there's a comparison function, search for
9517c478bdstevel@tonic-gate				 * the correct slot.  Otherwise, set the insert
9527c478bdstevel@tonic-gate				 * flag based on the argment flag.
9537c478bdstevel@tonic-gate				 */
9547c478bdstevel@tonic-gate				if (dbp->dup_compare == NULL)
9557c478bdstevel@tonic-gate					iiop = flags == DB_KEYFIRST ?
9567c478bdstevel@tonic-gate					    DB_BEFORE : DB_AFTER;
9577c478bdstevel@tonic-gate				else
9587c478bdstevel@tonic-gate					if ((ret = __bam_dsearch(dbc,
9597c478bdstevel@tonic-gate					    cp, data, &iiop)) != 0)
9607c478bdstevel@tonic-gate						goto err;
9617c478bdstevel@tonic-gate			} else
9627c478bdstevel@tonic-gate				iiop = DB_CURRENT;
9637c478bdstevel@tonic-gate			iiflags = 0;
9647c478bdstevel@tonic-gate		} else {
9657c478bdstevel@tonic-gate			iiop = DB_BEFORE;
9667c478bdstevel@tonic-gate			iiflags = BI_NEWKEY;
9677c478bdstevel@tonic-gate		}
9687c478bdstevel@tonic-gate
9697c478bdstevel@tonic-gate		if (cp->dpgno == PGNO_INVALID) {
9707c478bdstevel@tonic-gate			pgno = cp->pgno;
9717c478bdstevel@tonic-gate			indx = cp->indx;
9727c478bdstevel@tonic-gate		} else {
9737c478bdstevel@tonic-gate			pgno = cp->dpgno;
9747c478bdstevel@tonic-gate			indx = cp->dindx;
9757c478bdstevel@tonic-gate		}
9767c478bdstevel@tonic-gate		break;
9777c478bdstevel@tonic-gate	}
9787c478bdstevel@tonic-gate
9797c478bdstevel@tonic-gate	ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags);
9807c478bdstevel@tonic-gate
9817c478bdstevel@tonic-gate	if (ret == DB_NEEDSPLIT)
9827c478bdstevel@tonic-gate		goto split;
9837c478bdstevel@tonic-gate	if (ret != 0)
9847c478bdstevel@tonic-gate		goto err;
9857c478bdstevel@tonic-gate
9867c478bdstevel@tonic-gate	/*
9877c478bdstevel@tonic-gate	 * Reset any cursors referencing this item that might have the item
9887c478bdstevel@tonic-gate	 * marked for deletion.
9897c478bdstevel@tonic-gate	 */
9907c478bdstevel@tonic-gate	if (iiop == DB_CURRENT) {
9917c478bdstevel@tonic-gate		(void)__bam_ca_delete(dbp, pgno, indx, 0);
9927c478bdstevel@tonic-gate
9937c478bdstevel@tonic-gate		/*
9947c478bdstevel@tonic-gate		 * It's also possible that we are the cursor that had the
9957c478bdstevel@tonic-gate		 * item marked for deletion, in which case we want to make
9967c478bdstevel@tonic-gate		 * sure that we don't delete it because we had the delete
9977c478bdstevel@tonic-gate		 * flag set already.
9987c478bdstevel@tonic-gate		 */
9997c478bdstevel@tonic-gate		if (cp->pgno == copy.pgno && cp->indx == copy.indx &&
10007c478bdstevel@tonic-gate		    cp->dpgno == copy.dpgno && cp->dindx == copy.dindx)
10017c478bdstevel@tonic-gate			F_CLR(&copy, C_DELETED);
10027c478bdstevel@tonic-gate	}
10037c478bdstevel@tonic-gate
10047c478bdstevel@tonic-gate	/*
10057c478bdstevel@tonic-gate	 * Update the cursor to point to the new entry.  The new entry was
10067c478bdstevel@tonic-gate	 * stored on the current page, because we split pages until it was
10077c478bdstevel@tonic-gate	 * possible.
10087c478bdstevel@tonic-gate	 */
10097c478bdstevel@tonic-gate	if (cp->dpgno == PGNO_INVALID)
10107c478bdstevel@tonic-gate		cp->indx = indx;
10117c478bdstevel@tonic-gate	else
10127c478bdstevel@tonic-gate		cp->dindx = indx;
10137c478bdstevel@tonic-gate
10147c478bdstevel@tonic-gate	/*
10157c478bdstevel@tonic-gate	 * If the previous cursor record has been deleted, physically delete
10167c478bdstevel@tonic-gate	 * the entry from the page.  We clear the deleted flag before we call
10177c478bdstevel@tonic-gate	 * the underlying delete routine so that, if an error occurs, and we
10187c478bdstevel@tonic-gate	 * restore the cursor, the deleted flag is cleared.  This is because,
10197c478bdstevel@tonic-gate	 * if we manage to physically modify the page, and then restore the
10207c478bdstevel@tonic-gate	 * cursor, we might try to repeat the page modification when closing
10217c478bdstevel@tonic-gate	 * the cursor.
10227c478bdstevel@tonic-gate	 */
10237c478bdstevel@tonic-gate	if (F_ISSET(&copy, C_DELETED)) {
10247c478bdstevel@tonic-gate		F_CLR(&copy, C_DELETED);
10257c478bdstevel@tonic-gate		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
10267c478bdstevel@tonic-gate			goto err;
10277c478bdstevel@tonic-gate	}
10287c478bdstevel@tonic-gate	F_CLR(cp, C_DELETED);
10297c478bdstevel@tonic-gate
10307c478bdstevel@tonic-gate	/* Release the previous lock, if any; the current lock is retained. */
10317c478bdstevel@tonic-gate	if (copy.lock != LOCK_INVALID)
10327c478bdstevel@tonic-gate		(void)__BT_TLPUT(dbc, copy.lock);
10337c478bdstevel@tonic-gate
10347c478bdstevel@tonic-gate	/*
10357c478bdstevel@tonic-gate	 * Discard any pages pinned in the tree and their locks, except for
10367c478bdstevel@tonic-gate	 * the leaf page, for which we only discard the pin, not the lock.
10377c478bdstevel@tonic-gate	 *
10387c478bdstevel@tonic-gate	 * Note, the leaf page participated in the stack we acquired, and so
10397c478bdstevel@tonic-gate	 * we have to adjust the stack as necessary.  If there was only a
10407c478bdstevel@tonic-gate	 * single page on the stack, we don't have to free further stack pages.
10417c478bdstevel@tonic-gate	 */
10427c478bdstevel@tonic-gate	if (stack && BT_STK_POP(cp) != NULL)
10437c478bdstevel@tonic-gate		(void)__bam_stkrel(dbc, 0);
10447c478bdstevel@tonic-gate
10457c478bdstevel@tonic-gate	/* Release the current page. */
10467c478bdstevel@tonic-gate	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
10477c478bdstevel@tonic-gate		goto err;
10487c478bdstevel@tonic-gate
10497c478bdstevel@tonic-gate	if (0) {
10507c478bdstevel@tonic-gateerr:		/* Discard any pinned pages. */
10517c478bdstevel@tonic-gate		if (stack)
10527c478bdstevel@tonic-gate			(void)__bam_stkrel(dbc, 0);
10537c478bdstevel@tonic-gate		else
10547c478bdstevel@tonic-gate			DISCARD(dbc, cp);
10557c478bdstevel@tonic-gate		*cp = copy;
10567c478bdstevel@tonic-gate	}
10577c478bdstevel@tonic-gate
10587c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
10597c478bdstevel@tonic-gate		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
10607c478bdstevel@tonic-gate		    DB_LOCK_IWRITE, 0);
10617c478bdstevel@tonic-gate
10627c478bdstevel@tonic-gate	return (ret);
10637c478bdstevel@tonic-gate}
10647c478bdstevel@tonic-gate
10657c478bdstevel@tonic-gate/*
10667c478bdstevel@tonic-gate * __bam_c_first --
10677c478bdstevel@tonic-gate *	Return the first record.
10687c478bdstevel@tonic-gate */
10697c478bdstevel@tonic-gatestatic int
10707c478bdstevel@tonic-gate__bam_c_first(dbc, cp)
10717c478bdstevel@tonic-gate	DBC *dbc;
10727c478bdstevel@tonic-gate	CURSOR *cp;
10737c478bdstevel@tonic-gate{
10747c478bdstevel@tonic-gate	DB *dbp;
10757c478bdstevel@tonic-gate	db_pgno_t pgno;
10767c478bdstevel@tonic-gate	int ret;
10777c478bdstevel@tonic-gate
10787c478bdstevel@tonic-gate	dbp = dbc->dbp;
10797c478bdstevel@tonic-gate
10807c478bdstevel@tonic-gate	/* Walk down the left-hand side of the tree. */
10817c478bdstevel@tonic-gate	for (pgno = PGNO_ROOT;;) {
10827c478bdstevel@tonic-gate		if ((ret =
10837c478bdstevel@tonic-gate		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
10847c478bdstevel@tonic-gate			return (ret);
10857c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
10867c478bdstevel@tonic-gate			return (ret);
10877c478bdstevel@tonic-gate
10887c478bdstevel@tonic-gate		/* If we find a leaf page, we're done. */
10897c478bdstevel@tonic-gate		if (ISLEAF(cp->page))
10907c478bdstevel@tonic-gate			break;
10917c478bdstevel@tonic-gate
10927c478bdstevel@tonic-gate		pgno = GET_BINTERNAL(cp->page, 0)->pgno;
10937c478bdstevel@tonic-gate		DISCARD(dbc, cp);
10947c478bdstevel@tonic-gate	}
10957c478bdstevel@tonic-gate
10967c478bdstevel@tonic-gate	cp->pgno = cp->page->pgno;
10977c478bdstevel@tonic-gate	cp->indx = 0;
10987c478bdstevel@tonic-gate	cp->dpgno = PGNO_INVALID;
10997c478bdstevel@tonic-gate
11007c478bdstevel@tonic-gate	/* Check for duplicates. */
11017c478bdstevel@tonic-gate	if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
11027c478bdstevel@tonic-gate		return (ret);
11037c478bdstevel@tonic-gate
11047c478bdstevel@tonic-gate	/* If on an empty page or a deleted record, move to the next one. */
11057c478bdstevel@tonic-gate	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
11067c478bdstevel@tonic-gate		if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
11077c478bdstevel@tonic-gate			return (ret);
11087c478bdstevel@tonic-gate
11097c478bdstevel@tonic-gate	return (0);
11107c478bdstevel@tonic-gate}
11117c478bdstevel@tonic-gate
11127c478bdstevel@tonic-gate/*
11137c478bdstevel@tonic-gate * __bam_c_last --
11147c478bdstevel@tonic-gate *	Return the last record.
11157c478bdstevel@tonic-gate */
11167c478bdstevel@tonic-gatestatic int
11177c478bdstevel@tonic-gate__bam_c_last(dbc, cp)
11187c478bdstevel@tonic-gate	DBC *dbc;
11197c478bdstevel@tonic-gate	CURSOR *cp;
11207c478bdstevel@tonic-gate{
11217c478bdstevel@tonic-gate	DB *dbp;
11227c478bdstevel@tonic-gate	db_pgno_t pgno;
11237c478bdstevel@tonic-gate	int ret;
11247c478bdstevel@tonic-gate
11257c478bdstevel@tonic-gate	dbp = dbc->dbp;
11267c478bdstevel@tonic-gate
11277c478bdstevel@tonic-gate	/* Walk down the right-hand side of the tree. */
11287c478bdstevel@tonic-gate	for (pgno = PGNO_ROOT;;) {
11297c478bdstevel@tonic-gate		if ((ret =
11307c478bdstevel@tonic-gate		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
11317c478bdstevel@tonic-gate			return (ret);
11327c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
11337c478bdstevel@tonic-gate			return (ret);
11347c478bdstevel@tonic-gate
11357c478bdstevel@tonic-gate		/* If we find a leaf page, we're done. */
11367c478bdstevel@tonic-gate		if (ISLEAF(cp->page))
11377c478bdstevel@tonic-gate			break;
11387c478bdstevel@tonic-gate
11397c478bdstevel@tonic-gate		pgno =
11407c478bdstevel@tonic-gate		    GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
11417c478bdstevel@tonic-gate		DISCARD(dbc, cp);
11427c478bdstevel@tonic-gate	}
11437c478bdstevel@tonic-gate
11447c478bdstevel@tonic-gate	cp->pgno = cp->page->pgno;
11457c478bdstevel@tonic-gate	cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
11467c478bdstevel@tonic-gate	cp->dpgno = PGNO_INVALID;
11477c478bdstevel@tonic-gate
11487c478bdstevel@tonic-gate	/* Check for duplicates. */
11497c478bdstevel@tonic-gate	if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0)
11507c478bdstevel@tonic-gate		return (ret);
11517c478bdstevel@tonic-gate
11527c478bdstevel@tonic-gate	/* If on an empty page or a deleted record, move to the next one. */
11537c478bdstevel@tonic-gate	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
11547c478bdstevel@tonic-gate		if ((ret = __bam_c_prev(dbc, cp)) != 0)
11557c478bdstevel@tonic-gate			return (ret);
11567c478bdstevel@tonic-gate
11577c478bdstevel@tonic-gate	return (0);
11587c478bdstevel@tonic-gate}
11597c478bdstevel@tonic-gate
11607c478bdstevel@tonic-gate/*
11617c478bdstevel@tonic-gate * __bam_c_next --
11627c478bdstevel@tonic-gate *	Move to the next record.
11637c478bdstevel@tonic-gate */
11647c478bdstevel@tonic-gatestatic int
11657c478bdstevel@tonic-gate__bam_c_next(dbc, cp, initial_move)
11667c478bdstevel@tonic-gate	DBC *dbc;
11677c478bdstevel@tonic-gate	CURSOR *cp;
11687c478bdstevel@tonic-gate	int initial_move;
11697c478bdstevel@tonic-gate{
11707c478bdstevel@tonic-gate	DB *dbp;
11717c478bdstevel@tonic-gate	db_indx_t adjust, indx;
11727c478bdstevel@tonic-gate	db_pgno_t pgno;
11737c478bdstevel@tonic-gate	int ret;
11747c478bdstevel@tonic-gate
11757c478bdstevel@tonic-gate	dbp = dbc->dbp;
11767c478bdstevel@tonic-gate
11777c478bdstevel@tonic-gate	/*
11787c478bdstevel@tonic-gate	 * We're either moving through a page of duplicates or a btree leaf
11797c478bdstevel@tonic-gate	 * page.
11807c478bdstevel@tonic-gate	 */
11817c478bdstevel@tonic-gate	if (cp->dpgno == PGNO_INVALID) {
11827c478bdstevel@tonic-gate		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
11837c478bdstevel@tonic-gate		pgno = cp->pgno;
11847c478bdstevel@tonic-gate		indx = cp->indx;
11857c478bdstevel@tonic-gate	} else {
11867c478bdstevel@tonic-gate		adjust = O_INDX;
11877c478bdstevel@tonic-gate		pgno = cp->dpgno;
11887c478bdstevel@tonic-gate		indx = cp->dindx;
11897c478bdstevel@tonic-gate	}
11907c478bdstevel@tonic-gate	if (cp->page == NULL) {
11917c478bdstevel@tonic-gate		if ((ret =
11927c478bdstevel@tonic-gate		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
11937c478bdstevel@tonic-gate			return (ret);
11947c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
11957c478bdstevel@tonic-gate			return (ret);
11967c478bdstevel@tonic-gate	}
11977c478bdstevel@tonic-gate
11987c478bdstevel@tonic-gate	/*
11997c478bdstevel@tonic-gate	 * If at the end of the page, move to a subsequent page.
12007c478bdstevel@tonic-gate	 *
12017c478bdstevel@tonic-gate	 * !!!
12027c478bdstevel@tonic-gate	 * Check for >= NUM_ENT.  If we're here as the result of a search that
12037c478bdstevel@tonic-gate	 * landed us on NUM_ENT, we'll increment indx before we test.
12047c478bdstevel@tonic-gate	 *
12057c478bdstevel@tonic-gate	 * !!!
12067c478bdstevel@tonic-gate	 * This code handles empty pages and pages with only deleted entries.
12077c478bdstevel@tonic-gate	 */
12087c478bdstevel@tonic-gate	if (initial_move)
12097c478bdstevel@tonic-gate		indx += adjust;
12107c478bdstevel@tonic-gate	for (;;) {
12117c478bdstevel@tonic-gate		if (indx >= NUM_ENT(cp->page)) {
12127c478bdstevel@tonic-gate			/*
12137c478bdstevel@tonic-gate			 * If we're in a btree leaf page, we've reached the end
12147c478bdstevel@tonic-gate			 * of the tree.  If we've reached the end of a page of
12157c478bdstevel@tonic-gate			 * duplicates, continue from the btree leaf page where
12167c478bdstevel@tonic-gate			 * we found this page of duplicates.
12177c478bdstevel@tonic-gate			 */
12187c478bdstevel@tonic-gate			pgno = cp->page->next_pgno;
12197c478bdstevel@tonic-gate			if (pgno == PGNO_INVALID) {
12207c478bdstevel@tonic-gate				/* If in a btree leaf page, it's EOF. */
12217c478bdstevel@tonic-gate				if (cp->dpgno == PGNO_INVALID)
12227c478bdstevel@tonic-gate					return (DB_NOTFOUND);
12237c478bdstevel@tonic-gate
12247c478bdstevel@tonic-gate				/* Continue from the last btree leaf page. */
12257c478bdstevel@tonic-gate				cp->dpgno = PGNO_INVALID;
12267c478bdstevel@tonic-gate
12277c478bdstevel@tonic-gate				adjust = P_INDX;
12287c478bdstevel@tonic-gate				pgno = cp->pgno;
12297c478bdstevel@tonic-gate				indx = cp->indx + P_INDX;
12307c478bdstevel@tonic-gate			} else
12317c478bdstevel@tonic-gate				indx = 0;
12327c478bdstevel@tonic-gate
12337c478bdstevel@tonic-gate			DISCARD(dbc, cp);
12347c478bdstevel@tonic-gate			if ((ret = __bam_lget(dbc,
12357c478bdstevel@tonic-gate			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
12367c478bdstevel@tonic-gate				return (ret);
12377c478bdstevel@tonic-gate			if ((ret =
12387c478bdstevel@tonic-gate			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
12397c478bdstevel@tonic-gate				return (ret);
12407c478bdstevel@tonic-gate			continue;
12417c478bdstevel@tonic-gate		}
12427c478bdstevel@tonic-gate
12437c478bdstevel@tonic-gate		/* Ignore deleted records. */
12447c478bdstevel@tonic-gate		if (IS_DELETED(cp, indx)) {
12457c478bdstevel@tonic-gate			indx += adjust;
12467c478bdstevel@tonic-gate			continue;
12477c478bdstevel@tonic-gate		}
12487c478bdstevel@tonic-gate
12497c478bdstevel@tonic-gate		/*
12507c478bdstevel@tonic-gate		 * If we're not in a duplicates page, check to see if we've
12517c478bdstevel@tonic-gate		 * found a page of duplicates, in which case we move to the
12527c478bdstevel@tonic-gate		 * first entry.
12537c478bdstevel@tonic-gate		 */
12547c478bdstevel@tonic-gate		if (cp->dpgno == PGNO_INVALID) {
12557c478bdstevel@tonic-gate			cp->pgno = cp->page->pgno;
12567c478bdstevel@tonic-gate			cp->indx = indx;
12577c478bdstevel@tonic-gate
12587c478bdstevel@tonic-gate			if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0)
12597c478bdstevel@tonic-gate				return (ret);
12607c478bdstevel@tonic-gate			if (cp->dpgno != PGNO_INVALID) {
12617c478bdstevel@tonic-gate				indx = cp->dindx;
12627c478bdstevel@tonic-gate				adjust = O_INDX;
12637c478bdstevel@tonic-gate				continue;
12647c478bdstevel@tonic-gate			}
12657c478bdstevel@tonic-gate		} else {
12667c478bdstevel@tonic-gate			cp->dpgno = cp->page->pgno;
12677c478bdstevel@tonic-gate			cp->dindx = indx;
12687c478bdstevel@tonic-gate		}
12697c478bdstevel@tonic-gate		break;
12707c478bdstevel@tonic-gate	}
12717c478bdstevel@tonic-gate	return (0);
12727c478bdstevel@tonic-gate}
12737c478bdstevel@tonic-gate
12747c478bdstevel@tonic-gate/*
12757c478bdstevel@tonic-gate * __bam_c_prev --
12767c478bdstevel@tonic-gate *	Move to the previous record.
12777c478bdstevel@tonic-gate */
12787c478bdstevel@tonic-gatestatic int
12797c478bdstevel@tonic-gate__bam_c_prev(dbc, cp)
12807c478bdstevel@tonic-gate	DBC *dbc;
12817c478bdstevel@tonic-gate	CURSOR *cp;
12827c478bdstevel@tonic-gate{
12837c478bdstevel@tonic-gate	DB *dbp;
12847c478bdstevel@tonic-gate	db_indx_t indx, adjust;
12857c478bdstevel@tonic-gate	db_pgno_t pgno;
12867c478bdstevel@tonic-gate	int ret, set_indx;
12877c478bdstevel@tonic-gate
12887c478bdstevel@tonic-gate	dbp = dbc->dbp;
12897c478bdstevel@tonic-gate
12907c478bdstevel@tonic-gate	/*
12917c478bdstevel@tonic-gate	 * We're either moving through a page of duplicates or a btree leaf
12927c478bdstevel@tonic-gate	 * page.
12937c478bdstevel@tonic-gate	 */
12947c478bdstevel@tonic-gate	if (cp->dpgno == PGNO_INVALID) {
12957c478bdstevel@tonic-gate		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
12967c478bdstevel@tonic-gate		pgno = cp->pgno;
12977c478bdstevel@tonic-gate		indx = cp->indx;
12987c478bdstevel@tonic-gate	} else {
12997c478bdstevel@tonic-gate		adjust = O_INDX;
13007c478bdstevel@tonic-gate		pgno = cp->dpgno;
13017c478bdstevel@tonic-gate		indx = cp->dindx;
13027c478bdstevel@tonic-gate	}
13037c478bdstevel@tonic-gate	if (cp->page == NULL) {
13047c478bdstevel@tonic-gate		if ((ret =
13057c478bdstevel@tonic-gate		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
13067c478bdstevel@tonic-gate			return (ret);
13077c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
13087c478bdstevel@tonic-gate			return (ret);
13097c478bdstevel@tonic-gate	}
13107c478bdstevel@tonic-gate
13117c478bdstevel@tonic-gate	/*
13127c478bdstevel@tonic-gate	 * If at the beginning of the page, move to any previous one.
13137c478bdstevel@tonic-gate	 *
13147c478bdstevel@tonic-gate	 * !!!
13157c478bdstevel@tonic-gate	 * This code handles empty pages and pages with only deleted entries.
13167c478bdstevel@tonic-gate	 */
13177c478bdstevel@tonic-gate	for (;;) {
13187c478bdstevel@tonic-gate		if (indx == 0) {
13197c478bdstevel@tonic-gate			/*
13207c478bdstevel@tonic-gate			 * If we're in a btree leaf page, we've reached the
13217c478bdstevel@tonic-gate			 * beginning of the tree.  If we've reached the first
13227c478bdstevel@tonic-gate			 * of a page of duplicates, continue from the btree
13237c478bdstevel@tonic-gate			 * leaf page where we found this page of duplicates.
13247c478bdstevel@tonic-gate			 */
13257c478bdstevel@tonic-gate			pgno = cp->page->prev_pgno;
13267c478bdstevel@tonic-gate			if (pgno == PGNO_INVALID) {
13277c478bdstevel@tonic-gate				/* If in a btree leaf page, it's SOF. */
13287c478bdstevel@tonic-gate				if (cp->dpgno == PGNO_INVALID)
13297c478bdstevel@tonic-gate					return (DB_NOTFOUND);
13307c478bdstevel@tonic-gate
13317c478bdstevel@tonic-gate				/* Continue from the last btree leaf page. */
13327c478bdstevel@tonic-gate				cp->dpgno = PGNO_INVALID;
13337c478bdstevel@tonic-gate
13347c478bdstevel@tonic-gate				adjust = P_INDX;
13357c478bdstevel@tonic-gate				pgno = cp->pgno;
13367c478bdstevel@tonic-gate				indx = cp->indx;
13377c478bdstevel@tonic-gate				set_indx = 0;
13387c478bdstevel@tonic-gate			} else
13397c478bdstevel@tonic-gate				set_indx = 1;
13407c478bdstevel@tonic-gate
13417c478bdstevel@tonic-gate			DISCARD(dbc, cp);
13427c478bdstevel@tonic-gate			if ((ret = __bam_lget(dbc,
13437c478bdstevel@tonic-gate			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
13447c478bdstevel@tonic-gate				return (ret);
13457c478bdstevel@tonic-gate			if ((ret =
13467c478bdstevel@tonic-gate			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
13477c478bdstevel@tonic-gate				return (ret);
13487c478bdstevel@tonic-gate
13497c478bdstevel@tonic-gate			if (set_indx)
13507c478bdstevel@tonic-gate				indx = NUM_ENT(cp->page);
13517c478bdstevel@tonic-gate			if (indx == 0)
13527c478bdstevel@tonic-gate				continue;
13537c478bdstevel@tonic-gate		}
13547c478bdstevel@tonic-gate
13557c478bdstevel@tonic-gate		/* Ignore deleted records. */
13567c478bdstevel@tonic-gate		indx -= adjust;
13577c478bdstevel@tonic-gate		if (IS_DELETED(cp, indx))
13587c478bdstevel@tonic-gate			continue;
13597c478bdstevel@tonic-gate
13607c478bdstevel@tonic-gate		/*
13617c478bdstevel@tonic-gate		 * If we're not in a duplicates page, check to see if we've
13627c478bdstevel@tonic-gate		 * found a page of duplicates, in which case we move to the
13637c478bdstevel@tonic-gate		 * last entry.
13647c478bdstevel@tonic-gate		 */
13657c478bdstevel@tonic-gate		if (cp->dpgno == PGNO_INVALID) {
13667c478bdstevel@tonic-gate			cp->pgno = cp->page->pgno;
13677c478bdstevel@tonic-gate			cp->indx = indx;
13687c478bdstevel@tonic-gate
13697c478bdstevel@tonic-gate			if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0)
13707c478bdstevel@tonic-gate				return (ret);
13717c478bdstevel@tonic-gate			if (cp->dpgno != PGNO_INVALID) {
13727c478bdstevel@tonic-gate				indx = cp->dindx + O_INDX;
13737c478bdstevel@tonic-gate				adjust = O_INDX;
13747c478bdstevel@tonic-gate				continue;
13757c478bdstevel@tonic-gate			}
13767c478bdstevel@tonic-gate		} else {
13777c478bdstevel@tonic-gate			cp->dpgno = cp->page->pgno;
13787c478bdstevel@tonic-gate			cp->dindx = indx;
13797c478bdstevel@tonic-gate		}
13807c478bdstevel@tonic-gate		break;
13817c478bdstevel@tonic-gate	}
13827c478bdstevel@tonic-gate	return (0);
13837c478bdstevel@tonic-gate}
13847c478bdstevel@tonic-gate
13857c478bdstevel@tonic-gate/*
13867c478bdstevel@tonic-gate * __bam_c_search --
13877c478bdstevel@tonic-gate *	Move to a specified record.
13887c478bdstevel@tonic-gate */
13897c478bdstevel@tonic-gatestatic int
13907c478bdstevel@tonic-gate__bam_c_search(dbc, cp, key, flags, exactp)
13917c478bdstevel@tonic-gate	DBC *dbc;
13927c478bdstevel@tonic-gate	CURSOR *cp;
13937c478bdstevel@tonic-gate	const DBT *key;
13947c478bdstevel@tonic-gate	u_int32_t flags;
13957c478bdstevel@tonic-gate	int *exactp;
13967c478bdstevel@tonic-gate{
13977c478bdstevel@tonic-gate	BTREE *t;
13987c478bdstevel@tonic-gate	DB *dbp;
13997c478bdstevel@tonic-gate	DB_LOCK lock;
14007c478bdstevel@tonic-gate	PAGE *h;
14017c478bdstevel@tonic-gate	db_recno_t recno;
14027c478bdstevel@tonic-gate	db_indx_t indx;
14037c478bdstevel@tonic-gate	u_int32_t sflags;
14047c478bdstevel@tonic-gate	int cmp, needexact, ret;
14057c478bdstevel@tonic-gate
14067c478bdstevel@tonic-gate	dbp = dbc->dbp;
14077c478bdstevel@tonic-gate	t = dbp->internal;
14087c478bdstevel@tonic-gate
14097c478bdstevel@tonic-gate	/* Find an entry in the database. */
14107c478bdstevel@tonic-gate	switch (flags) {
14117c478bdstevel@tonic-gate	case DB_SET_RECNO:
14127c478bdstevel@tonic-gate		if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
14137c478bdstevel@tonic-gate			return (ret);
14147c478bdstevel@tonic-gate		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
14157c478bdstevel@tonic-gate		needexact = *exactp = 1;
14167c478bdstevel@tonic-gate		ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp);
14177c478bdstevel@tonic-gate		break;
14187c478bdstevel@tonic-gate	case DB_SET:
14197c478bdstevel@tonic-gate	case DB_GET_BOTH:
14207c478bdstevel@tonic-gate		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
14217c478bdstevel@tonic-gate		needexact = *exactp = 1;
14227c478bdstevel@tonic-gate		goto search;
14237c478bdstevel@tonic-gate	case DB_SET_RANGE:
14247c478bdstevel@tonic-gate		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
14257c478bdstevel@tonic-gate		needexact = *exactp = 0;
14267c478bdstevel@tonic-gate		goto search;
14277c478bdstevel@tonic-gate	case DB_KEYFIRST:
14287c478bdstevel@tonic-gate		sflags = S_KEYFIRST;
14297c478bdstevel@tonic-gate		goto fast_search;
14307c478bdstevel@tonic-gate	case DB_KEYLAST:
14317c478bdstevel@tonic-gate		sflags = S_KEYLAST;
14327c478bdstevel@tonic-gatefast_search:	needexact = *exactp = 0;
14337c478bdstevel@tonic-gate		/*
14347c478bdstevel@tonic-gate		 * If the application has a history of inserting into the first
14357c478bdstevel@tonic-gate		 * or last pages of the database, we check those pages first to
14367c478bdstevel@tonic-gate		 * avoid doing a full search.
14377c478bdstevel@tonic-gate		 *
14387c478bdstevel@tonic-gate		 * Record numbers can't be fast-tracked, the entire tree has to
14397c478bdstevel@tonic-gate		 * be locked.
14407c478bdstevel@tonic-gate		 */
14417c478bdstevel@tonic-gate		h = NULL;
14427c478bdstevel@tonic-gate		lock = LOCK_INVALID;
14437c478bdstevel@tonic-gate		if (F_ISSET(dbp, DB_BT_RECNUM))
14447c478bdstevel@tonic-gate			goto search;
14457c478bdstevel@tonic-gate
14467c478bdstevel@tonic-gate		/* Check if the application has a history of sorted input. */
14477c478bdstevel@tonic-gate		if (t->bt_lpgno == PGNO_INVALID)
14487c478bdstevel@tonic-gate			goto search;
14497c478bdstevel@tonic-gate
14507c478bdstevel@tonic-gate		/*
14517c478bdstevel@tonic-gate		 * Lock and retrieve the page on which we did the last insert.
14527c478bdstevel@tonic-gate		 * It's okay if it doesn't exist, or if it's not the page type
14537c478bdstevel@tonic-gate		 * we expected, it just means that the world changed.
14547c478bdstevel@tonic-gate		 */
14557c478bdstevel@tonic-gate		if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
14567c478bdstevel@tonic-gate			goto fast_miss;
14577c478bdstevel@tonic-gate		if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h))
14587c478bdstevel@tonic-gate			goto fast_miss;
14597c478bdstevel@tonic-gate		if (TYPE(h) != P_LBTREE)
14607c478bdstevel@tonic-gate			goto fast_miss;
14617c478bdstevel@tonic-gate		if (NUM_ENT(h) == 0)
14627c478bdstevel@tonic-gate			goto fast_miss;
14637c478bdstevel@tonic-gate
14647c478bdstevel@tonic-gate		/*
14657c478bdstevel@tonic-gate		 * What we do here is test to see if we're at the beginning or
14667c478bdstevel@tonic-gate		 * end of the tree and if the new item sorts before/after the
14677c478bdstevel@tonic-gate		 * first/last page entry.  We don't try and catch inserts into
14687c478bdstevel@tonic-gate		 * the middle of the tree (although we could, as long as there
14697c478bdstevel@tonic-gate		 * were two keys on the page and we saved both the index and
14707c478bdstevel@tonic-gate		 * the page number of the last insert).
14717c478bdstevel@tonic-gate		 */
14727c478bdstevel@tonic-gate		if (h->next_pgno == PGNO_INVALID) {
14737c478bdstevel@tonic-gate			indx = NUM_ENT(h) - P_INDX;
14747c478bdstevel@tonic-gate			if ((cmp =
14757c478bdstevel@tonic-gate			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0)
14767c478bdstevel@tonic-gate				goto try_begin;
14777c478bdstevel@tonic-gate			if (cmp > 0) {
14787c478bdstevel@tonic-gate				indx += P_INDX;
14797c478bdstevel@tonic-gate				goto fast_hit;
14807c478bdstevel@tonic-gate			}
14817c478bdstevel@tonic-gate
14827c478bdstevel@tonic-gate			/*
14837c478bdstevel@tonic-gate			 * Found a duplicate.  If doing DB_KEYLAST, we're at
14847c478bdstevel@tonic-gate			 * the correct position, otherwise, move to the first
14857c478bdstevel@tonic-gate			 * of the duplicates.
14867c478bdstevel@tonic-gate			 */
14877c478bdstevel@tonic-gate			if (flags == DB_KEYLAST)
14887c478bdstevel@tonic-gate				goto fast_hit;
14897c478bdstevel@tonic-gate			for (;
14907c478bdstevel@tonic-gate			    indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
14917c478bdstevel@tonic-gate			    indx -= P_INDX)
14927c478bdstevel@tonic-gate				;
14937c478bdstevel@tonic-gate			goto fast_hit;
14947c478bdstevel@tonic-gate		}
14957c478bdstevel@tonic-gatetry_begin:	if (h->prev_pgno == PGNO_INVALID) {
14967c478bdstevel@tonic-gate			indx = 0;
14977c478bdstevel@tonic-gate			if ((cmp =
14987c478bdstevel@tonic-gate			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0)
14997c478bdstevel@tonic-gate				goto fast_miss;
15007c478bdstevel@tonic-gate			if (cmp < 0)
15017c478bdstevel@tonic-gate				goto fast_hit;
15027c478bdstevel@tonic-gate			/*
15037c478bdstevel@tonic-gate			 * Found a duplicate.  If doing DB_KEYFIRST, we're at
15047c478bdstevel@tonic-gate			 * the correct position, otherwise, move to the last
15057c478bdstevel@tonic-gate			 * of the duplicates.
15067c478bdstevel@tonic-gate			 */
15077c478bdstevel@tonic-gate			if (flags == DB_KEYFIRST)
15087c478bdstevel@tonic-gate				goto fast_hit;
15097c478bdstevel@tonic-gate			for (;
15107c478bdstevel@tonic-gate			    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
15117c478bdstevel@tonic-gate			    h->inp[indx] == h->inp[indx + P_INDX];
15127c478bdstevel@tonic-gate			    indx += P_INDX)
15137c478bdstevel@tonic-gate				;
15147c478bdstevel@tonic-gate			goto fast_hit;
15157c478bdstevel@tonic-gate		}
15167c478bdstevel@tonic-gate		goto fast_miss;
15177c478bdstevel@tonic-gate
15187c478bdstevel@tonic-gatefast_hit:	/* Set the exact match flag, we may have found a duplicate. */
15197c478bdstevel@tonic-gate		*exactp = cmp == 0;
15207c478bdstevel@tonic-gate
15217c478bdstevel@tonic-gate		/* Enter the entry in the stack. */
15227c478bdstevel@tonic-gate		BT_STK_CLR(cp);
15237c478bdstevel@tonic-gate		BT_STK_ENTER(cp, h, indx, lock, ret);
15247c478bdstevel@tonic-gate		break;
15257c478bdstevel@tonic-gate
15267c478bdstevel@tonic-gatefast_miss:	if (h != NULL)
15277c478bdstevel@tonic-gate			(void)memp_fput(dbp->mpf, h, 0);
15287c478bdstevel@tonic-gate		if (lock != LOCK_INVALID)
15297c478bdstevel@tonic-gate			(void)__BT_LPUT(dbc, lock);
15307c478bdstevel@tonic-gate
15317c478bdstevel@tonic-gatesearch:		ret = __bam_search(dbc, key, sflags, 1, NULL, exactp);
15327c478bdstevel@tonic-gate		break;
15337c478bdstevel@tonic-gate	default:				/* XXX: Impossible. */
15347c478bdstevel@tonic-gate		abort();
15357c478bdstevel@tonic-gate		/* NOTREACHED */
15367c478bdstevel@tonic-gate	}
15377c478bdstevel@tonic-gate	if (ret != 0)
15387c478bdstevel@tonic-gate		return (ret);
15397c478bdstevel@tonic-gate
15407c478bdstevel@tonic-gate	/*
15417c478bdstevel@tonic-gate	 * Initialize the cursor to reference it.  This has to be done
15427c478bdstevel@tonic-gate	 * before we return (even with DB_NOTFOUND) because we have to
15437c478bdstevel@tonic-gate	 * free the page(s) we locked in __bam_search.
15447c478bdstevel@tonic-gate	 */
15457c478bdstevel@tonic-gate	cp->page = cp->csp->page;
15467c478bdstevel@tonic-gate	cp->pgno = cp->csp->page->pgno;
15477c478bdstevel@tonic-gate	cp->indx = cp->csp->indx;
15487c478bdstevel@tonic-gate	cp->lock = cp->csp->lock;
15497c478bdstevel@tonic-gate	cp->dpgno = PGNO_INVALID;
15507c478bdstevel@tonic-gate
15517c478bdstevel@tonic-gate	/*
15527c478bdstevel@tonic-gate	 * If we inserted a key into the first or last slot of the tree,
15537c478bdstevel@tonic-gate	 * remember where it was so we can do it more quickly next time.
15547c478bdstevel@tonic-gate	 */
15557c478bdstevel@tonic-gate	if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
15567c478bdstevel@tonic-gate		t->bt_lpgno =
15577c478bdstevel@tonic-gate		    ((cp->page->next_pgno == PGNO_INVALID &&
15587c478bdstevel@tonic-gate		    cp->indx >= NUM_ENT(cp->page)) ||
15597c478bdstevel@tonic-gate		    (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ?
15607c478bdstevel@tonic-gate		    cp->pgno : PGNO_INVALID;
15617c478bdstevel@tonic-gate
15627c478bdstevel@tonic-gate	/* If we need an exact match and didn't find one, we're done. */
15637c478bdstevel@tonic-gate	if (needexact && *exactp == 0)
15647c478bdstevel@tonic-gate		return (DB_NOTFOUND);
15657c478bdstevel@tonic-gate
15667c478bdstevel@tonic-gate	return (0);
15677c478bdstevel@tonic-gate}
15687c478bdstevel@tonic-gate
15697c478bdstevel@tonic-gate/*
15707c478bdstevel@tonic-gate * __bam_dup --
15717c478bdstevel@tonic-gate *	Check for an off-page duplicates entry, and if found, move to the
15727c478bdstevel@tonic-gate *	first or last entry.
15737c478bdstevel@tonic-gate *
15747c478bdstevel@tonic-gate * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
15757c478bdstevel@tonic-gate */
15767c478bdstevel@tonic-gateint
15777c478bdstevel@tonic-gate__bam_dup(dbc, cp, indx, last_dup)
15787c478bdstevel@tonic-gate	DBC *dbc;
15797c478bdstevel@tonic-gate	CURSOR *cp;
15807c478bdstevel@tonic-gate	u_int32_t indx;
15817c478bdstevel@tonic-gate	int last_dup;
15827c478bdstevel@tonic-gate{
15837c478bdstevel@tonic-gate	BOVERFLOW *bo;
15847c478bdstevel@tonic-gate	DB *dbp;
15857c478bdstevel@tonic-gate	db_pgno_t pgno;
15867c478bdstevel@tonic-gate	int ret;
15877c478bdstevel@tonic-gate
15887c478bdstevel@tonic-gate	dbp = dbc->dbp;
15897c478bdstevel@tonic-gate
15907c478bdstevel@tonic-gate	/*
15917c478bdstevel@tonic-gate	 * Check for an overflow entry.  If we find one, move to the
15927c478bdstevel@tonic-gate	 * duplicates page, and optionally move to the last record on
15937c478bdstevel@tonic-gate	 * that page.
15947c478bdstevel@tonic-gate	 *
15957c478bdstevel@tonic-gate	 * !!!
15967c478bdstevel@tonic-gate	 * We don't lock duplicates pages, we've already got the correct
15977c478bdstevel@tonic-gate	 * lock on the main page.
15987c478bdstevel@tonic-gate	 */
15997c478bdstevel@tonic-gate	bo = GET_BOVERFLOW(cp->page, indx + O_INDX);
16007c478bdstevel@tonic-gate	if (B_TYPE(bo->type) != B_DUPLICATE)
16017c478bdstevel@tonic-gate		return (0);
16027c478bdstevel@tonic-gate
16037c478bdstevel@tonic-gate	pgno = bo->pgno;
16047c478bdstevel@tonic-gate	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
16057c478bdstevel@tonic-gate		return (ret);
16067c478bdstevel@tonic-gate	cp->page = NULL;
16077c478bdstevel@tonic-gate	if (last_dup) {
16087c478bdstevel@tonic-gate		if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0)
16097c478bdstevel@tonic-gate			return (ret);
16107c478bdstevel@tonic-gate		indx = NUM_ENT(cp->page) - O_INDX;
16117c478bdstevel@tonic-gate	} else {
16127c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
16137c478bdstevel@tonic-gate			return (ret);
16147c478bdstevel@tonic-gate		indx = 0;
16157c478bdstevel@tonic-gate	}
16167c478bdstevel@tonic-gate
16177c478bdstevel@tonic-gate	/* Update the cursor's duplicate information. */
16187c478bdstevel@tonic-gate	cp->dpgno = cp->page->pgno;
16197c478bdstevel@tonic-gate	cp->dindx = indx;
16207c478bdstevel@tonic-gate
16217c478bdstevel@tonic-gate	return (0);
16227c478bdstevel@tonic-gate}
16237c478bdstevel@tonic-gate
16247c478bdstevel@tonic-gate/*
16257c478bdstevel@tonic-gate * __bam_c_physdel --
16267c478bdstevel@tonic-gate *	Actually do the cursor deletion.
16277c478bdstevel@tonic-gate */
16287c478bdstevel@tonic-gatestatic int
16297c478bdstevel@tonic-gate__bam_c_physdel(dbc, cp, h)
16307c478bdstevel@tonic-gate	DBC *dbc;
16317c478bdstevel@tonic-gate	CURSOR *cp;
16327c478bdstevel@tonic-gate	PAGE *h;
16337c478bdstevel@tonic-gate{
16347c478bdstevel@tonic-gate	enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd;
16357c478bdstevel@tonic-gate	BOVERFLOW bo;
16367c478bdstevel@tonic-gate	DB *dbp;
16377c478bdstevel@tonic-gate	DBT dbt;
16387c478bdstevel@tonic-gate	DB_LOCK lock;
16397c478bdstevel@tonic-gate	db_indx_t indx;
16407c478bdstevel@tonic-gate	db_pgno_t pgno, next_pgno, prev_pgno;
16417c478bdstevel@tonic-gate	int delete_page, local_page, ret;
16427c478bdstevel@tonic-gate
16437c478bdstevel@tonic-gate	dbp = dbc->dbp;
16447c478bdstevel@tonic-gate
16457c478bdstevel@tonic-gate	delete_page = ret = 0;
16467c478bdstevel@tonic-gate
16477c478bdstevel@tonic-gate	/* Figure out what we're deleting. */
16487c478bdstevel@tonic-gate	if (cp->dpgno == PGNO_INVALID) {
16497c478bdstevel@tonic-gate		pgno = cp->pgno;
16507c478bdstevel@tonic-gate		indx = cp->indx;
16517c478bdstevel@tonic-gate	} else {
16527c478bdstevel@tonic-gate		pgno = cp->dpgno;
16537c478bdstevel@tonic-gate		indx = cp->dindx;
16547c478bdstevel@tonic-gate	}
16557c478bdstevel@tonic-gate
16567c478bdstevel@tonic-gate	/*
16577c478bdstevel@tonic-gate	 * If the item is referenced by another cursor, set that cursor's
16587c478bdstevel@tonic-gate	 * delete flag and leave it up to it to do the delete.
16597c478bdstevel@tonic-gate	 *
16607c478bdstevel@tonic-gate	 * !!!
16617c478bdstevel@tonic-gate	 * This test for > 0 is a tricky.  There are two ways that we can
16627c478bdstevel@tonic-gate	 * be called here.  Either we are closing the cursor or we've moved
16637c478bdstevel@tonic-gate	 * off the page with the deleted entry.  In the first case, we've
16647c478bdstevel@tonic-gate	 * already removed the cursor from the active queue, so we won't see
16657c478bdstevel@tonic-gate	 * it in __bam_ca_delete. In the second case, it will be on a different
16667c478bdstevel@tonic-gate	 * item, so we won't bother with it in __bam_ca_delete.
16677c478bdstevel@tonic-gate	 */
16687c478bdstevel@tonic-gate	if (__bam_ca_delete(dbp, pgno, indx, 1) > 0)
16697c478bdstevel@tonic-gate		return (0);
16707c478bdstevel@tonic-gate
16717c478bdstevel@tonic-gate	/*
16727c478bdstevel@tonic-gate	 * If this is concurrent DB, upgrade the lock if necessary.
16737c478bdstevel@tonic-gate	 */
16747c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) &&
16757c478bdstevel@tonic-gate	    (ret = lock_get(dbp->dbenv->lk_info,
16767c478bdstevel@tonic-gate	    dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
16777c478bdstevel@tonic-gate	    &dbc->mylock)) != 0)
16787c478bdstevel@tonic-gate		return (EAGAIN);
16797c478bdstevel@tonic-gate
16807c478bdstevel@tonic-gate	/*
16817c478bdstevel@tonic-gate	 * If we don't already have the page locked, get it and delete the
16827c478bdstevel@tonic-gate	 * items.
16837c478bdstevel@tonic-gate	 */
16847c478bdstevel@tonic-gate	if ((h == NULL || h->pgno != pgno)) {
16857c478bdstevel@tonic-gate		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
16867c478bdstevel@tonic-gate			return (ret);
16877c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
16887c478bdstevel@tonic-gate			return (ret);
16897c478bdstevel@tonic-gate		local_page = 1;
16907c478bdstevel@tonic-gate	} else
16917c478bdstevel@tonic-gate		local_page = 0;
16927c478bdstevel@tonic-gate
16937c478bdstevel@tonic-gate	/*
16947c478bdstevel@tonic-gate	 * If we're deleting a duplicate entry and there are other duplicate
16957c478bdstevel@tonic-gate	 * entries remaining, call the common code to do the work and fix up
16967c478bdstevel@tonic-gate	 * the parent page as necessary.  Otherwise, do a normal btree delete.
16977c478bdstevel@tonic-gate	 *
16987c478bdstevel@tonic-gate	 * There are 5 possible cases:
16997c478bdstevel@tonic-gate	 *
17007c478bdstevel@tonic-gate	 * 1. It's not a duplicate item: do a normal btree delete.
17017c478bdstevel@tonic-gate	 * 2. It's a duplicate item:
17027c478bdstevel@tonic-gate	 *	2a: We delete an item from a page of duplicates, but there are
17037c478bdstevel@tonic-gate	 *	    more items on the page.
17047c478bdstevel@tonic-gate	 *      2b: We delete the last item from a page of duplicates, deleting
17057c478bdstevel@tonic-gate	 *	    the last duplicate.
17067c478bdstevel@tonic-gate	 *      2c: We delete the last item from a page of duplicates, but there
17077c478bdstevel@tonic-gate	 *	    is a previous page of duplicates.
17087c478bdstevel@tonic-gate	 *      2d: We delete the last item from a page of duplicates, but there
17097c478bdstevel@tonic-gate	 *	    is a following page of duplicates.
17107c478bdstevel@tonic-gate	 *
17117c478bdstevel@tonic-gate	 * In the case of:
17127c478bdstevel@tonic-gate	 *
17137c478bdstevel@tonic-gate	 *  1: There's nothing further to do.
17147c478bdstevel@tonic-gate	 * 2a: There's nothing further to do.
17157c478bdstevel@tonic-gate	 * 2b: Do the normal btree delete instead of a duplicate delete, as
17167c478bdstevel@tonic-gate	 *     that deletes both the duplicate chain and the parent page's
17177c478bdstevel@tonic-gate	 *     entry.
17187c478bdstevel@tonic-gate	 * 2c: There's nothing further to do.
17197c478bdstevel@tonic-gate	 * 2d: Delete the duplicate, and update the parent page's entry.
17207c478bdstevel@tonic-gate	 */
17217c478bdstevel@tonic-gate	if (TYPE(h) == P_DUPLICATE) {
17227c478bdstevel@tonic-gate		pgno = PGNO(h);
17237c478bdstevel@tonic-gate		prev_pgno = PREV_PGNO(h);
17247c478bdstevel@tonic-gate		next_pgno = NEXT_PGNO(h);
17257c478bdstevel@tonic-gate
17267c478bdstevel@tonic-gate		if (NUM_ENT(h) == 1 &&
17277c478bdstevel@tonic-gate		    prev_pgno == PGNO_INVALID && next_pgno == PGNO_INVALID)
17287c478bdstevel@tonic-gate			cmd = DELETE_PAGE;
17297c478bdstevel@tonic-gate		else {
17307c478bdstevel@tonic-gate			cmd = DELETE_ITEM;
17317c478bdstevel@tonic-gate
17327c478bdstevel@tonic-gate			/* Delete the duplicate. */
17337c478bdstevel@tonic-gate			if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0)
17347c478bdstevel@tonic-gate				goto err;
17357c478bdstevel@tonic-gate
17367c478bdstevel@tonic-gate			/*
17377c478bdstevel@tonic-gate			 * 2a: h != NULL, h->pgno == pgno
17387c478bdstevel@tonic-gate			 * 2b: We don't reach this clause, as the above test
17397c478bdstevel@tonic-gate			 *     was true.
17407c478bdstevel@tonic-gate			 * 2c: h == NULL, prev_pgno != PGNO_INVALID
17417c478bdstevel@tonic-gate			 * 2d: h != NULL, next_pgno != PGNO_INVALID
17427c478bdstevel@tonic-gate			 *
17437c478bdstevel@tonic-gate			 * Test for 2a and 2c: if we didn't empty the current
17447c478bdstevel@tonic-gate			 * page or there was a previous page of duplicates, we
17457c478bdstevel@tonic-gate			 * don't need to touch the parent page.
17467c478bdstevel@tonic-gate			 */
17477c478bdstevel@tonic-gate			if ((h != NULL && pgno == h->pgno) ||
17487c478bdstevel@tonic-gate			    prev_pgno != PGNO_INVALID)
17497c478bdstevel@tonic-gate				cmd = NOTHING_FURTHER;
17507c478bdstevel@tonic-gate		}
17517c478bdstevel@tonic-gate
17527c478bdstevel@tonic-gate		/*
17537c478bdstevel@tonic-gate		 * Release any page we're holding and its lock.
17547c478bdstevel@tonic-gate		 *
17557c478bdstevel@tonic-gate		 * !!!
17567c478bdstevel@tonic-gate		 * If there is no subsequent page in the duplicate chain, then
17577c478bdstevel@tonic-gate		 * __db_drem will have put page "h" and set it to NULL.
17587c478bdstevel@tonic-gate		*/
17597c478bdstevel@tonic-gate		if (local_page) {
17607c478bdstevel@tonic-gate			if (h != NULL)
17617c478bdstevel@tonic-gate				(void)memp_fput(dbp->mpf, h, 0);
17627c478bdstevel@tonic-gate			(void)__BT_TLPUT(dbc, lock);
17637c478bdstevel@tonic-gate			local_page = 0;
17647c478bdstevel@tonic-gate		}
17657c478bdstevel@tonic-gate
17667c478bdstevel@tonic-gate		if (cmd == NOTHING_FURTHER)
17677c478bdstevel@tonic-gate			goto done;
17687c478bdstevel@tonic-gate
17697c478bdstevel@tonic-gate		/* Acquire the parent page and switch the index to its entry. */
17707c478bdstevel@tonic-gate		if ((ret =
17717c478bdstevel@tonic-gate		    __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
17727c478bdstevel@tonic-gate			goto err;
17737c478bdstevel@tonic-gate		if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) {
17747c478bdstevel@tonic-gate			(void)__BT_TLPUT(dbc, lock);
17757c478bdstevel@tonic-gate			goto err;
17767c478bdstevel@tonic-gate		}
17777c478bdstevel@tonic-gate		local_page = 1;
17787c478bdstevel@tonic-gate		indx = cp->indx;
17797c478bdstevel@tonic-gate
17807c478bdstevel@tonic-gate		if (cmd == DELETE_PAGE)
17817c478bdstevel@tonic-gate			goto btd;
17827c478bdstevel@tonic-gate
17837c478bdstevel@tonic-gate		/*
17847c478bdstevel@tonic-gate		 * Copy, delete, update, add-back the parent page's data entry.
17857c478bdstevel@tonic-gate		 *
17867c478bdstevel@tonic-gate		 * XXX
17877c478bdstevel@tonic-gate		 * This may be a performance/logging problem.  We should add a
17887c478bdstevel@tonic-gate		 * log message which simply logs/updates a random set of bytes
17897c478bdstevel@tonic-gate		 * on a page, and use it instead of doing a delete/add pair.
17907c478bdstevel@tonic-gate		 */
17917c478bdstevel@tonic-gate		indx += O_INDX;
17927c478bdstevel@tonic-gate		bo = *GET_BOVERFLOW(h, indx);
17937c478bdstevel@tonic-gate		(void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE);
17947c478bdstevel@tonic-gate		bo.pgno = next_pgno;
17957c478bdstevel@tonic-gate		memset(&dbt, 0, sizeof(dbt));
17967c478bdstevel@tonic-gate		dbt.data = &bo;
17977c478bdstevel@tonic-gate		dbt.size = BOVERFLOW_SIZE;
17987c478bdstevel@tonic-gate		(void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
17997c478bdstevel@tonic-gate		(void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
18007c478bdstevel@tonic-gate		goto done;
18017c478bdstevel@tonic-gate	}
18027c478bdstevel@tonic-gate
18037c478bdstevel@tonic-gatebtd:	/*
18047c478bdstevel@tonic-gate	 * If the page is going to be emptied, delete it.  To delete a leaf
18057c478bdstevel@tonic-gate	 * page we need a copy of a key from the page.  We use the 0th page
18067c478bdstevel@tonic-gate	 * index since it's the last key that the page held.
18077c478bdstevel@tonic-gate	 *
18087c478bdstevel@tonic-gate	 * We malloc the page information instead of using the return key/data
18097c478bdstevel@tonic-gate	 * memory because we've already set them -- the reason we've already
18107c478bdstevel@tonic-gate	 * set them is because we're (potentially) about to do a reverse split,
18117c478bdstevel@tonic-gate	 * which would make our saved page information useless.
18127c478bdstevel@tonic-gate	 *
18137c478bdstevel@tonic-gate	 * !!!
18147c478bdstevel@tonic-gate	 * The following operations to delete a page might deadlock.  I think
18157c478bdstevel@tonic-gate	 * that's OK.  The problem is if we're deleting an item because we're
18167c478bdstevel@tonic-gate	 * closing cursors because we've already deadlocked and want to call
18177c478bdstevel@tonic-gate	 * txn_abort().  If we fail due to deadlock, we leave a locked empty
18187c478bdstevel@tonic-gate	 * page in the tree, which won't be empty long because we're going to
18197c478bdstevel@tonic-gate	 * undo the delete.
18207c478bdstevel@tonic-gate	 */
18217c478bdstevel@tonic-gate	if (NUM_ENT(h) == 2 && h->pgno != PGNO_ROOT) {
18227c478bdstevel@tonic-gate		memset(&dbt, 0, sizeof(DBT));
18237c478bdstevel@tonic-gate		dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
18247c478bdstevel@tonic-gate		if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
18257c478bdstevel@tonic-gate			goto err;
18267c478bdstevel@tonic-gate		delete_page = 1;
18277c478bdstevel@tonic-gate	}
18287c478bdstevel@tonic-gate
18297c478bdstevel@tonic-gate	/*
18307c478bdstevel@tonic-gate	 * Do a normal btree delete.
18317c478bdstevel@tonic-gate	 *
18327c478bdstevel@tonic-gate	 * !!!
18337c478bdstevel@tonic-gate	 * Delete the key item first, otherwise the duplicate checks in
18347c478bdstevel@tonic-gate	 * __bam_ditem() won't work!
18357c478bdstevel@tonic-gate	 */
18367c478bdstevel@tonic-gate	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
18377c478bdstevel@tonic-gate		goto err;
18387c478bdstevel@tonic-gate	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
18397c478bdstevel@tonic-gate		goto err;
18407c478bdstevel@tonic-gate
18417c478bdstevel@tonic-gate	/* Discard any remaining locks/pages. */
18427c478bdstevel@tonic-gate	if (local_page) {
18437c478bdstevel@tonic-gate		(void)memp_fput(dbp->mpf, h, 0);
18447c478bdstevel@tonic-gate		(void)__BT_TLPUT(dbc, lock);
18457c478bdstevel@tonic-gate		local_page = 0;
18467c478bdstevel@tonic-gate	}
18477c478bdstevel@tonic-gate
18487c478bdstevel@tonic-gate	/* Delete the page if it was emptied. */
18497c478bdstevel@tonic-gate	if (delete_page)
18507c478bdstevel@tonic-gate		ret = __bam_dpage(dbc, &dbt);
18517c478bdstevel@tonic-gate
18527c478bdstevel@tonic-gateerr:
18537c478bdstevel@tonic-gatedone:	if (delete_page)
18547c478bdstevel@tonic-gate		__os_free(dbt.data, dbt.size);
18557c478bdstevel@tonic-gate
18567c478bdstevel@tonic-gate	if (local_page) {
18577c478bdstevel@tonic-gate		/*
18587c478bdstevel@tonic-gate		 * It's possible for h to be NULL, as __db_drem may have
18597c478bdstevel@tonic-gate		 * been relinking pages by the time that it deadlocked.
18607c478bdstevel@tonic-gate		 */
18617c478bdstevel@tonic-gate		if (h != NULL)
18627c478bdstevel@tonic-gate			(void)memp_fput(dbp->mpf, h, 0);
18637c478bdstevel@tonic-gate		(void)__BT_TLPUT(dbc, lock);
18647c478bdstevel@tonic-gate	}
18657c478bdstevel@tonic-gate
18667c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
18677c478bdstevel@tonic-gate		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
18687c478bdstevel@tonic-gate		    DB_LOCK_IWRITE, 0);
18697c478bdstevel@tonic-gate
18707c478bdstevel@tonic-gate	return (ret);
18717c478bdstevel@tonic-gate}
18727c478bdstevel@tonic-gate
18737c478bdstevel@tonic-gate/*
18747c478bdstevel@tonic-gate * __bam_c_getstack --
18757c478bdstevel@tonic-gate *	Acquire a full stack for a cursor.
18767c478bdstevel@tonic-gate */
18777c478bdstevel@tonic-gatestatic int
18787c478bdstevel@tonic-gate__bam_c_getstack(dbc, cp)
18797c478bdstevel@tonic-gate	DBC *dbc;
18807c478bdstevel@tonic-gate	CURSOR *cp;
18817c478bdstevel@tonic-gate{
18827c478bdstevel@tonic-gate	DB *dbp;
18837c478bdstevel@tonic-gate	DBT dbt;
18847c478bdstevel@tonic-gate	PAGE *h;
18857c478bdstevel@tonic-gate	db_pgno_t pgno;
18867c478bdstevel@tonic-gate	int exact, ret;
18877c478bdstevel@tonic-gate
18887c478bdstevel@tonic-gate	dbp = dbc->dbp;
18897c478bdstevel@tonic-gate	h = NULL;
18907c478bdstevel@tonic-gate	memset(&dbt, 0, sizeof(DBT));
18917c478bdstevel@tonic-gate	ret = 0;
18927c478bdstevel@tonic-gate
18937c478bdstevel@tonic-gate	/* Get the page with the current item on it. */
18947c478bdstevel@tonic-gate	pgno = cp->pgno;
18957c478bdstevel@tonic-gate	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
18967c478bdstevel@tonic-gate		return (ret);
18977c478bdstevel@tonic-gate
18987c478bdstevel@tonic-gate	/* Get a copy of a key from the page. */
18997c478bdstevel@tonic-gate	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
19007c478bdstevel@tonic-gate	if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
19017c478bdstevel@tonic-gate		goto err;
19027c478bdstevel@tonic-gate
19037c478bdstevel@tonic-gate	/* Get a write-locked stack for that page. */
19047c478bdstevel@tonic-gate	exact = 0;
19057c478bdstevel@tonic-gate	ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
19067c478bdstevel@tonic-gate
19077c478bdstevel@tonic-gate	/* We no longer need the key or the page. */
19087c478bdstevel@tonic-gateerr:	if (h != NULL)
19097c478bdstevel@tonic-gate		(void)memp_fput(dbp->mpf, h, 0);
19107c478bdstevel@tonic-gate	if (dbt.data != NULL)
19117c478bdstevel@tonic-gate		__os_free(dbt.data, dbt.size);
19127c478bdstevel@tonic-gate	return (ret);
19137c478bdstevel@tonic-gate}
1914