17c478bdstevel@tonic-gate/*-
27c478bdstevel@tonic-gate * See the file LICENSE for redistribution information.
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * Copyright (c) 1997, 1998
57c478bdstevel@tonic-gate *	Sleepycat Software.  All rights reserved.
67c478bdstevel@tonic-gate */
77c478bdstevel@tonic-gate
87c478bdstevel@tonic-gate#include "config.h"
97c478bdstevel@tonic-gate
107c478bdstevel@tonic-gate#ifndef lint
117c478bdstevel@tonic-gatestatic const char sccsid[] = "@(#)bt_recno.c	10.53 (Sleepycat) 12/11/98";
127c478bdstevel@tonic-gate#endif /* not lint */
137c478bdstevel@tonic-gate
147c478bdstevel@tonic-gate#ifndef NO_SYSTEM_INCLUDES
157c478bdstevel@tonic-gate#include <sys/types.h>
167c478bdstevel@tonic-gate
177c478bdstevel@tonic-gate#include <errno.h>
187c478bdstevel@tonic-gate#include <limits.h>
197c478bdstevel@tonic-gate#include <string.h>
207c478bdstevel@tonic-gate#endif
217c478bdstevel@tonic-gate
227c478bdstevel@tonic-gate#include "db_int.h"
237c478bdstevel@tonic-gate#include "db_page.h"
247c478bdstevel@tonic-gate#include "btree.h"
257c478bdstevel@tonic-gate#include "db_ext.h"
267c478bdstevel@tonic-gate#include "shqueue.h"
277c478bdstevel@tonic-gate#include "db_shash.h"
287c478bdstevel@tonic-gate#include "lock.h"
297c478bdstevel@tonic-gate#include "lock_ext.h"
307c478bdstevel@tonic-gate
317c478bdstevel@tonic-gatestatic int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
327c478bdstevel@tonic-gatestatic int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
337c478bdstevel@tonic-gatestatic int __ram_fmap __P((DBC *, db_recno_t));
347c478bdstevel@tonic-gatestatic int __ram_i_delete __P((DBC *));
357c478bdstevel@tonic-gatestatic int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
367c478bdstevel@tonic-gatestatic int __ram_source __P((DB *, RECNO *, const char *));
377c478bdstevel@tonic-gatestatic int __ram_sync __P((DB *, u_int32_t));
387c478bdstevel@tonic-gatestatic int __ram_update __P((DBC *, db_recno_t, int));
397c478bdstevel@tonic-gatestatic int __ram_vmap __P((DBC *, db_recno_t));
407c478bdstevel@tonic-gatestatic int __ram_writeback __P((DBC *));
417c478bdstevel@tonic-gate
427c478bdstevel@tonic-gate/*
437c478bdstevel@tonic-gate * In recno, there are two meanings to the on-page "deleted" flag.  If we're
447c478bdstevel@tonic-gate * re-numbering records, it means the record was implicitly created.  We skip
457c478bdstevel@tonic-gate * over implicitly created records if doing a cursor "next" or "prev", and
467c478bdstevel@tonic-gate * return DB_KEYEMPTY if they're explicitly requested..  If not re-numbering
477c478bdstevel@tonic-gate * records, it means that the record was implicitly created, or was deleted.
487c478bdstevel@tonic-gate * We skip over implicitly created or deleted records if doing a cursor "next"
497c478bdstevel@tonic-gate * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
507c478bdstevel@tonic-gate *
517c478bdstevel@tonic-gate * If we're re-numbering records, then we have to detect in the cursor that
527c478bdstevel@tonic-gate * a record was deleted, and adjust the cursor as necessary on the next get.
537c478bdstevel@tonic-gate * If we're not re-numbering records, then we can detect that a record has
547c478bdstevel@tonic-gate * been deleted by looking at the actual on-page record, so we completely
557c478bdstevel@tonic-gate * ignore the cursor's delete flag.  This is different from the B+tree code.
567c478bdstevel@tonic-gate * It also maintains whether the cursor references a deleted record in the
577c478bdstevel@tonic-gate * cursor, and it doesn't always check the on-page value.
587c478bdstevel@tonic-gate */
597c478bdstevel@tonic-gate#define	CD_SET(dbp, cp) {						\
607c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
617c478bdstevel@tonic-gate		F_SET(cp, C_DELETED);					\
627c478bdstevel@tonic-gate}
637c478bdstevel@tonic-gate#define	CD_CLR(dbp, cp) {						\
647c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
657c478bdstevel@tonic-gate		F_CLR(cp, C_DELETED);					\
667c478bdstevel@tonic-gate}
677c478bdstevel@tonic-gate#define	CD_ISSET(dbp, cp)						\
687c478bdstevel@tonic-gate	(F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, C_DELETED))
697c478bdstevel@tonic-gate
707c478bdstevel@tonic-gate/*
717c478bdstevel@tonic-gate * __ram_open --
727c478bdstevel@tonic-gate *	Recno open function.
737c478bdstevel@tonic-gate *
747c478bdstevel@tonic-gate * PUBLIC: int __ram_open __P((DB *, DB_INFO *));
757c478bdstevel@tonic-gate */
767c478bdstevel@tonic-gateint
777c478bdstevel@tonic-gate__ram_open(dbp, dbinfo)
787c478bdstevel@tonic-gate	DB *dbp;
797c478bdstevel@tonic-gate	DB_INFO *dbinfo;
807c478bdstevel@tonic-gate{
817c478bdstevel@tonic-gate	BTREE *t;
827c478bdstevel@tonic-gate	DBC *dbc;
837c478bdstevel@tonic-gate	RECNO *rp;
847c478bdstevel@tonic-gate	int ret, t_ret;
857c478bdstevel@tonic-gate
867c478bdstevel@tonic-gate	/* Allocate and initialize the private btree structure. */
877c478bdstevel@tonic-gate	if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0)
887c478bdstevel@tonic-gate		return (ret);
897c478bdstevel@tonic-gate	dbp->internal = t;
907c478bdstevel@tonic-gate	__bam_setovflsize(dbp);
917c478bdstevel@tonic-gate
927c478bdstevel@tonic-gate	/* Allocate and initialize the private recno structure. */
937c478bdstevel@tonic-gate	if ((ret = __os_calloc(1, sizeof(*rp), &rp)) != 0)
947c478bdstevel@tonic-gate		return (ret);
957c478bdstevel@tonic-gate	/* Link in the private recno structure. */
967c478bdstevel@tonic-gate	t->recno = rp;
977c478bdstevel@tonic-gate
987c478bdstevel@tonic-gate	/*
997c478bdstevel@tonic-gate	 * Intention is to make sure all of the user's selections are okay
1007c478bdstevel@tonic-gate	 * here and then use them without checking.
1017c478bdstevel@tonic-gate	 */
1027c478bdstevel@tonic-gate	if (dbinfo == NULL) {
1037c478bdstevel@tonic-gate		rp->re_delim = '\n';
1047c478bdstevel@tonic-gate		rp->re_pad = ' ';
1057c478bdstevel@tonic-gate		rp->re_fd = -1;
1067c478bdstevel@tonic-gate		F_SET(rp, RECNO_EOF);
1077c478bdstevel@tonic-gate	} else {
1087c478bdstevel@tonic-gate		/*
1097c478bdstevel@tonic-gate		 * If the user specified a source tree, open it and map it in.
1107c478bdstevel@tonic-gate		 *
1117c478bdstevel@tonic-gate		 * !!!
1127c478bdstevel@tonic-gate		 * We don't complain if the user specified transactions or
1137c478bdstevel@tonic-gate		 * threads.  It's possible to make it work, but you'd better
1147c478bdstevel@tonic-gate		 * know what you're doing!
1157c478bdstevel@tonic-gate		 */
1167c478bdstevel@tonic-gate		if (dbinfo->re_source == NULL) {
1177c478bdstevel@tonic-gate			rp->re_fd = -1;
1187c478bdstevel@tonic-gate			F_SET(rp, RECNO_EOF);
1197c478bdstevel@tonic-gate		} else {
1207c478bdstevel@tonic-gate			if ((ret =
1217c478bdstevel@tonic-gate			    __ram_source(dbp, rp, dbinfo->re_source)) != 0)
1227c478bdstevel@tonic-gate			goto err;
1237c478bdstevel@tonic-gate		}
1247c478bdstevel@tonic-gate
1257c478bdstevel@tonic-gate		/* Copy delimiter, length and padding values. */
1267c478bdstevel@tonic-gate		rp->re_delim =
1277c478bdstevel@tonic-gate		    F_ISSET(dbp, DB_RE_DELIMITER) ? dbinfo->re_delim : '\n';
1287c478bdstevel@tonic-gate		rp->re_pad = F_ISSET(dbp, DB_RE_PAD) ? dbinfo->re_pad : ' ';
1297c478bdstevel@tonic-gate
1307c478bdstevel@tonic-gate		if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1317c478bdstevel@tonic-gate			if ((rp->re_len = dbinfo->re_len) == 0) {
1327c478bdstevel@tonic-gate				__db_err(dbp->dbenv,
1337c478bdstevel@tonic-gate				    "record length must be greater than 0");
1347c478bdstevel@tonic-gate				ret = EINVAL;
1357c478bdstevel@tonic-gate				goto err;
1367c478bdstevel@tonic-gate			}
1377c478bdstevel@tonic-gate		} else
1387c478bdstevel@tonic-gate			rp->re_len = 0;
1397c478bdstevel@tonic-gate	}
1407c478bdstevel@tonic-gate
1417c478bdstevel@tonic-gate	/* Initialize the remaining fields/methods of the DB. */
1427c478bdstevel@tonic-gate	dbp->am_close = __ram_close;
1437c478bdstevel@tonic-gate	dbp->del = __ram_delete;
1447c478bdstevel@tonic-gate	dbp->put = __ram_put;
1457c478bdstevel@tonic-gate	dbp->stat = __bam_stat;
1467c478bdstevel@tonic-gate	dbp->sync = __ram_sync;
1477c478bdstevel@tonic-gate
1487c478bdstevel@tonic-gate	/* Start up the tree. */
1497c478bdstevel@tonic-gate	if ((ret = __bam_read_root(dbp)) != 0)
1507c478bdstevel@tonic-gate		goto err;
1517c478bdstevel@tonic-gate
1527c478bdstevel@tonic-gate	/* Set the overflow page size. */
1537c478bdstevel@tonic-gate	__bam_setovflsize(dbp);
1547c478bdstevel@tonic-gate
1557c478bdstevel@tonic-gate	/* If we're snapshotting an underlying source file, do it now. */
1567c478bdstevel@tonic-gate	if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) {
1577c478bdstevel@tonic-gate		/* Allocate a cursor. */
1587c478bdstevel@tonic-gate		if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
1597c478bdstevel@tonic-gate			goto err;
1607c478bdstevel@tonic-gate
1617c478bdstevel@tonic-gate		/* Do the snapshot. */
1627c478bdstevel@tonic-gate		if ((ret = __ram_update(dbc,
1637c478bdstevel@tonic-gate		    DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
1647c478bdstevel@tonic-gate			ret = 0;
1657c478bdstevel@tonic-gate
1667c478bdstevel@tonic-gate		/* Discard the cursor. */
1677c478bdstevel@tonic-gate		if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
1687c478bdstevel@tonic-gate			ret = t_ret;
1697c478bdstevel@tonic-gate
1707c478bdstevel@tonic-gate		if (ret != 0)
1717c478bdstevel@tonic-gate			goto err;
1727c478bdstevel@tonic-gate	}
1737c478bdstevel@tonic-gate
1747c478bdstevel@tonic-gate	return (0);
1757c478bdstevel@tonic-gate
1767c478bdstevel@tonic-gateerr:	/* If we mmap'd a source file, discard it. */
1777c478bdstevel@tonic-gate	if (rp->re_smap != NULL)
1787c478bdstevel@tonic-gate		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
1797c478bdstevel@tonic-gate
1807c478bdstevel@tonic-gate	/* If we opened a source file, discard it. */
1817c478bdstevel@tonic-gate	if (rp->re_fd != -1)
1827c478bdstevel@tonic-gate		(void)__os_close(rp->re_fd);
1837c478bdstevel@tonic-gate	if (rp->re_source != NULL)
1847c478bdstevel@tonic-gate		__os_freestr(rp->re_source);
1857c478bdstevel@tonic-gate
1867c478bdstevel@tonic-gate	__os_free(rp, sizeof(*rp));
1877c478bdstevel@tonic-gate
1887c478bdstevel@tonic-gate	return (ret);
1897c478bdstevel@tonic-gate}
1907c478bdstevel@tonic-gate
1917c478bdstevel@tonic-gate/*
1927c478bdstevel@tonic-gate * __ram_delete --
1937c478bdstevel@tonic-gate *	Recno db->del function.
1947c478bdstevel@tonic-gate */
1957c478bdstevel@tonic-gatestatic int
1967c478bdstevel@tonic-gate__ram_delete(dbp, txn, key, flags)
1977c478bdstevel@tonic-gate	DB *dbp;
1987c478bdstevel@tonic-gate	DB_TXN *txn;
1997c478bdstevel@tonic-gate	DBT *key;
2007c478bdstevel@tonic-gate	u_int32_t flags;
2017c478bdstevel@tonic-gate{
2027c478bdstevel@tonic-gate	CURSOR *cp;
2037c478bdstevel@tonic-gate	DBC *dbc;
2047c478bdstevel@tonic-gate	db_recno_t recno;
2057c478bdstevel@tonic-gate	int ret, t_ret;
2067c478bdstevel@tonic-gate
2077c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
2087c478bdstevel@tonic-gate
2097c478bdstevel@tonic-gate	/* Check for invalid flags. */
2107c478bdstevel@tonic-gate	if ((ret = __db_delchk(dbp,
2117c478bdstevel@tonic-gate	    key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
2127c478bdstevel@tonic-gate		return (ret);
2137c478bdstevel@tonic-gate
2147c478bdstevel@tonic-gate	/* Acquire a cursor. */
2157c478bdstevel@tonic-gate	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
2167c478bdstevel@tonic-gate		return (ret);
2177c478bdstevel@tonic-gate
2187c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags);
2197c478bdstevel@tonic-gate
2207c478bdstevel@tonic-gate	/* Check the user's record number and fill in as necessary. */
2217c478bdstevel@tonic-gate	if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
2227c478bdstevel@tonic-gate		goto err;
2237c478bdstevel@tonic-gate
2247c478bdstevel@tonic-gate	/* Do the delete. */
2257c478bdstevel@tonic-gate	cp = dbc->internal;
2267c478bdstevel@tonic-gate	cp->recno = recno;
2277c478bdstevel@tonic-gate	ret = __ram_i_delete(dbc);
2287c478bdstevel@tonic-gate
2297c478bdstevel@tonic-gate	/* Release the cursor. */
2307c478bdstevel@tonic-gateerr:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
2317c478bdstevel@tonic-gate		ret = t_ret;
2327c478bdstevel@tonic-gate
2337c478bdstevel@tonic-gate	return (ret);
2347c478bdstevel@tonic-gate}
2357c478bdstevel@tonic-gate
2367c478bdstevel@tonic-gate/*
2377c478bdstevel@tonic-gate * __ram_i_delete --
2387c478bdstevel@tonic-gate *	Internal version of recno delete, called by __ram_delete and
2397c478bdstevel@tonic-gate *	__ram_c_del.
2407c478bdstevel@tonic-gate */
2417c478bdstevel@tonic-gatestatic int
2427c478bdstevel@tonic-gate__ram_i_delete(dbc)
2437c478bdstevel@tonic-gate	DBC *dbc;
2447c478bdstevel@tonic-gate{
2457c478bdstevel@tonic-gate	BKEYDATA bk;
2467c478bdstevel@tonic-gate	BTREE *t;
2477c478bdstevel@tonic-gate	CURSOR *cp;
2487c478bdstevel@tonic-gate	DB *dbp;
2497c478bdstevel@tonic-gate	DBT hdr, data;
2507c478bdstevel@tonic-gate	PAGE *h;
2517c478bdstevel@tonic-gate	db_indx_t indx;
2527c478bdstevel@tonic-gate	int exact, ret, stack;
2537c478bdstevel@tonic-gate
2547c478bdstevel@tonic-gate	dbp = dbc->dbp;
2557c478bdstevel@tonic-gate	cp = dbc->internal;
2567c478bdstevel@tonic-gate	t = dbp->internal;
2577c478bdstevel@tonic-gate	stack = 0;
2587c478bdstevel@tonic-gate
2597c478bdstevel@tonic-gate	/*
2607c478bdstevel@tonic-gate	 * If this is CDB and this isn't a write cursor, then it's an error.
2617c478bdstevel@tonic-gate	 * If it is a write cursor, but we don't yet hold the write lock, then
2627c478bdstevel@tonic-gate	 * we need to upgrade to the write lock.
2637c478bdstevel@tonic-gate	 */
2647c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB)) {
2657c478bdstevel@tonic-gate		/* Make sure it's a valid update cursor. */
2667c478bdstevel@tonic-gate		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
2677c478bdstevel@tonic-gate			return (EINVAL);
2687c478bdstevel@tonic-gate
2697c478bdstevel@tonic-gate		if (F_ISSET(dbc, DBC_RMW) &&
2707c478bdstevel@tonic-gate		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
2717c478bdstevel@tonic-gate		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
2727c478bdstevel@tonic-gate		    &dbc->mylock)) != 0)
2737c478bdstevel@tonic-gate			return (EAGAIN);
2747c478bdstevel@tonic-gate	}
2757c478bdstevel@tonic-gate
2767c478bdstevel@tonic-gate	/* Search the tree for the key; delete only deletes exact matches. */
2777c478bdstevel@tonic-gate	if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
2787c478bdstevel@tonic-gate		goto err;
2797c478bdstevel@tonic-gate	if (!exact) {
2807c478bdstevel@tonic-gate		ret = DB_NOTFOUND;
2817c478bdstevel@tonic-gate		goto err;
2827c478bdstevel@tonic-gate	}
2837c478bdstevel@tonic-gate	stack = 1;
2847c478bdstevel@tonic-gate
2857c478bdstevel@tonic-gate	h = cp->csp->page;
2867c478bdstevel@tonic-gate	indx = cp->csp->indx;
2877c478bdstevel@tonic-gate
2887c478bdstevel@tonic-gate	/*
2897c478bdstevel@tonic-gate	 * If re-numbering records, the on-page deleted flag can only mean
2907c478bdstevel@tonic-gate	 * that this record was implicitly created.  Applications aren't
2917c478bdstevel@tonic-gate	 * permitted to delete records they never created, return an error.
2927c478bdstevel@tonic-gate	 *
2937c478bdstevel@tonic-gate	 * If not re-numbering records, the on-page deleted flag means that
2947c478bdstevel@tonic-gate	 * this record was implicitly created, or, was deleted at some time.
2957c478bdstevel@tonic-gate	 * The former is an error because applications aren't permitted to
2967c478bdstevel@tonic-gate	 * delete records they never created, the latter is an error because
2977c478bdstevel@tonic-gate	 * if the record was "deleted", we could never have found it.
2987c478bdstevel@tonic-gate	 */
2997c478bdstevel@tonic-gate	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
3007c478bdstevel@tonic-gate		ret = DB_KEYEMPTY;
3017c478bdstevel@tonic-gate		goto err;
3027c478bdstevel@tonic-gate	}
3037c478bdstevel@tonic-gate
3047c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_RE_RENUMBER)) {
3057c478bdstevel@tonic-gate		/* Delete the item, adjust the counts, adjust the cursors. */
3067c478bdstevel@tonic-gate		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
3077c478bdstevel@tonic-gate			goto err;
3087c478bdstevel@tonic-gate		__bam_adjust(dbc, -1);
3097c478bdstevel@tonic-gate		__ram_ca(dbp, cp->recno, CA_DELETE);
3107c478bdstevel@tonic-gate
3117c478bdstevel@tonic-gate		/*
3127c478bdstevel@tonic-gate		 * If the page is empty, delete it.   The whole tree is locked
3137c478bdstevel@tonic-gate		 * so there are no preparations to make.
3147c478bdstevel@tonic-gate		 */
3157c478bdstevel@tonic-gate		if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
3167c478bdstevel@tonic-gate			stack = 0;
3177c478bdstevel@tonic-gate			ret = __bam_dpages(dbc);
3187c478bdstevel@tonic-gate		}
3197c478bdstevel@tonic-gate	} else {
3207c478bdstevel@tonic-gate		/* Use a delete/put pair to replace the record with a marker. */
3217c478bdstevel@tonic-gate		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
3227c478bdstevel@tonic-gate			goto err;
3237c478bdstevel@tonic-gate
3247c478bdstevel@tonic-gate		B_TSET(bk.type, B_KEYDATA, 1);
3257c478bdstevel@tonic-gate		bk.len = 0;
3267c478bdstevel@tonic-gate		memset(&hdr, 0, sizeof(hdr));
3277c478bdstevel@tonic-gate		hdr.data = &bk;
3287c478bdstevel@tonic-gate		hdr.size = SSZA(BKEYDATA, data);
3297c478bdstevel@tonic-gate		memset(&data, 0, sizeof(data));
3307c478bdstevel@tonic-gate		data.data = (char *)"";
3317c478bdstevel@tonic-gate		data.size = 0;
3327c478bdstevel@tonic-gate		if ((ret = __db_pitem(dbc,
3337c478bdstevel@tonic-gate		    h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
3347c478bdstevel@tonic-gate			goto err;
3357c478bdstevel@tonic-gate	}
3367c478bdstevel@tonic-gate	F_SET(t->recno, RECNO_MODIFIED);
3377c478bdstevel@tonic-gate
3387c478bdstevel@tonic-gateerr:	if (stack)
3397c478bdstevel@tonic-gate		__bam_stkrel(dbc, 0);
3407c478bdstevel@tonic-gate
3417c478bdstevel@tonic-gate	/* If we upgraded the CDB lock upon entry; downgrade it now. */
3427c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
3437c478bdstevel@tonic-gate		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
3447c478bdstevel@tonic-gate		    DB_LOCK_IWRITE, 0);
3457c478bdstevel@tonic-gate	return (ret);
3467c478bdstevel@tonic-gate}
3477c478bdstevel@tonic-gate
3487c478bdstevel@tonic-gate/*
3497c478bdstevel@tonic-gate * __ram_put --
3507c478bdstevel@tonic-gate *	Recno db->put function.
3517c478bdstevel@tonic-gate */
3527c478bdstevel@tonic-gatestatic int
3537c478bdstevel@tonic-gate__ram_put(dbp, txn, key, data, flags)
3547c478bdstevel@tonic-gate	DB *dbp;
3557c478bdstevel@tonic-gate	DB_TXN *txn;
3567c478bdstevel@tonic-gate	DBT *key, *data;
3577c478bdstevel@tonic-gate	u_int32_t flags;
3587c478bdstevel@tonic-gate{
3597c478bdstevel@tonic-gate	DBC *dbc;
3607c478bdstevel@tonic-gate	db_recno_t recno;
3617c478bdstevel@tonic-gate	int ret, t_ret;
3627c478bdstevel@tonic-gate
3637c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
3647c478bdstevel@tonic-gate
3657c478bdstevel@tonic-gate	/* Check for invalid flags. */
3667c478bdstevel@tonic-gate	if ((ret = __db_putchk(dbp,
3677c478bdstevel@tonic-gate	    key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0)
3687c478bdstevel@tonic-gate		return (ret);
3697c478bdstevel@tonic-gate
3707c478bdstevel@tonic-gate	/* Allocate a cursor. */
3717c478bdstevel@tonic-gate	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
3727c478bdstevel@tonic-gate		return (ret);
3737c478bdstevel@tonic-gate
3747c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags);
3757c478bdstevel@tonic-gate
3767c478bdstevel@tonic-gate	/*
3777c478bdstevel@tonic-gate	 * If we're appending to the tree, make sure we've read in all of
3787c478bdstevel@tonic-gate	 * the backing source file.  Otherwise, check the user's record
3797c478bdstevel@tonic-gate	 * number and fill in as necessary.
3807c478bdstevel@tonic-gate	 */
3817c478bdstevel@tonic-gate	ret = flags == DB_APPEND ?
3827c478bdstevel@tonic-gate	    __ram_update(dbc, DB_MAX_RECORDS, 0) :
3837c478bdstevel@tonic-gate	    __ram_getno(dbc, key, &recno, 1);
3847c478bdstevel@tonic-gate
3857c478bdstevel@tonic-gate	/* Add the record. */
3867c478bdstevel@tonic-gate	if (ret == 0)
3877c478bdstevel@tonic-gate		ret = __ram_add(dbc, &recno, data, flags, 0);
3887c478bdstevel@tonic-gate
3897c478bdstevel@tonic-gate	/* Discard the cursor. */
3907c478bdstevel@tonic-gate	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
3917c478bdstevel@tonic-gate		ret = t_ret;
3927c478bdstevel@tonic-gate
3937c478bdstevel@tonic-gate	/* Return the record number if we're appending to the tree. */
3947c478bdstevel@tonic-gate	if (ret == 0 && flags == DB_APPEND)
3957c478bdstevel@tonic-gate		*(db_recno_t *)key->data = recno;
3967c478bdstevel@tonic-gate
3977c478bdstevel@tonic-gate	return (ret);
3987c478bdstevel@tonic-gate}
3997c478bdstevel@tonic-gate
4007c478bdstevel@tonic-gate/*
4017c478bdstevel@tonic-gate * __ram_sync --
4027c478bdstevel@tonic-gate *	Recno db->sync function.
4037c478bdstevel@tonic-gate */
4047c478bdstevel@tonic-gatestatic int
4057c478bdstevel@tonic-gate__ram_sync(dbp, flags)
4067c478bdstevel@tonic-gate	DB *dbp;
4077c478bdstevel@tonic-gate	u_int32_t flags;
4087c478bdstevel@tonic-gate{
4097c478bdstevel@tonic-gate	DBC *dbc;
4107c478bdstevel@tonic-gate	int ret, t_ret;
4117c478bdstevel@tonic-gate
4127c478bdstevel@tonic-gate	/*
4137c478bdstevel@tonic-gate	 * Sync the underlying btree.
4147c478bdstevel@tonic-gate	 *
4157c478bdstevel@tonic-gate	 * !!!
4167c478bdstevel@tonic-gate	 * We don't need to do a panic check or flags check, the "real"
4177c478bdstevel@tonic-gate	 * sync function does all that for us.
4187c478bdstevel@tonic-gate	 */
4197c478bdstevel@tonic-gate	if ((ret = __db_sync(dbp, flags)) != 0)
4207c478bdstevel@tonic-gate		return (ret);
4217c478bdstevel@tonic-gate
4227c478bdstevel@tonic-gate	/* Allocate a cursor. */
4237c478bdstevel@tonic-gate	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
4247c478bdstevel@tonic-gate		return (ret);
4257c478bdstevel@tonic-gate
4267c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, NULL, "ram_sync", NULL, NULL, flags);
4277c478bdstevel@tonic-gate
4287c478bdstevel@tonic-gate	/* Copy back the backing source file. */
4297c478bdstevel@tonic-gate	ret = __ram_writeback(dbc);
4307c478bdstevel@tonic-gate
4317c478bdstevel@tonic-gate	/* Discard the cursor. */
4327c478bdstevel@tonic-gate	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
4337c478bdstevel@tonic-gate		ret = t_ret;
4347c478bdstevel@tonic-gate
4357c478bdstevel@tonic-gate	return (ret);
4367c478bdstevel@tonic-gate}
4377c478bdstevel@tonic-gate
4387c478bdstevel@tonic-gate/*
4397c478bdstevel@tonic-gate * __ram_close --
4407c478bdstevel@tonic-gate *	Recno db->close function.
4417c478bdstevel@tonic-gate *
4427c478bdstevel@tonic-gate * PUBLIC: int __ram_close __P((DB *));
4437c478bdstevel@tonic-gate */
4447c478bdstevel@tonic-gateint
4457c478bdstevel@tonic-gate__ram_close(dbp)
4467c478bdstevel@tonic-gate	DB *dbp;
4477c478bdstevel@tonic-gate{
4487c478bdstevel@tonic-gate	RECNO *rp;
4497c478bdstevel@tonic-gate
4507c478bdstevel@tonic-gate	rp = ((BTREE *)dbp->internal)->recno;
4517c478bdstevel@tonic-gate
4527c478bdstevel@tonic-gate	/* Close any underlying mmap region. */
4537c478bdstevel@tonic-gate	if (rp->re_smap != NULL)
4547c478bdstevel@tonic-gate		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
4557c478bdstevel@tonic-gate
4567c478bdstevel@tonic-gate	/* Close any backing source file descriptor. */
4577c478bdstevel@tonic-gate	if (rp->re_fd != -1)
4587c478bdstevel@tonic-gate		(void)__os_close(rp->re_fd);
4597c478bdstevel@tonic-gate
4607c478bdstevel@tonic-gate	/* Free any backing source file name. */
4617c478bdstevel@tonic-gate	if (rp->re_source != NULL)
4627c478bdstevel@tonic-gate		__os_freestr(rp->re_source);
4637c478bdstevel@tonic-gate
4647c478bdstevel@tonic-gate	/* Free allocated memory. */
4657c478bdstevel@tonic-gate	__os_free(rp, sizeof(RECNO));
4667c478bdstevel@tonic-gate	((BTREE *)dbp->internal)->recno = NULL;
4677c478bdstevel@tonic-gate
4687c478bdstevel@tonic-gate	/* Close the underlying btree. */
4697c478bdstevel@tonic-gate	return (__bam_close(dbp));
4707c478bdstevel@tonic-gate}
4717c478bdstevel@tonic-gate
4727c478bdstevel@tonic-gate/*
4737c478bdstevel@tonic-gate * __ram_c_del --
4747c478bdstevel@tonic-gate *	Recno cursor->c_del function.
4757c478bdstevel@tonic-gate *
4767c478bdstevel@tonic-gate * PUBLIC: int __ram_c_del __P((DBC *, u_int32_t));
4777c478bdstevel@tonic-gate */
4787c478bdstevel@tonic-gateint
4797c478bdstevel@tonic-gate__ram_c_del(dbc, flags)
4807c478bdstevel@tonic-gate	DBC *dbc;
4817c478bdstevel@tonic-gate	u_int32_t flags;
4827c478bdstevel@tonic-gate{
4837c478bdstevel@tonic-gate	CURSOR *cp;
4847c478bdstevel@tonic-gate	DB *dbp;
4857c478bdstevel@tonic-gate	int ret;
4867c478bdstevel@tonic-gate
4877c478bdstevel@tonic-gate	dbp = dbc->dbp;
4887c478bdstevel@tonic-gate	cp = dbc->internal;
4897c478bdstevel@tonic-gate
4907c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
4917c478bdstevel@tonic-gate
4927c478bdstevel@tonic-gate	/* Check for invalid flags. */
4937c478bdstevel@tonic-gate	if ((ret = __db_cdelchk(dbp, flags,
4947c478bdstevel@tonic-gate	    F_ISSET(dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
4957c478bdstevel@tonic-gate		return (ret);
4967c478bdstevel@tonic-gate
4977c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_del", NULL, NULL, flags);
4987c478bdstevel@tonic-gate
4997c478bdstevel@tonic-gate	/*
5007c478bdstevel@tonic-gate	 * If we are running CDB, this had better be either a write
5017c478bdstevel@tonic-gate	 * cursor or an immediate writer.
5027c478bdstevel@tonic-gate	 */
5037c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB))
5047c478bdstevel@tonic-gate		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
5057c478bdstevel@tonic-gate			return (EINVAL);
5067c478bdstevel@tonic-gate
5077c478bdstevel@tonic-gate	/*
5087c478bdstevel@tonic-gate	 * The semantics of cursors during delete are as follows: if record
5097c478bdstevel@tonic-gate	 * numbers are mutable (DB_RE_RENUMBER is set), deleting a record
5107c478bdstevel@tonic-gate	 * causes the cursor to automatically point to the record immediately
5117c478bdstevel@tonic-gate	 * following.  In this case it is possible to use a single cursor for
5127c478bdstevel@tonic-gate	 * repeated delete operations, without intervening operations.
5137c478bdstevel@tonic-gate	 *
5147c478bdstevel@tonic-gate	 * If record numbers are not mutable, then records are replaced with
5157c478bdstevel@tonic-gate	 * a marker containing a delete flag.  If the record referenced by
5167c478bdstevel@tonic-gate	 * this cursor has already been deleted, we will detect that as part
5177c478bdstevel@tonic-gate	 * of the delete operation, and fail.
5187c478bdstevel@tonic-gate	 */
5197c478bdstevel@tonic-gate	return (__ram_i_delete(dbc));
5207c478bdstevel@tonic-gate}
5217c478bdstevel@tonic-gate
5227c478bdstevel@tonic-gate/*
5237c478bdstevel@tonic-gate * __ram_c_get --
5247c478bdstevel@tonic-gate *	Recno cursor->c_get function.
5257c478bdstevel@tonic-gate *
5267c478bdstevel@tonic-gate * PUBLIC: int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
5277c478bdstevel@tonic-gate */
5287c478bdstevel@tonic-gateint
5297c478bdstevel@tonic-gate__ram_c_get(dbc, key, data, flags)
5307c478bdstevel@tonic-gate	DBC *dbc;
5317c478bdstevel@tonic-gate	DBT *key, *data;
5327c478bdstevel@tonic-gate	u_int32_t flags;
5337c478bdstevel@tonic-gate{
5347c478bdstevel@tonic-gate	CURSOR *cp, copy;
5357c478bdstevel@tonic-gate	DB *dbp;
5367c478bdstevel@tonic-gate	PAGE *h;
5377c478bdstevel@tonic-gate	db_indx_t indx;
5387c478bdstevel@tonic-gate	int exact, ret, stack, tmp_rmw;
5397c478bdstevel@tonic-gate
5407c478bdstevel@tonic-gate	dbp = dbc->dbp;
5417c478bdstevel@tonic-gate	cp = dbc->internal;
5427c478bdstevel@tonic-gate
5437c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
5447c478bdstevel@tonic-gate
5457c478bdstevel@tonic-gate	/* Check for invalid flags. */
5467c478bdstevel@tonic-gate	if ((ret = __db_cgetchk(dbc->dbp,
5477c478bdstevel@tonic-gate	    key, data, flags, cp->recno != RECNO_OOB)) != 0)
5487c478bdstevel@tonic-gate		return (ret);
5497c478bdstevel@tonic-gate
5507c478bdstevel@tonic-gate	/* Clear OR'd in additional bits so we can check for flag equality. */
5517c478bdstevel@tonic-gate	tmp_rmw = 0;
5527c478bdstevel@tonic-gate	if (LF_ISSET(DB_RMW)) {
5537c478bdstevel@tonic-gate		if (!F_ISSET(dbp, DB_AM_CDB)) {
5547c478bdstevel@tonic-gate			tmp_rmw = 1;
5557c478bdstevel@tonic-gate			F_SET(dbc, DBC_RMW);
5567c478bdstevel@tonic-gate		}
5577c478bdstevel@tonic-gate		LF_CLR(DB_RMW);
5587c478bdstevel@tonic-gate	}
5597c478bdstevel@tonic-gate
5607c478bdstevel@tonic-gate	DEBUG_LREAD(dbc, dbc->txn, "ram_c_get",
5617c478bdstevel@tonic-gate	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
5627c478bdstevel@tonic-gate
5637c478bdstevel@tonic-gate	/* Initialize the cursor for a new retrieval. */
5647c478bdstevel@tonic-gate	copy = *cp;
5657c478bdstevel@tonic-gate
5667c478bdstevel@tonic-gateretry:	/* Update the record number. */
5677c478bdstevel@tonic-gate	stack = 0;
5687c478bdstevel@tonic-gate	switch (flags) {
5697c478bdstevel@tonic-gate	case DB_CURRENT:
5707c478bdstevel@tonic-gate		/*
5717c478bdstevel@tonic-gate		 * If record numbers are mutable: if we just deleted a record,
5727c478bdstevel@tonic-gate		 * there is no action necessary, we return the record following
5737c478bdstevel@tonic-gate		 * the deleted item by virtue of renumbering the tree.
5747c478bdstevel@tonic-gate		 */
5757c478bdstevel@tonic-gate		break;
5767c478bdstevel@tonic-gate	case DB_NEXT:
5777c478bdstevel@tonic-gate		/*
5787c478bdstevel@tonic-gate		 * If record numbers are mutable: if we just deleted a record,
5797c478bdstevel@tonic-gate		 * we have to avoid incrementing the record number so that we
5807c478bdstevel@tonic-gate		 * return the right record by virtue of renumbering the tree.
5817c478bdstevel@tonic-gate		 */
5827c478bdstevel@tonic-gate		if (CD_ISSET(dbp, cp))
5837c478bdstevel@tonic-gate			break;
5847c478bdstevel@tonic-gate
5857c478bdstevel@tonic-gate		if (cp->recno != RECNO_OOB) {
5867c478bdstevel@tonic-gate			++cp->recno;
5877c478bdstevel@tonic-gate			break;
5887c478bdstevel@tonic-gate		}
5897c478bdstevel@tonic-gate		/* FALLTHROUGH */
5907c478bdstevel@tonic-gate	case DB_FIRST:
5917c478bdstevel@tonic-gate		flags = DB_NEXT;
5927c478bdstevel@tonic-gate		cp->recno = 1;
5937c478bdstevel@tonic-gate		break;
5947c478bdstevel@tonic-gate	case DB_PREV:
5957c478bdstevel@tonic-gate		if (cp->recno != RECNO_OOB) {
5967c478bdstevel@tonic-gate			if (cp->recno == 1) {
5977c478bdstevel@tonic-gate				ret = DB_NOTFOUND;
5987c478bdstevel@tonic-gate				goto err;
5997c478bdstevel@tonic-gate			}
6007c478bdstevel@tonic-gate			--cp->recno;
6017c478bdstevel@tonic-gate			break;
6027c478bdstevel@tonic-gate		}
6037c478bdstevel@tonic-gate		/* FALLTHROUGH */
6047c478bdstevel@tonic-gate	case DB_LAST:
6057c478bdstevel@tonic-gate		flags = DB_PREV;
6067c478bdstevel@tonic-gate		if (((ret = __ram_update(dbc,
6077c478bdstevel@tonic-gate		    DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
6087c478bdstevel@tonic-gate			goto err;
6097c478bdstevel@tonic-gate		if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
6107c478bdstevel@tonic-gate			goto err;
6117c478bdstevel@tonic-gate		if (cp->recno == 0) {
6127c478bdstevel@tonic-gate			ret = DB_NOTFOUND;
6137c478bdstevel@tonic-gate			goto err;
6147c478bdstevel@tonic-gate		}
6157c478bdstevel@tonic-gate		break;
6167c478bdstevel@tonic-gate	case DB_SET:
6177c478bdstevel@tonic-gate	case DB_SET_RANGE:
6187c478bdstevel@tonic-gate		if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
6197c478bdstevel@tonic-gate			goto err;
6207c478bdstevel@tonic-gate		break;
6217c478bdstevel@tonic-gate	}
6227c478bdstevel@tonic-gate
6237c478bdstevel@tonic-gate	/* Return the key if the user didn't give us one. */
6247c478bdstevel@tonic-gate	if (flags != DB_SET && flags != DB_SET_RANGE &&
6257c478bdstevel@tonic-gate	    (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno),
6267c478bdstevel@tonic-gate	    &dbc->rkey.data, &dbc->rkey.ulen, dbp->db_malloc)) != 0)
6277c478bdstevel@tonic-gate		goto err;
6287c478bdstevel@tonic-gate
6297c478bdstevel@tonic-gate	/* Search the tree for the record. */
6307c478bdstevel@tonic-gate	if ((ret = __bam_rsearch(dbc, &cp->recno,
6317c478bdstevel@tonic-gate	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &exact)) != 0)
6327c478bdstevel@tonic-gate		goto err;
6337c478bdstevel@tonic-gate	stack = 1;
6347c478bdstevel@tonic-gate	if (!exact) {
6357c478bdstevel@tonic-gate		ret = DB_NOTFOUND;
6367c478bdstevel@tonic-gate		goto err;
6377c478bdstevel@tonic-gate	}
6387c478bdstevel@tonic-gate	h = cp->csp->page;
6397c478bdstevel@tonic-gate	indx = cp->csp->indx;
6407c478bdstevel@tonic-gate
6417c478bdstevel@tonic-gate	/*
6427c478bdstevel@tonic-gate	 * If re-numbering records, the on-page deleted flag means this record
6437c478bdstevel@tonic-gate	 * was implicitly created.  If not re-numbering records, the on-page
6447c478bdstevel@tonic-gate	 * deleted flag means this record was implicitly created, or, it was
6457c478bdstevel@tonic-gate	 * deleted at some time.  Regardless, we skip such records if doing
6467c478bdstevel@tonic-gate	 * cursor next/prev operations, and fail if the application requested
6477c478bdstevel@tonic-gate	 * them explicitly.
6487c478bdstevel@tonic-gate	 */
6497c478bdstevel@tonic-gate	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
6507c478bdstevel@tonic-gate		if (flags == DB_NEXT || flags == DB_PREV) {
6517c478bdstevel@tonic-gate			(void)__bam_stkrel(dbc, 0);
6527c478bdstevel@tonic-gate			goto retry;
6537c478bdstevel@tonic-gate		}
6547c478bdstevel@tonic-gate		ret = DB_KEYEMPTY;
6557c478bdstevel@tonic-gate		goto err;
6567c478bdstevel@tonic-gate	}
6577c478bdstevel@tonic-gate
6587c478bdstevel@tonic-gate	/* Return the data item. */
6597c478bdstevel@tonic-gate	if ((ret = __db_ret(dbp,
6607c478bdstevel@tonic-gate	    h, indx, data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
6617c478bdstevel@tonic-gate		goto err;
6627c478bdstevel@tonic-gate
6637c478bdstevel@tonic-gate	/* The cursor was reset, no further delete adjustment is necessary. */
6647c478bdstevel@tonic-gate	CD_CLR(dbp, cp);
6657c478bdstevel@tonic-gate
6667c478bdstevel@tonic-gateerr:	if (stack)
6677c478bdstevel@tonic-gate		(void)__bam_stkrel(dbc, 0);
6687c478bdstevel@tonic-gate
6697c478bdstevel@tonic-gate	/* Release temporary lock upgrade. */
6707c478bdstevel@tonic-gate	if (tmp_rmw)
6717c478bdstevel@tonic-gate		F_CLR(dbc, DBC_RMW);
6727c478bdstevel@tonic-gate
6737c478bdstevel@tonic-gate	if (ret != 0)
6747c478bdstevel@tonic-gate		*cp = copy;
6757c478bdstevel@tonic-gate
6767c478bdstevel@tonic-gate	return (ret);
6777c478bdstevel@tonic-gate}
6787c478bdstevel@tonic-gate
6797c478bdstevel@tonic-gate/*
6807c478bdstevel@tonic-gate * __ram_c_put --
6817c478bdstevel@tonic-gate *	Recno cursor->c_put function.
6827c478bdstevel@tonic-gate *
6837c478bdstevel@tonic-gate * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
6847c478bdstevel@tonic-gate */
6857c478bdstevel@tonic-gateint
6867c478bdstevel@tonic-gate__ram_c_put(dbc, key, data, flags)
6877c478bdstevel@tonic-gate	DBC *dbc;
6887c478bdstevel@tonic-gate	DBT *key, *data;
6897c478bdstevel@tonic-gate	u_int32_t flags;
6907c478bdstevel@tonic-gate{
6917c478bdstevel@tonic-gate	CURSOR *cp, copy;
6927c478bdstevel@tonic-gate	DB *dbp;
6937c478bdstevel@tonic-gate	int exact, ret;
6947c478bdstevel@tonic-gate	void *arg;
6957c478bdstevel@tonic-gate
6967c478bdstevel@tonic-gate	dbp = dbc->dbp;
6977c478bdstevel@tonic-gate	cp = dbc->internal;
6987c478bdstevel@tonic-gate
6997c478bdstevel@tonic-gate	DB_PANIC_CHECK(dbp);
7007c478bdstevel@tonic-gate
7017c478bdstevel@tonic-gate	if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
7027c478bdstevel@tonic-gate	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
7037c478bdstevel@tonic-gate		return (ret);
7047c478bdstevel@tonic-gate
7057c478bdstevel@tonic-gate	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_put", NULL, data, flags);
7067c478bdstevel@tonic-gate
7077c478bdstevel@tonic-gate	/*
7087c478bdstevel@tonic-gate	 * If we are running CDB, this had better be either a write
7097c478bdstevel@tonic-gate	 * cursor or an immediate writer.  If it's a regular writer,
7107c478bdstevel@tonic-gate	 * that means we have an IWRITE lock and we need to upgrade
7117c478bdstevel@tonic-gate	 * it to a write lock.
7127c478bdstevel@tonic-gate	 */
7137c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_AM_CDB)) {
7147c478bdstevel@tonic-gate		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
7157c478bdstevel@tonic-gate			return (EINVAL);
7167c478bdstevel@tonic-gate
7177c478bdstevel@tonic-gate		if (F_ISSET(dbc, DBC_RMW) &&
7187c478bdstevel@tonic-gate		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
7197c478bdstevel@tonic-gate		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
7207c478bdstevel@tonic-gate		    &dbc->mylock)) != 0)
7217c478bdstevel@tonic-gate			return (EAGAIN);
7227c478bdstevel@tonic-gate	}
7237c478bdstevel@tonic-gate
7247c478bdstevel@tonic-gate	/* Initialize the cursor for a new retrieval. */
7257c478bdstevel@tonic-gate	copy = *cp;
7267c478bdstevel@tonic-gate
7277c478bdstevel@tonic-gate	/*
7287c478bdstevel@tonic-gate	 * To split, we need a valid key for the page.  Since it's a cursor,
7297c478bdstevel@tonic-gate	 * we have to build one.
7307c478bdstevel@tonic-gate	 *
7317c478bdstevel@tonic-gate	 * The split code discards all short-term locks and stack pages.
7327c478bdstevel@tonic-gate	 */
7337c478bdstevel@tonic-gate	if (0) {
7347c478bdstevel@tonic-gatesplit:		arg = &cp->recno;
7357c478bdstevel@tonic-gate		if ((ret = __bam_split(dbc, arg)) != 0)
7367c478bdstevel@tonic-gate			goto err;
7377c478bdstevel@tonic-gate	}
7387c478bdstevel@tonic-gate
7397c478bdstevel@tonic-gate	if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
7407c478bdstevel@tonic-gate		goto err;
7417c478bdstevel@tonic-gate	if (!exact) {
7427c478bdstevel@tonic-gate		ret = DB_NOTFOUND;
7437c478bdstevel@tonic-gate		goto err;
7447c478bdstevel@tonic-gate	}
7457c478bdstevel@tonic-gate	if ((ret = __bam_iitem(dbc, &cp->csp->page,
7467c478bdstevel@tonic-gate	    &cp->csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) {
7477c478bdstevel@tonic-gate		if ((ret = __bam_stkrel(dbc, 0)) != 0)
7487c478bdstevel@tonic-gate			goto err;
7497c478bdstevel@tonic-gate		goto split;
7507c478bdstevel@tonic-gate	}
7517c478bdstevel@tonic-gate	if ((ret = __bam_stkrel(dbc, 0)) != 0)
7527c478bdstevel@tonic-gate		goto err;
7537c478bdstevel@tonic-gate
7547c478bdstevel@tonic-gate	switch (flags) {
7557c478bdstevel@tonic-gate	case DB_AFTER:
7567c478bdstevel@tonic-gate		/* Adjust the cursors. */
7577c478bdstevel@tonic-gate		__ram_ca(dbp, cp->recno, CA_IAFTER);
7587c478bdstevel@tonic-gate
7597c478bdstevel@tonic-gate		/* Set this cursor to reference the new record. */
7607c478bdstevel@tonic-gate		cp->recno = copy.recno + 1;
7617c478bdstevel@tonic-gate		break;
7627c478bdstevel@tonic-gate	case DB_BEFORE:
7637c478bdstevel@tonic-gate		/* Adjust the cursors. */
7647c478bdstevel@tonic-gate		__ram_ca(dbp, cp->recno, CA_IBEFORE);
7657c478bdstevel@tonic-gate
7667c478bdstevel@tonic-gate		/* Set this cursor to reference the new record. */
7677c478bdstevel@tonic-gate		cp->recno = copy.recno;
7687c478bdstevel@tonic-gate		break;
7697c478bdstevel@tonic-gate	}
7707c478bdstevel@tonic-gate
7717c478bdstevel@tonic-gate	/* The cursor was reset, no further delete adjustment is necessary. */
7727c478bdstevel@tonic-gate	CD_CLR(dbp, cp);
7737c478bdstevel@tonic-gate
7747c478bdstevel@tonic-gateerr:	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
7757c478bdstevel@tonic-gate		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
7767c478bdstevel@tonic-gate		    DB_LOCK_IWRITE, 0);
7777c478bdstevel@tonic-gate
7787c478bdstevel@tonic-gate	if (ret != 0)
7797c478bdstevel@tonic-gate		*cp = copy;
7807c478bdstevel@tonic-gate
7817c478bdstevel@tonic-gate	return (ret);
7827c478bdstevel@tonic-gate}
7837c478bdstevel@tonic-gate
7847c478bdstevel@tonic-gate/*
7857c478bdstevel@tonic-gate * __ram_ca --
7867c478bdstevel@tonic-gate *	Adjust cursors.
7877c478bdstevel@tonic-gate *
7887c478bdstevel@tonic-gate * PUBLIC: void __ram_ca __P((DB *, db_recno_t, ca_recno_arg));
7897c478bdstevel@tonic-gate */
7907c478bdstevel@tonic-gatevoid
7917c478bdstevel@tonic-gate__ram_ca(dbp, recno, op)
7927c478bdstevel@tonic-gate	DB *dbp;
7937c478bdstevel@tonic-gate	db_recno_t recno;
7947c478bdstevel@tonic-gate	ca_recno_arg op;
7957c478bdstevel@tonic-gate{
7967c478bdstevel@tonic-gate	CURSOR *cp;
7977c478bdstevel@tonic-gate	DBC *dbc;
7987c478bdstevel@tonic-gate
7997c478bdstevel@tonic-gate	/*
8007c478bdstevel@tonic-gate	 * Adjust the cursors.  See the comment in __bam_ca_delete().
8017c478bdstevel@tonic-gate	 */
8027c478bdstevel@tonic-gate	DB_THREAD_LOCK(dbp);
8037c478bdstevel@tonic-gate	for (dbc = TAILQ_FIRST(&dbp->active_queue);
8047c478bdstevel@tonic-gate	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
8057c478bdstevel@tonic-gate		cp = dbc->internal;
8067c478bdstevel@tonic-gate		switch (op) {
8077c478bdstevel@tonic-gate		case CA_DELETE:
8087c478bdstevel@tonic-gate			if (recno > cp->recno)
8097c478bdstevel@tonic-gate				--cp->recno;
8107c478bdstevel@tonic-gate			if (recno == cp->recno)
8117c478bdstevel@tonic-gate				CD_SET(dbp, cp);
8127c478bdstevel@tonic-gate			break;
8137c478bdstevel@tonic-gate		case CA_IAFTER:
8147c478bdstevel@tonic-gate			if (recno > cp->recno)
8157c478bdstevel@tonic-gate				++cp->recno;
8167c478bdstevel@tonic-gate			break;
8177c478bdstevel@tonic-gate		case CA_IBEFORE:
8187c478bdstevel@tonic-gate			if (recno >= cp->recno)
8197c478bdstevel@tonic-gate				++cp->recno;
8207c478bdstevel@tonic-gate			break;
8217c478bdstevel@tonic-gate		}
8227c478bdstevel@tonic-gate	}
8237c478bdstevel@tonic-gate	DB_THREAD_UNLOCK(dbp);
8247c478bdstevel@tonic-gate}
8257c478bdstevel@tonic-gate
8267c478bdstevel@tonic-gate/*
8277c478bdstevel@tonic-gate * __ram_getno --
8287c478bdstevel@tonic-gate *	Check the user's record number, and make sure we've seen it.
8297c478bdstevel@tonic-gate *
8307c478bdstevel@tonic-gate * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
8317c478bdstevel@tonic-gate */
8327c478bdstevel@tonic-gateint
8337c478bdstevel@tonic-gate__ram_getno(dbc, key, rep, can_create)
8347c478bdstevel@tonic-gate	DBC *dbc;
8357c478bdstevel@tonic-gate	const DBT *key;
8367c478bdstevel@tonic-gate	db_recno_t *rep;
8377c478bdstevel@tonic-gate	int can_create;
8387c478bdstevel@tonic-gate{
8397c478bdstevel@tonic-gate	DB *dbp;
8407c478bdstevel@tonic-gate	db_recno_t recno;
8417c478bdstevel@tonic-gate
8427c478bdstevel@tonic-gate	dbp = dbc->dbp;
8437c478bdstevel@tonic-gate
8447c478bdstevel@tonic-gate	/* Check the user's record number. */
8457c478bdstevel@tonic-gate	if ((recno = *(db_recno_t *)key->data) == 0) {
8467c478bdstevel@tonic-gate		__db_err(dbp->dbenv, "illegal record number of 0");
8477c478bdstevel@tonic-gate		return (EINVAL);
8487c478bdstevel@tonic-gate	}
8497c478bdstevel@tonic-gate	if (rep != NULL)
8507c478bdstevel@tonic-gate		*rep = recno;
8517c478bdstevel@tonic-gate
8527c478bdstevel@tonic-gate	/*
8537c478bdstevel@tonic-gate	 * Btree can neither create records nor read them in.  Recno can
8547c478bdstevel@tonic-gate	 * do both, see if we can find the record.
8557c478bdstevel@tonic-gate	 */
8567c478bdstevel@tonic-gate	return (dbp->type == DB_RECNO ?
8577c478bdstevel@tonic-gate	    __ram_update(dbc, recno, can_create) : 0);
8587c478bdstevel@tonic-gate}
8597c478bdstevel@tonic-gate
8607c478bdstevel@tonic-gate/*
8617c478bdstevel@tonic-gate * __ram_update --
8627c478bdstevel@tonic-gate *	Ensure the tree has records up to and including the specified one.
8637c478bdstevel@tonic-gate */
8647c478bdstevel@tonic-gatestatic int
8657c478bdstevel@tonic-gate__ram_update(dbc, recno, can_create)
8667c478bdstevel@tonic-gate	DBC *dbc;
8677c478bdstevel@tonic-gate	db_recno_t recno;
8687c478bdstevel@tonic-gate	int can_create;
8697c478bdstevel@tonic-gate{
8707c478bdstevel@tonic-gate	BTREE *t;
8717c478bdstevel@tonic-gate	DB *dbp;
8727c478bdstevel@tonic-gate	RECNO *rp;
8737c478bdstevel@tonic-gate	db_recno_t nrecs;
8747c478bdstevel@tonic-gate	int ret;
8757c478bdstevel@tonic-gate
8767c478bdstevel@tonic-gate	dbp = dbc->dbp;
8777c478bdstevel@tonic-gate	t = dbp->internal;
8787c478bdstevel@tonic-gate	rp = t->recno;
8797c478bdstevel@tonic-gate
8807c478bdstevel@tonic-gate	/*
8817c478bdstevel@tonic-gate	 * If we can't create records and we've read the entire backing input
8827c478bdstevel@tonic-gate	 * file, we're done.
8837c478bdstevel@tonic-gate	 */
8847c478bdstevel@tonic-gate	if (!can_create && F_ISSET(rp, RECNO_EOF))
8857c478bdstevel@tonic-gate		return (0);
8867c478bdstevel@tonic-gate
8877c478bdstevel@tonic-gate	/*
8887c478bdstevel@tonic-gate	 * If we haven't seen this record yet, try to get it from the original
8897c478bdstevel@tonic-gate	 * file.
8907c478bdstevel@tonic-gate	 */
8917c478bdstevel@tonic-gate	if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
8927c478bdstevel@tonic-gate		return (ret);
8937c478bdstevel@tonic-gate	if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) {
8947c478bdstevel@tonic-gate		if ((ret = rp->re_irec(dbc, recno)) != 0)
8957c478bdstevel@tonic-gate			return (ret);
8967c478bdstevel@tonic-gate		if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
8977c478bdstevel@tonic-gate			return (ret);
8987c478bdstevel@tonic-gate	}
8997c478bdstevel@tonic-gate
9007c478bdstevel@tonic-gate	/*
9017c478bdstevel@tonic-gate	 * If we can create records, create empty ones up to the requested
9027c478bdstevel@tonic-gate	 * record.
9037c478bdstevel@tonic-gate	 */
9047c478bdstevel@tonic-gate	if (!can_create || recno <= nrecs + 1)
9057c478bdstevel@tonic-gate		return (0);
9067c478bdstevel@tonic-gate
9077c478bdstevel@tonic-gate	dbc->rdata.dlen = 0;
9087c478bdstevel@tonic-gate	dbc->rdata.doff = 0;
9097c478bdstevel@tonic-gate	dbc->rdata.flags = 0;
9107c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
9117c478bdstevel@tonic-gate		if (dbc->rdata.ulen < rp->re_len) {
9127c478bdstevel@tonic-gate			if ((ret =
9137c478bdstevel@tonic-gate			    __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
9147c478bdstevel@tonic-gate				dbc->rdata.ulen = 0;
9157c478bdstevel@tonic-gate				dbc->rdata.data = NULL;
9167c478bdstevel@tonic-gate				return (ret);
9177c478bdstevel@tonic-gate			}
9187c478bdstevel@tonic-gate			dbc->rdata.ulen = rp->re_len;
9197c478bdstevel@tonic-gate		}
9207c478bdstevel@tonic-gate		dbc->rdata.size = rp->re_len;
9217c478bdstevel@tonic-gate		memset(dbc->rdata.data, rp->re_pad, rp->re_len);
9227c478bdstevel@tonic-gate	} else
9237c478bdstevel@tonic-gate		dbc->rdata.size = 0;
9247c478bdstevel@tonic-gate
9257c478bdstevel@tonic-gate	while (recno > ++nrecs)
9267c478bdstevel@tonic-gate		if ((ret = __ram_add(dbc,
9277c478bdstevel@tonic-gate		    &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0)
9287c478bdstevel@tonic-gate			return (ret);
9297c478bdstevel@tonic-gate	return (0);
9307c478bdstevel@tonic-gate}
9317c478bdstevel@tonic-gate
9327c478bdstevel@tonic-gate/*
9337c478bdstevel@tonic-gate * __ram_source --
9347c478bdstevel@tonic-gate *	Load information about the backing file.
9357c478bdstevel@tonic-gate */
9367c478bdstevel@tonic-gatestatic int
9377c478bdstevel@tonic-gate__ram_source(dbp, rp, fname)
9387c478bdstevel@tonic-gate	DB *dbp;
9397c478bdstevel@tonic-gate	RECNO *rp;
9407c478bdstevel@tonic-gate	const char *fname;
9417c478bdstevel@tonic-gate{
9427c478bdstevel@tonic-gate	size_t size;
9437c478bdstevel@tonic-gate	u_int32_t bytes, mbytes, oflags;
9447c478bdstevel@tonic-gate	int ret;
9457c478bdstevel@tonic-gate
9467c478bdstevel@tonic-gate	/*
9477c478bdstevel@tonic-gate	 * !!!
9487c478bdstevel@tonic-gate	 * The caller has full responsibility for cleaning up on error --
9497c478bdstevel@tonic-gate	 * (it has to anyway, in case it fails after this routine succeeds).
9507c478bdstevel@tonic-gate	 */
9517c478bdstevel@tonic-gate	if ((ret = __db_appname(dbp->dbenv,
9527c478bdstevel@tonic-gate	    DB_APP_DATA, NULL, fname, 0, NULL, &rp->re_source)) != 0)
9537c478bdstevel@tonic-gate		return (ret);
9547c478bdstevel@tonic-gate
9557c478bdstevel@tonic-gate	oflags = F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0;
9567c478bdstevel@tonic-gate	if ((ret =
9577c478bdstevel@tonic-gate	    __db_open(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) {
9587c478bdstevel@tonic-gate		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
9597c478bdstevel@tonic-gate		return (ret);
9607c478bdstevel@tonic-gate	}
9617c478bdstevel@tonic-gate
9627c478bdstevel@tonic-gate	/*
9637c478bdstevel@tonic-gate	 * XXX
9647c478bdstevel@tonic-gate	 * We'd like to test to see if the file is too big to mmap.  Since we
9657c478bdstevel@tonic-gate	 * don't know what size or type off_t's or size_t's are, or the largest
9667c478bdstevel@tonic-gate	 * unsigned integral type is, or what random insanity the local C
9677c478bdstevel@tonic-gate	 * compiler will perpetrate, doing the comparison in a portable way is
9687c478bdstevel@tonic-gate	 * flatly impossible.  Hope that mmap fails if the file is too large.
9697c478bdstevel@tonic-gate	 */
9707c478bdstevel@tonic-gate	if ((ret = __os_ioinfo(rp->re_source,
9717c478bdstevel@tonic-gate	    rp->re_fd, &mbytes, &bytes, NULL)) != 0) {
9727c478bdstevel@tonic-gate		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
9737c478bdstevel@tonic-gate		return (ret);
9747c478bdstevel@tonic-gate	}
9757c478bdstevel@tonic-gate	if (mbytes == 0 && bytes == 0) {
9767c478bdstevel@tonic-gate		F_SET(rp, RECNO_EOF);
9777c478bdstevel@tonic-gate		return (0);
9787c478bdstevel@tonic-gate	}
9797c478bdstevel@tonic-gate
9807c478bdstevel@tonic-gate	size = mbytes * MEGABYTE + bytes;
9817c478bdstevel@tonic-gate	if ((ret = __db_mapfile(rp->re_source,
9827c478bdstevel@tonic-gate	    rp->re_fd, (size_t)size, 1, &rp->re_smap)) != 0)
9837c478bdstevel@tonic-gate		return (ret);
9847c478bdstevel@tonic-gate	rp->re_cmap = rp->re_smap;
9857c478bdstevel@tonic-gate	rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size);
9867c478bdstevel@tonic-gate	rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ?  __ram_fmap : __ram_vmap;
9877c478bdstevel@tonic-gate	return (0);
9887c478bdstevel@tonic-gate}
9897c478bdstevel@tonic-gate
9907c478bdstevel@tonic-gate/*
9917c478bdstevel@tonic-gate * __ram_writeback --
9927c478bdstevel@tonic-gate *	Rewrite the backing file.
9937c478bdstevel@tonic-gate */
9947c478bdstevel@tonic-gatestatic int
9957c478bdstevel@tonic-gate__ram_writeback(dbc)
9967c478bdstevel@tonic-gate	DBC *dbc;
9977c478bdstevel@tonic-gate{
9987c478bdstevel@tonic-gate	DB *dbp;
9997c478bdstevel@tonic-gate	DBT key, data;
10007c478bdstevel@tonic-gate	RECNO *rp;
10017c478bdstevel@tonic-gate	db_recno_t keyno;
10027c478bdstevel@tonic-gate	ssize_t nw;
10037c478bdstevel@tonic-gate	int fd, ret, t_ret;
10047c478bdstevel@tonic-gate	u_int8_t delim, *pad;
10057c478bdstevel@tonic-gate
10067c478bdstevel@tonic-gate	dbp = dbc->dbp;
10077c478bdstevel@tonic-gate	rp = ((BTREE *)dbp->internal)->recno;
10087c478bdstevel@tonic-gate
10097c478bdstevel@tonic-gate	/* If the file wasn't modified, we're done. */
10107c478bdstevel@tonic-gate	if (!F_ISSET(rp, RECNO_MODIFIED))
10117c478bdstevel@tonic-gate		return (0);
10127c478bdstevel@tonic-gate
10137c478bdstevel@tonic-gate	/* If there's no backing source file, we're done. */
10147c478bdstevel@tonic-gate	if (rp->re_source == NULL) {
10157c478bdstevel@tonic-gate		F_CLR(rp, RECNO_MODIFIED);
10167c478bdstevel@tonic-gate		return (0);
10177c478bdstevel@tonic-gate	}
10187c478bdstevel@tonic-gate
10197c478bdstevel@tonic-gate	/*
10207c478bdstevel@tonic-gate	 * Read any remaining records into the tree.
10217c478bdstevel@tonic-gate	 *
10227c478bdstevel@tonic-gate	 * !!!
10237c478bdstevel@tonic-gate	 * This is why we can't support transactions when applications specify
10247c478bdstevel@tonic-gate	 * backing (re_source) files.  At this point we have to read in the
10257c478bdstevel@tonic-gate	 * rest of the records from the file so that we can write all of the
10267c478bdstevel@tonic-gate	 * records back out again, which could modify a page for which we'd
10277c478bdstevel@tonic-gate	 * have to log changes and which we don't have locked.  This could be
10287c478bdstevel@tonic-gate	 * partially fixed by taking a snapshot of the entire file during the
10297c478bdstevel@tonic-gate	 * db_open(), or, since db_open() isn't transaction protected, as part
10307c478bdstevel@tonic-gate	 * of the first DB operation.  But, if a checkpoint occurs then, the
10317c478bdstevel@tonic-gate	 * part of the log holding the copy of the file could be discarded, and
10327c478bdstevel@tonic-gate	 * that would make it impossible to recover in the face of disaster.
10337c478bdstevel@tonic-gate	 * This could all probably be fixed, but it would require transaction
10347c478bdstevel@tonic-gate	 * protecting the backing source file, i.e. mpool would have to know
10357c478bdstevel@tonic-gate	 * about it, and we don't want to go there.
10367c478bdstevel@tonic-gate	 */
10377c478bdstevel@tonic-gate	if ((ret =
10387c478bdstevel@tonic-gate	    __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
10397c478bdstevel@tonic-gate		return (ret);
10407c478bdstevel@tonic-gate
10417c478bdstevel@tonic-gate	/*
10427c478bdstevel@tonic-gate	 * !!!
10437c478bdstevel@tonic-gate	 * Close any underlying mmap region.  This is required for Windows NT
10447c478bdstevel@tonic-gate	 * (4.0, Service Pack 2) -- if the file is still mapped, the following
10457c478bdstevel@tonic-gate	 * open will fail.
10467c478bdstevel@tonic-gate	 */
10477c478bdstevel@tonic-gate	if (rp->re_smap != NULL) {
10487c478bdstevel@tonic-gate		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
10497c478bdstevel@tonic-gate		rp->re_smap = NULL;
10507c478bdstevel@tonic-gate	}
10517c478bdstevel@tonic-gate
10527c478bdstevel@tonic-gate	/* Get rid of any backing file descriptor, just on GP's. */
10537c478bdstevel@tonic-gate	if (rp->re_fd != -1) {
10547c478bdstevel@tonic-gate		(void)__os_close(rp->re_fd);
10557c478bdstevel@tonic-gate		rp->re_fd = -1;
10567c478bdstevel@tonic-gate	}
10577c478bdstevel@tonic-gate
10587c478bdstevel@tonic-gate	/* Open the file, truncating it. */
10597c478bdstevel@tonic-gate	if ((ret = __db_open(rp->re_source,
10607c478bdstevel@tonic-gate	    DB_SEQUENTIAL | DB_TRUNCATE,
10617c478bdstevel@tonic-gate	    DB_SEQUENTIAL | DB_TRUNCATE, 0, &fd)) != 0) {
10627c478bdstevel@tonic-gate		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
10637c478bdstevel@tonic-gate		return (ret);
10647c478bdstevel@tonic-gate	}
10657c478bdstevel@tonic-gate
10667c478bdstevel@tonic-gate	/*
10677c478bdstevel@tonic-gate	 * We step through the records, writing each one out.  Use the record
10687c478bdstevel@tonic-gate	 * number and the dbp->get() function, instead of a cursor, so we find
10697c478bdstevel@tonic-gate	 * and write out "deleted" or non-existent records.
10707c478bdstevel@tonic-gate	 */
10717c478bdstevel@tonic-gate	memset(&key, 0, sizeof(key));
10727c478bdstevel@tonic-gate	memset(&data, 0, sizeof(data));
10737c478bdstevel@tonic-gate	key.size = sizeof(db_recno_t);
10747c478bdstevel@tonic-gate	key.data = &keyno;
10757c478bdstevel@tonic-gate
10767c478bdstevel@tonic-gate	/*
10777c478bdstevel@tonic-gate	 * We'll need the delimiter if we're doing variable-length records,
10787c478bdstevel@tonic-gate	 * and the pad character if we're doing fixed-length records.
10797c478bdstevel@tonic-gate	 */
10807c478bdstevel@tonic-gate	delim = rp->re_delim;
10817c478bdstevel@tonic-gate	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
10827c478bdstevel@tonic-gate		if ((ret = __os_malloc(rp->re_len, NULL, &pad)) != 0)
10837c478bdstevel@tonic-gate			goto err;
10847c478bdstevel@tonic-gate		memset(pad, rp->re_pad, rp->re_len);
10857c478bdstevel@tonic-gate	} else
10867c478bdstevel@tonic-gate		COMPQUIET(pad, NULL);
10877c478bdstevel@tonic-gate	for (keyno = 1;; ++keyno) {
10887c478bdstevel@tonic-gate		switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
10897c478bdstevel@tonic-gate		case 0:
10907c478bdstevel@tonic-gate			if ((ret =
10917c478bdstevel@tonic-gate			    __os_write(fd, data.data, data.size, &nw)) != 0)
10927c478bdstevel@tonic-gate				goto err;
10937c478bdstevel@tonic-gate			if (nw != (ssize_t)data.size) {
10947c478bdstevel@tonic-gate				ret = EIO;
10957c478bdstevel@tonic-gate				goto err;
10967c478bdstevel@tonic-gate			}
10977c478bdstevel@tonic-gate			break;
10987c478bdstevel@tonic-gate		case DB_KEYEMPTY:
10997c478bdstevel@tonic-gate			if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
11007c478bdstevel@tonic-gate				if ((ret =
11017c478bdstevel@tonic-gate				    __os_write(fd, pad, rp->re_len, &nw)) != 0)
11027c478bdstevel@tonic-gate					goto err;
11037c478bdstevel@tonic-gate				if (nw != (ssize_t)rp->re_len) {
11047c478bdstevel@tonic-gate					ret = EIO;
11057c478bdstevel@tonic-gate					goto err;
11067c478bdstevel@tonic-gate				}
11077c478bdstevel@tonic-gate			}
11087c478bdstevel@tonic-gate			break;
11097c478bdstevel@tonic-gate		case DB_NOTFOUND:
11107c478bdstevel@tonic-gate			ret = 0;
11117c478bdstevel@tonic-gate			goto done;
11127c478bdstevel@tonic-gate		}
11137c478bdstevel@tonic-gate		if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) {
11147c478bdstevel@tonic-gate			if ((ret = __os_write(fd, &delim, 1, &nw)) != 0)
11157c478bdstevel@tonic-gate				goto err;
11167c478bdstevel@tonic-gate			if (nw != 1) {
11177c478bdstevel@tonic-gate				ret = EIO;
11187c478bdstevel@tonic-gate				goto err;
11197c478bdstevel@tonic-gate			}
11207c478bdstevel@tonic-gate		}
11217c478bdstevel@tonic-gate	}
11227c478bdstevel@tonic-gate
11237c478bdstevel@tonic-gateerr:
11247c478bdstevel@tonic-gatedone:	/* Close the file descriptor. */
11257c478bdstevel@tonic-gate	if ((t_ret = __os_close(fd)) != 0 || ret == 0)
11267c478bdstevel@tonic-gate		ret = t_ret;
11277c478bdstevel@tonic-gate
11287c478bdstevel@tonic-gate	if (ret == 0)
11297c478bdstevel@tonic-gate		F_CLR(rp, RECNO_MODIFIED);
11307c478bdstevel@tonic-gate	return (ret);
11317c478bdstevel@tonic-gate}
11327c478bdstevel@tonic-gate
11337c478bdstevel@tonic-gate/*
11347c478bdstevel@tonic-gate * __ram_fmap --
11357c478bdstevel@tonic-gate *	Get fixed length records from a file.
11367c478bdstevel@tonic-gate */
11377c478bdstevel@tonic-gatestatic int
11387c478bdstevel@tonic-gate__ram_fmap(dbc, top)
11397c478bdstevel@tonic-gate	DBC *dbc;
11407c478bdstevel@tonic-gate	db_recno_t top;
11417c478bdstevel@tonic-gate{
11427c478bdstevel@tonic-gate	DB *dbp;
11437c478bdstevel@tonic-gate	DBT data;
11447c478bdstevel@tonic-gate	RECNO *rp;
11457c478bdstevel@tonic-gate	db_recno_t recno;
11467c478bdstevel@tonic-gate	u_int32_t len;
11477c478bdstevel@tonic-gate	u_int8_t *sp, *ep, *p;
11487c478bdstevel@tonic-gate	int ret;
11497c478bdstevel@tonic-gate
11507c478bdstevel@tonic-gate	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
11517c478bdstevel@tonic-gate		return (ret);
11527c478bdstevel@tonic-gate
11537c478bdstevel@tonic-gate	dbp = dbc->dbp;
11547c478bdstevel@tonic-gate	rp = ((BTREE *)(dbp->internal))->recno;
11557c478bdstevel@tonic-gate
11567c478bdstevel@tonic-gate	if (dbc->rdata.ulen < rp->re_len) {
11577c478bdstevel@tonic-gate		if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
11587c478bdstevel@tonic-gate			dbc->rdata.ulen = 0;
11597c478bdstevel@tonic-gate			dbc->rdata.data = NULL;
11607c478bdstevel@tonic-gate			return (ret);
11617c478bdstevel@tonic-gate		}
11627c478bdstevel@tonic-gate		dbc->rdata.ulen = rp->re_len;
11637c478bdstevel@tonic-gate	}
11647c478bdstevel@tonic-gate
11657c478bdstevel@tonic-gate	memset(&data, 0, sizeof(data));
11667c478bdstevel@tonic-gate	data.data = dbc->rdata.data;
11677c478bdstevel@tonic-gate	data.size = rp->re_len;
11687c478bdstevel@tonic-gate
11697c478bdstevel@tonic-gate	sp = (u_int8_t *)rp->re_cmap;
11707c478bdstevel@tonic-gate	ep = (u_int8_t *)rp->re_emap;
11717c478bdstevel@tonic-gate	while (recno < top) {
11727c478bdstevel@tonic-gate		if (sp >= ep) {
11737c478bdstevel@tonic-gate			F_SET(rp, RECNO_EOF);
11747c478bdstevel@tonic-gate			return (DB_NOTFOUND);
11757c478bdstevel@tonic-gate		}
11767c478bdstevel@tonic-gate		len = rp->re_len;
11777c478bdstevel@tonic-gate		for (p = dbc->rdata.data;
11787c478bdstevel@tonic-gate		    sp < ep && len > 0; *p++ = *sp++, --len)
11797c478bdstevel@tonic-gate			;
11807c478bdstevel@tonic-gate
11817c478bdstevel@tonic-gate		/*
11827c478bdstevel@tonic-gate		 * Another process may have read this record from the input
11837c478bdstevel@tonic-gate		 * file and stored it into the database already, in which
11847c478bdstevel@tonic-gate		 * case we don't need to repeat that operation.  We detect
11857c478bdstevel@tonic-gate		 * this by checking if the last record we've read is greater
11867c478bdstevel@tonic-gate		 * or equal to the number of records in the database.
11877c478bdstevel@tonic-gate		 *
11887c478bdstevel@tonic-gate		 * XXX
11897c478bdstevel@tonic-gate		 * We should just do a seek, since the records are fixed
11907c478bdstevel@tonic-gate		 * length.
11917c478bdstevel@tonic-gate		 */
11927c478bdstevel@tonic-gate		if (rp->re_last >= recno) {
11937c478bdstevel@tonic-gate			if (len != 0)
11947c478bdstevel@tonic-gate				memset(p, rp->re_pad, len);
11957c478bdstevel@tonic-gate
11967c478bdstevel@tonic-gate			++recno;
11977c478bdstevel@tonic-gate			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
11987c478bdstevel@tonic-gate				return (ret);
11997c478bdstevel@tonic-gate		}
12007c478bdstevel@tonic-gate		++rp->re_last;
12017c478bdstevel@tonic-gate	}
12027c478bdstevel@tonic-gate	rp->re_cmap = sp;
12037c478bdstevel@tonic-gate	return (0);
12047c478bdstevel@tonic-gate}
12057c478bdstevel@tonic-gate
12067c478bdstevel@tonic-gate/*
12077c478bdstevel@tonic-gate * __ram_vmap --
12087c478bdstevel@tonic-gate *	Get variable length records from a file.
12097c478bdstevel@tonic-gate */
12107c478bdstevel@tonic-gatestatic int
12117c478bdstevel@tonic-gate__ram_vmap(dbc, top)
12127c478bdstevel@tonic-gate	DBC *dbc;
12137c478bdstevel@tonic-gate	db_recno_t top;
12147c478bdstevel@tonic-gate{
12157c478bdstevel@tonic-gate	DBT data;
12167c478bdstevel@tonic-gate	RECNO *rp;
12177c478bdstevel@tonic-gate	db_recno_t recno;
12187c478bdstevel@tonic-gate	u_int8_t *sp, *ep;
12197c478bdstevel@tonic-gate	int delim, ret;
12207c478bdstevel@tonic-gate
12217c478bdstevel@tonic-gate	rp = ((BTREE *)(dbc->dbp->internal))->recno;
12227c478bdstevel@tonic-gate
12237c478bdstevel@tonic-gate	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
12247c478bdstevel@tonic-gate		return (ret);
12257c478bdstevel@tonic-gate
12267c478bdstevel@tonic-gate	memset(&data, 0, sizeof(data));
12277c478bdstevel@tonic-gate
12287c478bdstevel@tonic-gate	delim = rp->re_delim;
12297c478bdstevel@tonic-gate
12307c478bdstevel@tonic-gate	sp = (u_int8_t *)rp->re_cmap;
12317c478bdstevel@tonic-gate	ep = (u_int8_t *)rp->re_emap;
12327c478bdstevel@tonic-gate	while (recno < top) {
12337c478bdstevel@tonic-gate		if (sp >= ep) {
12347c478bdstevel@tonic-gate			F_SET(rp, RECNO_EOF);
12357c478bdstevel@tonic-gate			return (DB_NOTFOUND);
12367c478bdstevel@tonic-gate		}
12377c478bdstevel@tonic-gate		for (data.data = sp; sp < ep && *sp != delim; ++sp)
12387c478bdstevel@tonic-gate			;
12397c478bdstevel@tonic-gate
12407c478bdstevel@tonic-gate		/*
12417c478bdstevel@tonic-gate		 * Another process may have read this record from the input
12427c478bdstevel@tonic-gate		 * file and stored it into the database already, in which
12437c478bdstevel@tonic-gate		 * case we don't need to repeat that operation.  We detect
12447c478bdstevel@tonic-gate		 * this by checking if the last record we've read is greater
12457c478bdstevel@tonic-gate		 * or equal to the number of records in the database.
12467c478bdstevel@tonic-gate		 */
12477c478bdstevel@tonic-gate		if (rp->re_last >= recno) {
12487c478bdstevel@tonic-gate			data.size = sp - (u_int8_t *)data.data;
12497c478bdstevel@tonic-gate			++recno;
12507c478bdstevel@tonic-gate			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
12517c478bdstevel@tonic-gate				return (ret);
12527c478bdstevel@tonic-gate		}
12537c478bdstevel@tonic-gate		++rp->re_last;
12547c478bdstevel@tonic-gate		++sp;
12557c478bdstevel@tonic-gate	}
12567c478bdstevel@tonic-gate	rp->re_cmap = sp;
12577c478bdstevel@tonic-gate	return (0);
12587c478bdstevel@tonic-gate}
12597c478bdstevel@tonic-gate
12607c478bdstevel@tonic-gate/*
12617c478bdstevel@tonic-gate * __ram_add --
12627c478bdstevel@tonic-gate *	Add records into the tree.
12637c478bdstevel@tonic-gate */
12647c478bdstevel@tonic-gatestatic int
12657c478bdstevel@tonic-gate__ram_add(dbc, recnop, data, flags, bi_flags)
12667c478bdstevel@tonic-gate	DBC *dbc;
12677c478bdstevel@tonic-gate	db_recno_t *recnop;
12687c478bdstevel@tonic-gate	DBT *data;
12697c478bdstevel@tonic-gate	u_int32_t flags, bi_flags;
12707c478bdstevel@tonic-gate{
12717c478bdstevel@tonic-gate	BKEYDATA *bk;
12727c478bdstevel@tonic-gate	CURSOR *cp;
12737c478bdstevel@tonic-gate	DB *dbp;
12747c478bdstevel@tonic-gate	PAGE *h;
12757c478bdstevel@tonic-gate	db_indx_t indx;
12767c478bdstevel@tonic-gate	int exact, isdeleted, ret, stack;
12777c478bdstevel@tonic-gate
12787c478bdstevel@tonic-gate	dbp = dbc->dbp;
12797c478bdstevel@tonic-gate	cp = dbc->internal;
12807c478bdstevel@tonic-gate
12817c478bdstevel@tonic-gateretry:	/* Find the slot for insertion. */
12827c478bdstevel@tonic-gate	if ((ret = __bam_rsearch(dbc, recnop,
12837c478bdstevel@tonic-gate	    S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
12847c478bdstevel@tonic-gate		return (ret);
12857c478bdstevel@tonic-gate	h = cp->csp->page;
12867c478bdstevel@tonic-gate	indx = cp->csp->indx;
12877c478bdstevel@tonic-gate	stack = 1;
12887c478bdstevel@tonic-gate
12897c478bdstevel@tonic-gate	/*
12907c478bdstevel@tonic-gate	 * If re-numbering records, the on-page deleted flag means this record
12917c478bdstevel@tonic-gate	 * was implicitly created.  If not re-numbering records, the on-page
12927c478bdstevel@tonic-gate	 * deleted flag means this record was implicitly created, or, it was
12937c478bdstevel@tonic-gate	 * deleted at some time.
12947c478bdstevel@tonic-gate	 *
12957c478bdstevel@tonic-gate	 * If DB_NOOVERWRITE is set and the item already exists in the tree,
12967c478bdstevel@tonic-gate	 * return an error unless the item was either marked for deletion or
12977c478bdstevel@tonic-gate	 * only implicitly created.
12987c478bdstevel@tonic-gate	 */
12997c478bdstevel@tonic-gate	isdeleted = 0;
13007c478bdstevel@tonic-gate	if (exact) {
13017c478bdstevel@tonic-gate		bk = GET_BKEYDATA(h, indx);
13027c478bdstevel@tonic-gate		if (B_DISSET(bk->type))
13037c478bdstevel@tonic-gate			isdeleted = 1;
13047c478bdstevel@tonic-gate		else
13057c478bdstevel@tonic-gate			if (flags == DB_NOOVERWRITE) {
13067c478bdstevel@tonic-gate				ret = DB_KEYEXIST;
13077c478bdstevel@tonic-gate				goto err;
13087c478bdstevel@tonic-gate			}
13097c478bdstevel@tonic-gate	}
13107c478bdstevel@tonic-gate
13117c478bdstevel@tonic-gate	/*
13127c478bdstevel@tonic-gate	 * Select the arguments for __bam_iitem() and do the insert.  If the
13137c478bdstevel@tonic-gate	 * key is an exact match, or we're replacing the data item with a
13147c478bdstevel@tonic-gate	 * new data item, replace the current item.  If the key isn't an exact
13157c478bdstevel@tonic-gate	 * match, we're inserting a new key/data pair, before the search
13167c478bdstevel@tonic-gate	 * location.
13177c478bdstevel@tonic-gate	 */
13187c478bdstevel@tonic-gate	switch (ret = __bam_iitem(dbc,
13197c478bdstevel@tonic-gate	    &h, &indx, NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
13207c478bdstevel@tonic-gate	case 0:
13217c478bdstevel@tonic-gate		/*
13227c478bdstevel@tonic-gate		 * Don't adjust anything.
13237c478bdstevel@tonic-gate		 *
13247c478bdstevel@tonic-gate		 * If we inserted a record, no cursors need adjusting because
13257c478bdstevel@tonic-gate		 * the only new record it's possible to insert is at the very
13267c478bdstevel@tonic-gate		 * end of the tree.  The necessary adjustments to the internal
13277c478bdstevel@tonic-gate		 * page counts were made by __bam_iitem().
13287c478bdstevel@tonic-gate		 *
13297c478bdstevel@tonic-gate		 * If we overwrote a record, no cursors need adjusting because
13307c478bdstevel@tonic-gate		 * future DBcursor->get calls will simply return the underlying
13317c478bdstevel@tonic-gate		 * record (there's no adjustment made for the DB_CURRENT flag
13327c478bdstevel@tonic-gate		 * when a cursor get operation immediately follows a cursor
13337c478bdstevel@tonic-gate		 * delete operation, and the normal adjustment for the DB_NEXT
13347c478bdstevel@tonic-gate		 * flag is still correct).
13357c478bdstevel@tonic-gate		 */
13367c478bdstevel@tonic-gate		break;
13377c478bdstevel@tonic-gate	case DB_NEEDSPLIT:
13387c478bdstevel@tonic-gate		/* Discard the stack of pages and split the page. */
13397c478bdstevel@tonic-gate		(void)__bam_stkrel(dbc, 0);
13407c478bdstevel@tonic-gate		stack = 0;
13417c478bdstevel@tonic-gate
13427c478bdstevel@tonic-gate		if ((ret = __bam_split(dbc, recnop)) != 0)
13437c478bdstevel@tonic-gate			goto err;
13447c478bdstevel@tonic-gate
13457c478bdstevel@tonic-gate		goto retry;
13467c478bdstevel@tonic-gate		/* NOTREACHED */
13477c478bdstevel@tonic-gate	default:
13487c478bdstevel@tonic-gate		goto err;
13497c478bdstevel@tonic-gate	}
13507c478bdstevel@tonic-gate
13517c478bdstevel@tonic-gate
13527c478bdstevel@tonic-gateerr:	if (stack)
13537c478bdstevel@tonic-gate		__bam_stkrel(dbc, 0);
13547c478bdstevel@tonic-gate
13557c478bdstevel@tonic-gate	return (ret);
13567c478bdstevel@tonic-gate}
1357