1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 */
7
8#include "config.h"
9
10#ifndef lint
11static const char sccsid[] = "@(#)bt_recno.c	10.53 (Sleepycat) 12/11/98";
12#endif /* not lint */
13
14#ifndef NO_SYSTEM_INCLUDES
15#include <sys/types.h>
16
17#include <errno.h>
18#include <limits.h>
19#include <string.h>
20#endif
21
22#include "db_int.h"
23#include "db_page.h"
24#include "btree.h"
25#include "db_ext.h"
26#include "shqueue.h"
27#include "db_shash.h"
28#include "lock.h"
29#include "lock_ext.h"
30
31static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
32static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
33static int __ram_fmap __P((DBC *, db_recno_t));
34static int __ram_i_delete __P((DBC *));
35static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
36static int __ram_source __P((DB *, RECNO *, const char *));
37static int __ram_sync __P((DB *, u_int32_t));
38static int __ram_update __P((DBC *, db_recno_t, int));
39static int __ram_vmap __P((DBC *, db_recno_t));
40static int __ram_writeback __P((DBC *));
41
42/*
43 * In recno, there are two meanings to the on-page "deleted" flag.  If we're
44 * re-numbering records, it means the record was implicitly created.  We skip
45 * over implicitly created records if doing a cursor "next" or "prev", and
46 * return DB_KEYEMPTY if they're explicitly requested..  If not re-numbering
47 * records, it means that the record was implicitly created, or was deleted.
48 * We skip over implicitly created or deleted records if doing a cursor "next"
49 * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
50 *
51 * If we're re-numbering records, then we have to detect in the cursor that
52 * a record was deleted, and adjust the cursor as necessary on the next get.
53 * If we're not re-numbering records, then we can detect that a record has
54 * been deleted by looking at the actual on-page record, so we completely
55 * ignore the cursor's delete flag.  This is different from the B+tree code.
56 * It also maintains whether the cursor references a deleted record in the
57 * cursor, and it doesn't always check the on-page value.
58 */
59#define	CD_SET(dbp, cp) {						\
60	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
61		F_SET(cp, C_DELETED);					\
62}
63#define	CD_CLR(dbp, cp) {						\
64	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
65		F_CLR(cp, C_DELETED);					\
66}
67#define	CD_ISSET(dbp, cp)						\
68	(F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, C_DELETED))
69
70/*
71 * __ram_open --
72 *	Recno open function.
73 *
74 * PUBLIC: int __ram_open __P((DB *, DB_INFO *));
75 */
76int
77__ram_open(dbp, dbinfo)
78	DB *dbp;
79	DB_INFO *dbinfo;
80{
81	BTREE *t;
82	DBC *dbc;
83	RECNO *rp;
84	int ret, t_ret;
85
86	/* Allocate and initialize the private btree structure. */
87	if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0)
88		return (ret);
89	dbp->internal = t;
90	__bam_setovflsize(dbp);
91
92	/* Allocate and initialize the private recno structure. */
93	if ((ret = __os_calloc(1, sizeof(*rp), &rp)) != 0)
94		return (ret);
95	/* Link in the private recno structure. */
96	t->recno = rp;
97
98	/*
99	 * Intention is to make sure all of the user's selections are okay
100	 * here and then use them without checking.
101	 */
102	if (dbinfo == NULL) {
103		rp->re_delim = '\n';
104		rp->re_pad = ' ';
105		rp->re_fd = -1;
106		F_SET(rp, RECNO_EOF);
107	} else {
108		/*
109		 * If the user specified a source tree, open it and map it in.
110		 *
111		 * !!!
112		 * We don't complain if the user specified transactions or
113		 * threads.  It's possible to make it work, but you'd better
114		 * know what you're doing!
115		 */
116		if (dbinfo->re_source == NULL) {
117			rp->re_fd = -1;
118			F_SET(rp, RECNO_EOF);
119		} else {
120			if ((ret =
121			    __ram_source(dbp, rp, dbinfo->re_source)) != 0)
122			goto err;
123		}
124
125		/* Copy delimiter, length and padding values. */
126		rp->re_delim =
127		    F_ISSET(dbp, DB_RE_DELIMITER) ? dbinfo->re_delim : '\n';
128		rp->re_pad = F_ISSET(dbp, DB_RE_PAD) ? dbinfo->re_pad : ' ';
129
130		if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
131			if ((rp->re_len = dbinfo->re_len) == 0) {
132				__db_err(dbp->dbenv,
133				    "record length must be greater than 0");
134				ret = EINVAL;
135				goto err;
136			}
137		} else
138			rp->re_len = 0;
139	}
140
141	/* Initialize the remaining fields/methods of the DB. */
142	dbp->am_close = __ram_close;
143	dbp->del = __ram_delete;
144	dbp->put = __ram_put;
145	dbp->stat = __bam_stat;
146	dbp->sync = __ram_sync;
147
148	/* Start up the tree. */
149	if ((ret = __bam_read_root(dbp)) != 0)
150		goto err;
151
152	/* Set the overflow page size. */
153	__bam_setovflsize(dbp);
154
155	/* If we're snapshotting an underlying source file, do it now. */
156	if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) {
157		/* Allocate a cursor. */
158		if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
159			goto err;
160
161		/* Do the snapshot. */
162		if ((ret = __ram_update(dbc,
163		    DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
164			ret = 0;
165
166		/* Discard the cursor. */
167		if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
168			ret = t_ret;
169
170		if (ret != 0)
171			goto err;
172	}
173
174	return (0);
175
176err:	/* If we mmap'd a source file, discard it. */
177	if (rp->re_smap != NULL)
178		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
179
180	/* If we opened a source file, discard it. */
181	if (rp->re_fd != -1)
182		(void)__os_close(rp->re_fd);
183	if (rp->re_source != NULL)
184		__os_freestr(rp->re_source);
185
186	__os_free(rp, sizeof(*rp));
187
188	return (ret);
189}
190
191/*
192 * __ram_delete --
193 *	Recno db->del function.
194 */
195static int
196__ram_delete(dbp, txn, key, flags)
197	DB *dbp;
198	DB_TXN *txn;
199	DBT *key;
200	u_int32_t flags;
201{
202	CURSOR *cp;
203	DBC *dbc;
204	db_recno_t recno;
205	int ret, t_ret;
206
207	DB_PANIC_CHECK(dbp);
208
209	/* Check for invalid flags. */
210	if ((ret = __db_delchk(dbp,
211	    key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
212		return (ret);
213
214	/* Acquire a cursor. */
215	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
216		return (ret);
217
218	DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags);
219
220	/* Check the user's record number and fill in as necessary. */
221	if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
222		goto err;
223
224	/* Do the delete. */
225	cp = dbc->internal;
226	cp->recno = recno;
227	ret = __ram_i_delete(dbc);
228
229	/* Release the cursor. */
230err:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
231		ret = t_ret;
232
233	return (ret);
234}
235
236/*
237 * __ram_i_delete --
238 *	Internal version of recno delete, called by __ram_delete and
239 *	__ram_c_del.
240 */
241static int
242__ram_i_delete(dbc)
243	DBC *dbc;
244{
245	BKEYDATA bk;
246	BTREE *t;
247	CURSOR *cp;
248	DB *dbp;
249	DBT hdr, data;
250	PAGE *h;
251	db_indx_t indx;
252	int exact, ret, stack;
253
254	dbp = dbc->dbp;
255	cp = dbc->internal;
256	t = dbp->internal;
257	stack = 0;
258
259	/*
260	 * If this is CDB and this isn't a write cursor, then it's an error.
261	 * If it is a write cursor, but we don't yet hold the write lock, then
262	 * we need to upgrade to the write lock.
263	 */
264	if (F_ISSET(dbp, DB_AM_CDB)) {
265		/* Make sure it's a valid update cursor. */
266		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
267			return (EINVAL);
268
269		if (F_ISSET(dbc, DBC_RMW) &&
270		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
271		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
272		    &dbc->mylock)) != 0)
273			return (EAGAIN);
274	}
275
276	/* Search the tree for the key; delete only deletes exact matches. */
277	if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
278		goto err;
279	if (!exact) {
280		ret = DB_NOTFOUND;
281		goto err;
282	}
283	stack = 1;
284
285	h = cp->csp->page;
286	indx = cp->csp->indx;
287
288	/*
289	 * If re-numbering records, the on-page deleted flag can only mean
290	 * that this record was implicitly created.  Applications aren't
291	 * permitted to delete records they never created, return an error.
292	 *
293	 * If not re-numbering records, the on-page deleted flag means that
294	 * this record was implicitly created, or, was deleted at some time.
295	 * The former is an error because applications aren't permitted to
296	 * delete records they never created, the latter is an error because
297	 * if the record was "deleted", we could never have found it.
298	 */
299	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
300		ret = DB_KEYEMPTY;
301		goto err;
302	}
303
304	if (F_ISSET(dbp, DB_RE_RENUMBER)) {
305		/* Delete the item, adjust the counts, adjust the cursors. */
306		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
307			goto err;
308		__bam_adjust(dbc, -1);
309		__ram_ca(dbp, cp->recno, CA_DELETE);
310
311		/*
312		 * If the page is empty, delete it.   The whole tree is locked
313		 * so there are no preparations to make.
314		 */
315		if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
316			stack = 0;
317			ret = __bam_dpages(dbc);
318		}
319	} else {
320		/* Use a delete/put pair to replace the record with a marker. */
321		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
322			goto err;
323
324		B_TSET(bk.type, B_KEYDATA, 1);
325		bk.len = 0;
326		memset(&hdr, 0, sizeof(hdr));
327		hdr.data = &bk;
328		hdr.size = SSZA(BKEYDATA, data);
329		memset(&data, 0, sizeof(data));
330		data.data = (char *)"";
331		data.size = 0;
332		if ((ret = __db_pitem(dbc,
333		    h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
334			goto err;
335	}
336	F_SET(t->recno, RECNO_MODIFIED);
337
338err:	if (stack)
339		__bam_stkrel(dbc, 0);
340
341	/* If we upgraded the CDB lock upon entry; downgrade it now. */
342	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
343		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
344		    DB_LOCK_IWRITE, 0);
345	return (ret);
346}
347
348/*
349 * __ram_put --
350 *	Recno db->put function.
351 */
352static int
353__ram_put(dbp, txn, key, data, flags)
354	DB *dbp;
355	DB_TXN *txn;
356	DBT *key, *data;
357	u_int32_t flags;
358{
359	DBC *dbc;
360	db_recno_t recno;
361	int ret, t_ret;
362
363	DB_PANIC_CHECK(dbp);
364
365	/* Check for invalid flags. */
366	if ((ret = __db_putchk(dbp,
367	    key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0)
368		return (ret);
369
370	/* Allocate a cursor. */
371	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
372		return (ret);
373
374	DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags);
375
376	/*
377	 * If we're appending to the tree, make sure we've read in all of
378	 * the backing source file.  Otherwise, check the user's record
379	 * number and fill in as necessary.
380	 */
381	ret = flags == DB_APPEND ?
382	    __ram_update(dbc, DB_MAX_RECORDS, 0) :
383	    __ram_getno(dbc, key, &recno, 1);
384
385	/* Add the record. */
386	if (ret == 0)
387		ret = __ram_add(dbc, &recno, data, flags, 0);
388
389	/* Discard the cursor. */
390	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
391		ret = t_ret;
392
393	/* Return the record number if we're appending to the tree. */
394	if (ret == 0 && flags == DB_APPEND)
395		*(db_recno_t *)key->data = recno;
396
397	return (ret);
398}
399
400/*
401 * __ram_sync --
402 *	Recno db->sync function.
403 */
404static int
405__ram_sync(dbp, flags)
406	DB *dbp;
407	u_int32_t flags;
408{
409	DBC *dbc;
410	int ret, t_ret;
411
412	/*
413	 * Sync the underlying btree.
414	 *
415	 * !!!
416	 * We don't need to do a panic check or flags check, the "real"
417	 * sync function does all that for us.
418	 */
419	if ((ret = __db_sync(dbp, flags)) != 0)
420		return (ret);
421
422	/* Allocate a cursor. */
423	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
424		return (ret);
425
426	DEBUG_LWRITE(dbc, NULL, "ram_sync", NULL, NULL, flags);
427
428	/* Copy back the backing source file. */
429	ret = __ram_writeback(dbc);
430
431	/* Discard the cursor. */
432	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
433		ret = t_ret;
434
435	return (ret);
436}
437
438/*
439 * __ram_close --
440 *	Recno db->close function.
441 *
442 * PUBLIC: int __ram_close __P((DB *));
443 */
444int
445__ram_close(dbp)
446	DB *dbp;
447{
448	RECNO *rp;
449
450	rp = ((BTREE *)dbp->internal)->recno;
451
452	/* Close any underlying mmap region. */
453	if (rp->re_smap != NULL)
454		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
455
456	/* Close any backing source file descriptor. */
457	if (rp->re_fd != -1)
458		(void)__os_close(rp->re_fd);
459
460	/* Free any backing source file name. */
461	if (rp->re_source != NULL)
462		__os_freestr(rp->re_source);
463
464	/* Free allocated memory. */
465	__os_free(rp, sizeof(RECNO));
466	((BTREE *)dbp->internal)->recno = NULL;
467
468	/* Close the underlying btree. */
469	return (__bam_close(dbp));
470}
471
472/*
473 * __ram_c_del --
474 *	Recno cursor->c_del function.
475 *
476 * PUBLIC: int __ram_c_del __P((DBC *, u_int32_t));
477 */
478int
479__ram_c_del(dbc, flags)
480	DBC *dbc;
481	u_int32_t flags;
482{
483	CURSOR *cp;
484	DB *dbp;
485	int ret;
486
487	dbp = dbc->dbp;
488	cp = dbc->internal;
489
490	DB_PANIC_CHECK(dbp);
491
492	/* Check for invalid flags. */
493	if ((ret = __db_cdelchk(dbp, flags,
494	    F_ISSET(dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
495		return (ret);
496
497	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_del", NULL, NULL, flags);
498
499	/*
500	 * If we are running CDB, this had better be either a write
501	 * cursor or an immediate writer.
502	 */
503	if (F_ISSET(dbp, DB_AM_CDB))
504		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
505			return (EINVAL);
506
507	/*
508	 * The semantics of cursors during delete are as follows: if record
509	 * numbers are mutable (DB_RE_RENUMBER is set), deleting a record
510	 * causes the cursor to automatically point to the record immediately
511	 * following.  In this case it is possible to use a single cursor for
512	 * repeated delete operations, without intervening operations.
513	 *
514	 * If record numbers are not mutable, then records are replaced with
515	 * a marker containing a delete flag.  If the record referenced by
516	 * this cursor has already been deleted, we will detect that as part
517	 * of the delete operation, and fail.
518	 */
519	return (__ram_i_delete(dbc));
520}
521
522/*
523 * __ram_c_get --
524 *	Recno cursor->c_get function.
525 *
526 * PUBLIC: int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
527 */
528int
529__ram_c_get(dbc, key, data, flags)
530	DBC *dbc;
531	DBT *key, *data;
532	u_int32_t flags;
533{
534	CURSOR *cp, copy;
535	DB *dbp;
536	PAGE *h;
537	db_indx_t indx;
538	int exact, ret, stack, tmp_rmw;
539
540	dbp = dbc->dbp;
541	cp = dbc->internal;
542
543	DB_PANIC_CHECK(dbp);
544
545	/* Check for invalid flags. */
546	if ((ret = __db_cgetchk(dbc->dbp,
547	    key, data, flags, cp->recno != RECNO_OOB)) != 0)
548		return (ret);
549
550	/* Clear OR'd in additional bits so we can check for flag equality. */
551	tmp_rmw = 0;
552	if (LF_ISSET(DB_RMW)) {
553		if (!F_ISSET(dbp, DB_AM_CDB)) {
554			tmp_rmw = 1;
555			F_SET(dbc, DBC_RMW);
556		}
557		LF_CLR(DB_RMW);
558	}
559
560	DEBUG_LREAD(dbc, dbc->txn, "ram_c_get",
561	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
562
563	/* Initialize the cursor for a new retrieval. */
564	copy = *cp;
565
566retry:	/* Update the record number. */
567	stack = 0;
568	switch (flags) {
569	case DB_CURRENT:
570		/*
571		 * If record numbers are mutable: if we just deleted a record,
572		 * there is no action necessary, we return the record following
573		 * the deleted item by virtue of renumbering the tree.
574		 */
575		break;
576	case DB_NEXT:
577		/*
578		 * If record numbers are mutable: if we just deleted a record,
579		 * we have to avoid incrementing the record number so that we
580		 * return the right record by virtue of renumbering the tree.
581		 */
582		if (CD_ISSET(dbp, cp))
583			break;
584
585		if (cp->recno != RECNO_OOB) {
586			++cp->recno;
587			break;
588		}
589		/* FALLTHROUGH */
590	case DB_FIRST:
591		flags = DB_NEXT;
592		cp->recno = 1;
593		break;
594	case DB_PREV:
595		if (cp->recno != RECNO_OOB) {
596			if (cp->recno == 1) {
597				ret = DB_NOTFOUND;
598				goto err;
599			}
600			--cp->recno;
601			break;
602		}
603		/* FALLTHROUGH */
604	case DB_LAST:
605		flags = DB_PREV;
606		if (((ret = __ram_update(dbc,
607		    DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
608			goto err;
609		if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
610			goto err;
611		if (cp->recno == 0) {
612			ret = DB_NOTFOUND;
613			goto err;
614		}
615		break;
616	case DB_SET:
617	case DB_SET_RANGE:
618		if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
619			goto err;
620		break;
621	}
622
623	/* Return the key if the user didn't give us one. */
624	if (flags != DB_SET && flags != DB_SET_RANGE &&
625	    (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno),
626	    &dbc->rkey.data, &dbc->rkey.ulen, dbp->db_malloc)) != 0)
627		goto err;
628
629	/* Search the tree for the record. */
630	if ((ret = __bam_rsearch(dbc, &cp->recno,
631	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &exact)) != 0)
632		goto err;
633	stack = 1;
634	if (!exact) {
635		ret = DB_NOTFOUND;
636		goto err;
637	}
638	h = cp->csp->page;
639	indx = cp->csp->indx;
640
641	/*
642	 * If re-numbering records, the on-page deleted flag means this record
643	 * was implicitly created.  If not re-numbering records, the on-page
644	 * deleted flag means this record was implicitly created, or, it was
645	 * deleted at some time.  Regardless, we skip such records if doing
646	 * cursor next/prev operations, and fail if the application requested
647	 * them explicitly.
648	 */
649	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
650		if (flags == DB_NEXT || flags == DB_PREV) {
651			(void)__bam_stkrel(dbc, 0);
652			goto retry;
653		}
654		ret = DB_KEYEMPTY;
655		goto err;
656	}
657
658	/* Return the data item. */
659	if ((ret = __db_ret(dbp,
660	    h, indx, data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
661		goto err;
662
663	/* The cursor was reset, no further delete adjustment is necessary. */
664	CD_CLR(dbp, cp);
665
666err:	if (stack)
667		(void)__bam_stkrel(dbc, 0);
668
669	/* Release temporary lock upgrade. */
670	if (tmp_rmw)
671		F_CLR(dbc, DBC_RMW);
672
673	if (ret != 0)
674		*cp = copy;
675
676	return (ret);
677}
678
679/*
680 * __ram_c_put --
681 *	Recno cursor->c_put function.
682 *
683 * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
684 */
685int
686__ram_c_put(dbc, key, data, flags)
687	DBC *dbc;
688	DBT *key, *data;
689	u_int32_t flags;
690{
691	CURSOR *cp, copy;
692	DB *dbp;
693	int exact, ret;
694	void *arg;
695
696	dbp = dbc->dbp;
697	cp = dbc->internal;
698
699	DB_PANIC_CHECK(dbp);
700
701	if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
702	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
703		return (ret);
704
705	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_put", NULL, data, flags);
706
707	/*
708	 * If we are running CDB, this had better be either a write
709	 * cursor or an immediate writer.  If it's a regular writer,
710	 * that means we have an IWRITE lock and we need to upgrade
711	 * it to a write lock.
712	 */
713	if (F_ISSET(dbp, DB_AM_CDB)) {
714		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
715			return (EINVAL);
716
717		if (F_ISSET(dbc, DBC_RMW) &&
718		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
719		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
720		    &dbc->mylock)) != 0)
721			return (EAGAIN);
722	}
723
724	/* Initialize the cursor for a new retrieval. */
725	copy = *cp;
726
727	/*
728	 * To split, we need a valid key for the page.  Since it's a cursor,
729	 * we have to build one.
730	 *
731	 * The split code discards all short-term locks and stack pages.
732	 */
733	if (0) {
734split:		arg = &cp->recno;
735		if ((ret = __bam_split(dbc, arg)) != 0)
736			goto err;
737	}
738
739	if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
740		goto err;
741	if (!exact) {
742		ret = DB_NOTFOUND;
743		goto err;
744	}
745	if ((ret = __bam_iitem(dbc, &cp->csp->page,
746	    &cp->csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) {
747		if ((ret = __bam_stkrel(dbc, 0)) != 0)
748			goto err;
749		goto split;
750	}
751	if ((ret = __bam_stkrel(dbc, 0)) != 0)
752		goto err;
753
754	switch (flags) {
755	case DB_AFTER:
756		/* Adjust the cursors. */
757		__ram_ca(dbp, cp->recno, CA_IAFTER);
758
759		/* Set this cursor to reference the new record. */
760		cp->recno = copy.recno + 1;
761		break;
762	case DB_BEFORE:
763		/* Adjust the cursors. */
764		__ram_ca(dbp, cp->recno, CA_IBEFORE);
765
766		/* Set this cursor to reference the new record. */
767		cp->recno = copy.recno;
768		break;
769	}
770
771	/* The cursor was reset, no further delete adjustment is necessary. */
772	CD_CLR(dbp, cp);
773
774err:	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
775		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
776		    DB_LOCK_IWRITE, 0);
777
778	if (ret != 0)
779		*cp = copy;
780
781	return (ret);
782}
783
784/*
785 * __ram_ca --
786 *	Adjust cursors.
787 *
788 * PUBLIC: void __ram_ca __P((DB *, db_recno_t, ca_recno_arg));
789 */
790void
791__ram_ca(dbp, recno, op)
792	DB *dbp;
793	db_recno_t recno;
794	ca_recno_arg op;
795{
796	CURSOR *cp;
797	DBC *dbc;
798
799	/*
800	 * Adjust the cursors.  See the comment in __bam_ca_delete().
801	 */
802	DB_THREAD_LOCK(dbp);
803	for (dbc = TAILQ_FIRST(&dbp->active_queue);
804	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
805		cp = dbc->internal;
806		switch (op) {
807		case CA_DELETE:
808			if (recno > cp->recno)
809				--cp->recno;
810			if (recno == cp->recno)
811				CD_SET(dbp, cp);
812			break;
813		case CA_IAFTER:
814			if (recno > cp->recno)
815				++cp->recno;
816			break;
817		case CA_IBEFORE:
818			if (recno >= cp->recno)
819				++cp->recno;
820			break;
821		}
822	}
823	DB_THREAD_UNLOCK(dbp);
824}
825
826/*
827 * __ram_getno --
828 *	Check the user's record number, and make sure we've seen it.
829 *
830 * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
831 */
832int
833__ram_getno(dbc, key, rep, can_create)
834	DBC *dbc;
835	const DBT *key;
836	db_recno_t *rep;
837	int can_create;
838{
839	DB *dbp;
840	db_recno_t recno;
841
842	dbp = dbc->dbp;
843
844	/* Check the user's record number. */
845	if ((recno = *(db_recno_t *)key->data) == 0) {
846		__db_err(dbp->dbenv, "illegal record number of 0");
847		return (EINVAL);
848	}
849	if (rep != NULL)
850		*rep = recno;
851
852	/*
853	 * Btree can neither create records nor read them in.  Recno can
854	 * do both, see if we can find the record.
855	 */
856	return (dbp->type == DB_RECNO ?
857	    __ram_update(dbc, recno, can_create) : 0);
858}
859
860/*
861 * __ram_update --
862 *	Ensure the tree has records up to and including the specified one.
863 */
864static int
865__ram_update(dbc, recno, can_create)
866	DBC *dbc;
867	db_recno_t recno;
868	int can_create;
869{
870	BTREE *t;
871	DB *dbp;
872	RECNO *rp;
873	db_recno_t nrecs;
874	int ret;
875
876	dbp = dbc->dbp;
877	t = dbp->internal;
878	rp = t->recno;
879
880	/*
881	 * If we can't create records and we've read the entire backing input
882	 * file, we're done.
883	 */
884	if (!can_create && F_ISSET(rp, RECNO_EOF))
885		return (0);
886
887	/*
888	 * If we haven't seen this record yet, try to get it from the original
889	 * file.
890	 */
891	if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
892		return (ret);
893	if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) {
894		if ((ret = rp->re_irec(dbc, recno)) != 0)
895			return (ret);
896		if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
897			return (ret);
898	}
899
900	/*
901	 * If we can create records, create empty ones up to the requested
902	 * record.
903	 */
904	if (!can_create || recno <= nrecs + 1)
905		return (0);
906
907	dbc->rdata.dlen = 0;
908	dbc->rdata.doff = 0;
909	dbc->rdata.flags = 0;
910	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
911		if (dbc->rdata.ulen < rp->re_len) {
912			if ((ret =
913			    __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
914				dbc->rdata.ulen = 0;
915				dbc->rdata.data = NULL;
916				return (ret);
917			}
918			dbc->rdata.ulen = rp->re_len;
919		}
920		dbc->rdata.size = rp->re_len;
921		memset(dbc->rdata.data, rp->re_pad, rp->re_len);
922	} else
923		dbc->rdata.size = 0;
924
925	while (recno > ++nrecs)
926		if ((ret = __ram_add(dbc,
927		    &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0)
928			return (ret);
929	return (0);
930}
931
932/*
933 * __ram_source --
934 *	Load information about the backing file.
935 */
936static int
937__ram_source(dbp, rp, fname)
938	DB *dbp;
939	RECNO *rp;
940	const char *fname;
941{
942	size_t size;
943	u_int32_t bytes, mbytes, oflags;
944	int ret;
945
946	/*
947	 * !!!
948	 * The caller has full responsibility for cleaning up on error --
949	 * (it has to anyway, in case it fails after this routine succeeds).
950	 */
951	if ((ret = __db_appname(dbp->dbenv,
952	    DB_APP_DATA, NULL, fname, 0, NULL, &rp->re_source)) != 0)
953		return (ret);
954
955	oflags = F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0;
956	if ((ret =
957	    __db_open(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) {
958		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
959		return (ret);
960	}
961
962	/*
963	 * XXX
964	 * We'd like to test to see if the file is too big to mmap.  Since we
965	 * don't know what size or type off_t's or size_t's are, or the largest
966	 * unsigned integral type is, or what random insanity the local C
967	 * compiler will perpetrate, doing the comparison in a portable way is
968	 * flatly impossible.  Hope that mmap fails if the file is too large.
969	 */
970	if ((ret = __os_ioinfo(rp->re_source,
971	    rp->re_fd, &mbytes, &bytes, NULL)) != 0) {
972		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
973		return (ret);
974	}
975	if (mbytes == 0 && bytes == 0) {
976		F_SET(rp, RECNO_EOF);
977		return (0);
978	}
979
980	size = mbytes * MEGABYTE + bytes;
981	if ((ret = __db_mapfile(rp->re_source,
982	    rp->re_fd, (size_t)size, 1, &rp->re_smap)) != 0)
983		return (ret);
984	rp->re_cmap = rp->re_smap;
985	rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size);
986	rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ?  __ram_fmap : __ram_vmap;
987	return (0);
988}
989
990/*
991 * __ram_writeback --
992 *	Rewrite the backing file.
993 */
994static int
995__ram_writeback(dbc)
996	DBC *dbc;
997{
998	DB *dbp;
999	DBT key, data;
1000	RECNO *rp;
1001	db_recno_t keyno;
1002	ssize_t nw;
1003	int fd, ret, t_ret;
1004	u_int8_t delim, *pad;
1005
1006	dbp = dbc->dbp;
1007	rp = ((BTREE *)dbp->internal)->recno;
1008
1009	/* If the file wasn't modified, we're done. */
1010	if (!F_ISSET(rp, RECNO_MODIFIED))
1011		return (0);
1012
1013	/* If there's no backing source file, we're done. */
1014	if (rp->re_source == NULL) {
1015		F_CLR(rp, RECNO_MODIFIED);
1016		return (0);
1017	}
1018
1019	/*
1020	 * Read any remaining records into the tree.
1021	 *
1022	 * !!!
1023	 * This is why we can't support transactions when applications specify
1024	 * backing (re_source) files.  At this point we have to read in the
1025	 * rest of the records from the file so that we can write all of the
1026	 * records back out again, which could modify a page for which we'd
1027	 * have to log changes and which we don't have locked.  This could be
1028	 * partially fixed by taking a snapshot of the entire file during the
1029	 * db_open(), or, since db_open() isn't transaction protected, as part
1030	 * of the first DB operation.  But, if a checkpoint occurs then, the
1031	 * part of the log holding the copy of the file could be discarded, and
1032	 * that would make it impossible to recover in the face of disaster.
1033	 * This could all probably be fixed, but it would require transaction
1034	 * protecting the backing source file, i.e. mpool would have to know
1035	 * about it, and we don't want to go there.
1036	 */
1037	if ((ret =
1038	    __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
1039		return (ret);
1040
1041	/*
1042	 * !!!
1043	 * Close any underlying mmap region.  This is required for Windows NT
1044	 * (4.0, Service Pack 2) -- if the file is still mapped, the following
1045	 * open will fail.
1046	 */
1047	if (rp->re_smap != NULL) {
1048		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
1049		rp->re_smap = NULL;
1050	}
1051
1052	/* Get rid of any backing file descriptor, just on GP's. */
1053	if (rp->re_fd != -1) {
1054		(void)__os_close(rp->re_fd);
1055		rp->re_fd = -1;
1056	}
1057
1058	/* Open the file, truncating it. */
1059	if ((ret = __db_open(rp->re_source,
1060	    DB_SEQUENTIAL | DB_TRUNCATE,
1061	    DB_SEQUENTIAL | DB_TRUNCATE, 0, &fd)) != 0) {
1062		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
1063		return (ret);
1064	}
1065
1066	/*
1067	 * We step through the records, writing each one out.  Use the record
1068	 * number and the dbp->get() function, instead of a cursor, so we find
1069	 * and write out "deleted" or non-existent records.
1070	 */
1071	memset(&key, 0, sizeof(key));
1072	memset(&data, 0, sizeof(data));
1073	key.size = sizeof(db_recno_t);
1074	key.data = &keyno;
1075
1076	/*
1077	 * We'll need the delimiter if we're doing variable-length records,
1078	 * and the pad character if we're doing fixed-length records.
1079	 */
1080	delim = rp->re_delim;
1081	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1082		if ((ret = __os_malloc(rp->re_len, NULL, &pad)) != 0)
1083			goto err;
1084		memset(pad, rp->re_pad, rp->re_len);
1085	} else
1086		COMPQUIET(pad, NULL);
1087	for (keyno = 1;; ++keyno) {
1088		switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
1089		case 0:
1090			if ((ret =
1091			    __os_write(fd, data.data, data.size, &nw)) != 0)
1092				goto err;
1093			if (nw != (ssize_t)data.size) {
1094				ret = EIO;
1095				goto err;
1096			}
1097			break;
1098		case DB_KEYEMPTY:
1099			if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1100				if ((ret =
1101				    __os_write(fd, pad, rp->re_len, &nw)) != 0)
1102					goto err;
1103				if (nw != (ssize_t)rp->re_len) {
1104					ret = EIO;
1105					goto err;
1106				}
1107			}
1108			break;
1109		case DB_NOTFOUND:
1110			ret = 0;
1111			goto done;
1112		}
1113		if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1114			if ((ret = __os_write(fd, &delim, 1, &nw)) != 0)
1115				goto err;
1116			if (nw != 1) {
1117				ret = EIO;
1118				goto err;
1119			}
1120		}
1121	}
1122
1123err:
1124done:	/* Close the file descriptor. */
1125	if ((t_ret = __os_close(fd)) != 0 || ret == 0)
1126		ret = t_ret;
1127
1128	if (ret == 0)
1129		F_CLR(rp, RECNO_MODIFIED);
1130	return (ret);
1131}
1132
1133/*
1134 * __ram_fmap --
1135 *	Get fixed length records from a file.
1136 */
1137static int
1138__ram_fmap(dbc, top)
1139	DBC *dbc;
1140	db_recno_t top;
1141{
1142	DB *dbp;
1143	DBT data;
1144	RECNO *rp;
1145	db_recno_t recno;
1146	u_int32_t len;
1147	u_int8_t *sp, *ep, *p;
1148	int ret;
1149
1150	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1151		return (ret);
1152
1153	dbp = dbc->dbp;
1154	rp = ((BTREE *)(dbp->internal))->recno;
1155
1156	if (dbc->rdata.ulen < rp->re_len) {
1157		if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
1158			dbc->rdata.ulen = 0;
1159			dbc->rdata.data = NULL;
1160			return (ret);
1161		}
1162		dbc->rdata.ulen = rp->re_len;
1163	}
1164
1165	memset(&data, 0, sizeof(data));
1166	data.data = dbc->rdata.data;
1167	data.size = rp->re_len;
1168
1169	sp = (u_int8_t *)rp->re_cmap;
1170	ep = (u_int8_t *)rp->re_emap;
1171	while (recno < top) {
1172		if (sp >= ep) {
1173			F_SET(rp, RECNO_EOF);
1174			return (DB_NOTFOUND);
1175		}
1176		len = rp->re_len;
1177		for (p = dbc->rdata.data;
1178		    sp < ep && len > 0; *p++ = *sp++, --len)
1179			;
1180
1181		/*
1182		 * Another process may have read this record from the input
1183		 * file and stored it into the database already, in which
1184		 * case we don't need to repeat that operation.  We detect
1185		 * this by checking if the last record we've read is greater
1186		 * or equal to the number of records in the database.
1187		 *
1188		 * XXX
1189		 * We should just do a seek, since the records are fixed
1190		 * length.
1191		 */
1192		if (rp->re_last >= recno) {
1193			if (len != 0)
1194				memset(p, rp->re_pad, len);
1195
1196			++recno;
1197			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1198				return (ret);
1199		}
1200		++rp->re_last;
1201	}
1202	rp->re_cmap = sp;
1203	return (0);
1204}
1205
1206/*
1207 * __ram_vmap --
1208 *	Get variable length records from a file.
1209 */
1210static int
1211__ram_vmap(dbc, top)
1212	DBC *dbc;
1213	db_recno_t top;
1214{
1215	DBT data;
1216	RECNO *rp;
1217	db_recno_t recno;
1218	u_int8_t *sp, *ep;
1219	int delim, ret;
1220
1221	rp = ((BTREE *)(dbc->dbp->internal))->recno;
1222
1223	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1224		return (ret);
1225
1226	memset(&data, 0, sizeof(data));
1227
1228	delim = rp->re_delim;
1229
1230	sp = (u_int8_t *)rp->re_cmap;
1231	ep = (u_int8_t *)rp->re_emap;
1232	while (recno < top) {
1233		if (sp >= ep) {
1234			F_SET(rp, RECNO_EOF);
1235			return (DB_NOTFOUND);
1236		}
1237		for (data.data = sp; sp < ep && *sp != delim; ++sp)
1238			;
1239
1240		/*
1241		 * Another process may have read this record from the input
1242		 * file and stored it into the database already, in which
1243		 * case we don't need to repeat that operation.  We detect
1244		 * this by checking if the last record we've read is greater
1245		 * or equal to the number of records in the database.
1246		 */
1247		if (rp->re_last >= recno) {
1248			data.size = sp - (u_int8_t *)data.data;
1249			++recno;
1250			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1251				return (ret);
1252		}
1253		++rp->re_last;
1254		++sp;
1255	}
1256	rp->re_cmap = sp;
1257	return (0);
1258}
1259
1260/*
1261 * __ram_add --
1262 *	Add records into the tree.
1263 */
1264static int
1265__ram_add(dbc, recnop, data, flags, bi_flags)
1266	DBC *dbc;
1267	db_recno_t *recnop;
1268	DBT *data;
1269	u_int32_t flags, bi_flags;
1270{
1271	BKEYDATA *bk;
1272	CURSOR *cp;
1273	DB *dbp;
1274	PAGE *h;
1275	db_indx_t indx;
1276	int exact, isdeleted, ret, stack;
1277
1278	dbp = dbc->dbp;
1279	cp = dbc->internal;
1280
1281retry:	/* Find the slot for insertion. */
1282	if ((ret = __bam_rsearch(dbc, recnop,
1283	    S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
1284		return (ret);
1285	h = cp->csp->page;
1286	indx = cp->csp->indx;
1287	stack = 1;
1288
1289	/*
1290	 * If re-numbering records, the on-page deleted flag means this record
1291	 * was implicitly created.  If not re-numbering records, the on-page
1292	 * deleted flag means this record was implicitly created, or, it was
1293	 * deleted at some time.
1294	 *
1295	 * If DB_NOOVERWRITE is set and the item already exists in the tree,
1296	 * return an error unless the item was either marked for deletion or
1297	 * only implicitly created.
1298	 */
1299	isdeleted = 0;
1300	if (exact) {
1301		bk = GET_BKEYDATA(h, indx);
1302		if (B_DISSET(bk->type))
1303			isdeleted = 1;
1304		else
1305			if (flags == DB_NOOVERWRITE) {
1306				ret = DB_KEYEXIST;
1307				goto err;
1308			}
1309	}
1310
1311	/*
1312	 * Select the arguments for __bam_iitem() and do the insert.  If the
1313	 * key is an exact match, or we're replacing the data item with a
1314	 * new data item, replace the current item.  If the key isn't an exact
1315	 * match, we're inserting a new key/data pair, before the search
1316	 * location.
1317	 */
1318	switch (ret = __bam_iitem(dbc,
1319	    &h, &indx, NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
1320	case 0:
1321		/*
1322		 * Don't adjust anything.
1323		 *
1324		 * If we inserted a record, no cursors need adjusting because
1325		 * the only new record it's possible to insert is at the very
1326		 * end of the tree.  The necessary adjustments to the internal
1327		 * page counts were made by __bam_iitem().
1328		 *
1329		 * If we overwrote a record, no cursors need adjusting because
1330		 * future DBcursor->get calls will simply return the underlying
1331		 * record (there's no adjustment made for the DB_CURRENT flag
1332		 * when a cursor get operation immediately follows a cursor
1333		 * delete operation, and the normal adjustment for the DB_NEXT
1334		 * flag is still correct).
1335		 */
1336		break;
1337	case DB_NEEDSPLIT:
1338		/* Discard the stack of pages and split the page. */
1339		(void)__bam_stkrel(dbc, 0);
1340		stack = 0;
1341
1342		if ((ret = __bam_split(dbc, recnop)) != 0)
1343			goto err;
1344
1345		goto retry;
1346		/* NOTREACHED */
1347	default:
1348		goto err;
1349	}
1350
1351
1352err:	if (stack)
1353		__bam_stkrel(dbc, 0);
1354
1355	return (ret);
1356}
1357