1*7c478bd9Sstevel@tonic-gate /*-
2*7c478bd9Sstevel@tonic-gate  * See the file LICENSE for redistribution information.
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1996, 1997, 1998
5*7c478bd9Sstevel@tonic-gate  *	Sleepycat Software.  All rights reserved.
6*7c478bd9Sstevel@tonic-gate  */
7*7c478bd9Sstevel@tonic-gate /*
8*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1990, 1993, 1994, 1995, 1996
9*7c478bd9Sstevel@tonic-gate  *	Keith Bostic.  All rights reserved.
10*7c478bd9Sstevel@tonic-gate  */
11*7c478bd9Sstevel@tonic-gate /*
12*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1990, 1993, 1994, 1995
13*7c478bd9Sstevel@tonic-gate  *	The Regents of the University of California.  All rights reserved.
14*7c478bd9Sstevel@tonic-gate  *
15*7c478bd9Sstevel@tonic-gate  * Redistribution and use in source and binary forms, with or without
16*7c478bd9Sstevel@tonic-gate  * modification, are permitted provided that the following conditions
17*7c478bd9Sstevel@tonic-gate  * are met:
18*7c478bd9Sstevel@tonic-gate  * 1. Redistributions of source code must retain the above copyright
19*7c478bd9Sstevel@tonic-gate  *    notice, this list of conditions and the following disclaimer.
20*7c478bd9Sstevel@tonic-gate  * 2. Redistributions in binary form must reproduce the above copyright
21*7c478bd9Sstevel@tonic-gate  *    notice, this list of conditions and the following disclaimer in the
22*7c478bd9Sstevel@tonic-gate  *    documentation and/or other materials provided with the distribution.
23*7c478bd9Sstevel@tonic-gate  * 3. All advertising materials mentioning features or use of this software
24*7c478bd9Sstevel@tonic-gate  *    must display the following acknowledgement:
25*7c478bd9Sstevel@tonic-gate  *	This product includes software developed by the University of
26*7c478bd9Sstevel@tonic-gate  *	California, Berkeley and its contributors.
27*7c478bd9Sstevel@tonic-gate  * 4. Neither the name of the University nor the names of its contributors
28*7c478bd9Sstevel@tonic-gate  *    may be used to endorse or promote products derived from this software
29*7c478bd9Sstevel@tonic-gate  *    without specific prior written permission.
30*7c478bd9Sstevel@tonic-gate  *
31*7c478bd9Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32*7c478bd9Sstevel@tonic-gate  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33*7c478bd9Sstevel@tonic-gate  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34*7c478bd9Sstevel@tonic-gate  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35*7c478bd9Sstevel@tonic-gate  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36*7c478bd9Sstevel@tonic-gate  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37*7c478bd9Sstevel@tonic-gate  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38*7c478bd9Sstevel@tonic-gate  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39*7c478bd9Sstevel@tonic-gate  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40*7c478bd9Sstevel@tonic-gate  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41*7c478bd9Sstevel@tonic-gate  * SUCH DAMAGE.
42*7c478bd9Sstevel@tonic-gate  */
43*7c478bd9Sstevel@tonic-gate 
44*7c478bd9Sstevel@tonic-gate #include "config.h"
45*7c478bd9Sstevel@tonic-gate 
46*7c478bd9Sstevel@tonic-gate #ifndef lint
47*7c478bd9Sstevel@tonic-gate static const char sccsid[] = "@(#)bt_split.c	10.33 (Sleepycat) 10/13/98";
48*7c478bd9Sstevel@tonic-gate #endif /* not lint */
49*7c478bd9Sstevel@tonic-gate 
50*7c478bd9Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
51*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
52*7c478bd9Sstevel@tonic-gate 
53*7c478bd9Sstevel@tonic-gate #include <errno.h>
54*7c478bd9Sstevel@tonic-gate #include <limits.h>
55*7c478bd9Sstevel@tonic-gate #include <string.h>
56*7c478bd9Sstevel@tonic-gate #endif
57*7c478bd9Sstevel@tonic-gate 
58*7c478bd9Sstevel@tonic-gate #include "db_int.h"
59*7c478bd9Sstevel@tonic-gate #include "db_page.h"
60*7c478bd9Sstevel@tonic-gate #include "btree.h"
61*7c478bd9Sstevel@tonic-gate 
62*7c478bd9Sstevel@tonic-gate static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *));
63*7c478bd9Sstevel@tonic-gate static int __bam_page __P((DBC *, EPG *, EPG *));
64*7c478bd9Sstevel@tonic-gate static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *));
65*7c478bd9Sstevel@tonic-gate static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
66*7c478bd9Sstevel@tonic-gate static int __bam_root __P((DBC *, EPG *));
67*7c478bd9Sstevel@tonic-gate static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
68*7c478bd9Sstevel@tonic-gate 
69*7c478bd9Sstevel@tonic-gate /*
70*7c478bd9Sstevel@tonic-gate  * __bam_split --
71*7c478bd9Sstevel@tonic-gate  *	Split a page.
72*7c478bd9Sstevel@tonic-gate  *
73*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __bam_split __P((DBC *, void *));
74*7c478bd9Sstevel@tonic-gate  */
75*7c478bd9Sstevel@tonic-gate int
__bam_split(dbc,arg)76*7c478bd9Sstevel@tonic-gate __bam_split(dbc, arg)
77*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
78*7c478bd9Sstevel@tonic-gate 	void *arg;
79*7c478bd9Sstevel@tonic-gate {
80*7c478bd9Sstevel@tonic-gate 	BTREE *t;
81*7c478bd9Sstevel@tonic-gate 	CURSOR *cp;
82*7c478bd9Sstevel@tonic-gate 	DB *dbp;
83*7c478bd9Sstevel@tonic-gate 	enum { UP, DOWN } dir;
84*7c478bd9Sstevel@tonic-gate 	int exact, level, ret;
85*7c478bd9Sstevel@tonic-gate 
86*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
87*7c478bd9Sstevel@tonic-gate 	cp = dbc->internal;
88*7c478bd9Sstevel@tonic-gate 
89*7c478bd9Sstevel@tonic-gate 	/*
90*7c478bd9Sstevel@tonic-gate 	 * The locking protocol we use to avoid deadlock to acquire locks by
91*7c478bd9Sstevel@tonic-gate 	 * walking down the tree, but we do it as lazily as possible, locking
92*7c478bd9Sstevel@tonic-gate 	 * the root only as a last resort.  We expect all stack pages to have
93*7c478bd9Sstevel@tonic-gate 	 * been discarded before we're called; we discard all short-term locks.
94*7c478bd9Sstevel@tonic-gate 	 *
95*7c478bd9Sstevel@tonic-gate 	 * When __bam_split is first called, we know that a leaf page was too
96*7c478bd9Sstevel@tonic-gate 	 * full for an insert.  We don't know what leaf page it was, but we
97*7c478bd9Sstevel@tonic-gate 	 * have the key/recno that caused the problem.  We call XX_search to
98*7c478bd9Sstevel@tonic-gate 	 * reacquire the leaf page, but this time get both the leaf page and
99*7c478bd9Sstevel@tonic-gate 	 * its parent, locked.  We then split the leaf page and see if the new
100*7c478bd9Sstevel@tonic-gate 	 * internal key will fit into the parent page.  If it will, we're done.
101*7c478bd9Sstevel@tonic-gate 	 *
102*7c478bd9Sstevel@tonic-gate 	 * If it won't, we discard our current locks and repeat the process,
103*7c478bd9Sstevel@tonic-gate 	 * only this time acquiring the parent page and its parent, locked.
104*7c478bd9Sstevel@tonic-gate 	 * This process repeats until we succeed in the split, splitting the
105*7c478bd9Sstevel@tonic-gate 	 * root page as the final resort.  The entire process then repeats,
106*7c478bd9Sstevel@tonic-gate 	 * as necessary, until we split a leaf page.
107*7c478bd9Sstevel@tonic-gate 	 *
108*7c478bd9Sstevel@tonic-gate 	 * XXX
109*7c478bd9Sstevel@tonic-gate 	 * A traditional method of speeding this up is to maintain a stack of
110*7c478bd9Sstevel@tonic-gate 	 * the pages traversed in the original search.  You can detect if the
111*7c478bd9Sstevel@tonic-gate 	 * stack is correct by storing the page's LSN when it was searched and
112*7c478bd9Sstevel@tonic-gate 	 * comparing that LSN with the current one when it's locked during the
113*7c478bd9Sstevel@tonic-gate 	 * split.  This would be an easy change for this code, but I have no
114*7c478bd9Sstevel@tonic-gate 	 * numbers that indicate it's worthwhile.
115*7c478bd9Sstevel@tonic-gate 	 */
116*7c478bd9Sstevel@tonic-gate 	t = dbp->internal;
117*7c478bd9Sstevel@tonic-gate 	for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
118*7c478bd9Sstevel@tonic-gate 		/*
119*7c478bd9Sstevel@tonic-gate 		 * Acquire a page and its parent, locked.
120*7c478bd9Sstevel@tonic-gate 		 */
121*7c478bd9Sstevel@tonic-gate 		if ((ret = (dbp->type == DB_BTREE ?
122*7c478bd9Sstevel@tonic-gate 		    __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) :
123*7c478bd9Sstevel@tonic-gate 		    __bam_rsearch(dbc,
124*7c478bd9Sstevel@tonic-gate 		        (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0)
125*7c478bd9Sstevel@tonic-gate 			return (ret);
126*7c478bd9Sstevel@tonic-gate 
127*7c478bd9Sstevel@tonic-gate 		/*
128*7c478bd9Sstevel@tonic-gate 		 * Split the page if it still needs it (it's possible another
129*7c478bd9Sstevel@tonic-gate 		 * thread of control has already split the page).  If we are
130*7c478bd9Sstevel@tonic-gate 		 * guaranteed that two items will fit on the page, the split
131*7c478bd9Sstevel@tonic-gate 		 * is no longer necessary.
132*7c478bd9Sstevel@tonic-gate 		 */
133*7c478bd9Sstevel@tonic-gate 		if (t->bt_ovflsize * 2 <=
134*7c478bd9Sstevel@tonic-gate 		    (db_indx_t)P_FREESPACE(cp->csp[0].page)) {
135*7c478bd9Sstevel@tonic-gate 			__bam_stkrel(dbc, 1);
136*7c478bd9Sstevel@tonic-gate 			return (0);
137*7c478bd9Sstevel@tonic-gate 		}
138*7c478bd9Sstevel@tonic-gate 		ret = cp->csp[0].page->pgno == PGNO_ROOT ?
139*7c478bd9Sstevel@tonic-gate 		    __bam_root(dbc, &cp->csp[0]) :
140*7c478bd9Sstevel@tonic-gate 		    __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
141*7c478bd9Sstevel@tonic-gate 		BT_STK_CLR(cp);
142*7c478bd9Sstevel@tonic-gate 
143*7c478bd9Sstevel@tonic-gate 		switch (ret) {
144*7c478bd9Sstevel@tonic-gate 		case 0:
145*7c478bd9Sstevel@tonic-gate 			/* Once we've split the leaf page, we're done. */
146*7c478bd9Sstevel@tonic-gate 			if (level == LEAFLEVEL)
147*7c478bd9Sstevel@tonic-gate 				return (0);
148*7c478bd9Sstevel@tonic-gate 
149*7c478bd9Sstevel@tonic-gate 			/* Switch directions. */
150*7c478bd9Sstevel@tonic-gate 			if (dir == UP)
151*7c478bd9Sstevel@tonic-gate 				dir = DOWN;
152*7c478bd9Sstevel@tonic-gate 			break;
153*7c478bd9Sstevel@tonic-gate 		case DB_NEEDSPLIT:
154*7c478bd9Sstevel@tonic-gate 			/*
155*7c478bd9Sstevel@tonic-gate 			 * It's possible to fail to split repeatedly, as other
156*7c478bd9Sstevel@tonic-gate 			 * threads may be modifying the tree, or the page usage
157*7c478bd9Sstevel@tonic-gate 			 * is sufficiently bad that we don't get enough space
158*7c478bd9Sstevel@tonic-gate 			 * the first time.
159*7c478bd9Sstevel@tonic-gate 			 */
160*7c478bd9Sstevel@tonic-gate 			if (dir == DOWN)
161*7c478bd9Sstevel@tonic-gate 				dir = UP;
162*7c478bd9Sstevel@tonic-gate 			break;
163*7c478bd9Sstevel@tonic-gate 		default:
164*7c478bd9Sstevel@tonic-gate 			return (ret);
165*7c478bd9Sstevel@tonic-gate 		}
166*7c478bd9Sstevel@tonic-gate 	}
167*7c478bd9Sstevel@tonic-gate 	/* NOTREACHED */
168*7c478bd9Sstevel@tonic-gate }
169*7c478bd9Sstevel@tonic-gate 
170*7c478bd9Sstevel@tonic-gate /*
171*7c478bd9Sstevel@tonic-gate  * __bam_root --
172*7c478bd9Sstevel@tonic-gate  *	Split the root page of a btree.
173*7c478bd9Sstevel@tonic-gate  */
174*7c478bd9Sstevel@tonic-gate static int
__bam_root(dbc,cp)175*7c478bd9Sstevel@tonic-gate __bam_root(dbc, cp)
176*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
177*7c478bd9Sstevel@tonic-gate 	EPG *cp;
178*7c478bd9Sstevel@tonic-gate {
179*7c478bd9Sstevel@tonic-gate 	DB *dbp;
180*7c478bd9Sstevel@tonic-gate 	PAGE *lp, *rp;
181*7c478bd9Sstevel@tonic-gate 	db_indx_t split;
182*7c478bd9Sstevel@tonic-gate 	int ret;
183*7c478bd9Sstevel@tonic-gate 
184*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
185*7c478bd9Sstevel@tonic-gate 
186*7c478bd9Sstevel@tonic-gate 	/* Yeah, right. */
187*7c478bd9Sstevel@tonic-gate 	if (cp->page->level >= MAXBTREELEVEL) {
188*7c478bd9Sstevel@tonic-gate 		ret = ENOSPC;
189*7c478bd9Sstevel@tonic-gate 		goto err;
190*7c478bd9Sstevel@tonic-gate 	}
191*7c478bd9Sstevel@tonic-gate 
192*7c478bd9Sstevel@tonic-gate 	/* Create new left and right pages for the split. */
193*7c478bd9Sstevel@tonic-gate 	lp = rp = NULL;
194*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 ||
195*7c478bd9Sstevel@tonic-gate 	    (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
196*7c478bd9Sstevel@tonic-gate 		goto err;
197*7c478bd9Sstevel@tonic-gate 	P_INIT(lp, dbp->pgsize, lp->pgno,
198*7c478bd9Sstevel@tonic-gate 	    PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
199*7c478bd9Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
200*7c478bd9Sstevel@tonic-gate 	P_INIT(rp, dbp->pgsize, rp->pgno,
201*7c478bd9Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : lp->pgno, PGNO_INVALID,
202*7c478bd9Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
203*7c478bd9Sstevel@tonic-gate 
204*7c478bd9Sstevel@tonic-gate 	/* Split the page. */
205*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
206*7c478bd9Sstevel@tonic-gate 		goto err;
207*7c478bd9Sstevel@tonic-gate 
208*7c478bd9Sstevel@tonic-gate 	/* Log the change. */
209*7c478bd9Sstevel@tonic-gate 	if (DB_LOGGING(dbc)) {
210*7c478bd9Sstevel@tonic-gate 		DBT __a;
211*7c478bd9Sstevel@tonic-gate 		DB_LSN __lsn;
212*7c478bd9Sstevel@tonic-gate 		memset(&__a, 0, sizeof(__a));
213*7c478bd9Sstevel@tonic-gate 		__a.data = cp->page;
214*7c478bd9Sstevel@tonic-gate 		__a.size = dbp->pgsize;
215*7c478bd9Sstevel@tonic-gate 		ZERO_LSN(__lsn);
216*7c478bd9Sstevel@tonic-gate 		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
217*7c478bd9Sstevel@tonic-gate 		    &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp),
218*7c478bd9Sstevel@tonic-gate 		    PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn,
219*7c478bd9Sstevel@tonic-gate 		    &__a)) != 0)
220*7c478bd9Sstevel@tonic-gate 			goto err;
221*7c478bd9Sstevel@tonic-gate 		LSN(lp) = LSN(rp) = LSN(cp->page);
222*7c478bd9Sstevel@tonic-gate 	}
223*7c478bd9Sstevel@tonic-gate 
224*7c478bd9Sstevel@tonic-gate 	/* Clean up the new root page. */
225*7c478bd9Sstevel@tonic-gate 	if ((ret = (dbp->type == DB_RECNO ?
226*7c478bd9Sstevel@tonic-gate 	    __ram_root(dbc, cp->page, lp, rp) :
227*7c478bd9Sstevel@tonic-gate 	    __bam_broot(dbc, cp->page, lp, rp))) != 0)
228*7c478bd9Sstevel@tonic-gate 		goto err;
229*7c478bd9Sstevel@tonic-gate 
230*7c478bd9Sstevel@tonic-gate 	/* Adjust any cursors.  Do it last so we don't have to undo it. */
231*7c478bd9Sstevel@tonic-gate 	__bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
232*7c478bd9Sstevel@tonic-gate 
233*7c478bd9Sstevel@tonic-gate 	/* Success -- write the real pages back to the store. */
234*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
235*7c478bd9Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, cp->lock);
236*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY);
237*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
238*7c478bd9Sstevel@tonic-gate 
239*7c478bd9Sstevel@tonic-gate 	return (0);
240*7c478bd9Sstevel@tonic-gate 
241*7c478bd9Sstevel@tonic-gate err:	if (lp != NULL)
242*7c478bd9Sstevel@tonic-gate 		(void)__bam_free(dbc, lp);
243*7c478bd9Sstevel@tonic-gate 	if (rp != NULL)
244*7c478bd9Sstevel@tonic-gate 		(void)__bam_free(dbc, rp);
245*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, 0);
246*7c478bd9Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, cp->lock);
247*7c478bd9Sstevel@tonic-gate 	return (ret);
248*7c478bd9Sstevel@tonic-gate }
249*7c478bd9Sstevel@tonic-gate 
250*7c478bd9Sstevel@tonic-gate /*
251*7c478bd9Sstevel@tonic-gate  * __bam_page --
252*7c478bd9Sstevel@tonic-gate  *	Split the non-root page of a btree.
253*7c478bd9Sstevel@tonic-gate  */
254*7c478bd9Sstevel@tonic-gate static int
__bam_page(dbc,pp,cp)255*7c478bd9Sstevel@tonic-gate __bam_page(dbc, pp, cp)
256*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
257*7c478bd9Sstevel@tonic-gate 	EPG *pp, *cp;
258*7c478bd9Sstevel@tonic-gate {
259*7c478bd9Sstevel@tonic-gate 	DB *dbp;
260*7c478bd9Sstevel@tonic-gate 	DB_LOCK tplock;
261*7c478bd9Sstevel@tonic-gate 	PAGE *lp, *rp, *tp;
262*7c478bd9Sstevel@tonic-gate 	db_indx_t split;
263*7c478bd9Sstevel@tonic-gate 	int ret;
264*7c478bd9Sstevel@tonic-gate 
265*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
266*7c478bd9Sstevel@tonic-gate 	lp = rp = tp = NULL;
267*7c478bd9Sstevel@tonic-gate 	ret = -1;
268*7c478bd9Sstevel@tonic-gate 
269*7c478bd9Sstevel@tonic-gate 	/* Create new right page for the split. */
270*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
271*7c478bd9Sstevel@tonic-gate 		goto err;
272*7c478bd9Sstevel@tonic-gate 	P_INIT(rp, dbp->pgsize, rp->pgno,
273*7c478bd9Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno,
274*7c478bd9Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno,
275*7c478bd9Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
276*7c478bd9Sstevel@tonic-gate 
277*7c478bd9Sstevel@tonic-gate 	/* Create new left page for the split. */
278*7c478bd9Sstevel@tonic-gate 	if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0)
279*7c478bd9Sstevel@tonic-gate 		goto err;
280*7c478bd9Sstevel@tonic-gate 	P_INIT(lp, dbp->pgsize, cp->page->pgno,
281*7c478bd9Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : cp->page->prev_pgno,
282*7c478bd9Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : rp->pgno,
283*7c478bd9Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
284*7c478bd9Sstevel@tonic-gate 	ZERO_LSN(lp->lsn);
285*7c478bd9Sstevel@tonic-gate 
286*7c478bd9Sstevel@tonic-gate 	/*
287*7c478bd9Sstevel@tonic-gate 	 * Split right.
288*7c478bd9Sstevel@tonic-gate 	 *
289*7c478bd9Sstevel@tonic-gate 	 * Only the indices are sorted on the page, i.e., the key/data pairs
290*7c478bd9Sstevel@tonic-gate 	 * aren't, so it's simpler to copy the data from the split page onto
291*7c478bd9Sstevel@tonic-gate 	 * two new pages instead of copying half the data to the right page
292*7c478bd9Sstevel@tonic-gate 	 * and compacting the left page in place.  Since the left page can't
293*7c478bd9Sstevel@tonic-gate 	 * change, we swap the original and the allocated left page after the
294*7c478bd9Sstevel@tonic-gate 	 * split.
295*7c478bd9Sstevel@tonic-gate 	 */
296*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
297*7c478bd9Sstevel@tonic-gate 		goto err;
298*7c478bd9Sstevel@tonic-gate 
299*7c478bd9Sstevel@tonic-gate 	/*
300*7c478bd9Sstevel@tonic-gate 	 * Fix up the previous pointer of any leaf page following the split
301*7c478bd9Sstevel@tonic-gate 	 * page.
302*7c478bd9Sstevel@tonic-gate 	 *
303*7c478bd9Sstevel@tonic-gate 	 * !!!
304*7c478bd9Sstevel@tonic-gate 	 * There are interesting deadlock situations here as we write-lock a
305*7c478bd9Sstevel@tonic-gate 	 * page that's not in our direct ancestry.  Consider a cursor walking
306*7c478bd9Sstevel@tonic-gate 	 * through the leaf pages, that has the previous page read-locked and
307*7c478bd9Sstevel@tonic-gate 	 * is waiting on a lock for the page we just split.  It will deadlock
308*7c478bd9Sstevel@tonic-gate 	 * here.  If this is a problem, we can fail in the split; it's not a
309*7c478bd9Sstevel@tonic-gate 	 * problem as the split will succeed after the cursor passes through
310*7c478bd9Sstevel@tonic-gate 	 * the page we're splitting.
311*7c478bd9Sstevel@tonic-gate 	 */
312*7c478bd9Sstevel@tonic-gate 	if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) {
313*7c478bd9Sstevel@tonic-gate 		if ((ret = __bam_lget(dbc,
314*7c478bd9Sstevel@tonic-gate 		    0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0)
315*7c478bd9Sstevel@tonic-gate 			goto err;
316*7c478bd9Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0)
317*7c478bd9Sstevel@tonic-gate 			goto err;
318*7c478bd9Sstevel@tonic-gate 	}
319*7c478bd9Sstevel@tonic-gate 
320*7c478bd9Sstevel@tonic-gate 	/* Insert the new pages into the parent page. */
321*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0)
322*7c478bd9Sstevel@tonic-gate 		goto err;
323*7c478bd9Sstevel@tonic-gate 
324*7c478bd9Sstevel@tonic-gate 	/* Log the change. */
325*7c478bd9Sstevel@tonic-gate 	if (DB_LOGGING(dbc)) {
326*7c478bd9Sstevel@tonic-gate 		DBT __a;
327*7c478bd9Sstevel@tonic-gate 		DB_LSN __lsn;
328*7c478bd9Sstevel@tonic-gate 		memset(&__a, 0, sizeof(__a));
329*7c478bd9Sstevel@tonic-gate 		__a.data = cp->page;
330*7c478bd9Sstevel@tonic-gate 		__a.size = dbp->pgsize;
331*7c478bd9Sstevel@tonic-gate 		if (tp == NULL)
332*7c478bd9Sstevel@tonic-gate 			ZERO_LSN(__lsn);
333*7c478bd9Sstevel@tonic-gate 		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
334*7c478bd9Sstevel@tonic-gate 		    &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page),
335*7c478bd9Sstevel@tonic-gate 		    &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
336*7c478bd9Sstevel@tonic-gate 		    tp == NULL ? 0 : PGNO(tp),
337*7c478bd9Sstevel@tonic-gate 		    tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0)
338*7c478bd9Sstevel@tonic-gate 			goto err;
339*7c478bd9Sstevel@tonic-gate 
340*7c478bd9Sstevel@tonic-gate 		LSN(lp) = LSN(rp) = LSN(cp->page);
341*7c478bd9Sstevel@tonic-gate 		if (tp != NULL)
342*7c478bd9Sstevel@tonic-gate 			LSN(tp) = LSN(cp->page);
343*7c478bd9Sstevel@tonic-gate 	}
344*7c478bd9Sstevel@tonic-gate 
345*7c478bd9Sstevel@tonic-gate 	/* Copy the allocated page into place. */
346*7c478bd9Sstevel@tonic-gate 	memcpy(cp->page, lp, LOFFSET(lp));
347*7c478bd9Sstevel@tonic-gate 	memcpy((u_int8_t *)cp->page + HOFFSET(lp),
348*7c478bd9Sstevel@tonic-gate 	    (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
349*7c478bd9Sstevel@tonic-gate 	__os_free(lp, dbp->pgsize);
350*7c478bd9Sstevel@tonic-gate 	lp = NULL;
351*7c478bd9Sstevel@tonic-gate 
352*7c478bd9Sstevel@tonic-gate 	/* Finish the next-page link. */
353*7c478bd9Sstevel@tonic-gate 	if (tp != NULL)
354*7c478bd9Sstevel@tonic-gate 		tp->prev_pgno = rp->pgno;
355*7c478bd9Sstevel@tonic-gate 
356*7c478bd9Sstevel@tonic-gate 	/* Adjust any cursors.  Do so last so we don't have to undo it. */
357*7c478bd9Sstevel@tonic-gate 	__bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0);
358*7c478bd9Sstevel@tonic-gate 
359*7c478bd9Sstevel@tonic-gate 	/* Success -- write the real pages back to the store. */
360*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY);
361*7c478bd9Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, pp->lock);
362*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
363*7c478bd9Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, cp->lock);
364*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
365*7c478bd9Sstevel@tonic-gate 	if (tp != NULL) {
366*7c478bd9Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY);
367*7c478bd9Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, tplock);
368*7c478bd9Sstevel@tonic-gate 	}
369*7c478bd9Sstevel@tonic-gate 	return (0);
370*7c478bd9Sstevel@tonic-gate 
371*7c478bd9Sstevel@tonic-gate err:	if (lp != NULL)
372*7c478bd9Sstevel@tonic-gate 		__os_free(lp, dbp->pgsize);
373*7c478bd9Sstevel@tonic-gate 	if (rp != NULL)
374*7c478bd9Sstevel@tonic-gate 		(void)__bam_free(dbc, rp);
375*7c478bd9Sstevel@tonic-gate 	if (tp != NULL) {
376*7c478bd9Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, tp, 0);
377*7c478bd9Sstevel@tonic-gate 		if (ret == DB_NEEDSPLIT)
378*7c478bd9Sstevel@tonic-gate 			(void)__BT_LPUT(dbc, tplock);
379*7c478bd9Sstevel@tonic-gate 		else
380*7c478bd9Sstevel@tonic-gate 			(void)__BT_TLPUT(dbc, tplock);
381*7c478bd9Sstevel@tonic-gate 	}
382*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, pp->page, 0);
383*7c478bd9Sstevel@tonic-gate 	if (ret == DB_NEEDSPLIT)
384*7c478bd9Sstevel@tonic-gate 		(void)__BT_LPUT(dbc, pp->lock);
385*7c478bd9Sstevel@tonic-gate 	else
386*7c478bd9Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, pp->lock);
387*7c478bd9Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, 0);
388*7c478bd9Sstevel@tonic-gate 	if (ret == DB_NEEDSPLIT)
389*7c478bd9Sstevel@tonic-gate 		(void)__BT_LPUT(dbc, cp->lock);
390*7c478bd9Sstevel@tonic-gate 	else
391*7c478bd9Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, cp->lock);
392*7c478bd9Sstevel@tonic-gate 	return (ret);
393*7c478bd9Sstevel@tonic-gate }
394*7c478bd9Sstevel@tonic-gate 
395*7c478bd9Sstevel@tonic-gate /*
396*7c478bd9Sstevel@tonic-gate  * __bam_broot --
397*7c478bd9Sstevel@tonic-gate  *	Fix up the btree root page after it has been split.
398*7c478bd9Sstevel@tonic-gate  */
399*7c478bd9Sstevel@tonic-gate static int
__bam_broot(dbc,rootp,lp,rp)400*7c478bd9Sstevel@tonic-gate __bam_broot(dbc, rootp, lp, rp)
401*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
402*7c478bd9Sstevel@tonic-gate 	PAGE *rootp, *lp, *rp;
403*7c478bd9Sstevel@tonic-gate {
404*7c478bd9Sstevel@tonic-gate 	BINTERNAL bi, *child_bi;
405*7c478bd9Sstevel@tonic-gate 	BKEYDATA *child_bk;
406*7c478bd9Sstevel@tonic-gate 	DB *dbp;
407*7c478bd9Sstevel@tonic-gate 	DBT hdr, data;
408*7c478bd9Sstevel@tonic-gate 	int ret;
409*7c478bd9Sstevel@tonic-gate 
410*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
411*7c478bd9Sstevel@tonic-gate 
412*7c478bd9Sstevel@tonic-gate 	/*
413*7c478bd9Sstevel@tonic-gate 	 * If the root page was a leaf page, change it into an internal page.
414*7c478bd9Sstevel@tonic-gate 	 * We copy the key we split on (but not the key's data, in the case of
415*7c478bd9Sstevel@tonic-gate 	 * a leaf page) to the new root page.
416*7c478bd9Sstevel@tonic-gate 	 */
417*7c478bd9Sstevel@tonic-gate 	P_INIT(rootp, dbp->pgsize,
418*7c478bd9Sstevel@tonic-gate 	    PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
419*7c478bd9Sstevel@tonic-gate 
420*7c478bd9Sstevel@tonic-gate 	memset(&data, 0, sizeof(data));
421*7c478bd9Sstevel@tonic-gate 	memset(&hdr, 0, sizeof(hdr));
422*7c478bd9Sstevel@tonic-gate 
423*7c478bd9Sstevel@tonic-gate 	/*
424*7c478bd9Sstevel@tonic-gate 	 * The btree comparison code guarantees that the left-most key on any
425*7c478bd9Sstevel@tonic-gate 	 * level of the tree is never used, so it doesn't need to be filled in.
426*7c478bd9Sstevel@tonic-gate 	 */
427*7c478bd9Sstevel@tonic-gate 	memset(&bi, 0, sizeof(bi));
428*7c478bd9Sstevel@tonic-gate 	bi.len = 0;
429*7c478bd9Sstevel@tonic-gate 	B_TSET(bi.type, B_KEYDATA, 0);
430*7c478bd9Sstevel@tonic-gate 	bi.pgno = lp->pgno;
431*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_BT_RECNUM)) {
432*7c478bd9Sstevel@tonic-gate 		bi.nrecs = __bam_total(lp);
433*7c478bd9Sstevel@tonic-gate 		RE_NREC_SET(rootp, bi.nrecs);
434*7c478bd9Sstevel@tonic-gate 	}
435*7c478bd9Sstevel@tonic-gate 	hdr.data = &bi;
436*7c478bd9Sstevel@tonic-gate 	hdr.size = SSZA(BINTERNAL, data);
437*7c478bd9Sstevel@tonic-gate 	if ((ret =
438*7c478bd9Sstevel@tonic-gate 	    __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0)
439*7c478bd9Sstevel@tonic-gate 		return (ret);
440*7c478bd9Sstevel@tonic-gate 
441*7c478bd9Sstevel@tonic-gate 	switch (TYPE(rp)) {
442*7c478bd9Sstevel@tonic-gate 	case P_IBTREE:
443*7c478bd9Sstevel@tonic-gate 		/* Copy the first key of the child page onto the root page. */
444*7c478bd9Sstevel@tonic-gate 		child_bi = GET_BINTERNAL(rp, 0);
445*7c478bd9Sstevel@tonic-gate 
446*7c478bd9Sstevel@tonic-gate 		bi.len = child_bi->len;
447*7c478bd9Sstevel@tonic-gate 		B_TSET(bi.type, child_bi->type, 0);
448*7c478bd9Sstevel@tonic-gate 		bi.pgno = rp->pgno;
449*7c478bd9Sstevel@tonic-gate 		if (F_ISSET(dbp, DB_BT_RECNUM)) {
450*7c478bd9Sstevel@tonic-gate 			bi.nrecs = __bam_total(rp);
451*7c478bd9Sstevel@tonic-gate 			RE_NREC_ADJ(rootp, bi.nrecs);
452*7c478bd9Sstevel@tonic-gate 		}
453*7c478bd9Sstevel@tonic-gate 		hdr.data = &bi;
454*7c478bd9Sstevel@tonic-gate 		hdr.size = SSZA(BINTERNAL, data);
455*7c478bd9Sstevel@tonic-gate 		data.data = child_bi->data;
456*7c478bd9Sstevel@tonic-gate 		data.size = child_bi->len;
457*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_pitem(dbc, rootp, 1,
458*7c478bd9Sstevel@tonic-gate 		    BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
459*7c478bd9Sstevel@tonic-gate 			return (ret);
460*7c478bd9Sstevel@tonic-gate 
461*7c478bd9Sstevel@tonic-gate 		/* Increment the overflow ref count. */
462*7c478bd9Sstevel@tonic-gate 		if (B_TYPE(child_bi->type) == B_OVERFLOW)
463*7c478bd9Sstevel@tonic-gate 			if ((ret = __db_ovref(dbc,
464*7c478bd9Sstevel@tonic-gate 			    ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
465*7c478bd9Sstevel@tonic-gate 				return (ret);
466*7c478bd9Sstevel@tonic-gate 		break;
467*7c478bd9Sstevel@tonic-gate 	case P_LBTREE:
468*7c478bd9Sstevel@tonic-gate 		/* Copy the first key of the child page onto the root page. */
469*7c478bd9Sstevel@tonic-gate 		child_bk = GET_BKEYDATA(rp, 0);
470*7c478bd9Sstevel@tonic-gate 		switch (B_TYPE(child_bk->type)) {
471*7c478bd9Sstevel@tonic-gate 		case B_KEYDATA:
472*7c478bd9Sstevel@tonic-gate 			bi.len = child_bk->len;
473*7c478bd9Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
474*7c478bd9Sstevel@tonic-gate 			bi.pgno = rp->pgno;
475*7c478bd9Sstevel@tonic-gate 			if (F_ISSET(dbp, DB_BT_RECNUM)) {
476*7c478bd9Sstevel@tonic-gate 				bi.nrecs = __bam_total(rp);
477*7c478bd9Sstevel@tonic-gate 				RE_NREC_ADJ(rootp, bi.nrecs);
478*7c478bd9Sstevel@tonic-gate 			}
479*7c478bd9Sstevel@tonic-gate 			hdr.data = &bi;
480*7c478bd9Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
481*7c478bd9Sstevel@tonic-gate 			data.data = child_bk->data;
482*7c478bd9Sstevel@tonic-gate 			data.size = child_bk->len;
483*7c478bd9Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, rootp, 1,
484*7c478bd9Sstevel@tonic-gate 			    BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0)
485*7c478bd9Sstevel@tonic-gate 				return (ret);
486*7c478bd9Sstevel@tonic-gate 			break;
487*7c478bd9Sstevel@tonic-gate 		case B_DUPLICATE:
488*7c478bd9Sstevel@tonic-gate 		case B_OVERFLOW:
489*7c478bd9Sstevel@tonic-gate 			bi.len = BOVERFLOW_SIZE;
490*7c478bd9Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
491*7c478bd9Sstevel@tonic-gate 			bi.pgno = rp->pgno;
492*7c478bd9Sstevel@tonic-gate 			if (F_ISSET(dbp, DB_BT_RECNUM)) {
493*7c478bd9Sstevel@tonic-gate 				bi.nrecs = __bam_total(rp);
494*7c478bd9Sstevel@tonic-gate 				RE_NREC_ADJ(rootp, bi.nrecs);
495*7c478bd9Sstevel@tonic-gate 			}
496*7c478bd9Sstevel@tonic-gate 			hdr.data = &bi;
497*7c478bd9Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
498*7c478bd9Sstevel@tonic-gate 			data.data = child_bk;
499*7c478bd9Sstevel@tonic-gate 			data.size = BOVERFLOW_SIZE;
500*7c478bd9Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, rootp, 1,
501*7c478bd9Sstevel@tonic-gate 			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
502*7c478bd9Sstevel@tonic-gate 				return (ret);
503*7c478bd9Sstevel@tonic-gate 
504*7c478bd9Sstevel@tonic-gate 			/* Increment the overflow ref count. */
505*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(child_bk->type) == B_OVERFLOW)
506*7c478bd9Sstevel@tonic-gate 				if ((ret = __db_ovref(dbc,
507*7c478bd9Sstevel@tonic-gate 				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
508*7c478bd9Sstevel@tonic-gate 					return (ret);
509*7c478bd9Sstevel@tonic-gate 			break;
510*7c478bd9Sstevel@tonic-gate 		default:
511*7c478bd9Sstevel@tonic-gate 			return (__db_pgfmt(dbp, rp->pgno));
512*7c478bd9Sstevel@tonic-gate 		}
513*7c478bd9Sstevel@tonic-gate 		break;
514*7c478bd9Sstevel@tonic-gate 	default:
515*7c478bd9Sstevel@tonic-gate 		return (__db_pgfmt(dbp, rp->pgno));
516*7c478bd9Sstevel@tonic-gate 	}
517*7c478bd9Sstevel@tonic-gate 	return (0);
518*7c478bd9Sstevel@tonic-gate }
519*7c478bd9Sstevel@tonic-gate 
520*7c478bd9Sstevel@tonic-gate /*
521*7c478bd9Sstevel@tonic-gate  * __ram_root --
522*7c478bd9Sstevel@tonic-gate  *	Fix up the recno root page after it has been split.
523*7c478bd9Sstevel@tonic-gate  */
524*7c478bd9Sstevel@tonic-gate static int
__ram_root(dbc,rootp,lp,rp)525*7c478bd9Sstevel@tonic-gate __ram_root(dbc, rootp, lp, rp)
526*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
527*7c478bd9Sstevel@tonic-gate 	PAGE *rootp, *lp, *rp;
528*7c478bd9Sstevel@tonic-gate {
529*7c478bd9Sstevel@tonic-gate 	DB *dbp;
530*7c478bd9Sstevel@tonic-gate 	DBT hdr;
531*7c478bd9Sstevel@tonic-gate 	RINTERNAL ri;
532*7c478bd9Sstevel@tonic-gate 	int ret;
533*7c478bd9Sstevel@tonic-gate 
534*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
535*7c478bd9Sstevel@tonic-gate 
536*7c478bd9Sstevel@tonic-gate 	/* Initialize the page. */
537*7c478bd9Sstevel@tonic-gate 	P_INIT(rootp, dbp->pgsize,
538*7c478bd9Sstevel@tonic-gate 	    PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
539*7c478bd9Sstevel@tonic-gate 
540*7c478bd9Sstevel@tonic-gate 	/* Initialize the header. */
541*7c478bd9Sstevel@tonic-gate 	memset(&hdr, 0, sizeof(hdr));
542*7c478bd9Sstevel@tonic-gate 	hdr.data = &ri;
543*7c478bd9Sstevel@tonic-gate 	hdr.size = RINTERNAL_SIZE;
544*7c478bd9Sstevel@tonic-gate 
545*7c478bd9Sstevel@tonic-gate 	/* Insert the left and right keys, set the header information. */
546*7c478bd9Sstevel@tonic-gate 	ri.pgno = lp->pgno;
547*7c478bd9Sstevel@tonic-gate 	ri.nrecs = __bam_total(lp);
548*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
549*7c478bd9Sstevel@tonic-gate 		return (ret);
550*7c478bd9Sstevel@tonic-gate 	RE_NREC_SET(rootp, ri.nrecs);
551*7c478bd9Sstevel@tonic-gate 	ri.pgno = rp->pgno;
552*7c478bd9Sstevel@tonic-gate 	ri.nrecs = __bam_total(rp);
553*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
554*7c478bd9Sstevel@tonic-gate 		return (ret);
555*7c478bd9Sstevel@tonic-gate 	RE_NREC_ADJ(rootp, ri.nrecs);
556*7c478bd9Sstevel@tonic-gate 	return (0);
557*7c478bd9Sstevel@tonic-gate }
558*7c478bd9Sstevel@tonic-gate 
559*7c478bd9Sstevel@tonic-gate /*
560*7c478bd9Sstevel@tonic-gate  * __bam_pinsert --
561*7c478bd9Sstevel@tonic-gate  *	Insert a new key into a parent page, completing the split.
562*7c478bd9Sstevel@tonic-gate  */
563*7c478bd9Sstevel@tonic-gate static int
__bam_pinsert(dbc,parent,lchild,rchild)564*7c478bd9Sstevel@tonic-gate __bam_pinsert(dbc, parent, lchild, rchild)
565*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
566*7c478bd9Sstevel@tonic-gate 	EPG *parent;
567*7c478bd9Sstevel@tonic-gate 	PAGE *lchild, *rchild;
568*7c478bd9Sstevel@tonic-gate {
569*7c478bd9Sstevel@tonic-gate 	BINTERNAL bi, *child_bi;
570*7c478bd9Sstevel@tonic-gate 	BKEYDATA *child_bk, *tmp_bk;
571*7c478bd9Sstevel@tonic-gate 	BTREE *t;
572*7c478bd9Sstevel@tonic-gate 	DB *dbp;
573*7c478bd9Sstevel@tonic-gate 	DBT a, b, hdr, data;
574*7c478bd9Sstevel@tonic-gate 	PAGE *ppage;
575*7c478bd9Sstevel@tonic-gate 	RINTERNAL ri;
576*7c478bd9Sstevel@tonic-gate 	db_indx_t off;
577*7c478bd9Sstevel@tonic-gate 	db_recno_t nrecs;
578*7c478bd9Sstevel@tonic-gate 	u_int32_t n, nbytes, nksize;
579*7c478bd9Sstevel@tonic-gate 	int ret;
580*7c478bd9Sstevel@tonic-gate 
581*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
582*7c478bd9Sstevel@tonic-gate 	t = dbp->internal;
583*7c478bd9Sstevel@tonic-gate 	ppage = parent->page;
584*7c478bd9Sstevel@tonic-gate 
585*7c478bd9Sstevel@tonic-gate 	/* If handling record numbers, count records split to the right page. */
586*7c478bd9Sstevel@tonic-gate 	nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ?
587*7c478bd9Sstevel@tonic-gate 	    __bam_total(rchild) : 0;
588*7c478bd9Sstevel@tonic-gate 
589*7c478bd9Sstevel@tonic-gate 	/*
590*7c478bd9Sstevel@tonic-gate 	 * Now we insert the new page's first key into the parent page, which
591*7c478bd9Sstevel@tonic-gate 	 * completes the split.  The parent points to a PAGE and a page index
592*7c478bd9Sstevel@tonic-gate 	 * offset, where the new key goes ONE AFTER the index, because we split
593*7c478bd9Sstevel@tonic-gate 	 * to the right.
594*7c478bd9Sstevel@tonic-gate 	 *
595*7c478bd9Sstevel@tonic-gate 	 * XXX
596*7c478bd9Sstevel@tonic-gate 	 * Some btree algorithms replace the key for the old page as well as
597*7c478bd9Sstevel@tonic-gate 	 * the new page.  We don't, as there's no reason to believe that the
598*7c478bd9Sstevel@tonic-gate 	 * first key on the old page is any better than the key we have, and,
599*7c478bd9Sstevel@tonic-gate 	 * in the case of a key being placed at index 0 causing the split, the
600*7c478bd9Sstevel@tonic-gate 	 * key is unavailable.
601*7c478bd9Sstevel@tonic-gate 	 */
602*7c478bd9Sstevel@tonic-gate 	off = parent->indx + O_INDX;
603*7c478bd9Sstevel@tonic-gate 
604*7c478bd9Sstevel@tonic-gate 	/*
605*7c478bd9Sstevel@tonic-gate 	 * Calculate the space needed on the parent page.
606*7c478bd9Sstevel@tonic-gate 	 *
607*7c478bd9Sstevel@tonic-gate 	 * Prefix trees: space hack used when inserting into BINTERNAL pages.
608*7c478bd9Sstevel@tonic-gate 	 * Retain only what's needed to distinguish between the new entry and
609*7c478bd9Sstevel@tonic-gate 	 * the LAST entry on the page to its left.  If the keys compare equal,
610*7c478bd9Sstevel@tonic-gate 	 * retain the entire key.  We ignore overflow keys, and the entire key
611*7c478bd9Sstevel@tonic-gate 	 * must be retained for the next-to-leftmost key on the leftmost page
612*7c478bd9Sstevel@tonic-gate 	 * of each level, or the search will fail.  Applicable ONLY to internal
613*7c478bd9Sstevel@tonic-gate 	 * pages that have leaf pages as children.  Further reduction of the
614*7c478bd9Sstevel@tonic-gate 	 * key between pairs of internal pages loses too much information.
615*7c478bd9Sstevel@tonic-gate 	 */
616*7c478bd9Sstevel@tonic-gate 	switch (TYPE(rchild)) {
617*7c478bd9Sstevel@tonic-gate 	case P_IBTREE:
618*7c478bd9Sstevel@tonic-gate 		child_bi = GET_BINTERNAL(rchild, 0);
619*7c478bd9Sstevel@tonic-gate 		nbytes = BINTERNAL_PSIZE(child_bi->len);
620*7c478bd9Sstevel@tonic-gate 
621*7c478bd9Sstevel@tonic-gate 		if (P_FREESPACE(ppage) < nbytes)
622*7c478bd9Sstevel@tonic-gate 			return (DB_NEEDSPLIT);
623*7c478bd9Sstevel@tonic-gate 
624*7c478bd9Sstevel@tonic-gate 		/* Add a new record for the right page. */
625*7c478bd9Sstevel@tonic-gate 		memset(&bi, 0, sizeof(bi));
626*7c478bd9Sstevel@tonic-gate 		bi.len = child_bi->len;
627*7c478bd9Sstevel@tonic-gate 		B_TSET(bi.type, child_bi->type, 0);
628*7c478bd9Sstevel@tonic-gate 		bi.pgno = rchild->pgno;
629*7c478bd9Sstevel@tonic-gate 		bi.nrecs = nrecs;
630*7c478bd9Sstevel@tonic-gate 		memset(&hdr, 0, sizeof(hdr));
631*7c478bd9Sstevel@tonic-gate 		hdr.data = &bi;
632*7c478bd9Sstevel@tonic-gate 		hdr.size = SSZA(BINTERNAL, data);
633*7c478bd9Sstevel@tonic-gate 		memset(&data, 0, sizeof(data));
634*7c478bd9Sstevel@tonic-gate 		data.data = child_bi->data;
635*7c478bd9Sstevel@tonic-gate 		data.size = child_bi->len;
636*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_pitem(dbc, ppage, off,
637*7c478bd9Sstevel@tonic-gate 		    BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
638*7c478bd9Sstevel@tonic-gate 			return (ret);
639*7c478bd9Sstevel@tonic-gate 
640*7c478bd9Sstevel@tonic-gate 		/* Increment the overflow ref count. */
641*7c478bd9Sstevel@tonic-gate 		if (B_TYPE(child_bi->type) == B_OVERFLOW)
642*7c478bd9Sstevel@tonic-gate 			if ((ret = __db_ovref(dbc,
643*7c478bd9Sstevel@tonic-gate 			    ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
644*7c478bd9Sstevel@tonic-gate 				return (ret);
645*7c478bd9Sstevel@tonic-gate 		break;
646*7c478bd9Sstevel@tonic-gate 	case P_LBTREE:
647*7c478bd9Sstevel@tonic-gate 		child_bk = GET_BKEYDATA(rchild, 0);
648*7c478bd9Sstevel@tonic-gate 		switch (B_TYPE(child_bk->type)) {
649*7c478bd9Sstevel@tonic-gate 		case B_KEYDATA:
650*7c478bd9Sstevel@tonic-gate 			nbytes = BINTERNAL_PSIZE(child_bk->len);
651*7c478bd9Sstevel@tonic-gate 			nksize = child_bk->len;
652*7c478bd9Sstevel@tonic-gate 			if (t->bt_prefix == NULL)
653*7c478bd9Sstevel@tonic-gate 				goto noprefix;
654*7c478bd9Sstevel@tonic-gate 			if (ppage->prev_pgno == PGNO_INVALID && off <= 1)
655*7c478bd9Sstevel@tonic-gate 				goto noprefix;
656*7c478bd9Sstevel@tonic-gate 			tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX);
657*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(tmp_bk->type) != B_KEYDATA)
658*7c478bd9Sstevel@tonic-gate 				goto noprefix;
659*7c478bd9Sstevel@tonic-gate 			memset(&a, 0, sizeof(a));
660*7c478bd9Sstevel@tonic-gate 			a.size = tmp_bk->len;
661*7c478bd9Sstevel@tonic-gate 			a.data = tmp_bk->data;
662*7c478bd9Sstevel@tonic-gate 			memset(&b, 0, sizeof(b));
663*7c478bd9Sstevel@tonic-gate 			b.size = child_bk->len;
664*7c478bd9Sstevel@tonic-gate 			b.data = child_bk->data;
665*7c478bd9Sstevel@tonic-gate 			nksize = t->bt_prefix(&a, &b);
666*7c478bd9Sstevel@tonic-gate 			if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
667*7c478bd9Sstevel@tonic-gate 				nbytes = n;
668*7c478bd9Sstevel@tonic-gate 			else
669*7c478bd9Sstevel@tonic-gate noprefix:			nksize = child_bk->len;
670*7c478bd9Sstevel@tonic-gate 
671*7c478bd9Sstevel@tonic-gate 			if (P_FREESPACE(ppage) < nbytes)
672*7c478bd9Sstevel@tonic-gate 				return (DB_NEEDSPLIT);
673*7c478bd9Sstevel@tonic-gate 
674*7c478bd9Sstevel@tonic-gate 			memset(&bi, 0, sizeof(bi));
675*7c478bd9Sstevel@tonic-gate 			bi.len = nksize;
676*7c478bd9Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
677*7c478bd9Sstevel@tonic-gate 			bi.pgno = rchild->pgno;
678*7c478bd9Sstevel@tonic-gate 			bi.nrecs = nrecs;
679*7c478bd9Sstevel@tonic-gate 			memset(&hdr, 0, sizeof(hdr));
680*7c478bd9Sstevel@tonic-gate 			hdr.data = &bi;
681*7c478bd9Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
682*7c478bd9Sstevel@tonic-gate 			memset(&data, 0, sizeof(data));
683*7c478bd9Sstevel@tonic-gate 			data.data = child_bk->data;
684*7c478bd9Sstevel@tonic-gate 			data.size = nksize;
685*7c478bd9Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, ppage, off,
686*7c478bd9Sstevel@tonic-gate 			    BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)
687*7c478bd9Sstevel@tonic-gate 				return (ret);
688*7c478bd9Sstevel@tonic-gate 			break;
689*7c478bd9Sstevel@tonic-gate 		case B_DUPLICATE:
690*7c478bd9Sstevel@tonic-gate 		case B_OVERFLOW:
691*7c478bd9Sstevel@tonic-gate 			nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
692*7c478bd9Sstevel@tonic-gate 
693*7c478bd9Sstevel@tonic-gate 			if (P_FREESPACE(ppage) < nbytes)
694*7c478bd9Sstevel@tonic-gate 				return (DB_NEEDSPLIT);
695*7c478bd9Sstevel@tonic-gate 
696*7c478bd9Sstevel@tonic-gate 			memset(&bi, 0, sizeof(bi));
697*7c478bd9Sstevel@tonic-gate 			bi.len = BOVERFLOW_SIZE;
698*7c478bd9Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
699*7c478bd9Sstevel@tonic-gate 			bi.pgno = rchild->pgno;
700*7c478bd9Sstevel@tonic-gate 			bi.nrecs = nrecs;
701*7c478bd9Sstevel@tonic-gate 			memset(&hdr, 0, sizeof(hdr));
702*7c478bd9Sstevel@tonic-gate 			hdr.data = &bi;
703*7c478bd9Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
704*7c478bd9Sstevel@tonic-gate 			memset(&data, 0, sizeof(data));
705*7c478bd9Sstevel@tonic-gate 			data.data = child_bk;
706*7c478bd9Sstevel@tonic-gate 			data.size = BOVERFLOW_SIZE;
707*7c478bd9Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, ppage, off,
708*7c478bd9Sstevel@tonic-gate 			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
709*7c478bd9Sstevel@tonic-gate 				return (ret);
710*7c478bd9Sstevel@tonic-gate 
711*7c478bd9Sstevel@tonic-gate 			/* Increment the overflow ref count. */
712*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(child_bk->type) == B_OVERFLOW)
713*7c478bd9Sstevel@tonic-gate 				if ((ret = __db_ovref(dbc,
714*7c478bd9Sstevel@tonic-gate 				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
715*7c478bd9Sstevel@tonic-gate 					return (ret);
716*7c478bd9Sstevel@tonic-gate 			break;
717*7c478bd9Sstevel@tonic-gate 		default:
718*7c478bd9Sstevel@tonic-gate 			return (__db_pgfmt(dbp, rchild->pgno));
719*7c478bd9Sstevel@tonic-gate 		}
720*7c478bd9Sstevel@tonic-gate 		break;
721*7c478bd9Sstevel@tonic-gate 	case P_IRECNO:
722*7c478bd9Sstevel@tonic-gate 	case P_LRECNO:
723*7c478bd9Sstevel@tonic-gate 		nbytes = RINTERNAL_PSIZE;
724*7c478bd9Sstevel@tonic-gate 
725*7c478bd9Sstevel@tonic-gate 		if (P_FREESPACE(ppage) < nbytes)
726*7c478bd9Sstevel@tonic-gate 			return (DB_NEEDSPLIT);
727*7c478bd9Sstevel@tonic-gate 
728*7c478bd9Sstevel@tonic-gate 		/* Add a new record for the right page. */
729*7c478bd9Sstevel@tonic-gate 		memset(&hdr, 0, sizeof(hdr));
730*7c478bd9Sstevel@tonic-gate 		hdr.data = &ri;
731*7c478bd9Sstevel@tonic-gate 		hdr.size = RINTERNAL_SIZE;
732*7c478bd9Sstevel@tonic-gate 		ri.pgno = rchild->pgno;
733*7c478bd9Sstevel@tonic-gate 		ri.nrecs = nrecs;
734*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_pitem(dbc,
735*7c478bd9Sstevel@tonic-gate 		    ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)
736*7c478bd9Sstevel@tonic-gate 			return (ret);
737*7c478bd9Sstevel@tonic-gate 		break;
738*7c478bd9Sstevel@tonic-gate 	default:
739*7c478bd9Sstevel@tonic-gate 		return (__db_pgfmt(dbp, rchild->pgno));
740*7c478bd9Sstevel@tonic-gate 	}
741*7c478bd9Sstevel@tonic-gate 
742*7c478bd9Sstevel@tonic-gate 	/* Adjust the parent page's left page record count. */
743*7c478bd9Sstevel@tonic-gate 	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
744*7c478bd9Sstevel@tonic-gate 		/* Log the change. */
745*7c478bd9Sstevel@tonic-gate 		if (DB_LOGGING(dbc) &&
746*7c478bd9Sstevel@tonic-gate 		    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
747*7c478bd9Sstevel@tonic-gate 		    dbc->txn, &LSN(ppage), 0, dbp->log_fileid,
748*7c478bd9Sstevel@tonic-gate 		    PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx,
749*7c478bd9Sstevel@tonic-gate 		    -(int32_t)nrecs, (int32_t)0)) != 0)
750*7c478bd9Sstevel@tonic-gate 			return (ret);
751*7c478bd9Sstevel@tonic-gate 
752*7c478bd9Sstevel@tonic-gate 		/* Update the left page count. */
753*7c478bd9Sstevel@tonic-gate 		if (dbp->type == DB_RECNO)
754*7c478bd9Sstevel@tonic-gate 			GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
755*7c478bd9Sstevel@tonic-gate 		else
756*7c478bd9Sstevel@tonic-gate 			GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
757*7c478bd9Sstevel@tonic-gate 	}
758*7c478bd9Sstevel@tonic-gate 
759*7c478bd9Sstevel@tonic-gate 	return (0);
760*7c478bd9Sstevel@tonic-gate }
761*7c478bd9Sstevel@tonic-gate 
762*7c478bd9Sstevel@tonic-gate /*
763*7c478bd9Sstevel@tonic-gate  * __bam_psplit --
764*7c478bd9Sstevel@tonic-gate  *	Do the real work of splitting the page.
765*7c478bd9Sstevel@tonic-gate  */
766*7c478bd9Sstevel@tonic-gate static int
__bam_psplit(dbc,cp,lp,rp,splitret)767*7c478bd9Sstevel@tonic-gate __bam_psplit(dbc, cp, lp, rp, splitret)
768*7c478bd9Sstevel@tonic-gate 	DBC *dbc;
769*7c478bd9Sstevel@tonic-gate 	EPG *cp;
770*7c478bd9Sstevel@tonic-gate 	PAGE *lp, *rp;
771*7c478bd9Sstevel@tonic-gate 	db_indx_t *splitret;
772*7c478bd9Sstevel@tonic-gate {
773*7c478bd9Sstevel@tonic-gate 	DB *dbp;
774*7c478bd9Sstevel@tonic-gate 	PAGE *pp;
775*7c478bd9Sstevel@tonic-gate 	db_indx_t half, nbytes, off, splitp, top;
776*7c478bd9Sstevel@tonic-gate 	int adjust, cnt, isbigkey, ret;
777*7c478bd9Sstevel@tonic-gate 
778*7c478bd9Sstevel@tonic-gate 	dbp = dbc->dbp;
779*7c478bd9Sstevel@tonic-gate 	pp = cp->page;
780*7c478bd9Sstevel@tonic-gate 	adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
781*7c478bd9Sstevel@tonic-gate 
782*7c478bd9Sstevel@tonic-gate 	/*
783*7c478bd9Sstevel@tonic-gate 	 * If we're splitting the first (last) page on a level because we're
784*7c478bd9Sstevel@tonic-gate 	 * inserting (appending) a key to it, it's likely that the data is
785*7c478bd9Sstevel@tonic-gate 	 * sorted.  Moving a single item to the new page is less work and can
786*7c478bd9Sstevel@tonic-gate 	 * push the fill factor higher than normal.  If we're wrong it's not
787*7c478bd9Sstevel@tonic-gate 	 * a big deal, we'll just do the split the right way next time.
788*7c478bd9Sstevel@tonic-gate 	 */
789*7c478bd9Sstevel@tonic-gate 	off = 0;
790*7c478bd9Sstevel@tonic-gate 	if (NEXT_PGNO(pp) == PGNO_INVALID &&
791*7c478bd9Sstevel@tonic-gate 	    ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) ||
792*7c478bd9Sstevel@tonic-gate 	    (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page))))
793*7c478bd9Sstevel@tonic-gate 		off = NUM_ENT(cp->page) - adjust;
794*7c478bd9Sstevel@tonic-gate 	else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
795*7c478bd9Sstevel@tonic-gate 		off = adjust;
796*7c478bd9Sstevel@tonic-gate 
797*7c478bd9Sstevel@tonic-gate 	if (off != 0)
798*7c478bd9Sstevel@tonic-gate 		goto sort;
799*7c478bd9Sstevel@tonic-gate 
800*7c478bd9Sstevel@tonic-gate 	/*
801*7c478bd9Sstevel@tonic-gate 	 * Split the data to the left and right pages.  Try not to split on
802*7c478bd9Sstevel@tonic-gate 	 * an overflow key.  (Overflow keys on internal pages will slow down
803*7c478bd9Sstevel@tonic-gate 	 * searches.)  Refuse to split in the middle of a set of duplicates.
804*7c478bd9Sstevel@tonic-gate 	 *
805*7c478bd9Sstevel@tonic-gate 	 * First, find the optimum place to split.
806*7c478bd9Sstevel@tonic-gate 	 *
807*7c478bd9Sstevel@tonic-gate 	 * It's possible to try and split past the last record on the page if
808*7c478bd9Sstevel@tonic-gate 	 * there's a very large record at the end of the page.  Make sure this
809*7c478bd9Sstevel@tonic-gate 	 * doesn't happen by bounding the check at the next-to-last entry on
810*7c478bd9Sstevel@tonic-gate 	 * the page.
811*7c478bd9Sstevel@tonic-gate 	 *
812*7c478bd9Sstevel@tonic-gate 	 * Note, we try and split half the data present on the page.  This is
813*7c478bd9Sstevel@tonic-gate 	 * because another process may have already split the page and left
814*7c478bd9Sstevel@tonic-gate 	 * it half empty.  We don't try and skip the split -- we don't know
815*7c478bd9Sstevel@tonic-gate 	 * how much space we're going to need on the page, and we may need up
816*7c478bd9Sstevel@tonic-gate 	 * to half the page for a big item, so there's no easy test to decide
817*7c478bd9Sstevel@tonic-gate 	 * if we need to split or not.  Besides, if two threads are inserting
818*7c478bd9Sstevel@tonic-gate 	 * data into the same place in the database, we're probably going to
819*7c478bd9Sstevel@tonic-gate 	 * need more space soon anyway.
820*7c478bd9Sstevel@tonic-gate 	 */
821*7c478bd9Sstevel@tonic-gate 	top = NUM_ENT(pp) - adjust;
822*7c478bd9Sstevel@tonic-gate 	half = (dbp->pgsize - HOFFSET(pp)) / 2;
823*7c478bd9Sstevel@tonic-gate 	for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
824*7c478bd9Sstevel@tonic-gate 		switch (TYPE(pp)) {
825*7c478bd9Sstevel@tonic-gate 		case P_IBTREE:
826*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA)
827*7c478bd9Sstevel@tonic-gate 				nbytes +=
828*7c478bd9Sstevel@tonic-gate 				   BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len);
829*7c478bd9Sstevel@tonic-gate 			else
830*7c478bd9Sstevel@tonic-gate 				nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
831*7c478bd9Sstevel@tonic-gate 			break;
832*7c478bd9Sstevel@tonic-gate 		case P_LBTREE:
833*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
834*7c478bd9Sstevel@tonic-gate 				nbytes +=
835*7c478bd9Sstevel@tonic-gate 				    BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
836*7c478bd9Sstevel@tonic-gate 			else
837*7c478bd9Sstevel@tonic-gate 				nbytes += BOVERFLOW_SIZE;
838*7c478bd9Sstevel@tonic-gate 
839*7c478bd9Sstevel@tonic-gate 			++off;
840*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
841*7c478bd9Sstevel@tonic-gate 				nbytes +=
842*7c478bd9Sstevel@tonic-gate 				    BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
843*7c478bd9Sstevel@tonic-gate 			else
844*7c478bd9Sstevel@tonic-gate 				nbytes += BOVERFLOW_SIZE;
845*7c478bd9Sstevel@tonic-gate 			break;
846*7c478bd9Sstevel@tonic-gate 		case P_IRECNO:
847*7c478bd9Sstevel@tonic-gate 			nbytes += RINTERNAL_SIZE;
848*7c478bd9Sstevel@tonic-gate 			break;
849*7c478bd9Sstevel@tonic-gate 		case P_LRECNO:
850*7c478bd9Sstevel@tonic-gate 			nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
851*7c478bd9Sstevel@tonic-gate 			break;
852*7c478bd9Sstevel@tonic-gate 		default:
853*7c478bd9Sstevel@tonic-gate 			return (__db_pgfmt(dbp, pp->pgno));
854*7c478bd9Sstevel@tonic-gate 		}
855*7c478bd9Sstevel@tonic-gate sort:	splitp = off;
856*7c478bd9Sstevel@tonic-gate 
857*7c478bd9Sstevel@tonic-gate 	/*
858*7c478bd9Sstevel@tonic-gate 	 * Splitp is either at or just past the optimum split point.  If
859*7c478bd9Sstevel@tonic-gate 	 * it's a big key, try and find something close by that's not.
860*7c478bd9Sstevel@tonic-gate 	 */
861*7c478bd9Sstevel@tonic-gate 	if (TYPE(pp) == P_IBTREE)
862*7c478bd9Sstevel@tonic-gate 		isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA;
863*7c478bd9Sstevel@tonic-gate 	else if (TYPE(pp) == P_LBTREE)
864*7c478bd9Sstevel@tonic-gate 		isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA;
865*7c478bd9Sstevel@tonic-gate 	else
866*7c478bd9Sstevel@tonic-gate 		isbigkey = 0;
867*7c478bd9Sstevel@tonic-gate 	if (isbigkey)
868*7c478bd9Sstevel@tonic-gate 		for (cnt = 1; cnt <= 3; ++cnt) {
869*7c478bd9Sstevel@tonic-gate 			off = splitp + cnt * adjust;
870*7c478bd9Sstevel@tonic-gate 			if (off < (db_indx_t)NUM_ENT(pp) &&
871*7c478bd9Sstevel@tonic-gate 			    ((TYPE(pp) == P_IBTREE &&
872*7c478bd9Sstevel@tonic-gate 			    B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) ||
873*7c478bd9Sstevel@tonic-gate 			    B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) {
874*7c478bd9Sstevel@tonic-gate 				splitp = off;
875*7c478bd9Sstevel@tonic-gate 				break;
876*7c478bd9Sstevel@tonic-gate 			}
877*7c478bd9Sstevel@tonic-gate 			if (splitp <= (db_indx_t)(cnt * adjust))
878*7c478bd9Sstevel@tonic-gate 				continue;
879*7c478bd9Sstevel@tonic-gate 			off = splitp - cnt * adjust;
880*7c478bd9Sstevel@tonic-gate 			if (TYPE(pp) == P_IBTREE ?
881*7c478bd9Sstevel@tonic-gate 			    B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA :
882*7c478bd9Sstevel@tonic-gate 			    B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) {
883*7c478bd9Sstevel@tonic-gate 				splitp = off;
884*7c478bd9Sstevel@tonic-gate 				break;
885*7c478bd9Sstevel@tonic-gate 			}
886*7c478bd9Sstevel@tonic-gate 		}
887*7c478bd9Sstevel@tonic-gate 
888*7c478bd9Sstevel@tonic-gate 	/*
889*7c478bd9Sstevel@tonic-gate 	 * We can't split in the middle a set of duplicates.  We know that
890*7c478bd9Sstevel@tonic-gate 	 * no duplicate set can take up more than about 25% of the page,
891*7c478bd9Sstevel@tonic-gate 	 * because that's the point where we push it off onto a duplicate
892*7c478bd9Sstevel@tonic-gate 	 * page set.  So, this loop can't be unbounded.
893*7c478bd9Sstevel@tonic-gate 	 */
894*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE &&
895*7c478bd9Sstevel@tonic-gate 	    pp->inp[splitp] == pp->inp[splitp - adjust])
896*7c478bd9Sstevel@tonic-gate 		for (cnt = 1;; ++cnt) {
897*7c478bd9Sstevel@tonic-gate 			off = splitp + cnt * adjust;
898*7c478bd9Sstevel@tonic-gate 			if (off < NUM_ENT(pp) &&
899*7c478bd9Sstevel@tonic-gate 			    pp->inp[splitp] != pp->inp[off]) {
900*7c478bd9Sstevel@tonic-gate 				splitp = off;
901*7c478bd9Sstevel@tonic-gate 				break;
902*7c478bd9Sstevel@tonic-gate 			}
903*7c478bd9Sstevel@tonic-gate 			if (splitp <= (db_indx_t)(cnt * adjust))
904*7c478bd9Sstevel@tonic-gate 				continue;
905*7c478bd9Sstevel@tonic-gate 			off = splitp - cnt * adjust;
906*7c478bd9Sstevel@tonic-gate 			if (pp->inp[splitp] != pp->inp[off]) {
907*7c478bd9Sstevel@tonic-gate 				splitp = off + adjust;
908*7c478bd9Sstevel@tonic-gate 				break;
909*7c478bd9Sstevel@tonic-gate 			}
910*7c478bd9Sstevel@tonic-gate 		}
911*7c478bd9Sstevel@tonic-gate 
912*7c478bd9Sstevel@tonic-gate 
913*7c478bd9Sstevel@tonic-gate 	/* We're going to split at splitp. */
914*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
915*7c478bd9Sstevel@tonic-gate 		return (ret);
916*7c478bd9Sstevel@tonic-gate 	if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
917*7c478bd9Sstevel@tonic-gate 		return (ret);
918*7c478bd9Sstevel@tonic-gate 
919*7c478bd9Sstevel@tonic-gate 	*splitret = splitp;
920*7c478bd9Sstevel@tonic-gate 	return (0);
921*7c478bd9Sstevel@tonic-gate }
922*7c478bd9Sstevel@tonic-gate 
923*7c478bd9Sstevel@tonic-gate /*
924*7c478bd9Sstevel@tonic-gate  * __bam_copy --
925*7c478bd9Sstevel@tonic-gate  *	Copy a set of records from one page to another.
926*7c478bd9Sstevel@tonic-gate  *
927*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
928*7c478bd9Sstevel@tonic-gate  */
929*7c478bd9Sstevel@tonic-gate int
__bam_copy(dbp,pp,cp,nxt,stop)930*7c478bd9Sstevel@tonic-gate __bam_copy(dbp, pp, cp, nxt, stop)
931*7c478bd9Sstevel@tonic-gate 	DB *dbp;
932*7c478bd9Sstevel@tonic-gate 	PAGE *pp, *cp;
933*7c478bd9Sstevel@tonic-gate 	u_int32_t nxt, stop;
934*7c478bd9Sstevel@tonic-gate {
935*7c478bd9Sstevel@tonic-gate 	db_indx_t nbytes, off;
936*7c478bd9Sstevel@tonic-gate 
937*7c478bd9Sstevel@tonic-gate 	/*
938*7c478bd9Sstevel@tonic-gate 	 * Copy the rest of the data to the right page.  Nxt is the next
939*7c478bd9Sstevel@tonic-gate 	 * offset placed on the target page.
940*7c478bd9Sstevel@tonic-gate 	 */
941*7c478bd9Sstevel@tonic-gate 	for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
942*7c478bd9Sstevel@tonic-gate 		switch (TYPE(pp)) {
943*7c478bd9Sstevel@tonic-gate 		case P_IBTREE:
944*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA)
945*7c478bd9Sstevel@tonic-gate 				nbytes =
946*7c478bd9Sstevel@tonic-gate 				    BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len);
947*7c478bd9Sstevel@tonic-gate 			else
948*7c478bd9Sstevel@tonic-gate 				nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
949*7c478bd9Sstevel@tonic-gate 			break;
950*7c478bd9Sstevel@tonic-gate 		case P_LBTREE:
951*7c478bd9Sstevel@tonic-gate 			/*
952*7c478bd9Sstevel@tonic-gate 			 * If we're on a key and it's a duplicate, just copy
953*7c478bd9Sstevel@tonic-gate 			 * the offset.
954*7c478bd9Sstevel@tonic-gate 			 */
955*7c478bd9Sstevel@tonic-gate 			if (off != 0 && (nxt % P_INDX) == 0 &&
956*7c478bd9Sstevel@tonic-gate 			    pp->inp[nxt] == pp->inp[nxt - P_INDX]) {
957*7c478bd9Sstevel@tonic-gate 				cp->inp[off] = cp->inp[off - P_INDX];
958*7c478bd9Sstevel@tonic-gate 				continue;
959*7c478bd9Sstevel@tonic-gate 			}
960*7c478bd9Sstevel@tonic-gate 			/* FALLTHROUGH */
961*7c478bd9Sstevel@tonic-gate 		case P_LRECNO:
962*7c478bd9Sstevel@tonic-gate 			if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA)
963*7c478bd9Sstevel@tonic-gate 				nbytes =
964*7c478bd9Sstevel@tonic-gate 				    BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len);
965*7c478bd9Sstevel@tonic-gate 			else
966*7c478bd9Sstevel@tonic-gate 				nbytes = BOVERFLOW_SIZE;
967*7c478bd9Sstevel@tonic-gate 			break;
968*7c478bd9Sstevel@tonic-gate 		case P_IRECNO:
969*7c478bd9Sstevel@tonic-gate 			nbytes = RINTERNAL_SIZE;
970*7c478bd9Sstevel@tonic-gate 			break;
971*7c478bd9Sstevel@tonic-gate 		default:
972*7c478bd9Sstevel@tonic-gate 			return (__db_pgfmt(dbp, pp->pgno));
973*7c478bd9Sstevel@tonic-gate 		}
974*7c478bd9Sstevel@tonic-gate 		cp->inp[off] = HOFFSET(cp) -= nbytes;
975*7c478bd9Sstevel@tonic-gate 		memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes);
976*7c478bd9Sstevel@tonic-gate 	}
977*7c478bd9Sstevel@tonic-gate 	return (0);
978*7c478bd9Sstevel@tonic-gate }
979